diff options
Diffstat (limited to 'lib/Target/X86')
40 files changed, 2363 insertions, 1906 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index e0989b0..c352bfc 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -928,6 +928,18 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, Operands.erase(Operands.begin() + 1); } } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + } + } return false; } diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index d4a88d7..a9c90f8 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -485,7 +485,7 @@ struct InternalInstruction { consumed___ indicates that the byte was already consumed and does not need to be consumed again */ - /* The VEX.vvvv field, which contains a thrid register operand for some AVX + /* The VEX.vvvv field, which contains a third register operand for some AVX instructions */ Reg vvvv; diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index d006eca..68247d2 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -41,8 +41,15 @@ X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) &TM.getSubtarget<X86Subtarget>())); } +void X86ATTInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + OS << '%' << getRegisterName(RegNo); +} + void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { - printInstruction(MI, OS); + // Try to print any aliases first. + if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); // If verbose assembly is enabled, we can print some informative comments. if (CommentStream) diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index f24674f..5f939b6 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -26,11 +26,14 @@ class X86ATTInstPrinter : public MCInstPrinter { public: X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI); + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; // Methods used to print the alias of an instruction. unsigned ComputeAvailableFeatures(const X86Subtarget *Subtarget) const; + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. bool printAliasInstr(const MCInst *MI, raw_ostream &OS); // Autogenerated by tblgen. diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 47253eb..5f581ba 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -29,6 +29,10 @@ using namespace llvm; #define GET_INSTRUCTION_NAME #include "X86GenAsmWriter1.inc" +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) { printInstruction(MI, OS); diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index ca99dc0..c8030c3 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -27,6 +27,7 @@ public: X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI) : MCInstPrinter(MAI) {} + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; virtual void printInst(const MCInst *MI, raw_ostream &OS); virtual StringRef getOpcodeName(unsigned Opcode) const; diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index e21d69a..bcfdf0b 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -36,7 +36,7 @@ _conv: cmovb %rcx, %rax ret -Seems like the jb branch has high likelyhood of being taken. It would have +Seems like the jb branch has high likelihood of being taken. It would have saved a few instructions. //===---------------------------------------------------------------------===// @@ -124,51 +124,6 @@ if we have whole-function selectiondags. //===---------------------------------------------------------------------===// -Take the following C code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640): - -struct u1 -{ - float x; - float y; -}; - -float foo(struct u1 u) -{ - return u.x + u.y; -} - -Optimizes to the following IR: -define float @foo(double %u.0) nounwind readnone { -entry: - %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2] - %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1] - %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1] - %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1] - %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1] - %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1] - %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1] - ret float %0 -} - -And current llvm-gcc/clang output: - movd %xmm0, %rax - movd %eax, %xmm1 - shrq $32, %rax - movd %eax, %xmm0 - addss %xmm1, %xmm0 - ret - -We really shouldn't move the floats to RAX, only to immediately move them -straight back to the XMM registers. - -There really isn't any good way to handle this purely in IR optimizers; it -could possibly be handled by changing the output of the fronted, though. It -would also be feasible to add a x86-specific DAGCombine to optimize the -bitcast+trunc+(lshr+)bitcast combination. - -//===---------------------------------------------------------------------===// - Take the following code (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): extern unsigned long table[]; diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 1ac2305..560947a 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -7,14 +7,6 @@ copy (3-addr bswap + memory support?) This is available on Atom processors. //===---------------------------------------------------------------------===// -CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 -backend knows how to three-addressify this shift, but it appears the register -allocator isn't even asking it to do so in this case. We should investigate -why this isn't happening, it could have significant impact on other important -cases for X86 as well. - -//===---------------------------------------------------------------------===// - This should be one DIV/IDIV instruction, not a libcall: unsigned test(unsigned long long X, unsigned Y) { @@ -1572,7 +1564,7 @@ Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations: 1. ix86_pad_returns inserts a noop before ret instructions if immediately - preceeded by a conditional branch or is the target of a jump. + preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. @@ -1736,26 +1728,6 @@ are functionally identical. //===---------------------------------------------------------------------===// Take the following C code: -int x(int y) { return (y & 63) << 14; } - -Code produced by gcc: - andl $63, %edi - sall $14, %edi - movl %edi, %eax - ret - -Code produced by clang: - shll $14, %edi - movl %edi, %eax - andl $1032192, %eax - ret - -The code produced by gcc is 3 bytes shorter. This sort of construct often -shows up with bitfields. - -//===---------------------------------------------------------------------===// - -Take the following C code: int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } We generate the following IR with clang: @@ -2016,3 +1988,81 @@ We currently generate: We could save an instruction here by commuting the addss. //===---------------------------------------------------------------------===// + +This (from PR9661): + +float clamp_float(float a) { + if (a > 1.0f) + return 1.0f; + else if (a < 0.0f) + return 0.0f; + else + return a; +} + +Could compile to: + +clamp_float: # @clamp_float + movss .LCPI0_0(%rip), %xmm1 + minss %xmm1, %xmm0 + pxor %xmm1, %xmm1 + maxss %xmm1, %xmm0 + ret + +with -ffast-math. + +//===---------------------------------------------------------------------===// + +This function (from PR9803): + +int clamp2(int a) { + if (a > 5) + a = 5; + if (a < 0) + return 0; + return a; +} + +Compiles to: + +_clamp2: ## @clamp2 + pushq %rbp + movq %rsp, %rbp + cmpl $5, %edi + movl $5, %ecx + cmovlel %edi, %ecx + testl %ecx, %ecx + movl $0, %eax + cmovnsl %ecx, %eax + popq %rbp + ret + +The move of 0 could be scheduled above the test to make it is xor reg,reg. + +//===---------------------------------------------------------------------===// + +GCC PR48986. We currently compile this: + +void bar(void); +void yyy(int* p) { + if (__sync_fetch_and_add(p, -1) == 1) + bar(); +} + +into: + movl $-1, %eax + lock + xaddl %eax, (%rdi) + cmpl $1, %eax + je LBB0_2 + +Instead we could generate: + + lock + dec %rdi + je LBB0_2 + +The trick is to match "fetch_and_add(X, -C) == C". + +//===---------------------------------------------------------------------===// + diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index efb6c8c..7bb9676 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -1,13 +1,13 @@ //===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // -// This is a target description file for the Intel i386 architecture, refered to +// This is a target description file for the Intel i386 architecture, referred to // here as the "X86" architecture. // //===----------------------------------------------------------------------===// @@ -32,7 +32,7 @@ def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all - // SSE1+ processors support them. + // SSE1+ processors support them. [FeatureMMX, FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", @@ -50,7 +50,8 @@ def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41, FeaturePOPCNT]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", - "Enable 3DNow! instructions">; + "Enable 3DNow! instructions", + [FeatureMMX]>; def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", "Enable 3DNow! Athlon instructions", [Feature3DNow]>; @@ -100,8 +101,10 @@ def : Proc<"i686", []>; def : Proc<"pentiumpro", [FeatureCMOV]>; def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; @@ -121,14 +124,14 @@ def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. // FIXME: Disabling AVX for now since it's not ready. -def : Proc<"sandybridge", [FeatureSSE42, Feature64Bit, +def : Proc<"corei7-avx", [FeatureSSE42, Feature64Bit, FeatureAES, FeatureCLMUL]>; def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; -def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>; -def : Proc<"athlon", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; -def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"k6-2", [Feature3DNow]>; +def : Proc<"k6-3", [Feature3DNow]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; @@ -156,8 +159,8 @@ def : Proc<"shanghai", [Feature3DNowA, Feature64Bit, FeatureSSE4A, Feature3DNowA]>; def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>; -def : Proc<"c3", [FeatureMMX, Feature3DNow]>; +def : Proc<"winchip2", [Feature3DNow]>; +def : Proc<"c3", [Feature3DNow]>; def : Proc<"c3-2", [FeatureSSE1]>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp index a7581eb..4d7d96d 100644 --- a/lib/Target/X86/X86AsmBackend.cpp +++ b/lib/Target/X86/X86AsmBackend.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Object/MachOFormat.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -28,6 +29,13 @@ #include "llvm/Target/TargetAsmBackend.h" using namespace llvm; +// Option to allow disabling arithmetic relaxation to workaround PR9807, which +// is useful when running bitwise comparison experiments on Darwin. We should be +// able to remove this once PR9807 is resolved. +static cl::opt<bool> +MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", + cl::desc("Disable relaxation of arithmetic instruction for X86")); + static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: assert(0 && "invalid fixup kind!"); @@ -201,6 +209,9 @@ bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) return true; + if (MCDisableArithRelaxation) + return false; + // Check if this instruction is ever relaxable. if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) return false; @@ -414,34 +425,26 @@ public: TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_32AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_32AsmBackend(T); - else - return new WindowsX86AsmBackend(T, false); - default: - return new ELFX86_32AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, false); + + return new ELFX86_32AsmBackend(T, TheTriple.getOS()); } TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const std::string &TT) { - switch (Triple(TT).getOS()) { - case Triple::Darwin: + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return new DarwinX86_64AsmBackend(T); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (Triple(TT).getEnvironment() == Triple::MachO) - return new DarwinX86_64AsmBackend(T); - else - return new WindowsX86AsmBackend(T, true); - default: - return new ELFX86_64AsmBackend(T, Triple(TT).getOS()); - } + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, true); + + return new ELFX86_64AsmBackend(T, TheTriple.getOS()); } diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 8874486..f1b9972 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -23,6 +23,7 @@ #include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -77,10 +78,8 @@ private: bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); - bool X86FastEmitStore(EVT VT, const Value *Val, - const X86AddressMode &AM); - bool X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); @@ -109,11 +108,11 @@ private: bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); - bool X86SelectExtractValue(const Instruction *I); - bool X86VisitIntrinsicCall(const IntrinsicInst &I); bool X86SelectCall(const Instruction *I); + bool DoSelectCall(const Instruction *I, const char *MemIntName); + const X86InstrInfo *getInstrInfo() const { return getTargetMachine()->getInstrInfo(); } @@ -125,6 +124,8 @@ private: unsigned TargetMaterializeAlloca(const AllocaInst *C); + unsigned TargetMaterializeFloatZero(const ConstantFP *CF); + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { @@ -133,6 +134,11 @@ private: } bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); }; } // end anonymous namespace. @@ -224,8 +230,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool -X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, - const X86AddressMode &AM) { +X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -395,43 +400,45 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { const Value *Op = *i; if (const StructType *STy = dyn_cast<StructType>(*GTI)) { const StructLayout *SL = TD.getStructLayout(STy); - unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); - Disp += SL->getElementOffset(Idx); - } else { - uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); - for (;;) { - if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - // Constant-offset addressing. - Disp += CI->getSExtValue() * S; - break; - } - if (isa<AddOperator>(Op) && - (!isa<Instruction>(Op) || - FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] - == FuncInfo.MBB) && - isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { - // An add (in the same block) with a constant operand. Fold the - // constant. - ConstantInt *CI = - cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); - Disp += CI->getSExtValue() * S; - // Iterate on the other operand. - Op = cast<AddOperator>(Op)->getOperand(0); - continue; - } - if (IndexReg == 0 && - (!AM.GV || !Subtarget->isPICStyleRIPRel()) && - (S == 1 || S == 2 || S == 4 || S == 8)) { - // Scaled-index addressing. - Scale = S; - IndexReg = getRegForGEPIndex(Op).first; - if (IndexReg == 0) - return false; - break; - } - // Unsupported. - goto unsupported_gep; + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; } + // Unsupported. + goto unsupported_gep; } } // Check for displacement overflow. @@ -445,7 +452,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (X86SelectAddress(U->getOperand(0), AM)) return true; - // If we couldn't merge the sub value into this addr mode, revert back to + // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; break; @@ -457,91 +464,91 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Handle constant address. if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { - // Can't handle alternate code models yet. + // Can't handle alternate code models or TLS yet. if (TM.getCodeModel() != CodeModel::Small) return false; - // RIP-relative addresses can't have additional register operands. - if (Subtarget->isPICStyleRIPRel() && - (AM.Base.Reg != 0 || AM.IndexReg != 0)) - return false; - - // Can't handle TLS yet. if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->isThreadLocal()) return false; - // Okay, we've committed to selecting this global. Set up the basic address. - AM.GV = GV; - - // Allow the subtarget to classify the global. - unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); - - // If this reference is relative to the pic base, set it now. - if (isGlobalRelativeToPICBase(GVFlags)) { - // FIXME: How do we know Base.Reg is free?? - AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); - } + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } - // Unless the ABI requires an extra load, return a direct reference to - // the global. - if (!isGlobalStubReference(GVFlags)) { - if (Subtarget->isPICStyleRIPRel()) { - // Use rip-relative addressing if we can. Above we verified that the - // base and index registers are unused. - assert(AM.Base.Reg == 0 && AM.IndexReg == 0); - AM.Base.Reg = X86::RIP; + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; } - AM.GVOpFlags = GVFlags; - return true; - } - // Ok, we need to do a load from a stub. If we've already loaded from this - // stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); - unsigned LoadReg; - if (I != LocalValueMap.end() && I->second != 0) { - LoadReg = I->second; - } else { - // Issue load from stub. - unsigned Opc = 0; - const TargetRegisterClass *RC = NULL; - X86AddressMode StubAM; - StubAM.Base.Reg = AM.Base.Reg; - StubAM.GV = GV; - StubAM.GVOpFlags = GVFlags; - - // Prepare for inserting code in the local-value area. - SavePoint SaveInsertPt = enterLocalValueArea(); - - if (TLI.getPointerTy() == MVT::i64) { - Opc = X86::MOV64rm; - RC = X86::GR64RegisterClass; - - if (Subtarget->isPICStyleRIPRel()) - StubAM.Base.Reg = X86::RIP; + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; } else { - Opc = X86::MOV32rm; - RC = X86::GR32RegisterClass; - } + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + } - LoadReg = createResultReg(RC); - MachineInstrBuilder LoadMI = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); - addFullAddress(LoadMI, StubAM); + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); - // Ok, back to normal mode. - leaveLocalValueArea(SaveInsertPt); + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); - // Prevent loading GV stub multiple times in same MBB. - LocalValueMap[V] = LoadReg; - } + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } - // Now construct the final address. Note that the Disp, Scale, - // and Index values may already be set here. - AM.Base.Reg = LoadReg; - AM.GV = 0; - return true; + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } } // If all else fails, try to materialize the value in a register. @@ -699,7 +706,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -719,18 +727,38 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Only handle register returns for now. if (!VA.isRegLoc()) return false; - // TODO: For now, don't try to handle cases where getLocInfo() - // says Full but the types don't match. - if (TLI.getValueType(RV->getType()) != VA.getValVT()) - return false; // The calling-convention tables for x87 returns don't tell // the whole story. if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) return false; - // Make the copy. unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. unsigned DstReg = VA.getLocReg(); const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. @@ -862,12 +890,9 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { unsigned NEReg = createResultReg(&X86::GR8RegClass); unsigned PReg = createResultReg(&X86::GR8RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETNEr), NEReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::SETPr), PReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::OR8rr), ResultReg) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg) .addReg(PReg).addReg(NEReg); UpdateValueMap(I, ResultReg); return true; @@ -914,18 +939,31 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. - if (I->getType()->isIntegerTy(8) && - I->getOperand(0)->getType()->isIntegerTy(1)) { - unsigned ResultReg = getRegForValue(I->getOperand(0)); - if (ResultReg == 0) return false; - // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); - if (ResultReg == 0) return false; - UpdateValueMap(I, ResultReg); - return true; + if (!I->getOperand(0)->getType()->isIntegerTy(1)) + return false; + + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + if (ResultReg == 0) + return false; + + if (DstVT != MVT::i8) { + ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; } - return false; + UpdateValueMap(I, ResultReg); + return true; } @@ -1008,71 +1046,49 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { FuncInfo.MBB->addSuccessor(TrueMBB); return true; } - } else if (ExtractValueInst *EI = - dyn_cast<ExtractValueInst>(BI->getCondition())) { - // Check to see if the branch instruction is from an "arithmetic with - // overflow" intrinsic. The main way these intrinsics are used is: - // - // %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - // %sum = extractvalue { i32, i1 } %t, 0 - // %obit = extractvalue { i32, i1 } %t, 1 - // br i1 %obit, label %overflow, label %normal - // - // The %sum and %obit are converted in an ADD and a SETO/SETB before - // reaching the branch. Therefore, we search backwards through the MBB - // looking for the SETO/SETB instruction. If an instruction modifies the - // EFLAGS register before we reach the SETO/SETB instruction, then we can't - // convert the branch into a JO/JB instruction. - if (const IntrinsicInst *CI = - dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){ - if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow || - CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) { - const MachineInstr *SetMI = 0; - unsigned Reg = getRegForValue(EI); - - for (MachineBasicBlock::const_reverse_iterator - RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend(); - RI != RE; ++RI) { - const MachineInstr &MI = *RI; - - if (MI.definesRegister(Reg)) { - if (MI.isCopy()) { - Reg = MI.getOperand(1).getReg(); - continue; - } - - SetMI = &MI; - break; - } - - const TargetInstrDesc &TID = MI.getDesc(); - if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) || - MI.hasUnmodeledSideEffects()) - break; - } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); - if (SetMI) { - unsigned OpCode = SetMI->getOpcode(); - - if (OpCode == X86::SETOr || OpCode == X86::SETBr) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(OpCode == X86::SETOr ? X86::JO_4 : X86::JB_4)) - .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DL); - FuncInfo.MBB->addSuccessor(TrueMBB); - return true; - } + unsigned JmpOpc = X86::JNE_4; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_4; } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; } } } // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr)) - .addReg(OpReg).addReg(OpReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4)) .addMBB(TrueMBB); FastEmitBranch(FalseMBB, DL); @@ -1081,42 +1097,42 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } bool X86FastISel::X86SelectShift(const Instruction *I) { - unsigned CReg = 0, OpReg = 0, OpImm = 0; + unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = NULL; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break; - case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break; - case Instruction::Shl: OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break; + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break; - case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break; - case Instruction::Shl: OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break; + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break; - case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break; - case Instruction::Shl: OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break; + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break; - case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break; - case Instruction::Shl: OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break; + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; default: return false; } } else { @@ -1130,15 +1146,6 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - // Fold immediate in shl(x,3). - if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { - unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), - ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff); - UpdateValueMap(I, ResultReg); - return true; - } - unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), @@ -1238,18 +1245,13 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { } bool X86FastISel::X86SelectTrunc(const Instruction *I) { - if (Subtarget->is64Bit()) - // All other cases should be handled by the tblgen generated code. - return false; EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); - // This code only handles truncation to byte right now. + // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) - // All other cases should be handled by the tblgen generated code. return false; - if (SrcVT != MVT::i16 && SrcVT != MVT::i32) - // All other cases should be handled by the tblgen generated code. + if (!TLI.isTypeLegal(SrcVT)) return false; unsigned InputReg = getRegForValue(I->getOperand(0)); @@ -1257,16 +1259,26 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { // Unhandled operand. Halt "fast" selection and bail. return false; - // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) - ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; - unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - CopyReg).addReg(InputReg); + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + UpdateValueMap(I, InputReg); + return true; + } - // Then issue an extract_subreg. + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) + ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(InputReg); + InputReg = CopyReg; + } + + // Issue an extract_subreg. unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, - CopyReg, /*Kill=*/true, + InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; @@ -1275,35 +1287,92 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return true; } -bool X86FastISel::X86SelectExtractValue(const Instruction *I) { - const ExtractValueInst *EI = cast<ExtractValueInst>(I); - const Value *Agg = EI->getAggregateOperand(); +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} - if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) { - switch (CI->getIntrinsicID()) { - default: break; - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: { - // Cheat a little. We know that the registers for "add" and "seto" are - // allocated sequentially. However, we only keep track of the register - // for "add" in the value map. Use extractvalue's index to get the - // correct register for "seto". - unsigned OpReg = getRegForValue(Agg); - if (OpReg == 0) - return false; - UpdateValueMap(I, OpReg + *EI->idx_begin()); - return true; - } +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert(Len == 1); + VT = MVT::i8; } + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, Reg); + RV &= X86FastEmitStore(VT, Reg, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; } - return false; + return true; } bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // FIXME: Handle more intrinsics. switch (I.getIntrinsicID()) { default: return false; + case Intrinsic::memcpy: { + const MemCpyInst &MCI = cast<MemCpyInst>(I); + // Don't handle volatile or variable length memcpys. + if (MCI.isVolatile()) + return false; + + if (isa<ConstantInt>(MCI.getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memcpy"); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memset"); + } case Intrinsic::stackprotector: { // Emit code inline code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); @@ -1314,33 +1383,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; - if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; - - return true; - } - case Intrinsic::objectsize: { - ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1)); - const Type *Ty = I.getCalledFunction()->getReturnType(); - - assert(CI && "Non-constant type in Intrinsic::objectsize?"); - - MVT VT; - if (!isTypeLegal(Ty, VT)) - return false; - - unsigned OpC = 0; - if (VT == MVT::i32) - OpC = X86::MOV32ri; - else if (VT == MVT::i64) - OpC = X86::MOV64ri; - else - return false; - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg). - addImm(CI->isZero() ? -1ULL : 0); - UpdateValueMap(&I, ResultReg); return true; } case Intrinsic::dbg_declare: { @@ -1362,11 +1405,10 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { } case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: { + // FIXME: Should fold immediates. + // Replace "add with overflow" intrinsics with an "add" instruction followed - // by a seto/setc instruction. Later on, when the "extractvalue" - // instructions are encountered, we use the fact that two registers were - // created sequentially to get the correct registers for the "sum" and the - // "overflow bit". + // by a seto/setc instruction. const Function *Callee = I.getCalledFunction(); const Type *RetTy = cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); @@ -1392,27 +1434,18 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { else return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + // The call to CreateRegs builds two sequential registers, to store the + // both the the returned values. + unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) .addReg(Reg1).addReg(Reg2); - unsigned DestReg1 = UpdateValueMap(&I, ResultReg); - - // If the add with overflow is an intra-block value then we just want to - // create temporaries for it like normal. If it is a cross-block value then - // UpdateValueMap will return the cross-block register used. Since we - // *really* want the value to be live in the register pair known by - // UpdateValueMap, we have to use DestReg1+1 as the destination register in - // the cross block case. In the non-cross-block case, we should just make - // another register for the value. - if (DestReg1 != ResultReg) - ResultReg = DestReg1+1; - else - ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8)); unsigned Opc = X86::SETBr; if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) Opc = X86::SETOr; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1); + + UpdateValueMap(&I, ResultReg, 2); return true; } } @@ -1430,11 +1463,18 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) return X86VisitIntrinsicCall(*II); + return DoSelectCall(I, 0); +} + +// Select either a call, or an llvm.memcpy/memmove/memset intrinsic +bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + // Handle only C and fastcc calling conventions for now. ImmutableCallSite CS(CI); CallingConv::ID CC = CS.getCallingConv(); - if (CC != CallingConv::C && - CC != CallingConv::Fast && + if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::X86_FastCall) return false; @@ -1443,22 +1483,28 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CC == CallingConv::Fast && GuaranteedTailCallOpt) return false; - // Let SDISel handle vararg functions. const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); const FunctionType *FTy = cast<FunctionType>(PT->getElementType()); - if (FTy->isVarArg()) + bool isVarArg = FTy->isVarArg(); + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (isVarArg && Subtarget->isTargetWin64()) return false; // Fast-isel doesn't know about callee-pop yet. - if (Subtarget->IsCalleePop(FTy->isVarArg(), CC)) + if (Subtarget->IsCalleePop(isVarArg, CC)) return false; - // Handle *simple* calls for now. - const Type *RetTy = CS.getType(); - MVT RetVT; - if (RetTy->isVoidTy()) - RetVT = MVT::isVoid; - else if (!isTypeLegal(RetTy, RetVT, true)) + // Check whether the function can return without sret-demotion. + SmallVector<ISD::OutputArg, 4> Outs; + SmallVector<uint64_t, 4> Offsets; + GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), + Outs, TLI, &Offsets); + bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); + if (!CanLowerReturn) return false; // Materialize callee address in a register. FIXME: GV address can be @@ -1475,13 +1521,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } else return false; - // Allow calls which produce i1 results. - bool AndToI1 = false; - if (RetVT == MVT::i1) { - RetVT = MVT::i8; - AndToI1 = true; - } - // Deal with call operands first. SmallVector<const Value *, 8> ArgVals; SmallVector<unsigned, 8> Args; @@ -1493,9 +1532,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { ArgFlags.reserve(CS.arg_size()); for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i) { - unsigned Arg = getRegForValue(*i); - if (Arg == 0) - return false; + // If we're lowering a mem intrinsic instead of a regular call, skip the + // last two arguments, which should not passed to the underlying functions. + if (MemIntName && e-i <= 2) + break; + Value *ArgVal = *i; ISD::ArgFlagsTy Flags; unsigned AttrInd = i - CS.arg_begin() + 1; if (CS.paramHasAttr(AttrInd, Attribute::SExt)) @@ -1503,34 +1544,83 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) Flags.setZExt(); - // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::StructRet) || - CS.paramHasAttr(AttrInd, Attribute::Nest) || - CS.paramHasAttr(AttrInd, Attribute::ByVal)) - return false; + if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { + const PointerType *Ty = cast<PointerType>(ArgVal->getType()); + const Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = TD.getTypeAllocSize(ElementTy); + unsigned FrameAlign = CS.getParamAlignment(AttrInd); + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByVal(); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + if (!IsMemcpySmall(FrameSize)) + return false; + } + + if (CS.paramHasAttr(AttrInd, Attribute::InReg)) + Flags.setInReg(); + if (CS.paramHasAttr(AttrInd, Attribute::Nest)) + Flags.setNest(); + + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all fastisel supported + // calling conventions on x86. + if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { + if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || + CI->getBitWidth() == 16) { + if (Flags.isSExt()) + ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + else + ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + } + } + + unsigned ArgReg; - const Type *ArgTy = (*i)->getType(); + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && + cast<TruncInst>(ArgVal)->getParent() == I->getParent() && + ArgVal->hasOneUse()) { + ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); + ArgReg = getRegForValue(ArgVal); + if (ArgReg == 0) return false; + + MVT ArgVT; + if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; + + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, + ArgVal->hasOneUse(), 1); + } else { + ArgReg = getRegForValue(ArgVal); + } + + if (ArgReg == 0) return false; + + const Type *ArgTy = ArgVal->getType(); MVT ArgVT; if (!isTypeLegal(ArgTy, ArgVT)) return false; + if (ArgVT == MVT::x86mmx) + return false; unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); Flags.setOrigAlign(OriginalAlignment); - Args.push_back(Arg); - ArgVals.push_back(*i); + Args.push_back(ArgReg); + ArgVals.push_back(ArgVal); ArgVTs.push_back(ArgVT); ArgFlags.push_back(Flags); } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext()); + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, + I->getParent()->getContext()); // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetWin64()) CCInfo.AllocateStack(32, 8); - } CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -1555,6 +1645,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; @@ -1562,6 +1654,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; @@ -1569,9 +1663,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { break; } case CCValAssign::AExt: { - // We don't handle MMX parameters yet. - if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() == 128) - return false; + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); if (!Emitted) @@ -1605,14 +1698,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { AM.Base.Reg = StackPtr; AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; - - // If this is a really simple value, emit this with the Value* version of - // X86FastEmitStore. If it isn't simple, we don't want to do this, as it - // can cause us to reevaluate the argument. - if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) + ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; + + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = Arg; + bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); + assert(Res && "memcpy length already checked!"); (void)Res; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + //of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. X86FastEmitStore(ArgVT, ArgVal, AM); - else + } else { X86FastEmitStore(ArgVT, Arg, AM); + } } } @@ -1624,6 +1724,17 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { X86::EBX).addReg(Base); } + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { @@ -1662,7 +1773,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -1670,80 +1782,100 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) - .addGlobalAddress(GV, 0, OpFlags); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + if (MemIntName) + MIB.addExternalSymbol(MemIntName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); } // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX); + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + MIB.addReg(X86::AL); + // Add implicit physical register uses to the call. for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) MIB.addReg(RegArgs[i]); // Issue CALLSEQ_END unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode(); + unsigned NumBytesCallee = 0; + if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet)) + NumBytesCallee = 4; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(NumBytesCallee); + + // Build info for return calling conv lowering code. + // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. + SmallVector<ISD::InputArg, 32> Ins; + SmallVector<EVT, 4> RetTys; + ComputeValueVTs(TLI, I->getType(), RetTys); + for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { + EVT VT = RetTys[i]; + EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); + for (unsigned j = 0; j != NumRegs; ++j) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT.getSimpleVT(); + MyFlags.Used = !CS.getInstruction()->use_empty(); + if (CS.paramHasAttr(0, Attribute::SExt)) + MyFlags.Flags.setSExt(); + if (CS.paramHasAttr(0, Attribute::ZExt)) + MyFlags.Flags.setZExt(); + if (CS.paramHasAttr(0, Attribute::InReg)) + MyFlags.Flags.setInReg(); + Ins.push_back(MyFlags); + } + } - // Now handle call return value (if any). + // Now handle call return values. SmallVector<unsigned, 4> UsedRegs; - if (RetVT != MVT::isVoid) { - SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext()); - CCInfo.AnalyzeCallResult(RetVT, RetCC_X86); - - // Copy all of the result registers out of their specified physreg. - assert(RVLocs.size() == 1 && "Can't handle multi-value calls!"); - EVT CopyVT = RVLocs[0].getValVT(); - TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, + I->getParent()->getContext()); + unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + unsigned CopyReg = ResultReg + i; // If this is a call to a function that returns an fp value on the x87 fp // stack, but where we prefer to use the value in xmm registers, copy it // out as F80 and use a truncate to move it from fp stack reg to xmm reg. - if ((RVLocs[0].getLocReg() == X86::ST0 || - RVLocs[0].getLocReg() == X86::ST1) && + if ((RVLocs[i].getLocReg() == X86::ST0 || + RVLocs[i].getLocReg() == X86::ST1) && isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) { CopyVT = MVT::f80; - DstRC = X86::RFP80RegisterClass; + CopyReg = createResultReg(X86::RFP80RegisterClass); } - unsigned ResultReg = createResultReg(DstRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), - ResultReg).addReg(RVLocs[0].getLocReg()); - UsedRegs.push_back(RVLocs[0].getLocReg()); + CopyReg).addReg(RVLocs[i].getLocReg()); + UsedRegs.push_back(RVLocs[i].getLocReg()); - if (CopyVT != RVLocs[0].getValVT()) { + if (CopyVT != RVLocs[i].getValVT()) { // Round the F80 the right size, which also moves to the appropriate xmm // register. This is accomplished by storing the F80 value in memory and // then loading it back. Ewww... - EVT ResVT = RVLocs[0].getValVT(); + EVT ResVT = RVLocs[i].getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)), FI) - .addReg(ResultReg); - DstRC = ResVT == MVT::f32 - ? X86::FR32RegisterClass : X86::FR64RegisterClass; + .addReg(CopyReg); Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; - ResultReg = createResultReg(DstRC); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(Opc), ResultReg), FI); - } - - if (AndToI1) { - // Mask out all but lowest bit for some call which produces an i1. - unsigned AndResult = createResultReg(X86::GR8RegisterClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, - TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1); - ResultReg = AndResult; + TII.get(Opc), ResultReg + i), FI); } - - UpdateValueMap(I, ResultReg); } + if (RVLocs.size()) + UpdateValueMap(I, ResultReg, RVLocs.size()); + // Set all unused physreg defs as dead. static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); @@ -1782,8 +1914,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); - case Instruction::ExtractValue: - return X86SelectExtractValue(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); @@ -1856,10 +1986,13 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { if (isa<GlobalValue>(C)) { X86AddressMode AM; if (X86SelectAddress(C, AM)) { - if (TLI.getPointerTy() == MVT::i32) - Opc = X86::LEA32r; - else - Opc = X86::LEA64r; + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + return AM.Base.Reg; + + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg), AM); @@ -1921,6 +2054,45 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { return ResultReg; } +unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return false; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: + if (Subtarget->hasSSE1()) { + Opc = X86::FsFLD0SS; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp032; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (Subtarget->hasSSE2()) { + Opc = X86::FsFLD0SD; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp064; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + return ResultReg; +} + + /// TryToFoldLoad - The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 3aaa693..325d061 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -1307,7 +1307,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // set up by FpSET_ST0, and our StackTop is off by one because of it. unsigned Op0 = getFPReg(MI->getOperand(0)); // Restore the actual StackTop from before Fp_SET_ST0. - // Note we can't handle Fp_SET_ST1 without a preceeding Fp_SET_ST0, and we + // Note we can't handle Fp_SET_ST1 without a preceding Fp_SET_ST0, and we // are not enforcing the constraint. ++StackTop; unsigned RegOnTop = getStackEntry(0); // This reg must remain in st(0). diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 071fbe0..cd4e954 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1,4 +1,4 @@ -//=======- X86FrameLowering.cpp - X86 Frame Information ------------*- C++ -*-====// +//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" @@ -159,8 +160,10 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, Opc = isSub ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) : (Is64Bit ? X86::POP64r : X86::POP32r); - BuildMI(MBB, MBBI, DL, TII.get(Opc)) + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); Offset -= ThisVal; continue; } @@ -170,6 +173,8 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(ThisVal); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. Offset -= ThisVal; } @@ -296,7 +301,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, // FIXME: This is dirty hack. The code itself is pretty mess right now. // It should be rewritten from scratch and generalized sometimes. - // Determine maximum offset (minumum due to stack growth). + // Determine maximum offset (minimum due to stack growth). int64_t MaxOffset = 0; for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) @@ -354,7 +359,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || - !Fn->doesNotThrow() || UnwindTablesMandatory; + Fn->needsUnwindTableEntry(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); @@ -408,7 +413,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) - .addImm(-TailCallReturnAddrDelta); + .addImm(-TailCallReturnAddrDelta) + .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } @@ -446,7 +452,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(FramePtr, RegState::Kill); + .addReg(FramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); if (needsFrameMoves) { // Mark the place where EBP/RBP was saved. @@ -473,7 +480,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Update EBP with the new base value... BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) - .addReg(StackPtr); + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); if (needsFrameMoves) { // Mark effective beginning of when frame pointer becomes valid. @@ -615,7 +623,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII, *RegInfo); - if ((NumBytes || PushedRegs) && needsFrameMoves) { + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. MCSymbol *Label = MMI.getContext().CreateTempSymbol(); BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); @@ -641,7 +649,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } void X86FrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { + MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); @@ -785,7 +793,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, assert(Offset >= 0 && "Offset should never be negative"); if (Offset) { - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); } @@ -829,7 +837,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, int delta = -1*X86FI->getTCReturnAddrDelta(); MBBI = MBB.getLastNonDebugInstr(); - // Check for possible merge with preceeding ADD instruction. + // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); } @@ -918,7 +926,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, // X86RegisterInfo::emitPrologue will handle spilling of frame register. continue; CalleeFrameSize += SlotSize; - BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill); + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); } X86FI->setCalleeSavedFrameSize(CalleeFrameSize); diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 9b0ec6e..1fcc274 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -189,6 +189,7 @@ namespace { SDNode *Select(SDNode *N); SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); @@ -1329,6 +1330,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { return ResNode; } +// FIXME: Figure out some way to unify this with the 'or' and other code +// below. SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { if (Node->hasAnyUseOfValue(0)) return 0; @@ -1479,6 +1482,158 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { } } +enum AtomicOpc { + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr + } +}; + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + // FIXME: Same as for 'add' and 'sub', try to merge those down here. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + default: + return 0; + } + + bool isCN = false; + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); + if (CN) { + isCN = true; + Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT); + } + + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + } else + Opc = AtomicOpcTbl[Op][I64]; + break; + } + + DebugLoc dl = Node->getDebugLoc(); + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); +} + /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has /// any uses which require the SF or OF bits to be accurate. static bool HasNoSignedComparisonUses(SDNode *N) { @@ -1580,6 +1735,89 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return RetVal; break; } + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: { + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + break; + + unsigned ShlOp, Op = 0; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + break; + } case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -1768,17 +2006,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16, + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, MVT::Other, Ops, array_lengthof(Ops)), 0); Chain = Move.getValue(1); ReplaceUses(N0.getValue(1), Chain); } else { Move = - SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0); + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); Chain = CurDAG->getEntryNode(); } - Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue()); + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); InFlag = Chain.getValue(1); } else { InFlag = diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cd1d201..1cdf2b6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); - + // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. if (Subtarget->is64Bit()) @@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); @@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // Can turn SHL into an integer multiply. setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); - setOperationAction(ISD::SRL, MVT::v4i32, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are @@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } + if (Subtarget->hasSSE2()) { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + } + if (Subtarget->hasSSE42()) setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); @@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SINT_TO_FP); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); @@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; + + setPrefFunctionAlignment(4); } @@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { - return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; -} - // FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(EVT VT) const{ @@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, #include "X86GenCallingConv.inc" bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, +X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); - } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { - // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. - if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::v2i64, InFlag).getValue(1); - Val = Chain.getValue(0); - Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, - Val, DAG.getConstant(0, MVT::i64)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - MVT::i64, InFlag).getValue(1); - Val = Chain.getValue(0); - } - Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); @@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -1952,7 +1953,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call +/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, @@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, getTargetMachine(), + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 @@ -2043,7 +2044,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue RetAddrFrIdx; - // Load return adress for tail calls. + // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); @@ -2200,7 +2201,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<SDValue, 8> MemOpChains2; SDValue FIN; int FI = 0; - // Do not flag preceeding copytoreg stuff together with the following stuff. + // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); if (GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -2270,6 +2271,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportLinkage()) { unsigned char OpFlags = 0; + bool ExtraLoad = false; + unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol @@ -2281,15 +2284,34 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; + } else if (Subtarget->isPICStyleRIPRel() && + isa<Function>(GV) && + cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { + // If the function is marked as non-lazy, generate an indirect call + // which loads from the GOT directly. This avoids runtime overhead + // at the cost of eager binding (and one extra byte of encoding). + OpFlags = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), G->getOffset(), OpFlags); + + // Add a wrapper if needed. + if (WrapperKind != ISD::DELETED_NODE) + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + // Add extra indirection if needed. + if (ExtraLoad) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(), + false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -2300,7 +2322,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, getTargetMachine().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && - Subtarget->getDarwinVers() < 9) { + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. @@ -2528,16 +2551,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (RegInfo->needsStackRealignment(MF)) return false; - // Do not sibcall optimize vararg calls unless the call site is not passing - // any arguments. - if (isVarArg && !Outs.empty()) - return false; - // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (Subtarget->isTargetWin64()) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. // Therefore if it's not used by the call it is not safe to optimize this into // a sibcall. @@ -2550,8 +2587,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, getTargetMachine(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; @@ -2564,13 +2601,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, getTargetMachine(), - RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, getTargetMachine(), - RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -2596,8 +2633,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (Subtarget->isTargetWin64()) { @@ -4018,7 +4055,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The -/// search can start in two diferent directions, from left or right. +/// search can start in two different directions, from left or right. static unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, bool ZerosFromLeft, SelectionDAG &DAG) { @@ -6617,9 +6654,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { } -/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and /// take a 2 x i32 value to shift plus a shift amount. -SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -6708,12 +6745,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, unsigned ByteSize = SrcVT.getSizeInBits()/8; - int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); - + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, @@ -7204,6 +7247,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } +SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8779,16 +8833,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { return Res; } -SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); DebugLoc dl = Op.getDebugLoc(); SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); LLVMContext *Context = DAG.getContext(); - assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + // Must have SSE2. + if (!Subtarget->hasSSE2()) return SDValue(); + + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + // Lower SHL with variable shift amount. + // Cannot lower SHL without SSE4.1 or later. + if (!Subtarget->hasSSE41()) return SDValue(); - if (VT == MVT::v4i32) { + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), Op.getOperand(1), DAG.getConstant(23, MVT::i32)); @@ -8807,7 +8916,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } - if (VT == MVT::v16i8) { + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { // a = a << 5; Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), @@ -9112,7 +9221,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::SHL_PARTS: case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerShift(Op, DAG); + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); @@ -9120,6 +9229,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VSETCC: return LowerVSETCC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); @@ -9140,7 +9250,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL_V2I64(Op, DAG); - case ISD::SHL: return LowerSHL(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: @@ -9307,6 +9419,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UCOMI: return "X86ISD::UCOMI"; case X86ISD::SETCC: return "X86ISD::SETCC"; case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; + case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; @@ -10984,14 +11098,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, UE = Uses.end(); UI != UE; ++UI) { SDNode *Extract = *UI; - // Compute the element's address. + // cOMpute the element's address. SDValue Idx = Extract->getOperand(1); unsigned EltSize = InputVector.getValueType().getVectorElementType().getSizeInBits()/8; uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), StackPtr, OffsetVal); // Load the scalar. @@ -11264,15 +11378,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) return SDValue(); + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + // If this is a select between two integer constants, try to do some // optimizations. Note that the operands are ordered the opposite of SELECT // operands. - if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { - if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is // larger than FalseC (the false value). - X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); - if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); @@ -11282,7 +11409,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // This is efficient for any integer data type (including i8/i16) and // shift amount. if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11300,7 +11426,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient // for any integer data type, including i8/i16. if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); @@ -11339,7 +11464,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (isFastMultiplier) { APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); - SDValue Cond = N->getOperand(3); Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, DAG.getConstant(CC, MVT::i8), Cond); // Zero extend the condition if needed. @@ -11574,12 +11698,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + X86ISD::NodeType NTOperator = is64BitFP ? + X86ISD::FSETCCsd : X86ISD::FSETCCss; + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + DAG.getConstant(x86cc, MVT::i8)); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, + DAG.getConstant(1, MVT::i32)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + // Want to form PANDN nodes, in the hopes of then easily combining them with // OR and AND nodes to form PBLEND/PSIGN. EVT VT = N->getValueType(0); @@ -11609,6 +11815,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) return SDValue(); @@ -11976,6 +12186,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { + DebugLoc dl = N->getDebugLoc(); + SDValue Op0 = N->getOperand(0); + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT VT = Ld->getValueType(0); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !XTLI->getSubtarget()->is64Bit() && + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -12060,6 +12290,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); @@ -12216,7 +12447,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. - // FIXME: this should verify that we are targetting a 486 or better. If not, + // FIXME: this should verify that we are targeting a 486 or better. If not, // we will turn this bswap into something that will be lowered to logical ops // instead of emitting the bswap asm. For now, we don't support 486 or lower // so don't worry about this. @@ -12489,12 +12720,16 @@ LowerXConstraint(EVT ConstraintVT) const { /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, - char Constraint, + std::string &Constraint, std::vector<SDValue>&Ops, SelectionDAG &DAG) const { SDValue Result(0, 0); - switch (Constraint) { + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { default: break; case 'I': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -12686,7 +12921,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return std::make_pair(0U, X86::GR8RegisterClass); if (VT == MVT::i16) return std::make_pair(0U, X86::GR16RegisterClass); - if (VT == MVT::i32 || !Subtarget->is64Bit()) + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) return std::make_pair(0U, X86::GR32RegisterClass); return std::make_pair(0U, X86::GR64RegisterClass); case 'R': // LEGACY_REGS diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 6301057..d61a125 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -94,6 +94,15 @@ namespace llvm { // one's or all zero's. SETCC_CARRY, // R = carry_bit ? ~0 : 0 + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCCss, FSETCCsd, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the /// flag operand produced by a CMP or TEST instruction. It also writes a @@ -592,7 +601,7 @@ namespace llvm { /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. virtual void LowerAsmOperandForConstraint(SDValue Op, - char ConstraintLetter, + std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const; @@ -674,15 +683,15 @@ namespace llvm { /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; - /// getFunctionAlignment - Return the Log2 alignment of this function. - virtual unsigned getFunctionAlignment(const Function *F) const; - /// getStackCookieLocation - Return true if the target stores stack /// protector cookies at a fixed offset in some non-standard address /// space, and populates the address space and offset as /// appropriate. virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + protected: std::pair<const TargetRegisterClass*, uint8_t> findRepresentativeClass(EVT VT) const; @@ -773,9 +782,7 @@ namespace llvm { SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; - SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, - SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -786,6 +793,7 @@ namespace llvm { SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToBT(SDValue And, ISD::CondCode CC, DebugLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; @@ -808,7 +816,7 @@ namespace llvm { SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; @@ -850,9 +858,10 @@ namespace llvm { ISD::NodeType ExtendKind) const; virtual bool - CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const; + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG, unsigned NewOp) const; diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 45d1c6b..dd4f6a5 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -12,66 +12,91 @@ // //===----------------------------------------------------------------------===// -// FIXME: We don't support any intrinsics for these instructions yet. +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} -class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag> pattern> - : I<o, F, outs, ins, asm, pattern>, TB, Requires<[Has3DNow]> { +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; } -class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic> - : I<o, F, (outs VR64:$dst), ins, - !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), []>, - TB, Requires<[Has3DNow]>, Has3DNow0F0FOpcode { +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. let isAsmParserOnly = 1; } +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} -let Constraints = "$src1 = $dst" in { - // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. - // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. - multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { - def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn>; - def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn>; - } +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; } -defm PAVGUSB : I3DNow_binop_rm<0xBF, "pavgusb">; -defm PF2ID : I3DNow_binop_rm<0x1D, "pf2id">; -defm PFACC : I3DNow_binop_rm<0xAE, "pfacc">; -defm PFADD : I3DNow_binop_rm<0x9E, "pfadd">; -defm PFCMPEQ : I3DNow_binop_rm<0xB0, "pfcmpeq">; -defm PFCMPGE : I3DNow_binop_rm<0x90, "pfcmpge">; -defm PFCMPGT : I3DNow_binop_rm<0xA0, "pfcmpgt">; -defm PFMAX : I3DNow_binop_rm<0xA4, "pfmax">; -defm PFMIN : I3DNow_binop_rm<0x94, "pfmin">; -defm PFMUL : I3DNow_binop_rm<0xB4, "pfmul">; -defm PFRCP : I3DNow_binop_rm<0x96, "pfrcp">; -defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1">; -defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2">; -defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1">; -defm PFRSQRT : I3DNow_binop_rm<0x97, "pfrsqrt">; -defm PFSUB : I3DNow_binop_rm<0x9A, "pfsub">; -defm PFSUBR : I3DNow_binop_rm<0xAA, "pfsubr">; -defm PI2FD : I3DNow_binop_rm<0x0D, "pi2fd">; -defm PMULHRW : I3DNow_binop_rm<0xB7, "pmulhrw">; +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>; def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr), "prefetch $addr", []>; - + // FIXME: Diassembler gets a bogus decode conflict. -let isAsmParserOnly = 1 in { +let isAsmParserOnly = 1 in def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr), "prefetchw $addr", []>; -} // "3DNowA" instructions -defm PF2IW : I3DNow_binop_rm<0x1C, "pf2iw">; -defm PI2FW : I3DNow_binop_rm<0x0C, "pi2fw">; -defm PFNACC : I3DNow_binop_rm<0x8A, "pfnacc">; -defm PFPNACC : I3DNow_binop_rm<0x8E, "pfpnacc">; -defm PSWAPD : I3DNow_binop_rm<0xBB, "pswapd">; +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f0ea068..9f7a4b0 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -163,7 +163,7 @@ def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), } // Defs = [EFLAGS] -// Suprisingly enough, these are not two address instructions! +// Surprisingly enough, these are not two address instructions! let Defs = [EFLAGS] in { // Register-Integer Signed Integer Multiply def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 4c915d9..adcc747 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -214,6 +214,30 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), (SETBr)>; +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + //===----------------------------------------------------------------------===// // String Pseudo Instructions // @@ -519,85 +543,98 @@ def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero), Requires<[In64BitMode]>, LOCK; -// Optimized codegen when the non-memory output is not used. +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { -def LOCK_ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2), - "lock\n\t" - "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs), - (ins i64mem:$dst, i64i32imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; - -def LOCK_SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, OpSize, LOCK; +def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +} -def LOCK_SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), - "lock\n\t" - "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs), - (ins i64mem:$dst, i64i32imm:$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +} +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">; -def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), - "lock\n\t" - "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK; -def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2), - "lock\n\t" - "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK; -def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs), - (ins i64mem:$dst, i64i8imm :$src2), - "lock\n\t" - "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK; +// Optimized codegen when the non-memory output is not used. +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "lock\n\t" @@ -960,7 +997,8 @@ def : Pat<(extloadi64i32 addr:$src), // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. -def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>; +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. @@ -1127,9 +1165,9 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In32BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, Requires<[In32BitMode]>; // r & (2^32-1) ==> movz @@ -1147,7 +1185,8 @@ def : Pat<(and GR32:$src1, 0xff), Requires<[In64BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>, + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, Requires<[In64BitMode]>; @@ -1159,10 +1198,11 @@ def : Pat<(sext_inreg GR32:$src, i8), GR32_ABCD)), sub_8bit))>, Requires<[In32BitMode]>; + def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, - GR16_ABCD)), - sub_8bit))>, + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, Requires<[In32BitMode]>; def : Pat<(sext_inreg GR64:$src, i32), @@ -1175,9 +1215,19 @@ def : Pat<(sext_inreg GR32:$src, i8), (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, Requires<[In64BitMode]>; def : Pat<(sext_inreg GR16:$src, i8), - (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>, + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, Requires<[In64BitMode]>; +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; // trunc patterns def : Pat<(i16 (trunc GR32:$src)), @@ -1318,6 +1368,11 @@ def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), // (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; @@ -1474,12 +1529,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; -// Optimize multiply by 2 with EFLAGS result. -let AddedComplexity = 2 in { -def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>; -def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>; -} - // Patterns for nodes that do not produce flags, for instructions that do. // addition diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index 867c0f8..2e1d523 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -38,22 +38,11 @@ let neverHasSideEffects = 1 in { // Sign/Zero extenders -// Use movsbl intead of movsbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movsbw included for the disassembler. -def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; - -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (sext GR8:$src))]>, TB; -def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB; -} -def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sext GR8:$src))]>, TB; def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), @@ -66,20 +55,10 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "movs{wl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; -// Use movzbl intead of movzbw; we don't care about the high 16 bits -// of the register here. This has a smaller encoding and avoids a -// partial-register update. Actual movzbw included for the disassembler. -def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), - "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; -// FIXME: Use a pat pattern or define a syntax here. -let isCodeGenOnly=1 in { -def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src), - "", [(set GR16:$dst, (zext GR8:$src))]>, TB; -def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src), - "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB; -} +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zext GR8:$src))]>, TB; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3cbfac1..7c9a9f7 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,8 +38,11 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; +def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 21df57c..b3237d5 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -232,7 +232,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE) continue; @@ -335,7 +335,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl0[i][1] & TB_NOT_REVERSABLE) continue; @@ -460,7 +460,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries"); RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl1[i][1] & TB_NOT_REVERSABLE) continue; @@ -682,7 +682,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!"); RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align); - // If this is not a reversable operation (because there is a many->one) + // If this is not a reversible operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2[i][1] & TB_NOT_REVERSABLE) continue; @@ -916,7 +916,6 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MMX_MOVD64rm: @@ -1790,7 +1789,6 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, .addMBB(UnCondBrIter->getOperand(0).getMBB()); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) .addMBB(TargetBB); - MBB.addSuccessor(TargetBB); OldInst->eraseFromParent(); UnCondBrIter->eraseFromParent(); @@ -2016,62 +2014,48 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, bool isStackAligned, const TargetMachine &TM, bool load) { - switch (RC->getID()) { + switch (RC->getSize()) { default: - llvm_unreachable("Unknown regclass"); - case X86::GR64RegClassID: - case X86::GR64_ABCDRegClassID: - case X86::GR64_NOREXRegClassID: - case X86::GR64_NOREX_NOSPRegClassID: - case X86::GR64_NOSPRegClassID: - case X86::GR64_TCRegClassID: - case X86::GR64_TCW64RegClassID: - return load ? X86::MOV64rm : X86::MOV64mr; - case X86::GR32RegClassID: - case X86::GR32_ABCDRegClassID: - case X86::GR32_ADRegClassID: - case X86::GR32_NOREXRegClassID: - case X86::GR32_NOSPRegClassID: - case X86::GR32_TCRegClassID: - return load ? X86::MOV32rm : X86::MOV32mr; - case X86::GR16RegClassID: - case X86::GR16_ABCDRegClassID: - case X86::GR16_NOREXRegClassID: - return load ? X86::MOV16rm : X86::MOV16mr; - case X86::GR8RegClassID: - // Copying to or from a physical H register on x86-64 requires a NOREX - // move. Otherwise use a normal move. - if (isHReg(Reg) && - TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::GR8_ABCD_LRegClassID: - case X86::GR8_NOREXRegClassID: - return load ? X86::MOV8rm :X86::MOV8mr; - case X86::GR8_ABCD_HRegClassID: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); if (TM.getSubtarget<X86Subtarget>().is64Bit()) - return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; - else - return load ? X86::MOV8rm : X86::MOV8mr; - case X86::RFP80RegClassID: + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSSrm : X86::MOVSSmr; + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOVSDrm : X86::MOVSDmr; + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; - case X86::RFP64RegClassID: - return load ? X86::LD_Fp64m : X86::ST_Fp64m; - case X86::RFP32RegClassID: - return load ? X86::LD_Fp32m : X86::ST_Fp32m; - case X86::FR32RegClassID: - return load ? X86::MOVSSrm : X86::MOVSSmr; - case X86::FR64RegClassID: - return load ? X86::MOVSDrm : X86::MOVSDmr; - case X86::VR128RegClassID: + case 16: + assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::MOVAPSrm : X86::MOVAPSmr; else return load ? X86::MOVUPSrm : X86::MOVUPSmr; - case X86::VR64RegClassID: - return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; } } @@ -2241,6 +2225,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1; + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return NULL; + MachineInstr *NewMI = NULL; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires @@ -2429,7 +2419,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 4; break; default: - llvm_unreachable("Don't know how to fold this instruction!"); + return 0; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; @@ -2535,6 +2525,12 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, case X86::TEST32rr: case X86::TEST64rr: return true; + case X86::ADD32ri: + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return false; + break; } } @@ -2845,11 +2841,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } switch (Opc2) { @@ -2869,11 +2863,9 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: - case X86::MOVUPSrm_Int: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: - case X86::MOVDQUrm_Int: break; } diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 4625b4c..d895023 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -449,7 +449,6 @@ namespace X86II { SSEDomainShift = SegOvrShift + 2, OpcodeShift = SSEDomainShift + 2, - OpcodeMask = 0xFFULL << OpcodeShift, //===------------------------------------------------------------------===// /// VEX - The opcode prefix used by AVX instructions @@ -807,7 +806,7 @@ public: int64_t &Offset1, int64_t &Offset2) const; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f832a7c..8cab808 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -23,6 +23,9 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3, def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; +def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; @@ -459,7 +462,7 @@ def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; include "X86InstrFormats.td" //===----------------------------------------------------------------------===// -// Pattern fragments... +// Pattern fragments. // // X86 specific condition code. These correspond to CondCode in @@ -481,21 +484,21 @@ def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; -def immSext8 : PatLeaf<(imm), [{ return immSext8(N); }]>; +let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. + def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; + def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +} -def i16immSExt8 : PatLeaf<(i16 immSext8)>; -def i32immSExt8 : PatLeaf<(i32 immSext8)>; -def i64immSExt8 : PatLeaf<(i64 immSext8)>; -def i64immSExt32 : PatLeaf<(i64 imm), [{ return i64immSExt32(N); }]>; -def i64immZExt32 : PatLeaf<(i64 imm), [{ - // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // unsignedsign extended field. - return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue(); -}]>; +def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; -def i64immZExt32SExt8 : PatLeaf<(i64 imm), [{ - uint64_t v = N->getZExtValue(); - return v == (uint32_t)v && (int32_t)v == (int8_t)v; +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; }]>; // Helper fragments for loads. @@ -1437,7 +1440,7 @@ def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" -def : InstAlias<"faddp", (ADD_FPrST0 ST1)>; +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; @@ -1455,13 +1458,15 @@ def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with // gas. -multiclass FpUnaryAlias<string Mnemonic, Instruction Inst> { - def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), (Inst RST:$op)>; - def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), (Inst ST0)>; +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + (Inst ST0), EmitAlias>; } defm : FpUnaryAlias<"fadd", ADD_FST0r>; -defm : FpUnaryAlias<"faddp", ADD_FPrST0>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; defm : FpUnaryAlias<"fsub", SUB_FST0r>; defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; @@ -1472,8 +1477,8 @@ defm : FpUnaryAlias<"fdiv", DIV_FST0r>; defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; -defm : FpUnaryAlias<"fcomi", COM_FIr>; -defm : FpUnaryAlias<"fucomi", UCOM_FIr>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; defm : FpUnaryAlias<"fcompi", COM_FIPr>; defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; @@ -1481,8 +1486,9 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they // commute. We also allow fdiv[r]p/fsubrp even though they don't commute, // solely because gas supports it. -def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>; +def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; @@ -1534,29 +1540,31 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>; -def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) def : InstAlias<"movsd", (MOVSD)>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src)>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src)>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src)>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src)>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index bb2165a..b2d9fca 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -285,7 +285,7 @@ let Constraints = "$src1 = $dst" in defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>; -defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 8f08e68..7774057 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -135,18 +135,16 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, // is used instead. Register-to-register movss/movsd is not modeled as an // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable // in terms of a copy, and just mentioned, we don't use movss/movsd for copies. -let isAsmParserOnly = 0 in { - def VMOVSSrr : sse12_move_rr<FR32, v4f32, - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; - def VMOVSDrr : sse12_move_rr<FR64, v2f64, - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; +def VMOVSSrr : sse12_move_rr<FR32, v4f32, + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V; +def VMOVSDrr : sse12_move_rr<FR64, v2f64, + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V; - let canFoldAsLoad = 1, isReMaterializable = 1 in { - def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; +let canFoldAsLoad = 1, isReMaterializable = 1 in { + def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX; - let AddedComplexity = 20 in - def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; - } + let AddedComplexity = 20 in + def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX; } let Constraints = "$src1 = $dst" in { @@ -218,14 +216,12 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>; -let isAsmParserOnly = 0 in { def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", [(store FR32:$src, addr:$dst)]>, XS, VEX; def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>, XD, VEX; -} // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), @@ -251,7 +247,6 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in [(set RC:$dst, (ld_frag addr:$src))], d>; } -let isAsmParserOnly = 0 in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, VEX; defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -269,7 +264,6 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", SSEPackedSingle>, VEX; defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, 0>, OpSize, VEX; -} defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle>, TB; defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, @@ -279,7 +273,6 @@ defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, 0>, TB, OpSize; -let isAsmParserOnly = 0 in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX; @@ -304,7 +297,6 @@ def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movupd\t{$src, $dst|$dst, $src}", [(store (v4f64 VR256:$src), addr:$dst)]>, VEX; -} def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), @@ -328,32 +320,14 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), [(store (v2f64 VR128:$src), addr:$dst)]>; // Intrinsic forms of MOVUPS/D load and store -let isAsmParserOnly = 0 in { - let canFoldAsLoad = 1, isReMaterializable = 1 in - def VMOVUPSrm_Int : VPSI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>, VEX; - def VMOVUPDrm_Int : VPDI<0x10, MRMSrcMem, (outs VR128:$dst), - (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>, VEX; - def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movups\t{$src, $dst|$dst, $src}", - [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; -} -let canFoldAsLoad = 1, isReMaterializable = 1 in -def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movups\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; -def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "movupd\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>; +def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX; +def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX; def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movups\t{$src, $dst|$dst, $src}", @@ -382,7 +356,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC, SSEPackedDouble>, TB, OpSize; } -let isAsmParserOnly = 0, AddedComplexity = 20 in { +let AddedComplexity = 20 in { defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", @@ -395,7 +369,6 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { "\t{$src2, $dst|$dst, $src2}">; } -let isAsmParserOnly = 0 in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -404,7 +377,6 @@ def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), @@ -416,7 +388,6 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. -let isAsmParserOnly = 0 in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -429,7 +400,6 @@ def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", [(store (f64 (vector_extract @@ -441,7 +411,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), (v2f64 (unpckh VR128:$src, (undef))), (iPTR 0))), addr:$dst)]>; -let isAsmParserOnly = 0, AddedComplexity = 20 in { +let AddedComplexity = 20 in { def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -516,7 +486,6 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; } -let isAsmParserOnly = 0 in { defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, @@ -542,7 +511,6 @@ defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, VEX_4V; defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, VEX_4V, VEX_W; -} defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, "cvttss2si\t{$src, $dst|$dst, $src}">, XS; @@ -591,27 +559,25 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>; } -let isAsmParserOnly = 0 in { - defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, - f32mem, load, "cvtss2si">, XS, VEX; - defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, - XS, VEX, VEX_W; - defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, - f128mem, load, "cvtsd2si">, XD, VEX; - defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, - int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, - XD, VEX, VEX_W; - - // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ - // Get rid of this hack or rename the intrinsics, there are several - // intructions that only match with the intrinsic form, why create duplicates - // to let them be recognized by the assembler? - defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; - defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, - "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; -} +defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + f32mem, load, "cvtss2si">, XS, VEX; +defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">, + XS, VEX, VEX_W; +defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si">, XD, VEX; +defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, + XD, VEX, VEX_W; + +// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ +// Get rid of this hack or rename the intrinsics, there are several +// intructions that only match with the intrinsic form, why create duplicates +// to let them be recognized by the assembler? +defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; +defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, f32mem, load, "cvtss2si">, XS; defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, @@ -622,18 +588,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 0 in { - defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; - defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, - VEX_W; - defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, - int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; - defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, - int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, - VEX_4V, VEX_W; -} +defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; +defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, + VEX_W; +defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; +defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, + VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in { defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, @@ -653,7 +617,6 @@ let Constraints = "$src1 = $dst" in { /// SSE 1 Only // Aliases for intrinsics -let isAsmParserOnly = 0 in { defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS, VEX; defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -664,7 +627,6 @@ defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si">, XD, VEX, VEX_W; -} defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, f32mem, load, "cvttss2si">, XS; defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, @@ -676,7 +638,7 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, f128mem, load, "cvttsd2si{q}">, XD, REX_W; -let isAsmParserOnly = 0, Pattern = []<dag> in { +let Pattern = []<dag> in { defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX; defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, @@ -702,7 +664,6 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, /// SSE 2 Only // Convert scalar double to scalar single -let isAsmParserOnly = 0 in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -711,7 +672,6 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; -} def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, Requires<[HasAVX]>; @@ -723,7 +683,6 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 0 in defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>, XS, VEX_4V; @@ -732,7 +691,7 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS; // Convert scalar single to scalar double -let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -741,7 +700,6 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; -} def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>; @@ -754,7 +712,6 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, Requires<[HasSSE2, OptForSize]>; -let isAsmParserOnly = 0 in { def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -767,7 +724,6 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, (load addr:$src2)))]>, XS, VEX_4V, Requires<[HasAVX]>; -} let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), @@ -788,7 +744,7 @@ def : Pat<(extloadf32 addr:$src), Requires<[HasSSE2, OptForSpeed]>; // Convert doubleword to packed single/double fp -let isAsmParserOnly = 0 in { // SSE2 instructions without OpSize prefix +// SSE2 instructions without OpSize prefix def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -798,7 +754,6 @@ def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2ps (bitconvert (memopv2i64 addr:$src))))]>, TB, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, @@ -810,7 +765,7 @@ def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), TB, Requires<[HasSSE2]>; // FIXME: why the non-intrinsic version is described as SSE3? -let isAsmParserOnly = 0 in { // SSE2 instructions with XS prefix +// SSE2 instructions with XS prefix def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -820,7 +775,6 @@ def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtdq2pd (bitconvert (memopv2i64 addr:$src))))]>, XS, VEX, Requires<[HasAVX]>; -} def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, @@ -833,7 +787,6 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), // Convert packed single/double fp to doubleword -let isAsmParserOnly = 0 in { def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -842,13 +795,11 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 0 in { def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>, @@ -858,7 +809,6 @@ def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>, VEX; -} def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; @@ -867,7 +817,7 @@ def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { // SSE2 packed instructions with XD prefix +// SSE2 packed instructions with XD prefix def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -877,7 +827,6 @@ def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtpd2dq (memop addr:$src)))]>, XD, VEX, Requires<[HasAVX]>; -} def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, @@ -890,7 +839,7 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // Convert with truncation packed single/double fp to doubleword -let isAsmParserOnly = 0 in { // SSE2 packed instructions with XS prefix +// SSE2 packed instructions with XS prefix def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), @@ -899,7 +848,6 @@ def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; -} def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -910,7 +858,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), (int_x86_sse2_cvttps2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -921,9 +868,7 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memop addr:$src)))]>, XS, VEX, Requires<[HasAVX]>; -} -let isAsmParserOnly = 0 in { def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -934,7 +879,6 @@ def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>, VEX; -} def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; @@ -943,7 +887,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>; -let isAsmParserOnly = 0 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -963,10 +906,9 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} // Convert packed single to packed double -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; @@ -982,7 +924,6 @@ def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; -let isAsmParserOnly = 0 in { def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -992,7 +933,6 @@ def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), [(set VR128:$dst, (int_x86_sse2_cvtps2pd (load addr:$src)))]>, VEX, Requires<[HasAVX]>; -} def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, @@ -1004,7 +944,6 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), TB, Requires<[HasSSE2]>; // Convert packed double to packed single -let isAsmParserOnly = 0 in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -1024,14 +963,12 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; -} def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; -let isAsmParserOnly = 0 in { def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1040,7 +977,6 @@ def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps (memop addr:$src)))]>; -} def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; @@ -1109,7 +1045,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, asm_alt, []>; } -let neverHasSideEffects = 1, isAsmParserOnly = 0 in { +let neverHasSideEffects = 1 in { defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">, @@ -1120,13 +1056,37 @@ let neverHasSideEffects = 1, isAsmParserOnly = 0 in { XD, VEX_4V; } +let Constraints = "$src1 = $dst" in { +def CMPSSrr : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS; +def CMPSSrm : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc), + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS; +def CMPSDrr : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD; +def CMPSDrm : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc), + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD; +} let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in { - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, - "cmp${cc}ss\t{$src, $dst|$dst, $src}", - "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS; - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, - "cmp${cc}sd\t{$src, $dst|$dst, $src}", - "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD; +def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2), + "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS; +def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; +def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2), + "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD; } multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, @@ -1142,14 +1102,12 @@ multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, } // Aliases to match intrinsics which expect XMM operand(s). -let isAsmParserOnly = 0 in { - defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, - "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, - XS, VEX_4V; - defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, - "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, - XD, VEX_4V; -} +defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, + XS, VEX_4V; +defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, + XD, VEX_4V; let Constraints = "$src1 = $dst" in { defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS; @@ -1172,28 +1130,26 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, } let Defs = [EFLAGS] in { - let isAsmParserOnly = 0 in { - defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss", SSEPackedSingle>, VEX; - defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd", SSEPackedDouble>, OpSize, VEX; - let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, - "comiss", SSEPackedSingle>, VEX; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, - "comisd", SSEPackedDouble>, OpSize, VEX; - } - - defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, - load, "ucomiss", SSEPackedSingle>, VEX; - defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, - load, "ucomisd", SSEPackedDouble>, OpSize, VEX; - - defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, - load, "comiss", SSEPackedSingle>, VEX; - defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, - load, "comisd", SSEPackedDouble>, OpSize, VEX; + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, VEX; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, OpSize, VEX; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, VEX; + defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, OpSize, VEX; } + + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, OpSize, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss", SSEPackedSingle>, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd", SSEPackedDouble>, OpSize, VEX; defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, "ucomiss", SSEPackedSingle>, TB; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, @@ -1239,24 +1195,22 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, asm_alt, [], d>; } -let isAsmParserOnly = 0 in { - defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, - "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedSingle>, VEX_4V; - defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, - "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", - "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; -} +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedSingle>, VEX_4V; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", + "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src, $dst|$dst, $src}", @@ -1296,20 +1250,18 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; } -let isAsmParserOnly = 0 in { - defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv4f32, SSEPackedSingle>, TB, VEX_4V; - defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, - "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - memopv8f32, SSEPackedSingle>, TB, VEX_4V; - defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; - defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, - "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", - memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; -} +defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv4f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv8f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -1342,33 +1294,31 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt, } let AddedComplexity = 10 in { - let isAsmParserOnly = 0 in { - defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, - VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, - VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, - VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, - VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - - defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, - VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, - VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, - VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, VEX_4V; - defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, - VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, OpSize, VEX_4V; - } + defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + + defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; + defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, VEX_4V; + defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, @@ -1401,35 +1351,46 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, } // Mask creation +defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; +defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, VEX; +defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, OpSize, + VEX; defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", SSEPackedSingle>, TB; defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", SSEPackedDouble>, TB, OpSize; -let isAsmParserOnly = 0 in { - defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; - defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, - "movmskps", SSEPackedSingle>, VEX; - defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, - "movmskpd", SSEPackedDouble>, OpSize, - VEX; +// X86fgetsign +def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src), + "movmskpd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize; +def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; +def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src), + "movmskps\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB; - // Assembler Only - def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; - def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; - def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), - "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, - VEX; -} +// Assembler Only +def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; +def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; +def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions @@ -1484,13 +1445,11 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), /// multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { - let isAsmParserOnly = 0 in { - defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V; - defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; - } + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, @@ -1516,7 +1475,7 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode, int HasPat = 0, list<list<dag>> Pattern = []> { - let isAsmParserOnly = 0, Pattern = []<dag> in { + let Pattern = []<dag> in { defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, !if(HasPat, Pattern[0], // rr @@ -1563,7 +1522,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, /// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms /// -let isAsmParserOnly = 0 in { multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, [], [], 0>, VEX_4V; @@ -1571,7 +1529,6 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr> { defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, [], [], 0>, OpSize, VEX_4V; } -} // AVX 256-bit packed logical ops forms defm VAND : sse12_fp_packed_logical_y<0x54, "and">; @@ -1669,38 +1626,36 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> { } // Binary Arithmetic instructions -let isAsmParserOnly = 0 in { - defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_s_int<0x58, "add", 0>, - basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, - basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; - defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_s_int<0x59, "mul", 0>, - basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, - basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", 0>, + basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", 0>, + basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; - let isCommutable = 0 in { - defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, - basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, - basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; - defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_s_int<0x5E, "div", 0>, - basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, - basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; - defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_s_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, - basic_sse12_fp_binop_p_int<0x5F, "max", 0>, - basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, - basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; - defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_s_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, - basic_sse12_fp_binop_p_int<0x5D, "min", 0>, - basic_sse12_fp_binop_p_y_int<0x5D, "min">, - basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; - } +let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", 0>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p_y_int<0x5D, "min">, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; } let Constraints = "$src1 = $dst" in { @@ -1901,7 +1856,7 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Square root. defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>, sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>, @@ -1957,67 +1912,54 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, // SSE 1 & 2 - Non-temporal stores //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { - def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX; - def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX; - - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs), +let AddedComplexity = 400 in { // Prefer non-temporal versions + def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX; - - let AddedComplexity = 400 in { // Prefer non-temporal versions - def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2f64 VR128:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f32 VR128:$src), + [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>, VEX; - def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), - (ins f256mem:$dst, VR256:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v4f64 VR256:$src), - addr:$dst)]>, VEX; - let ExeDomain = SSEPackedInt in - def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + let ExeDomain = SSEPackedInt in + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; + + def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f32 VR256:$src), + [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)]>, VEX; - } + let ExeDomain = SSEPackedInt in + def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; } def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), @@ -2027,18 +1969,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), (VMOVNTPSYmr addr:$dst, VR256:$src)>; -def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntps\t{$src, $dst|$dst, $src}", - [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; -def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "movntpd\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; - -let ExeDomain = SSEPackedInt in -def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; - let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -2056,22 +1986,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movnti\t{$src, $dst|$dst, $src}", [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, TB, Requires<[HasSSE2]>; - } -def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "movnti\t{$src, $dst|$dst, $src}", - [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, - TB, Requires<[HasSSE2]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Misc Instructions (No AVX form) @@ -2079,13 +2006,13 @@ def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), // Prefetch intrinsic. def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src), - "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>; + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>; def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src), - "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>; + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>; def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src), - "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>; + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>; def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), - "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>; + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>; // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, @@ -2136,16 +2063,23 @@ def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>; def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; +// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while +// in the non-AVX version bits 127:64 aren't touched. Find a better way to +// represent this instead of always zeroing SRC1. One possible solution is +// to represent the instruction w/ something similar as the "$src1 = $dst" +// constraint but without the tied operands. +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), addr:$src)>, + Requires<[HasAVX, OptForSpeed]>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 - Load/Store XCSR register //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { - def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; - def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; -} +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>; @@ -2158,45 +2092,43 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let isAsmParserOnly = 0 in { - let neverHasSideEffects = 1 in { - def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - } - def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - - let canFoldAsLoad = 1, mayLoad = 1 in { - def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +let neverHasSideEffects = 1 in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +} +def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; +def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; - let mayStore = 1 in { - def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i256mem:$dst, VR256:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - let Predicates = [HasAVX] in { - def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), +let canFoldAsLoad = 1, mayLoad = 1 in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; - } - } +} +} + +let mayStore = 1 in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +} } let neverHasSideEffects = 1 in @@ -2228,23 +2160,11 @@ def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), } // Intrinsic forms of MOVDQU load and store -let isAsmParserOnly = 0 in { -let canFoldAsLoad = 1 in -def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, VEX, Requires<[HasAVX]>; def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, XS, VEX, Requires<[HasAVX]>; -} -let canFoldAsLoad = 1 in -def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqu\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, Requires<[HasSSE2]>; def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, @@ -2349,7 +2269,7 @@ multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, // 128-bit Integer Arithmetic -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; @@ -2439,7 +2359,7 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, VEX_4V; @@ -2586,7 +2506,7 @@ let Predicates = [HasSSE2] in { // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, 0>, VEX_4V; defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, @@ -2640,7 +2560,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, 0, 0>, VEX_4V; defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, @@ -2678,7 +2598,7 @@ def mi : Ii8<0x70, MRMSrcMem, } } // ExeDomain = SSEPackedInt -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let AddedComplexity = 5 in defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize, VEX; @@ -2726,7 +2646,7 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, addr:$src2))))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16, @@ -2836,7 +2756,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2849,7 +2769,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, imm:$src2))]>; // Insert -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V; def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), @@ -2868,13 +2788,11 @@ let Constraints = "$src1 = $dst" in let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 0 in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; -} def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; @@ -2887,7 +2805,6 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), let ExeDomain = SSEPackedInt in { -let isAsmParserOnly = 0 in { let Uses = [EDI] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2898,7 +2815,6 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), "maskmovdqu\t{$mask, $src|$src, $mask}", [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX; -} let Uses = [EDI] in def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), @@ -2916,7 +2832,6 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), //===---------------------------------------------------------------------===// // Move Int Doubleword to Packed Double Int -let isAsmParserOnly = 0 in { def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2926,7 +2841,6 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), [(set VR128:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, VEX; -} def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -2945,7 +2859,6 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), // Move Int Doubleword to Single Scalar -let isAsmParserOnly = 0 in { def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX; @@ -2954,7 +2867,6 @@ def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, VEX; -} def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (bitconvert GR32:$src))]>; @@ -2964,7 +2876,6 @@ def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; // Move Packed Doubleword Int to Packed Double Int -let isAsmParserOnly = 0 in { def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -2974,7 +2885,6 @@ def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (vector_extract (v4i32 VR128:$src), (iPTR 0))), addr:$dst)]>, VEX; -} def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), @@ -3000,14 +2910,12 @@ def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; // Move Scalar Single to Double Int -let isAsmParserOnly = 0 in { def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX; def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX; -} def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32:$src))]>; @@ -3016,7 +2924,7 @@ def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; // movd / movq to XMM register zero-extends -let AddedComplexity = 15, isAsmParserOnly = 0 in { +let AddedComplexity = 15 in { def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v4i32 (X86vzmovl @@ -3040,7 +2948,6 @@ def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), } let AddedComplexity = 20 in { -let isAsmParserOnly = 0 in def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3066,7 +2973,6 @@ def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), //===---------------------------------------------------------------------===// // Move Quadword Int to Packed Quadword Int -let isAsmParserOnly = 0 in def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3079,7 +2985,6 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix // Move Packed Quadword Int to Quadword Int -let isAsmParserOnly = 0 in def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(store (i64 (vector_extract (v2i64 VR128:$src), @@ -3093,7 +2998,6 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; // Store / copy lower 64-bits of a XMM register. -let isAsmParserOnly = 0 in def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; @@ -3101,7 +3005,7 @@ def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; -let AddedComplexity = 20, isAsmParserOnly = 0 in +let AddedComplexity = 20 in def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3126,7 +3030,7 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. -let isAsmParserOnly = 0, AddedComplexity = 15 in +let AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, @@ -3137,7 +3041,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, XS, Requires<[HasSSE2]>; -let AddedComplexity = 20, isAsmParserOnly = 0 in +let AddedComplexity = 20 in def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl @@ -3155,7 +3059,6 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), } // Instructions to match in the assembler -let isAsmParserOnly = 0 in { def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), @@ -3163,13 +3066,12 @@ def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), // Recognize "movd" with GR64 destination, but encode as a "movq" def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; -} // Instructions for the disassembler // xr = XMM register // xm = mem64 -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3211,7 +3113,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, //===---------------------------------------------------------------------===// // Convert Packed Double FP to Packed DW Integers -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // The assembler can recognize rr 256-bit instructions by seeing a ymm // register, but the same isn't true when using memory operands instead. // Provide other assembly rr and rm forms to address this explicitly. @@ -3239,7 +3141,7 @@ def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; // Convert Packed DW Integers to Packed Double FP -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3290,7 +3192,7 @@ def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX; @@ -3321,7 +3223,7 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), []>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // FIXME: Merge above classes when we have patterns for the ymm version defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; @@ -3329,7 +3231,7 @@ let isAsmParserOnly = 0, Predicates = [HasAVX] in { defm MOVDDUP : sse3_replicate_dfp<"movddup">; // Move Unaligned Integer -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; @@ -3393,7 +3295,7 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX], +let Predicates = [HasAVX], ExeDomain = SSEPackedDouble in { defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, f128mem, 0>, TB, XD, VEX_4V; @@ -3446,7 +3348,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, int_x86_sse3_hadd_ps, 0>, VEX_4V; defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, @@ -3498,7 +3400,7 @@ multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (mem_frag128 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, int_x86_ssse3_pabs_b_128>, VEX; defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, @@ -3540,7 +3442,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, int_x86_ssse3_phadd_w_128, 0>, VEX_4V; @@ -3632,7 +3534,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> { []>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PALIGN : ssse3_palign<"palignr">; @@ -3696,6 +3598,16 @@ let Predicates = [HasSSE2] in def : Pat<(fextend (loadf32 addr:$src)), (CVTSS2SDrm addr:$src)>; +// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while +// in the non-AVX version bits 127:64 aren't touched. Find a better way to +// represent this instead of always zeroing SRC1. One possible solution is +// to represent the instruction w/ something similar as the "$src1 = $dst" +// constraint but without the tied operands. +let Predicates = [HasAVX] in + def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), + addr:$src)>; + // bit_convert let Predicates = [HasXMMInt] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; @@ -3987,7 +3899,7 @@ multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, @@ -4053,7 +3965,7 @@ multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, @@ -4094,7 +4006,7 @@ multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, @@ -4136,7 +4048,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4158,7 +4070,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -4180,7 +4092,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -4201,7 +4113,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize, REX_W; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -4224,7 +4136,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), (ins VR128:$src1, i32i8imm:$src2), @@ -4264,7 +4176,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -4290,7 +4202,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -4316,7 +4228,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -4349,7 +4261,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { let Constraints = "$src1 = $dst" in defm INSERTPS : SS41I_insertf32<0x21, "insertps">; -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), @@ -4519,7 +4431,7 @@ multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd, } // FP round - roundss, roundps, roundsd, roundpd -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { // Intrinsic form defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, memopv4f32, memopv2f64, @@ -4554,7 +4466,7 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", // ptest instruction we'll lower to this in X86ISelLowering primarily from // the intel intrinsic that corresponds to this. -let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), "vptest\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, @@ -4597,7 +4509,7 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, OpSize, VEX; } -let Defs = [EFLAGS], isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Defs = [EFLAGS], Predicates = [HasAVX] in { defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; @@ -4646,7 +4558,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, (bitconvert (memopv8i16 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", int_x86_sse41_phminposuw>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", @@ -4672,7 +4584,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; @@ -4739,7 +4651,7 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; @@ -4771,7 +4683,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, VR128, memopv16i8, i128mem, 0>, VEX_4V; @@ -4812,7 +4724,7 @@ let Constraints = "$src1 = $dst" in { } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, PatFrag mem_frag, Intrinsic IntId> { @@ -4851,14 +4763,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, OpSize; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, - "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"), + "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; @@ -4872,7 +4784,7 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0), (PBLENDVBrr0 VR128:$src1, VR128:$src2)>; -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -4906,7 +4818,7 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 0, Predicates = [HasAVX] in +let Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, 0>, VEX_4V; let Constraints = "$src1 = $dst" in @@ -4938,8 +4850,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in { defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; } -let Defs = [XMM0, EFLAGS], isAsmParserOnly = 0, - Predicates = [HasAVX] in { +let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in { def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; @@ -4974,7 +4885,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; } -let isAsmParserOnly = 0, Predicates = [HasAVX], +let Predicates = [HasAVX], Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), @@ -5009,7 +4920,7 @@ let Defs = [ECX, EFLAGS] in { } } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">, VEX; defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">, @@ -5048,7 +4959,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in { } } -let isAsmParserOnly = 0, Predicates = [HasAVX] in { +let Predicates = [HasAVX] in { defm VPCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">, VEX; defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">, @@ -5080,66 +4991,66 @@ defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>; // This set of instructions are only rm, the only difference is the size // of r and m. let Constraints = "$src1 = $dst" in { - def CRC32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, + (int_x86_sse42_crc32_32_8 GR32:$src1, (load addr:$src2)))]>; - def CRC32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>; - def CRC32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; + def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i16mem:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, + (int_x86_sse42_crc32_32_16 GR32:$src1, (load addr:$src2)))]>, OpSize; - def CRC32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR16:$src2), "crc32{w} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>, + (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, OpSize; - def CRC32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, + (int_x86_sse42_crc32_32_32 GR32:$src1, (load addr:$src2)))]>; - def CRC32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "crc32{l} \t{$src2, $src1|$src1, $src2}", [(set GR32:$dst, - (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>; - def CRC64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), + (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; + def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i8mem:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, + (int_x86_sse42_crc32_64_8 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR8:$src2), "crc32{b} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>, + (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, REX_W; - def CRC64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), + def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, + (int_x86_sse42_crc32_64_64 GR64:$src1, (load addr:$src2)))]>, REX_W; - def CRC64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), + def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "crc32{q} \t{$src2, $src1|$src1, $src2}", [(set GR64:$dst, - (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>, + (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, REX_W; } @@ -5167,7 +5078,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, } // Perform One Round of an AES Encryption/Decryption Flow -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", int_x86_aesni_aesenc, 0>, VEX_4V; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", @@ -5207,7 +5118,7 @@ def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), (AESDECLASTrm VR128:$src1, addr:$src2)>; // Perform the AES InvMixColumn Transformation -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1), "vaesimc\t{$src1, $dst|$dst, $src1}", @@ -5235,7 +5146,7 @@ def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), OpSize; // AES Round Key Generation Assist -let isAsmParserOnly = 0, Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, HasAES] in { def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -5271,7 +5182,6 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), // Only the AVX version of CLMUL instructions are described here. // Carry-less Multiplication instructions -let isAsmParserOnly = 0 in { def VPCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -5297,13 +5207,10 @@ defm VPCLMULHQLQDQ : avx_vpclmul<"vpclmulhqlqdq">; defm VPCLMULLQHQDQ : avx_vpclmul<"vpclmullqhqdq">; defm VPCLMULLQLQDQ : avx_vpclmul<"vpclmullqlqdq">; -} // isAsmParserOnly - //===----------------------------------------------------------------------===// // AVX Instructions //===----------------------------------------------------------------------===// -let isAsmParserOnly = 0 in { // Load from memory and broadcast to all elements of the destination operand class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -5437,8 +5344,6 @@ def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>; -} // isAsmParserOnly - def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3), (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 2710425..f73cff3 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -34,9 +34,16 @@ let Uses = [EFLAGS] in def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + + def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)]>; + def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; def SYSRETL : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB; def SYSRETQ :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp index 6686214..2e1ec63 100644 --- a/lib/Target/X86/X86MCAsmInfo.cpp +++ b/lib/Target/X86/X86MCAsmInfo.cpp @@ -15,7 +15,9 @@ #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ELF.h" using namespace llvm; @@ -69,7 +71,22 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) { DwarfUsesInlineInfoSection = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::Create(4, Context); + return MCBinaryExpr::CreateAdd(Res, Four, Context); +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { } X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { @@ -89,7 +106,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfTable; + ExceptionsType = ExceptionHandling::DwarfCFI; // OpenBSD has buggy support for .quad in 32-bit mode, just split into two // .words. diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h index 5815225..2cd4c8e 100644 --- a/lib/Target/X86/X86MCAsmInfo.h +++ b/lib/Target/X86/X86MCAsmInfo.h @@ -25,6 +25,14 @@ namespace llvm { explicit X86MCAsmInfoDarwin(const Triple &Triple); }; + struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + virtual const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const; + }; + struct X86ELFMCAsmInfo : public MCAsmInfo { explicit X86ELFMCAsmInfo(const Triple &Triple); virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp index a2bd638..55aceba 100644 --- a/lib/Target/X86/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/X86MCCodeEmitter.cpp @@ -514,7 +514,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, } // To only check operands before the memory address ones, start - // the search from the begining + // the search from the beginning if (IsDestMem) CurOp = 0; @@ -1015,7 +1015,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } else { unsigned FixupKind; // FIXME: Is there a better way to know that we need a signed relocation? - if (MI.getOpcode() == X86::MOV64ri32 || + if (MI.getOpcode() == X86::ADD64ri32 || + MI.getOpcode() == X86::MOV64ri32 || MI.getOpcode() == X86::MOV64mi32 || MI.getOpcode() == X86::PUSH64i32) FixupKind = X86::reloc_signed_4byte; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index cbe6db2..793156f 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -355,10 +355,6 @@ ReSimplify: assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && "LEA has segment specified!"); break; - case X86::MOVZX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; - case X86::MOVZX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; - case X86::MOVSX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break; - case X86::MOVSX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break; case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 1f464f4..1ad6203 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -73,29 +73,61 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, } } -/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF -/// specific numbering, used in debug info and exception tables. -int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { - const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); - unsigned Flavour = DWARFFlavour::X86_64; - +static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) { if (!Subtarget->is64Bit()) { if (Subtarget->isTargetDarwin()) { if (isEH) - Flavour = DWARFFlavour::X86_32_DarwinEH; + return DWARFFlavour::X86_32_DarwinEH; else - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else if (Subtarget->isTargetCygMing()) { // Unsupported by now, just quick fallback - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } else { - Flavour = DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; } } + return DWARFFlavour::X86_64; +} + +/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF +/// specific numbering, used in debug info and exception tables. +int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour); } +/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register. +int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + unsigned Flavour = getFlavour(Subtarget, isEH); + + return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour); +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + int reg = getX86RegNum(i); + switch (i) { + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + reg += 8; + } + return reg; +} + /// getX86RegNum - This function maps LLVM register identifiers to their X86 /// specific numbering, which is used in various places encoding instructions. unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) { @@ -229,19 +261,13 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, } break; case X86::sub_8bit_hi: - if (B == &X86::GR8_ABCD_HRegClass) { - if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || - A == &X86::GR64_NOREXRegClass || - A == &X86::GR64_NOSPRegClass || - A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_ABCDRegClass; - else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || - A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass) - return &X86::GR32_ABCDRegClass; - else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass || - A == &X86::GR16_NOREXRegClass) - return &X86::GR16_ABCDRegClass; - } + if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass)) + switch (A->getSize()) { + case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass); + case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass); + case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass); + default: return 0; + } break; case X86::sub_16bit: if (B == &X86::GR16RegClass) { @@ -285,9 +311,16 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, A == &X86::GR64_NOREX_NOSPRegClass) return &X86::GR64_ABCDRegClass; } else if (B == &X86::GR32_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREX_NOSPRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } else if (B == &X86::GR32_NOREX_NOSPRegClass) { if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) - return &X86::GR64_NOREXRegClass; + return &X86::GR64_NOREX_NOSPRegClass; else if (A == &X86::GR64_ABCDRegClass) return &X86::GR64_ABCDRegClass; } @@ -308,6 +341,33 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, return 0; } +const TargetRegisterClass* +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->superclasses_begin(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(unsigned Kind) const { switch (Kind) { @@ -446,6 +506,34 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::ST5); Reserved.set(X86::ST6); Reserved.set(X86::ST7); + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + for (unsigned n = 0; n != 8; ++n) { + const unsigned GPR64[] = { + X86::R8, X86::R9, X86::R10, X86::R11, + X86::R12, X86::R13, X86::R14, X86::R15 + }; + for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + + // XMM8, XMM9, ... + assert(X86::XMM15 == X86::XMM8+7); + for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + } + } + return Reserved; } @@ -470,7 +558,7 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { // FIXME: It's more complicated than this... if (0 && requiresRealignment && MFI->hasVarSizedObjects()) report_fatal_error( - "Stack realignment in presense of dynamic allocas is not supported"); + "Stack realignment in presence of dynamic allocas is not supported"); // If we've requested that we force align the stack do so now. if (ForceStackAlign) diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index cccddfa..dd3d3dc 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -80,6 +80,10 @@ public: /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum /// (created by TableGen) for target dependencies. int getDwarfRegNum(unsigned RegNum, bool isEH) const; + int getLLVMRegNum(unsigned RegNum, bool isEH) const; + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; /// Code Generation virtual methods... /// @@ -91,6 +95,9 @@ public: getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer /// values. const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 612fac2..590b38b 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -41,80 +41,83 @@ let Namespace = "X86" in { // 8-bit registers // Low registers - def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>; - def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>; - def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>; - def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>; - - // X86-64 only - def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>; - def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>; - def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>; - def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>; - def R8B : Register<"r8b">, DwarfRegNum<[8, -2, -2]>; - def R9B : Register<"r9b">, DwarfRegNum<[9, -2, -2]>; - def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>; - def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>; - def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>; - def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>; - def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>; - def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>; + def AL : Register<"al">; + def DL : Register<"dl">; + def CL : Register<"cl">; + def BL : Register<"bl">; + + // X86-64 only, requires REX. + let CostPerUse = 1 in { + def SIL : Register<"sil">; + def DIL : Register<"dil">; + def BPL : Register<"bpl">; + def SPL : Register<"spl">; + def R8B : Register<"r8b">; + def R9B : Register<"r9b">; + def R10B : Register<"r10b">; + def R11B : Register<"r11b">; + def R12B : Register<"r12b">; + def R13B : Register<"r13b">; + def R14B : Register<"r14b">; + def R15B : Register<"r15b">; + } // High registers. On x86-64, these cannot be used in any instruction // with a REX prefix. - def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>; - def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>; - def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>; - def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>; + def AH : Register<"ah">; + def DH : Register<"dh">; + def CH : Register<"ch">; + def BH : Register<"bh">; // 16-bit registers let SubRegIndices = [sub_8bit, sub_8bit_hi] in { - def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>; - def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>; - def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>; - def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>; + def AX : RegisterWithSubRegs<"ax", [AL,AH]>; + def DX : RegisterWithSubRegs<"dx", [DL,DH]>; + def CX : RegisterWithSubRegs<"cx", [CL,CH]>; + def BX : RegisterWithSubRegs<"bx", [BL,BH]>; } let SubRegIndices = [sub_8bit] in { - def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>; - def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>; - def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>; - def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>; + def SI : RegisterWithSubRegs<"si", [SIL]>; + def DI : RegisterWithSubRegs<"di", [DIL]>; + def BP : RegisterWithSubRegs<"bp", [BPL]>; + def SP : RegisterWithSubRegs<"sp", [SPL]>; } - def IP : Register<"ip">, DwarfRegNum<[16]>; - - // X86-64 only - let SubRegIndices = [sub_8bit] in { - def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>; - def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>; - def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>; - def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>; - def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>; - def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>; - def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>; - def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>; + def IP : Register<"ip">; + + // X86-64 only, requires REX. + let SubRegIndices = [sub_8bit], CostPerUse = 1 in { + def R8W : RegisterWithSubRegs<"r8w", [R8B]>; + def R9W : RegisterWithSubRegs<"r9w", [R9B]>; + def R10W : RegisterWithSubRegs<"r10w", [R10B]>; + def R11W : RegisterWithSubRegs<"r11w", [R11B]>; + def R12W : RegisterWithSubRegs<"r12w", [R12B]>; + def R13W : RegisterWithSubRegs<"r13w", [R13B]>; + def R14W : RegisterWithSubRegs<"r14w", [R14B]>; + def R15W : RegisterWithSubRegs<"r15w", [R15B]>; } // 32-bit registers let SubRegIndices = [sub_16bit] in { - def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>; - def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>; - def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>; - def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>; - def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>; - def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>; - def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>; - def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>; - def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; - - // X86-64 only - def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>; - def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>; - def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>; - def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>; - def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>; - def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>; - def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>; - def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>; - } + def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>; + def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>; + def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>; + def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>; + def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>; + def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>; + def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>; + def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>; + def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>; + + // X86-64 only, requires REX + let CostPerUse = 1 in { + def R8D : RegisterWithSubRegs<"r8d", [R8W]>; + def R9D : RegisterWithSubRegs<"r9d", [R9W]>; + def R10D : RegisterWithSubRegs<"r10d", [R10W]>; + def R11D : RegisterWithSubRegs<"r11d", [R11W]>; + def R12D : RegisterWithSubRegs<"r12d", [R12W]>; + def R13D : RegisterWithSubRegs<"r13d", [R13W]>; + def R14D : RegisterWithSubRegs<"r14d", [R14W]>; + def R15D : RegisterWithSubRegs<"r15d", [R15W]>; + }} // 64-bit registers, X86-64 only let SubRegIndices = [sub_32bit] in { @@ -127,6 +130,8 @@ let Namespace = "X86" in { def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>; def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>; + // These also require REX. + let CostPerUse = 1 in { def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>; def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>; def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>; @@ -136,7 +141,7 @@ let Namespace = "X86" in { def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>; def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>; def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>; - } + }} // MMX Registers. These are actually aliased to ST0 .. ST7 def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>; @@ -170,6 +175,7 @@ let Namespace = "X86" in { def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>; // X86-64 only + let CostPerUse = 1 in { def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>; def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>; def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>; @@ -178,26 +184,26 @@ let Namespace = "X86" in { def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>; def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>; def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>; - } + }} // YMM Registers, used by AVX instructions let SubRegIndices = [sub_xmm] in { - def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>; - def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>; - def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>; - def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>; - def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>; - def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>; - def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>; - def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>; - def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegNum<[25, -2, -2]>; - def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegNum<[26, -2, -2]>; - def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>; - def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>; - def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>; - def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>; - def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>; - def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>; + def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>; + def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>; + def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>; + def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>; + def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>; + def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>; + def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>; + def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>; + def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>; + def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>; + def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>; + def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>; + def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>; + def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>; + def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>; + def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>; } // Floating point stack registers @@ -273,8 +279,8 @@ let Namespace = "X86" in { // require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" // cannot be encoded. def GR8 : RegisterClass<"X86", [i8], 8, - [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, - R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> { + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -317,152 +323,38 @@ def GR8 : RegisterClass<"X86", [i8], 8, } def GR16 : RegisterClass<"X86", [i16], 16, - [AX, CX, DX, SI, DI, BX, BP, SP, - R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> { + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR16_AO_64[] = { - X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, - X86::R8W, X86::R9W, X86::R10W, X86::R11W, - X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP - }; - - GR16Class::iterator - GR16Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR16_AO_64; - else - return begin(); - } - - GR16Class::iterator - GR16Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return array_endof(X86_GR16_AO_64) - 1; - else - // If not, just don't allocate SP. - return array_endof(X86_GR16_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return begin() + 6; - else - // If not, just don't allocate SP. - return begin() + 7; - } - } - }]; } def GR32 : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, - R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32Class::iterator - GR32Class::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_AO_64; - else - return begin(); - } - - GR32Class::iterator - GR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return array_endof(X86_GR32_AO_64) - 1; - else - // If not, just don't allocate ESP. - return array_endof(X86_GR32_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return begin() + 6; - else - // If not, just don't allocate ESP. - return begin() + 7; - } - } - }]; } // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since // RIP isn't really a register and it can't be used anywhere except in an // address, but it doesn't cause trouble. def GR64 : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP, RSP, RIP]> { + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32 sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64Class::iterator - GR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-3; // If so, don't allocate RIP, RSP or RBP - else - return end()-2; // If not, just don't allocate RIP or RSP - } - }]; } // Segment registers for use by MOV instructions (and others) that have a // segment register as one operand. Always contain a 16-bit segment // descriptor. -def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]>; +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; // Debug registers. -def DEBUG_REG : RegisterClass<"X86", [i32], 32, - [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]>; +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; // Control registers. -def CONTROL_REG : RegisterClass<"X86", [i64], 64, - [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7, CR8, - CR9, CR10, CR11, CR12, CR13, CR14, CR15]>; +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; // GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of // GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" @@ -470,38 +362,38 @@ def CONTROL_REG : RegisterClass<"X86", [i64], 64, // that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, // and GR64_ABCD are classes for registers that support 8-bit h-register // operations. -def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]>; -def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]>; -def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> { +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)]; } -def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> { +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit)]; } -def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> { +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit), (GR32_ABCD sub_32bit)]; } -def GR32_TC : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> { +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; } -def GR64_TC : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI, - R8, R9, R11]> { +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_TC sub_32bit)]; } -def GR64_TCW64 : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, - R8, R9, R11]>; +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R11)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, - [AL, CL, DL, AH, CH, DH, BL, BH]> { + (add AL, CL, DL, AH, CH, DH, BL, BH)> { let MethodProtos = [{ iterator allocation_order_begin(const MachineFunction &MF) const; iterator allocation_order_end(const MachineFunction &MF) const; @@ -535,232 +427,62 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8, } // GR16_NOREX - GR16 registers which do not require a REX prefix. def GR16_NOREX : RegisterClass<"X86", [i16], 16, - [AX, CX, DX, SI, DI, BX, BP, SP]> { + (add AX, CX, DX, SI, DI, BX, BP, SP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR16_NOREXClass::iterator - GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate SP or BP. - return end() - 2; - else - // If not, just don't allocate SP. - return end() - 1; - } - }]; } // GR32_NOREX - GR32 registers which do not require a REX prefix. def GR32_NOREX : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> { + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR32_NOREXClass::iterator - GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP / EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate ESP or EBP. - return end() - 2; - else - // If not, just don't allocate ESP. - return end() - 1; - } - }]; } // GR64_NOREX - GR64 registers which do not require a REX prefix. def GR64_NOREX : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> { + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREXClass::iterator - GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RIP, RSP or RBP. - return end() - 3; - else - // If not, just don't allocate RIP or RSP. - return end() - 2; - } - }]; } // GR32_NOSP - GR32 registers except ESP. -def GR32_NOSP : RegisterClass<"X86", [i32], 32, - [EAX, ECX, EDX, ESI, EDI, EBX, EBP, - R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> { +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; - let MethodProtos = [{ - iterator allocation_order_begin(const MachineFunction &MF) const; - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - static const unsigned X86_GR32_NOSP_AO_64[] = { - X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, - X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP - }; - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (Subtarget.is64Bit()) - return X86_GR32_NOSP_AO_64; - else - return begin(); - } - - GR32_NOSPClass::iterator - GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (Subtarget.is64Bit()) { - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return array_endof(X86_GR32_NOSP_AO_64) - 1; - else - // If not, any reg in this class is ok. - return array_endof(X86_GR32_NOSP_AO_64); - } else { - // Does the function dedicate EBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate EBP. - return begin() + 6; - else - // If not, any reg in this class is ok. - return begin() + 7; - } - } - }]; } // GR64_NOSP - GR64 registers except RSP (and RIP). -def GR64_NOSP : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP]> { +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> { let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit), (GR32_NOSP sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOSPClass::iterator - GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - if (!Subtarget.is64Bit()) - return begin(); // None of these are allocatable in 32-bit. - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - return end()-1; // If so, don't allocate RBP - else - return end(); // If not, any reg in this class is ok. - } - }]; +} + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit)]; } // GR64_NOREX_NOSP - GR64_NOREX registers except RSP. def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, - [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> { + (and GR64_NOREX, GR64_NOSP)> { let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), (GR16_NOREX sub_16bit), - (GR32_NOREX sub_32bit)]; - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - GR64_NOREX_NOSPClass::iterator - GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const - { - const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering *TFI = TM.getFrameLowering(); - const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); - // Does the function dedicate RBP to being a frame ptr? - if (TFI->hasFP(MF) || MFI->getReserveFP()) - // If so, don't allocate RBP. - return end() - 1; - else - // If not, any reg in this class is ok. - return end(); - } - }]; + (GR32_NOREX_NOSP sub_32bit)]; } // A class to support the 'A' assembler constraint: EAX then EDX. -def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> { +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> { let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi), (GR16_ABCD sub_16bit)]; } // Scalar SSE2 floating point registers. -def FR32 : RegisterClass<"X86", [f32], 32, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR32Class::iterator - FR32Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; -def FR64 : RegisterClass<"X86", [f64], 64, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - FR64Class::iterator - FR64Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; -} +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; // FIXME: This sets up the floating point register files as though they are f64 @@ -769,85 +491,31 @@ def FR64 : RegisterClass<"X86", [f64], 64, // faster on common hardware. In reality, this should be controlled by a // command line option or something. -def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; -def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; -def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; // Floating point stack registers (these are not allocatable by the // register allocator - the floating point stackifier is responsible // for transforming FPn allocations to STn registers) -def RST : RegisterClass<"X86", [f80, f64, f32], 32, - [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> { - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - RSTClass::iterator - RSTClass::allocation_order_end(const MachineFunction &MF) const { - return begin(); - } - }]; +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; } // Generic vector registers: VR64 and VR128. -def VR64: RegisterClass<"X86", [x86mmx], 64, - [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>; -def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128, - [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, - XMM12, XMM13, XMM14, XMM15]> { +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32)> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR128Class::iterator - VR128Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. - else - return end(); - } - }]; } def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256, - [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, - YMM8, YMM9, YMM10, YMM11, - YMM12, YMM13, YMM14, YMM15]> { + (sequence "YMM%u", 0, 15)> { let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)]; - - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - VR256Class::iterator - VR256Class::allocation_order_end(const MachineFunction &MF) const { - const TargetMachine &TM = MF.getTarget(); - const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); - if (!Subtarget.is64Bit()) - return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode. - else - return end(); - } - }]; } // Status flags registers. -def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> { +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { let CopyCost = -1; // Don't allow copying of status registers. - - // EFLAGS is not allocatable. - let MethodProtos = [{ - iterator allocation_order_end(const MachineFunction &MF) const; - }]; - let MethodBodies = [{ - CCRClass::iterator - CCRClass::allocation_order_end(const MachineFunction &MF) const { - return allocation_order_begin(MF); - } - }]; + let isAllocatable = 0; } diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 42e8193..02754f9 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -178,7 +178,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferrably + // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 1ee7312..481e821 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -144,7 +144,8 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. - if (getDarwinVers() >= 10) + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) return "__bzero"; return 0; @@ -264,6 +265,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { HasCLMUL = IsIntel && ((ECX >> 1) & 0x1); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); + HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); HasAES = IsIntel && ((ECX >> 25) & 0x1); if (IsIntel || IsAMD) { diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 0a62a02..286a798 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -165,9 +165,15 @@ public: bool isUnalignedMemAccessFast() const { return IsUAMemFast; } bool hasVectorUAMem() const { return HasVectorUAMem; } - bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } - bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; } - bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; } + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { + return TargetTriple.getOS() == Triple::FreeBSD; + } + bool isTargetSolaris() const { + return TargetTriple.getOS() == Triple::Solaris; + } // ELF is a reasonably sane default and the only other X86 targets we // support are Darwin and Windows. Just use "not those". @@ -215,13 +221,6 @@ public: return PICStyle == PICStyles::StubDynamicNoPIC || PICStyle == PICStyles::StubPIC; } - /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard, - /// 10 = Snow Leopard, etc. - unsigned getDarwinVers() const { - if (isTargetDarwin()) return TargetTriple.getDarwinMajorNumber(); - return 0; - } - /// ClassifyGlobalReference - Classify a global variable reference for the /// current subtarget according to how we should reference it in a non-pcrel /// context. diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8fb9470..7483329 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -26,19 +26,18 @@ using namespace llvm; static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: - return new X86MCAsmInfoDarwin(TheTriple); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return new X86MCAsmInfoDarwin(TheTriple); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + if (TheTriple.getArch() == Triple::x86_64) + return new X86_64MCAsmInfoDarwin(TheTriple); else - return new X86MCAsmInfoCOFF(TheTriple); - default: - return new X86ELFMCAsmInfo(TheTriple); + return new X86MCAsmInfoDarwin(TheTriple); } + + if (TheTriple.isOSWindows()) + return new X86MCAsmInfoCOFF(TheTriple); + + return new X86ELFMCAsmInfo(TheTriple); } static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, @@ -48,19 +47,14 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, bool RelaxAll, bool NoExecStack) { Triple TheTriple(TT); - switch (TheTriple.getOS()) { - case Triple::Darwin: + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - case Triple::MinGW32: - case Triple::Cygwin: - case Triple::Win32: - if (TheTriple.getEnvironment() == Triple::MachO) - return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); - else - return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); - default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); - } + + if (TheTriple.isOSWindows()) + return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); + + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } extern "C" void LLVMInitializeX86Target() { diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index c15dfbb..1231798 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -38,6 +38,12 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); } +MCSymbol *X8664_MachoTargetObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} + unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4; @@ -52,7 +58,7 @@ unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const { +unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const { if (TM.getRelocationModel() == Reloc::PIC_) return DW_EH_PE_pcrel | DW_EH_PE_sdata4; else @@ -91,17 +97,14 @@ unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const { return DW_EH_PE_absptr; } -unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const { - CodeModel::Model Model = TM.getCodeModel(); - if (TM.getRelocationModel() == Reloc::PIC_) - return DW_EH_PE_pcrel | (Model == CodeModel::Small || - Model == CodeModel::Medium ? - DW_EH_PE_sdata4 : DW_EH_PE_sdata8); +unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const { + if (CFI) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - if (Model == CodeModel::Small || Model == CodeModel::Medium) - return DW_EH_PE_udata4; + if (TM.getRelocationModel() == Reloc::PIC_) + return DW_EH_PE_pcrel | DW_EH_PE_sdata4; - return DW_EH_PE_absptr; + return DW_EH_PE_udata4; } unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const { diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f2fd49c..e21b5bf 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -25,6 +25,12 @@ namespace llvm { getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, MachineModuleInfo *MMI, unsigned Encoding, MCStreamer &Streamer) const; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + virtual MCSymbol * + getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const; }; class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF { @@ -34,7 +40,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; @@ -45,7 +51,7 @@ namespace llvm { :TM(tm) { } virtual unsigned getPersonalityEncoding() const; virtual unsigned getLSDAEncoding() const; - virtual unsigned getFDEEncoding() const; + virtual unsigned getFDEEncoding(bool CFI) const; virtual unsigned getTTypeEncoding() const; }; |