diff options
Diffstat (limited to 'lib/Target/X86')
59 files changed, 3472 insertions, 2921 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 655eeb5..8fe549b 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -44,8 +44,6 @@ private: bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } - bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); - X86Operand *ParseOperand(); X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); @@ -71,6 +69,7 @@ public: setAvailableFeatures(ComputeAvailableFeatures( &TM.getSubtarget<X86Subtarget>())); } + virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands); @@ -622,6 +621,11 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, SmallVectorImpl<MCParsedAsmOperand*> &Operands) { StringRef PatchedName = Name; + // FIXME: Hack to recognize setneb as setne. + if (PatchedName.startswith("set") && PatchedName.endswith("b") && + PatchedName != "setb" && PatchedName != "setnb") + PatchedName = PatchedName.substr(0, Name.size()-1); + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. const MCExpr *ExtraImmOp = 0; if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && @@ -707,7 +711,8 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, bool isPrefix = Name == "lock" || Name == "rep" || Name == "repe" || Name == "repz" || - Name == "repne" || Name == "repnz"; + Name == "repne" || Name == "repnz" || + Name == "rex64" || Name == "data16"; // This does the actual operand parsing. Don't parse any more if we have a @@ -744,13 +749,16 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, } if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); Parser.EatToEndOfStatement(); - return TokError("unexpected token in argument list"); + return Error(Loc, "unexpected token in argument list"); } } if (getLexer().is(AsmToken::EndOfStatement)) Parser.Lex(); // Consume the EndOfStatement + else if (isPrefix && getLexer().is(AsmToken::Slash)) + Parser.Lex(); // Consume the prefix separator Slash // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely @@ -767,6 +775,19 @@ ParseInstruction(StringRef Name, SMLoc NameLoc, delete &Op; } } + // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". + if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && + Operands.size() == 3) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + delete &Op; + } + } // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to // "shift <op>". @@ -834,6 +855,8 @@ MatchAndEmitInstruction(SMLoc IDLoc, case Match_MissingFeature: Error(IDLoc, "instruction requires a CPU feature not currently enabled"); return true; + case Match_ConversionFail: + return Error(IDLoc, "unable to convert operands to instruction"); case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; diff --git a/lib/Target/X86/AsmPrinter/Android.mk b/lib/Target/X86/AsmPrinter/Android.mk deleted file mode 100644 index 3f5538a..0000000 --- a/lib/Target/X86/AsmPrinter/Android.mk +++ /dev/null @@ -1,57 +0,0 @@ -LOCAL_PATH := $(call my-dir) - -x86_asm_printer_TBLGEN_TABLES := \ - X86GenAsmWriter.inc \ - X86GenAsmWriter1.inc \ - X86GenInstrNames.inc \ - X86GenRegisterNames.inc \ - X86GenRegisterInfo.h.inc - -x86_asm_printer_SRC_FILES := \ - X86AsmPrinter.cpp - -# For the host -# ===================================================== -include $(CLEAR_VARS) -include $(CLEAR_TBLGEN_VARS) - -TBLGEN_TABLES := $(x86_asm_printer_TBLGEN_TABLES) - -TBLGEN_TD_DIR := $(LOCAL_PATH)/.. - -LOCAL_SRC_FILES := $(x86_asm_printer_SRC_FILES) - -LOCAL_C_INCLUDES += \ - $(LOCAL_PATH)/.. - -LOCAL_MODULE:= libLLVMX86AsmPrinter - -LOCAL_MODULE_TAGS := optional - -include $(LLVM_HOST_BUILD_MK) -include $(LLVM_TBLGEN_RULES_MK) -include $(BUILD_HOST_STATIC_LIBRARY) - -# For the device -# ===================================================== -ifeq ($(TARGET_ARCH),x86) -include $(CLEAR_VARS) -include $(CLEAR_TBLGEN_VARS) - -TBLGEN_TABLES := $(x86_asm_printer_TBLGEN_TABLES) - -TBLGEN_TD_DIR := $(LOCAL_PATH)/.. - -LOCAL_SRC_FILES := $(x86_asm_printer_SRC_FILES) - -LOCAL_C_INCLUDES += \ - $(LOCAL_PATH)/.. - -LOCAL_MODULE:= libLLVMX86AsmPrinter - -LOCAL_MODULE_TAGS := optional - -include $(LLVM_DEVICE_BUILD_MK) -include $(LLVM_TBLGEN_RULES_MK) -include $(BUILD_STATIC_LIBRARY) -endif diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp deleted file mode 100644 index d0aa290..0000000 --- a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp +++ /dev/null @@ -1,722 +0,0 @@ -//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a printer that converts from our internal representation -// of machine-dependent LLVM code to X86 machine code. -// -//===----------------------------------------------------------------------===// - -#include "X86AsmPrinter.h" -#include "InstPrinter/X86ATTInstPrinter.h" -#include "InstPrinter/X86IntelInstPrinter.h" -#include "X86MCInstLower.h" -#include "X86.h" -#include "X86COFFMachineModuleInfo.h" -#include "X86MachineFunctionInfo.h" -#include "X86TargetMachine.h" -#include "llvm/CallingConv.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/Type.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Assembly/Writer.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSectionMachO.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineModuleInfoImpls.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Support/COFF.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/Mangler.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegistry.h" -#include "llvm/ADT/SmallString.h" -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Primitive Helper Functions. -//===----------------------------------------------------------------------===// - -/// runOnMachineFunction - Emit the function body. -/// -bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { - SetupMachineFunction(MF); - - if (Subtarget->isTargetCOFF()) { - bool Intrn = MF.getFunction()->hasInternalLinkage(); - OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC - : COFF::IMAGE_SYM_CLASS_EXTERNAL); - OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION - << COFF::SCT_COMPLEX_TYPE_SHIFT); - OutStreamer.EndCOFFSymbolDef(); - } - - // Have common code print out the function header with linkage info etc. - EmitFunctionHeader(); - - // Emit the rest of the function body. - EmitFunctionBody(); - - // We didn't modify anything. - return false; -} - -/// printSymbolOperand - Print a raw symbol reference operand. This handles -/// jump tables, constant pools, global address and external symbols, all of -/// which print to a label with various suffixes for relocation types etc. -void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, - raw_ostream &O) { - switch (MO.getType()) { - default: llvm_unreachable("unknown symbol type!"); - case MachineOperand::MO_JumpTableIndex: - O << *GetJTISymbol(MO.getIndex()); - break; - case MachineOperand::MO_ConstantPoolIndex: - O << *GetCPISymbol(MO.getIndex()); - printOffset(MO.getOffset(), O); - break; - case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - - MCSymbol *GVSym; - if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) - GVSym = GetSymbolWithGlobalValueBase(GV, "$stub"); - else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || - MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || - MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) - GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - else - GVSym = Mang->getSymbol(GV); - - // Handle dllimport linkage. - if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) - GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName()); - - if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || - MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { - MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); - if (StubSym.getPointer() == 0) - StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); - } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ - MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym); - if (StubSym.getPointer() == 0) - StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); - } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { - MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); - if (StubSym.getPointer() == 0) - StubSym = MachineModuleInfoImpl:: - StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); - } - - // If the name begins with a dollar-sign, enclose it in parens. We do this - // to avoid having it look like an integer immediate to the assembler. - if (GVSym->getName()[0] != '$') - O << *GVSym; - else - O << '(' << *GVSym << ')'; - printOffset(MO.getOffset(), O); - break; - } - case MachineOperand::MO_ExternalSymbol: { - const MCSymbol *SymToPrint; - if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { - SmallString<128> TempNameStr; - TempNameStr += StringRef(MO.getSymbolName()); - TempNameStr += StringRef("$stub"); - - MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str()); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); - if (StubSym.getPointer() == 0) { - TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end()); - StubSym = MachineModuleInfoImpl:: - StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()), - true); - } - SymToPrint = StubSym.getPointer(); - } else { - SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName()); - } - - // If the name begins with a dollar-sign, enclose it in parens. We do this - // to avoid having it look like an integer immediate to the assembler. - if (SymToPrint->getName()[0] != '$') - O << *SymToPrint; - else - O << '(' << *SymToPrint << '('; - break; - } - } - - switch (MO.getTargetFlags()) { - default: - llvm_unreachable("Unknown target flag on GV operand"); - case X86II::MO_NO_FLAG: // No flag. - break; - case X86II::MO_DARWIN_NONLAZY: - case X86II::MO_DLLIMPORT: - case X86II::MO_DARWIN_STUB: - // These affect the name of the symbol, not any suffix. - break; - case X86II::MO_GOT_ABSOLUTE_ADDRESS: - O << " + [.-" << *MF->getPICBaseSymbol() << ']'; - break; - case X86II::MO_PIC_BASE_OFFSET: - case X86II::MO_DARWIN_NONLAZY_PIC_BASE: - case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: - O << '-' << *MF->getPICBaseSymbol(); - break; - case X86II::MO_TLSGD: O << "@TLSGD"; break; - case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break; - case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break; - case X86II::MO_TPOFF: O << "@TPOFF"; break; - case X86II::MO_NTPOFF: O << "@NTPOFF"; break; - case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break; - case X86II::MO_GOT: O << "@GOT"; break; - case X86II::MO_GOTOFF: O << "@GOTOFF"; break; - case X86II::MO_PLT: O << "@PLT"; break; - case X86II::MO_TLVP: O << "@TLVP"; break; - case X86II::MO_TLVP_PIC_BASE: - O << "@TLVP" << '-' << *MF->getPICBaseSymbol(); - break; - } -} - -/// print_pcrel_imm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value. These print slightly differently, for -/// example, a $ is not emitted. -void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(OpNo); - switch (MO.getType()) { - default: llvm_unreachable("Unknown pcrel immediate operand"); - case MachineOperand::MO_Register: - // pc-relativeness was handled when computing the value in the reg. - printOperand(MI, OpNo, O); - return; - case MachineOperand::MO_Immediate: - O << MO.getImm(); - return; - case MachineOperand::MO_MachineBasicBlock: - O << *MO.getMBB()->getSymbol(); - return; - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: - printSymbolOperand(MO, O); - return; - } -} - - -void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier) { - const MachineOperand &MO = MI->getOperand(OpNo); - switch (MO.getType()) { - default: llvm_unreachable("unknown operand type!"); - case MachineOperand::MO_Register: { - O << '%'; - unsigned Reg = MO.getReg(); - if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - EVT VT = (strcmp(Modifier+6,"64") == 0) ? - MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : - ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); - Reg = getX86SubSuperRegister(Reg, VT); - } - O << X86ATTInstPrinter::getRegisterName(Reg); - return; - } - - case MachineOperand::MO_Immediate: - O << '$' << MO.getImm(); - return; - - case MachineOperand::MO_JumpTableIndex: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_ExternalSymbol: { - O << '$'; - printSymbolOperand(MO, O); - break; - } - } -} - -void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op, - raw_ostream &O) { - unsigned char value = MI->getOperand(Op).getImm(); - assert(value <= 7 && "Invalid ssecc argument!"); - switch (value) { - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - } -} - -void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op, - raw_ostream &O, const char *Modifier) { - const MachineOperand &BaseReg = MI->getOperand(Op); - const MachineOperand &IndexReg = MI->getOperand(Op+2); - const MachineOperand &DispSpec = MI->getOperand(Op+3); - - // If we really don't want to print out (rip), don't. - bool HasBaseReg = BaseReg.getReg() != 0; - if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") && - BaseReg.getReg() == X86::RIP) - HasBaseReg = false; - - // HasParenPart - True if we will print out the () part of the mem ref. - bool HasParenPart = IndexReg.getReg() || HasBaseReg; - - if (DispSpec.isImm()) { - int DispVal = DispSpec.getImm(); - if (DispVal || !HasParenPart) - O << DispVal; - } else { - assert(DispSpec.isGlobal() || DispSpec.isCPI() || - DispSpec.isJTI() || DispSpec.isSymbol()); - printSymbolOperand(MI->getOperand(Op+3), O); - } - - if (HasParenPart) { - assert(IndexReg.getReg() != X86::ESP && - "X86 doesn't allow scaling by ESP"); - - O << '('; - if (HasBaseReg) - printOperand(MI, Op, O, Modifier); - - if (IndexReg.getReg()) { - O << ','; - printOperand(MI, Op+2, O, Modifier); - unsigned ScaleVal = MI->getOperand(Op+1).getImm(); - if (ScaleVal != 1) - O << ',' << ScaleVal; - } - O << ')'; - } -} - -void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, - raw_ostream &O, const char *Modifier) { - assert(isMem(MI, Op) && "Invalid memory reference!"); - const MachineOperand &Segment = MI->getOperand(Op+4); - if (Segment.getReg()) { - printOperand(MI, Op+4, O, Modifier); - O << ':'; - } - printLeaMemReference(MI, Op, O, Modifier); -} - -void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op, - raw_ostream &O) { - O << *MF->getPICBaseSymbol() << '\n'; - O << *MF->getPICBaseSymbol() << ':'; -} - -bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, - raw_ostream &O) { - unsigned Reg = MO.getReg(); - switch (Mode) { - default: return true; // Unknown mode. - case 'b': // Print QImode register - Reg = getX86SubSuperRegister(Reg, MVT::i8); - break; - case 'h': // Print QImode high register - Reg = getX86SubSuperRegister(Reg, MVT::i8, true); - break; - case 'w': // Print HImode register - Reg = getX86SubSuperRegister(Reg, MVT::i16); - break; - case 'k': // Print SImode register - Reg = getX86SubSuperRegister(Reg, MVT::i32); - break; - case 'q': // Print DImode register - Reg = getX86SubSuperRegister(Reg, MVT::i64); - break; - } - - O << '%' << X86ATTInstPrinter::getRegisterName(Reg); - return false; -} - -/// PrintAsmOperand - Print out an operand for an inline asm expression. -/// -bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O) { - // Does this asm operand have a single letter operand modifier? - if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) return true; // Unknown modifier. - - const MachineOperand &MO = MI->getOperand(OpNo); - - switch (ExtraCode[0]) { - default: return true; // Unknown modifier. - case 'a': // This is an address. Currently only 'i' and 'r' are expected. - if (MO.isImm()) { - O << MO.getImm(); - return false; - } - if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) { - printSymbolOperand(MO, O); - if (Subtarget->isPICStyleRIPRel()) - O << "(%rip)"; - return false; - } - if (MO.isReg()) { - O << '('; - printOperand(MI, OpNo, O); - O << ')'; - return false; - } - return true; - - case 'c': // Don't print "$" before a global var name or constant. - if (MO.isImm()) - O << MO.getImm(); - else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) - printSymbolOperand(MO, O); - else - printOperand(MI, OpNo, O); - return false; - - case 'A': // Print '*' before a register (it must be a register) - if (MO.isReg()) { - O << '*'; - printOperand(MI, OpNo, O); - return false; - } - return true; - - case 'b': // Print QImode register - case 'h': // Print QImode high register - case 'w': // Print HImode register - case 'k': // Print SImode register - case 'q': // Print DImode register - if (MO.isReg()) - return printAsmMRegister(MO, ExtraCode[0], O); - printOperand(MI, OpNo, O); - return false; - - case 'P': // This is the operand of a call, treat specially. - print_pcrel_imm(MI, OpNo, O); - return false; - - case 'n': // Negate the immediate or print a '-' before the operand. - // Note: this is a temporary solution. It should be handled target - // independently as part of the 'MC' work. - if (MO.isImm()) { - O << -MO.getImm(); - return false; - } - O << '-'; - } - } - - printOperand(MI, OpNo, O); - return false; -} - -bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNo, unsigned AsmVariant, - const char *ExtraCode, - raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) return true; // Unknown modifier. - - switch (ExtraCode[0]) { - default: return true; // Unknown modifier. - case 'b': // Print QImode register - case 'h': // Print QImode high register - case 'w': // Print HImode register - case 'k': // Print SImode register - case 'q': // Print SImode register - // These only apply to registers, ignore on mem. - break; - case 'P': // Don't print @PLT, but do print as memory. - printMemReference(MI, OpNo, O, "no-rip"); - return false; - } - } - printMemReference(MI, OpNo, O); - return false; -} - -void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { - if (Subtarget->isTargetDarwin()) - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); -} - - -void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetDarwin()) { - // All darwin targets use mach-o. - MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); - - // Output stubs for dynamically-linked functions. - MachineModuleInfoMachO::SymbolListTy Stubs; - - Stubs = MMIMacho.GetFnStubList(); - if (!Stubs.empty()) { - const MCSection *TheSection = - OutContext.getMachOSection("__IMPORT", "__jump_table", - MCSectionMachO::S_SYMBOL_STUBS | - MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE | - MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, - 5, SectionKind::getMetadata()); - OutStreamer.SwitchSection(TheSection); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$stub: - OutStreamer.EmitLabel(Stubs[i].first); - // .indirect_symbol _foo - OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(), - MCSA_IndirectSymbol); - // hlt; hlt; hlt; hlt; hlt hlt = 0xf4 = -12. - const char HltInsts[] = { -12, -12, -12, -12, -12 }; - OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/); - } - - Stubs.clear(); - OutStreamer.AddBlankLine(); - } - - // Output stubs for external and common global variables. - Stubs = MMIMacho.GetGVStubList(); - if (!Stubs.empty()) { - const MCSection *TheSection = - OutContext.getMachOSection("__IMPORT", "__pointers", - MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS, - SectionKind::getMetadata()); - OutStreamer.SwitchSection(TheSection); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$non_lazy_ptr: - OutStreamer.EmitLabel(Stubs[i].first); - // .indirect_symbol _foo - MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; - OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), - MCSA_IndirectSymbol); - // .long 0 - if (MCSym.getInt()) - // External to current translation unit. - OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); - else - // Internal to current translation unit. - // - // When we place the LSDA into the TEXT section, the type info - // pointers need to be indirect and pc-rel. We accomplish this by - // using NLPs. However, sometimes the types are local to the file. So - // we need to fill in the value for the NLP in those cases. - OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), - OutContext), - 4/*size*/, 0/*addrspace*/); - } - Stubs.clear(); - OutStreamer.AddBlankLine(); - } - - Stubs = MMIMacho.GetHiddenGVStubList(); - if (!Stubs.empty()) { - OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); - EmitAlignment(2); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$non_lazy_ptr: - OutStreamer.EmitLabel(Stubs[i].first); - // .long _foo - OutStreamer.EmitValue(MCSymbolRefExpr:: - Create(Stubs[i].second.getPointer(), - OutContext), - 4/*size*/, 0/*addrspace*/); - } - Stubs.clear(); - OutStreamer.AddBlankLine(); - } - - // Funny Darwin hack: This flag tells the linker that no global symbols - // contain code that falls through to other global symbols (e.g. the obvious - // implementation of multiple entry points). If this doesn't occur, the - // linker can safely perform dead code stripping. Since LLVM never - // generates code that does this, it is always safe to set. - OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - } - - if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() && - MMI->callsExternalVAFunctionWithFloatingPointArguments()) { - StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; - MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); - OutStreamer.EmitSymbolAttribute(S, MCSA_Global); - } - - if (Subtarget->isTargetCOFF()) { - X86COFFMachineModuleInfo &COFFMMI = - MMI->getObjFileInfo<X86COFFMachineModuleInfo>(); - - // Emit type information for external functions - typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator; - for (externals_iterator I = COFFMMI.externals_begin(), - E = COFFMMI.externals_end(); - I != E; ++I) { - OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); - OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL); - OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION - << COFF::SCT_COMPLEX_TYPE_SHIFT); - OutStreamer.EndCOFFSymbolDef(); - } - - // Necessary for dllexport support - std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; - - const TargetLoweringObjectFileCOFF &TLOFCOFF = - static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); - - for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) - if (I->hasDLLExportLinkage()) - DLLExportedFns.push_back(Mang->getSymbol(I)); - - for (Module::const_global_iterator I = M.global_begin(), - E = M.global_end(); I != E; ++I) - if (I->hasDLLExportLinkage()) - DLLExportedGlobals.push_back(Mang->getSymbol(I)); - - // Output linker support code for dllexported globals on windows. - if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) { - OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection()); - SmallString<128> name; - for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) { - if (Subtarget->isTargetWindows()) - name = " /EXPORT:"; - else - name = " -export:"; - name += DLLExportedGlobals[i]->getName(); - if (Subtarget->isTargetWindows()) - name += ",DATA"; - else - name += ",data"; - OutStreamer.EmitBytes(name, 0); - } - - for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) { - if (Subtarget->isTargetWindows()) - name = " /EXPORT:"; - else - name = " -export:"; - name += DLLExportedFns[i]->getName(); - OutStreamer.EmitBytes(name, 0); - } - } - } - - if (Subtarget->isTargetELF()) { - const TargetLoweringObjectFileELF &TLOFELF = - static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); - - MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>(); - - // Output stubs for external and common global variables. - MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); - if (!Stubs.empty()) { - OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const TargetData *TD = TM.getTargetData(); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - OutStreamer.EmitLabel(Stubs[i].first); - OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), - TD->getPointerSize(), 0); - } - Stubs.clear(); - } - } -} - -MachineLocation -X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - - if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - -void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &O) { - // Only the target-dependent form of DBG_VALUE should get here. - // Referencing the offset and metadata as NOps-2 and NOps-1 is - // probably portable to other targets; frame pointer location is not. - unsigned NOps = MI->getNumOperands(); - assert(NOps==7); - O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); - if (V.getContext().isSubprogram()) - O << DISubprogram(V.getContext()).getDisplayName() << ":"; - O << V.getName(); - O << " <- "; - // Frame address. Currently handles register +- offset only. - O << '['; - if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) - printOperand(MI, 0, O); - else - O << "undef"; - O << '+'; printOperand(MI, 3, O); - O << ']'; - O << "+"; - printOperand(MI, NOps-2, O); -} - - - -//===----------------------------------------------------------------------===// -// Target Registry Stuff -//===----------------------------------------------------------------------===// - -static MCInstPrinter *createX86MCInstPrinter(const Target &T, - unsigned SyntaxVariant, - const MCAsmInfo &MAI) { - if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI); - if (SyntaxVariant == 1) - return new X86IntelInstPrinter(MAI); - return 0; -} - -// Force static initialization. -extern "C" void LLVMInitializeX86AsmPrinter() { - RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target); - RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target); - - TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter); -} diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index f5f9cfa..b5fa94f 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -24,11 +24,12 @@ set(sources X86ELFWriterInfo.cpp X86FastISel.cpp X86FloatingPoint.cpp - X86FrameInfo.cpp + X86FrameLowering.cpp X86ISelDAGToDAG.cpp X86ISelLowering.cpp X86InstrInfo.cpp X86JITInfo.cpp + X86MachObjectWriter.cpp X86MCAsmInfo.cpp X86MCCodeEmitter.cpp X86MCInstLower.cpp @@ -40,14 +41,24 @@ set(sources ) if( CMAKE_CL_64 ) + # A workaround for a bug in cmake 2.8.3. See PR 8885. + if( CMAKE_VERSION STREQUAL "2.8.3" ) + include(CMakeDetermineCompilerId) + endif() + # end of workaround. enable_language(ASM_MASM) ADD_CUSTOM_COMMAND( OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj + MAIN_DEPENDENCY X86CompilationCallback_Win64.asm COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm ) set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj) endif() add_llvm_target(X86CodeGen ${sources}) +add_subdirectory(AsmParser) +add_subdirectory(Disassembler) +add_subdirectory(InstPrinter) +add_subdirectory(TargetInfo) +add_subdirectory(Utils) diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt index 97589c0..972a0d9 100644 --- a/lib/Target/X86/Disassembler/CMakeLists.txt +++ b/lib/Target/X86/Disassembler/CMakeLists.txt @@ -5,7 +5,7 @@ add_llvm_library(LLVMX86Disassembler X86DisassemblerDecoder.c ) # workaround for hanging compilation on MSVC9 and 10 -if( MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) +if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) set_property( SOURCE X86Disassembler.cpp PROPERTY COMPILE_FLAGS "/Od" diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 691e2d7..f777756 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -168,16 +168,16 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, switch (insn.displacementSize) { default: break; - case 8: + case 1: type = TYPE_MOFFS8; break; - case 16: + case 2: type = TYPE_MOFFS16; break; - case 32: + case 4: type = TYPE_MOFFS32; break; - case 64: + case 8: type = TYPE_MOFFS64; break; } diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c index 1fd6685..a9d28c9 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -290,7 +290,7 @@ static int readPrefixes(struct InternalInstruction* insn) { BOOL isPrefix = TRUE; BOOL prefixGroups[4] = { FALSE }; uint64_t prefixLocation; - uint8_t byte; + uint8_t byte = 0; BOOL hasAdSize = FALSE; BOOL hasOpSize = FALSE; @@ -388,6 +388,7 @@ static int readPrefixes(struct InternalInstruction* insn) { } } else { unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; } if (insn->mode == MODE_16BIT) { @@ -511,7 +512,8 @@ static int getIDWithAttrMask(uint16_t* instructionID, insn->opcode); if (hasModRMExtension) { - readModRM(insn); + if (readModRM(insn)) + return -1; *instructionID = decode(insn->opcodeType, instructionClass, @@ -747,7 +749,7 @@ static int readSIB(struct InternalInstruction* insn) { insn->sibIndex = SIB_INDEX_NONE; break; default: - insn->sibIndex = (EABase)(sibIndexBase + index); + insn->sibIndex = (SIBIndex)(sibIndexBase + index); if (insn->sibIndex == SIB_INDEX_sib || insn->sibIndex == SIB_INDEX_sib64) insn->sibIndex = SIB_INDEX_NONE; @@ -794,7 +796,7 @@ static int readSIB(struct InternalInstruction* insn) { } break; default: - insn->sibBase = (EABase)(sibBaseBase + base); + insn->sibBase = (SIBBase)(sibBaseBase + base); break; } @@ -860,7 +862,8 @@ static int readModRM(struct InternalInstruction* insn) { if (insn->consumedModRM) return 0; - consumeByte(insn, &insn->modRM); + if (consumeByte(insn, &insn->modRM)) + return -1; insn->consumedModRM = TRUE; mod = modFromModRM(insn->modRM); diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 4f4fbcd..d0dc8b5 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -399,7 +399,7 @@ struct InternalInstruction { /* The segment override type */ SegmentOverride segmentOverride; - /* Sizes of various critical pieces of data */ + /* Sizes of various critical pieces of data, in bytes */ uint8_t registerSize; uint8_t addressSize; uint8_t displacementSize; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index abcb716..1425b86 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -22,7 +22,7 @@ #ifndef X86DISASSEMBLERDECODERCOMMON_H #define X86DISASSEMBLERDECODERCOMMON_H -#include "llvm/System/DataTypes.h" +#include "llvm/Support/DataTypes.h" #define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers #define CONTEXTS_SYM x86DisassemblerContexts diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index da9d5a3..c642acc 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -16,7 +16,7 @@ #include "X86GenInstrNames.inc" #include "llvm/MC/MCInst.h" #include "llvm/Support/raw_ostream.h" -#include "../X86ShuffleDecode.h" +#include "../Utils/X86ShuffleDecode.h" using namespace llvm; //===----------------------------------------------------------------------===// @@ -111,28 +111,28 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, // FALL THROUGH. case X86::PUNPCKLBWrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(16, ShuffleMask); + DecodePUNPCKLBWMask(16, ShuffleMask); break; case X86::PUNPCKLWDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLWDrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(8, ShuffleMask); + DecodePUNPCKLWDMask(8, ShuffleMask); break; case X86::PUNPCKLDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(4, ShuffleMask); + DecodePUNPCKLDQMask(4, ShuffleMask); break; case X86::PUNPCKLQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PUNPCKLQDQrm: Src1Name = getRegName(MI->getOperand(0).getReg()); - DecodePUNPCKLMask(2, ShuffleMask); + DecodePUNPCKLQDQMask(2, ShuffleMask); break; case X86::SHUFPDrri: @@ -153,16 +153,44 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPDrm: - DecodeUNPCKLPMask(2, ShuffleMask); + DecodeUNPCKLPDMask(2, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDrm: + DecodeUNPCKLPDMask(2, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDYrm: + DecodeUNPCKLPDMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; case X86::UNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::UNPCKLPSrm: - DecodeUNPCKLPMask(4, ShuffleMask); + DecodeUNPCKLPSMask(4, ShuffleMask); Src1Name = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSrm: + DecodeUNPCKLPSMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSYrm: + DecodeUNPCKLPSMask(8, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; case X86::UNPCKHPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index a92a651..0484529 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -21,7 +21,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" #include "X86GenInstrNames.inc" -#include <ctype.h> +#include <cctype> using namespace llvm; // Include the auto-generated portion of the assembly writer. diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile index 9c6415d..12fb090 100644 --- a/lib/Target/X86/Makefile +++ b/lib/Target/X86/Makefile @@ -20,6 +20,6 @@ BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \ X86GenCallingConv.inc X86GenSubtarget.inc \ X86GenEDInfo.inc -DIRS = InstPrinter AsmParser Disassembler TargetInfo +DIRS = InstPrinter AsmParser Disassembler TargetInfo Utils include $(LEVEL)/Makefile.common diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index b2116e0..f16ec02 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -923,4 +923,15 @@ The insertps's of $0 are pointless complex copies. //===---------------------------------------------------------------------===// +If SSE4.1 is available we should inline rounding functions instead of emitting +a libcall. +floor: roundsd $0x01, %xmm, %xmm +ceil: roundsd $0x02, %xmm, %xmm + +and likewise for the single precision versions. + +Currently, SelectionDAGBuilder doesn't turn calls to these functions into the +corresponding nodes and some targets (including X86) aren't ready for them. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index 78c4dc0..e21d69a 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -41,50 +41,6 @@ saved a few instructions. //===---------------------------------------------------------------------===// -Poor codegen: - -int X[2]; -int b; -void test(void) { - memset(X, b, 2*sizeof(X[0])); -} - -llc: - movq _b@GOTPCREL(%rip), %rax - movzbq (%rax), %rax - movq %rax, %rcx - shlq $8, %rcx - orq %rax, %rcx - movq %rcx, %rax - shlq $16, %rax - orq %rcx, %rax - movq %rax, %rcx - shlq $32, %rcx - movq _X@GOTPCREL(%rip), %rdx - orq %rax, %rcx - movq %rcx, (%rdx) - ret - -gcc: - movq _b@GOTPCREL(%rip), %rax - movabsq $72340172838076673, %rdx - movzbq (%rax), %rax - imulq %rdx, %rax - movq _X@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) - ret - -And the codegen is even worse for the following -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103): - void fill1(char *s, int a) - { - __builtin_memset(s, a, 15); - } - -For this version, we duplicate the computation of the constant to store. - -//===---------------------------------------------------------------------===// - It's not possible to reference AH, BH, CH, and DH registers in an instruction requiring REX prefix. However, divb and mulb both produce results in AH. If isel emits a CopyFromReg which gets turned into a movb and that can be allocated a diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index a305ae6..abd1515 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -67,19 +67,6 @@ cmovs, we should expand to a conditional branch like GCC produces. //===---------------------------------------------------------------------===// -Compile this: -_Bool f(_Bool a) { return a!=1; } - -into: - movzbl %dil, %eax - xorl $1, %eax - ret - -(Although note that this isn't a legal way to express the code that llvm-gcc -currently generates for that function.) - -//===---------------------------------------------------------------------===// - Some isel ideas: 1. Dynamic programming based approach when compile time if not an @@ -109,6 +96,37 @@ It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// +This: + +void foo(void); +void bar(int x, int *P) { + x >>= 2; + if (x) + foo(); + *P = x; +} + +compiles into: + + movq %rsi, %rbx + movl %edi, %r14d + sarl $2, %r14d + testl %r14d, %r14d + je LBB0_2 + +Instead of doing an explicit test, we can use the flags off the sar. This +occurs in a bigger testcase like this, which is pretty common: + +#include <vector> +int test1(std::vector<int> &X) { + int Sum = 0; + for (long i = 0, e = X.size(); i != e; ++i) + X[i] = 0; + return Sum; +} + +//===---------------------------------------------------------------------===// + Only use inc/neg/not instructions on processors where they are faster than add/sub/xor. They are slower on the P4 due to only updating some processor flags. @@ -394,72 +412,8 @@ boundary to improve performance. //===---------------------------------------------------------------------===// -Codegen: - -int f(int a, int b) { - if (a == 4 || a == 6) - b++; - return b; -} - - -as: - -or eax, 2 -cmp eax, 6 -jz label - -//===---------------------------------------------------------------------===// - GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting -simplifications for integer "x cmp y ? a : b". For example, instead of: - -int G; -void f(int X, int Y) { - G = X < 0 ? 14 : 13; -} - -compiling to: - -_f: - movl $14, %eax - movl $13, %ecx - movl 4(%esp), %edx - testl %edx, %edx - cmovl %eax, %ecx - movl %ecx, _G - ret - -it could be: -_f: - movl 4(%esp), %eax - sarl $31, %eax - notl %eax - addl $14, %eax - movl %eax, _G - ret - -etc. - -Another is: -int usesbb(unsigned int a, unsigned int b) { - return (a < b ? -1 : 0); -} -to: -_usesbb: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - sbbl %eax, %eax - ret - -instead of: -_usesbb: - xorl %eax, %eax - movl 8(%esp), %ecx - cmpl %ecx, 4(%esp) - movl $4294967295, %ecx - cmovb %ecx, %eax - ret +simplifications for integer "x cmp y ? a : b". //===---------------------------------------------------------------------===// @@ -756,23 +710,17 @@ This: { return !full_add(a, b).second; } Should compile to: + addl %esi, %edi + setae %al + movzbl %al, %eax + ret - - _Z11no_overflowjj: - addl %edi, %esi - setae %al - ret - -FIXME: That code looks wrong; bool return is normally defined as zext. - -on x86-64, not: - -__Z11no_overflowjj: - addl %edi, %esi - cmpl %edi, %esi - setae %al - movzbl %al, %eax - ret +on x86-64, instead of the rather stupid-looking: + addl %esi, %edi + setb %al + xorb $1, %al + movzbl %al, %eax + ret //===---------------------------------------------------------------------===// @@ -1040,10 +988,10 @@ _foo: instead of: _foo: - movl $255, %eax - orl 4(%esp), %eax - andl $65535, %eax - ret + movl $65280, %eax + andl 4(%esp), %eax + orl $255, %eax + ret //===---------------------------------------------------------------------===// @@ -1165,58 +1113,6 @@ abs: //===---------------------------------------------------------------------===// -Consider: -int test(unsigned long a, unsigned long b) { return -(a < b); } - -We currently compile this to: - -define i32 @test(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1] - %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1] - %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1] - ret i32 %tmp5 -} - -and - -_test: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - negl %eax - ret - -Several deficiencies here. First, we should instcombine zext+neg into sext: - -define i32 @test2(i32 %a, i32 %b) nounwind { - %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1] - %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1] - ret i32 %tmp34 -} - -However, before we can do that, we have to fix the bad codegen that we get for -sext from bool: - -_test2: - movl 8(%esp), %eax - cmpl %eax, 4(%esp) - setb %al - movzbl %al, %eax - shll $31, %eax - sarl $31, %eax - ret - -This code should be at least as good as the code above. Once this is fixed, we -can optimize this specific case even more to: - - movl 8(%esp), %eax - xorl %ecx, %ecx - cmpl %eax, 4(%esp) - sbbl %ecx, %ecx - -//===---------------------------------------------------------------------===// - Take the following code (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): @@ -1605,6 +1501,8 @@ loop, the value comes into the loop as two values, and RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the constructed BUILD_PAIR which represents the cast value. +This can be handled by making CodeGenPrepare sink the cast. + //===---------------------------------------------------------------------===// Test instructions can be eliminated by using EFLAGS values from arithmetic @@ -1736,46 +1634,6 @@ Ideal output: //===---------------------------------------------------------------------===// -Testcase: -int x(int a) { return (a & 0x80) ? 0x100 : 0; } -int y(int a) { return (a & 0x80) *2; } - -Current: - testl $128, 4(%esp) - setne %al - movzbl %al, %eax - shll $8, %eax - ret - -Better: - movl 4(%esp), %eax - addl %eax, %eax - andl $256, %eax - ret - -This is another general instcombine transformation that is profitable on all -targets. In LLVM IR, these functions look like this: - -define i32 @x(i32 %a) nounwind readnone { -entry: - %0 = and i32 %a, 128 - %1 = icmp eq i32 %0, 0 - %iftmp.0.0 = select i1 %1, i32 0, i32 256 - ret i32 %iftmp.0.0 -} - -define i32 @y(i32 %a) nounwind readnone { -entry: - %0 = shl i32 %a, 1 - %1 = and i32 %0, 256 - ret i32 %1 -} - -Replacing an icmp+select with a shift should always be considered profitable in -instcombine. - -//===---------------------------------------------------------------------===// - Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch properly. @@ -1960,3 +1818,132 @@ load, making it non-trivial to determine if there's anything between the load and the store which would prohibit narrowing. //===---------------------------------------------------------------------===// + +This code: +void foo(unsigned x) { + if (x == 0) bar(); + else if (x == 1) qux(); +} + +currently compiles into: +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + je LBB0_3 + testl %eax, %eax + jne LBB0_4 + +the testl could be removed: +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + je LBB0_3 + jb LBB0_4 + +0 is the only unsigned number < 1. + +//===---------------------------------------------------------------------===// + +This code: + +%0 = type { i32, i1 } + +define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { +entry: + %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) + %cmp = extractvalue %0 %uadd, 1 + %inc = zext i1 %cmp to i32 + %add = add i32 %x, %sum + %z.0 = add i32 %add, %inc + ret i32 %z.0 +} + +declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone + +compiles to: + +_add32carry: ## @add32carry + addl %esi, %edi + sbbl %ecx, %ecx + movl %edi, %eax + subl %ecx, %eax + ret + +But it could be: + +_add32carry: + leal (%rsi,%rdi), %eax + cmpl %esi, %eax + adcl $0, %eax + ret + +//===---------------------------------------------------------------------===// + +The hot loop of 256.bzip2 contains code that looks a bit like this: + +int foo(char *P, char *Q, int x, int y) { + if (P[0] != Q[0]) + return P[0] < Q[0]; + if (P[1] != Q[1]) + return P[1] < Q[1]; + if (P[2] != Q[2]) + return P[2] < Q[2]; + return P[3] < Q[3]; +} + +In the real code, we get a lot more wrong than this. However, even in this +code we generate: + +_foo: ## @foo +## BB#0: ## %entry + movb (%rsi), %al + movb (%rdi), %cl + cmpb %al, %cl + je LBB0_2 +LBB0_1: ## %if.then + cmpb %al, %cl + jmp LBB0_5 +LBB0_2: ## %if.end + movb 1(%rsi), %al + movb 1(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#3: ## %if.end38 + movb 2(%rsi), %al + movb 2(%rdi), %cl + cmpb %al, %cl + jne LBB0_1 +## BB#4: ## %if.end60 + movb 3(%rdi), %al + cmpb 3(%rsi), %al +LBB0_5: ## %if.end60 + setl %al + movzbl %al, %eax + ret + +Note that we generate jumps to LBB0_1 which does a redundant compare. The +redundant compare also forces the register values to be live, which prevents +folding one of the loads into the compare. In contrast, GCC 4.2 produces: + +_foo: + movzbl (%rsi), %eax + cmpb %al, (%rdi) + jne L10 +L12: + movzbl 1(%rsi), %eax + cmpb %al, 1(%rdi) + jne L10 + movzbl 2(%rsi), %eax + cmpb %al, 2(%rdi) + jne L10 + movzbl 3(%rdi), %eax + cmpb 3(%rsi), %al +L10: + setl %al + movzbl %al, %eax + ret + +which is "perfect". + +//===---------------------------------------------------------------------===// + diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt new file mode 100644 index 0000000..3ad5f99 --- /dev/null +++ b/lib/Target/X86/Utils/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMX86Utils + X86ShuffleDecode.cpp + ) +add_dependencies(LLVMX86Utils X86CodeGenTable_gen) diff --git a/lib/Target/X86/Utils/Makefile b/lib/Target/X86/Utils/Makefile new file mode 100644 index 0000000..1df6f0f --- /dev/null +++ b/lib/Target/X86/Utils/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/X86/Utils/Makefile -----------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMX86Utils + +# Hack: we need to include 'main' x86 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp new file mode 100644 index 0000000..cd06060 --- /dev/null +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -0,0 +1,190 @@ +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecode.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) { + // Defaults the copying the dest value. + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + + // Decode the immediate. + unsigned ZMask = Imm & 15; + unsigned CountD = (Imm >> 4) & 3; + unsigned CountS = (Imm >> 6) & 3; + + // CountS selects which input element to use. + unsigned InVal = 4+CountS; + // CountD specifies which element of destination to update. + ShuffleMask[CountD] = InVal; + // ZMask zaps values, potentially overriding the CountD elt. + if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; + if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; + if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; + if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; +} + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(NElts+i); + + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(i); +} + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(i); + + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(NElts+i); +} + +void DecodePSHUFMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts; ++i) { + ShuffleMask.push_back(Imm % NElts); + Imm /= NElts; + } +} + +void DecodePSHUFHWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back(4+(Imm & 3)); + Imm >>= 2; + } +} + +void DecodePSHUFLWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm & 3)); + Imm >>= 2; + } + ShuffleMask.push_back(4); + ShuffleMask.push_back(5); + ShuffleMask.push_back(6); + ShuffleMask.push_back(7); +} + +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask); +} + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask); +} + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} + +void DecodePUNPCKLMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(VT, ShuffleMask); +} + +void DecodePUNPCKHMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(i+NElts/2); + ShuffleMask.push_back(i+NElts+NElts/2); + } +} + +void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + // Part that reads from dest. + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(Imm % NElts); + Imm /= NElts; + } + // Part that reads from src. + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(Imm % NElts + NElts); + Imm /= NElts; + } +} + +void DecodeUNPCKHPMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(i+NElts/2); // Reads from dest + ShuffleMask.push_back(i+NElts+NElts/2); // Reads from src + } +} + +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} + +/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts / 2; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start; i != End; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumSectionElts); // Reads from src/src2 + } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; + } +} + +} // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h new file mode 100644 index 0000000..b18f670 --- /dev/null +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -0,0 +1,87 @@ +//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_SHUFFLE_DECODE_H +#define X86_SHUFFLE_DECODE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ValueTypes.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +enum { + SM_SentinelZero = ~0U +}; + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFHWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFLWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKHMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKHPMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask); + +} // llvm namespace + +#endif diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 27e8850..0ca4366 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -15,6 +15,7 @@ #ifndef TARGET_X86_H #define TARGET_X86_H +#include "llvm/Support/DataTypes.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -23,11 +24,13 @@ class FunctionPass; class JITCodeEmitter; class MCCodeEmitter; class MCContext; +class MCObjectWriter; class MachineCodeEmitter; class Target; class TargetAsmBackend; class X86TargetMachine; class formatted_raw_ostream; +class raw_ostream; /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. @@ -74,6 +77,13 @@ FunctionPass *createEmitX86CodeToMemory(); /// FunctionPass *createX86MaxStackAlignmentHeuristicPass(); + +/// createX86MachObjectWriter - Construct an X86 Mach-O object writer. +MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + extern Target TheX86_32Target, TheX86_64Target; } // End llvm namespace diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 923f894..efb6c8c 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -23,6 +23,9 @@ include "llvm/Target/Target.td" def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", "Enable conditional move instructions">; +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", "Enable MMX instructions">; @@ -45,7 +48,7 @@ def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41", [FeatureSSSE3]>; def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", - [FeatureSSE41]>; + [FeatureSSE41, FeaturePOPCNT]>; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", "Enable 3DNow! instructions">; def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", @@ -63,7 +66,8 @@ def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "IsUAMemFast", "true", "Fast unaligned memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", - "Support SSE 4a instructions">; + "Support SSE 4a instructions", + [FeaturePOPCNT]>; def FeatureAVX : SubtargetFeature<"avx", "HasAVX", "true", "Enable AVX instructions">; @@ -112,11 +116,13 @@ def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, FeatureFastUAMem]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem, FeatureAES]>; -// Sandy Bridge does not have FMA -// FIXME: Wikipedia says it does... it should have AES as well. -def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>; +def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem, FeatureAES, FeatureCLMUL]>; +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +// FIXME: Disabling AVX for now since it's not ready. +def : Proc<"sandybridge", [FeatureSSE42, Feature64Bit, + FeatureAES, FeatureCLMUL]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/X86AsmBackend.cpp index 8910c42..da5f5b1 100644 --- a/lib/Target/X86/X86AsmBackend.cpp +++ b/lib/Target/X86/X86AsmBackend.cpp @@ -12,52 +12,82 @@ #include "X86FixupKinds.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCObjectFormat.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" +#include "llvm/Object/MachOFormat.h" #include "llvm/Support/ELF.h" -#include "llvm/Support/MachO.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetRegistry.h" #include "llvm/Target/TargetAsmBackend.h" using namespace llvm; - static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: assert(0 && "invalid fixup kind!"); - case X86::reloc_pcrel_1byte: + case FK_PCRel_1: case FK_Data_1: return 0; - case X86::reloc_pcrel_2byte: + case FK_PCRel_2: case FK_Data_2: return 1; - case X86::reloc_pcrel_4byte: + case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_movq_load: case X86::reloc_signed_4byte: case X86::reloc_global_offset_table: case FK_Data_4: return 2; + case FK_PCRel_8: case FK_Data_8: return 3; } } namespace { + +class X86ELFObjectWriter : public MCELFObjectTargetWriter { +public: + X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine, + bool HasRelocationAddend) + : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {} +}; + class X86AsmBackend : public TargetAsmBackend { public: X86AsmBackend(const Target &T) - : TargetAsmBackend(T) {} + : TargetAsmBackend() {} + + unsigned getNumFixupKinds() const { + return X86::NumTargetFixupKinds; + } - void ApplyFixup(const MCFixup &Fixup, MCDataFragment &DF, + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, + { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel}, + { "reloc_signed_4byte", 0, 4 * 8, 0}, + { "reloc_global_offset_table", 0, 4 * 8, 0} + }; + + if (Kind < FirstTargetFixupKind) + return TargetAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); - assert(Fixup.getOffset() + Size <= DF.getContents().size() && + assert(Fixup.getOffset() + Size <= DataSize && "Invalid fixup offset!"); for (unsigned i = 0; i != Size; ++i) - DF.getContents()[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); } bool MayNeedRelaxation(const MCInst &Inst) const; @@ -153,6 +183,9 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::CMP32mi8: return X86::CMP32mi; case X86::CMP64ri8: return X86::CMP64ri32; case X86::CMP64mi8: return X86::CMP64mi32; + + // PUSH + case X86::PUSHi8: return X86::PUSHi32; } } @@ -211,10 +244,8 @@ void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { /// WriteNopData - Write optimal nops to the output file for the \arg Count /// bytes. This returns the number of bytes written. It may return 0 if /// the \arg Count is more than the maximum optimal nops. -/// -/// FIXME this is X86 32-bit specific and should move to a better place. bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { - static const uint8_t Nops[16][16] = { + static const uint8_t Nops[10][10] = { // nop {0x90}, // xchg %ax,%ax @@ -235,30 +266,16 @@ bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, // nopw %cs:0L(%[re]ax,%[re]ax,1) {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, - // nopw %cs:0L(%[re]ax,%[re]ax,1) - {0x66, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, - // nopw 0(%[re]ax,%[re]ax,1) - // nopw 0(%[re]ax,%[re]ax,1) - {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, - 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, - // nopw 0(%[re]ax,%[re]ax,1) - // nopl 0L(%[re]ax) */ - {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, - 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, - // nopl 0L(%[re]ax) - // nopl 0L(%[re]ax) - {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, - 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, - // nopl 0L(%[re]ax) - // nopl 0L(%[re]ax,%[re]ax,1) - {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, - 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00} }; // Write an optimal sequence for the first 15 bytes. - uint64_t OptimalCount = (Count < 16) ? Count : 15; - for (uint64_t i = 0, e = OptimalCount; i != e; i++) - OW->Write8(Nops[OptimalCount - 1][i]); + const uint64_t OptimalCount = (Count < 16) ? Count : 15; + const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10; + for (uint64_t i = 0, e = Prefixes; i != e; i++) + OW->Write8(0x66); + const uint64_t Rest = OptimalCount - Prefixes; + for (uint64_t i = 0, e = Rest; i != e; i++) + OW->Write8(Nops[Rest - 1][i]); // Finish with single byte nops. for (uint64_t i = OptimalCount, e = Count; i != e; ++i) @@ -271,28 +288,16 @@ bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { namespace { class ELFX86AsmBackend : public X86AsmBackend { - MCELFObjectFormat Format; - public: Triple::OSType OSType; ELFX86AsmBackend(const Target &T, Triple::OSType _OSType) : X86AsmBackend(T), OSType(_OSType) { - HasScatteredSymbols = true; HasReliableSymbolDifference = true; } - virtual const MCObjectFormat &getObjectFormat() const { - return Format; - } - virtual bool doesSectionRequireSymbols(const MCSection &Section) const { const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section); - return ES.getFlags() & MCSectionELF::SHF_MERGE; - } - - bool isVirtualSection(const MCSection &Section) const { - const MCSectionELF &SE = static_cast<const MCSectionELF&>(Section); - return SE.getType() == MCSectionELF::SHT_NOBITS; + return ES.getFlags() & ELF::SHF_MERGE; } }; @@ -301,15 +306,10 @@ public: ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType) : ELFX86AsmBackend(T, OSType) {} - unsigned getPointerSize() const { - return 4; - } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(OS, /*Is64Bit=*/false, - OSType, ELF::EM_386, - /*IsLittleEndian=*/true, - /*HasRelocationAddend=*/false); + return createELFObjectWriter(new X86ELFObjectWriter(false, OSType, + ELF::EM_386, false), + OS, /*IsLittleEndian*/ true); } }; @@ -318,69 +318,31 @@ public: ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType) : ELFX86AsmBackend(T, OSType) {} - unsigned getPointerSize() const { - return 8; - } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createELFObjectWriter(OS, /*Is64Bit=*/true, - OSType, ELF::EM_X86_64, - /*IsLittleEndian=*/true, - /*HasRelocationAddend=*/true); + return createELFObjectWriter(new X86ELFObjectWriter(true, OSType, + ELF::EM_X86_64, true), + OS, /*IsLittleEndian*/ true); } }; class WindowsX86AsmBackend : public X86AsmBackend { bool Is64Bit; - MCCOFFObjectFormat Format; public: WindowsX86AsmBackend(const Target &T, bool is64Bit) : X86AsmBackend(T) , Is64Bit(is64Bit) { - HasScatteredSymbols = true; - } - - virtual const MCObjectFormat &getObjectFormat() const { - return Format; - } - - unsigned getPointerSize() const { - if (Is64Bit) - return 8; - else - return 4; } MCObjectWriter *createObjectWriter(raw_ostream &OS) const { return createWinCOFFObjectWriter(OS, Is64Bit); } - - bool isVirtualSection(const MCSection &Section) const { - const MCSectionCOFF &SE = static_cast<const MCSectionCOFF&>(Section); - return SE.getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA; - } }; class DarwinX86AsmBackend : public X86AsmBackend { - MCMachOObjectFormat Format; - public: DarwinX86AsmBackend(const Target &T) - : X86AsmBackend(T) { - HasScatteredSymbols = true; - } - - virtual const MCObjectFormat &getObjectFormat() const { - return Format; - } - - bool isVirtualSection(const MCSection &Section) const { - const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); - return (SMO.getType() == MCSectionMachO::S_ZEROFILL || - SMO.getType() == MCSectionMachO::S_GB_ZEROFILL || - SMO.getType() == MCSectionMachO::S_THREAD_LOCAL_ZEROFILL); - } + : X86AsmBackend(T) { } }; class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { @@ -388,14 +350,10 @@ public: DarwinX86_32AsmBackend(const Target &T) : DarwinX86AsmBackend(T) {} - unsigned getPointerSize() const { - return 4; - } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPUTypeI386, - MachO::CPUSubType_I386_ALL, - /*IsLittleEndian=*/true); + return createX86MachObjectWriter(OS, /*Is64Bit=*/false, + object::mach::CTM_i386, + object::mach::CSX86_ALL); } }; @@ -406,14 +364,10 @@ public: HasReliableSymbolDifference = true; } - unsigned getPointerSize() const { - return 8; - } - MCObjectWriter *createObjectWriter(raw_ostream &OS) const { - return createMachObjectWriter(OS, /*Is64Bit=*/true, MachO::CPUTypeX86_64, - MachO::CPUSubType_I386_ALL, - /*IsLittleEndian=*/true); + return createX86MachObjectWriter(OS, /*Is64Bit=*/true, + object::mach::CTM_x86_64, + object::mach::CSX86_ALL); } virtual bool doesSectionRequireSymbols(const MCSection &Section) const { @@ -460,7 +414,10 @@ TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T, case Triple::MinGW32: case Triple::Cygwin: case Triple::Win32: - return new WindowsX86AsmBackend(T, false); + if (Triple(TT).getEnvironment() == Triple::MachO) + return new DarwinX86_32AsmBackend(T); + else + return new WindowsX86AsmBackend(T, false); default: return new ELFX86_32AsmBackend(T, Triple(TT).getOS()); } @@ -471,10 +428,13 @@ TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T, switch (Triple(TT).getOS()) { case Triple::Darwin: return new DarwinX86_64AsmBackend(T); - case Triple::MinGW64: + case Triple::MinGW32: case Triple::Cygwin: case Triple::Win32: - return new WindowsX86AsmBackend(T, true); + if (Triple(TT).getEnvironment() == Triple::MachO) + return new DarwinX86_64AsmBackend(T); + else + return new WindowsX86AsmBackend(T, true); default: return new ELFX86_64AsmBackend(T, Triple(TT).getOS()); } diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 56863d9..5635175 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -61,7 +61,7 @@ def RetCC_X86_32_C : CallingConv<[ // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. - CCIfInReg<CCIfSubtarget<"hasSSE2()", + CCIfInReg<CCIfSubtarget<"hasXMMInt()", CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, CCDelegateTo<RetCC_X86Common> @@ -73,8 +73,8 @@ def RetCC_X86_32_Fast : CallingConv<[ // SSE2. // This can happen when a float, 2 x float, or 3 x float vector is split by // target lowering, and is returned in 1-3 sse regs. - CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, - CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, // For integers, ECX can be used as an extra return register CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, @@ -163,12 +163,12 @@ def CC_X86_64_C : CallingConv<[ // registers on Darwin. CCIfType<[x86mmx], CCIfSubtarget<"isTargetDarwin()", - CCIfSubtarget<"hasSSE2()", + CCIfSubtarget<"hasXMMInt()", CCPromoteToType<v2i64>>>>, // The first 8 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasSSE1()", + CCIfSubtarget<"hasXMM()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, // The first 8 256-bit vector arguments are passed in YMM registers. @@ -215,6 +215,13 @@ def CC_X86_Win64_C : CallingConv<[ // The first 4 integer arguments are passed in integer registers. CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], [XMM0, XMM1, XMM2, XMM3]>>, @@ -245,7 +252,7 @@ def CC_X86_64_GHC : CallingConv<[ // Pass in STG registers: F1, F2, F3, F4, D1, D2 CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasSSE1()", + CCIfSubtarget<"hasXMM()", CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> ]>; @@ -263,7 +270,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], - CCIfSubtarget<"hasSSE2()", + CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx @@ -362,7 +369,7 @@ def CC_X86_32_FastCC : CallingConv<[ // The first 3 float or double arguments, if the call is not a vararg // call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg<CCIfType<[f32,f64], - CCIfSubtarget<"hasSSE2()", + CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, // Doubles get 8-byte slots that are 8-byte aligned. diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp index 3b721b4..f1d7ede 100644 --- a/lib/Target/X86/X86ELFWriterInfo.cpp +++ b/lib/Target/X86/X86ELFWriterInfo.cpp @@ -14,6 +14,7 @@ #include "X86ELFWriterInfo.h" #include "X86Relocations.h" #include "llvm/Function.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetMachine.h" @@ -35,13 +36,13 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { if (is64Bit) { switch(MachineRelTy) { case X86::reloc_pcrel_word: - return R_X86_64_PC32; + return ELF::R_X86_64_PC32; case X86::reloc_absolute_word: - return R_X86_64_32; + return ELF::R_X86_64_32; case X86::reloc_absolute_word_sext: - return R_X86_64_32S; + return ELF::R_X86_64_32S; case X86::reloc_absolute_dword: - return R_X86_64_64; + return ELF::R_X86_64_64; case X86::reloc_picrel_word: default: llvm_unreachable("unknown x86_64 machine relocation type"); @@ -49,9 +50,9 @@ unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { } else { switch(MachineRelTy) { case X86::reloc_pcrel_word: - return R_386_PC32; + return ELF::R_386_PC32; case X86::reloc_absolute_word: - return R_386_32; + return ELF::R_386_32; case X86::reloc_absolute_word_sext: case X86::reloc_absolute_dword: case X86::reloc_picrel_word: @@ -66,18 +67,18 @@ long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, long int Modifier) const { if (is64Bit) { switch(RelTy) { - case R_X86_64_PC32: return Modifier - 4; - case R_X86_64_32: - case R_X86_64_32S: - case R_X86_64_64: + case ELF::R_X86_64_PC32: return Modifier - 4; + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: return Modifier; default: llvm_unreachable("unknown x86_64 relocation type"); } } else { switch(RelTy) { - case R_386_PC32: return Modifier - 4; - case R_386_32: return Modifier; + case ELF::R_386_PC32: return Modifier - 4; + case ELF::R_386_32: return Modifier; default: llvm_unreachable("unknown x86 relocation type"); } @@ -88,19 +89,19 @@ long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { if (is64Bit) { switch(RelTy) { - case R_X86_64_PC32: - case R_X86_64_32: - case R_X86_64_32S: + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: return 32; - case R_X86_64_64: + case ELF::R_X86_64_64: return 64; default: llvm_unreachable("unknown x86_64 relocation type"); } } else { switch(RelTy) { - case R_386_PC32: - case R_386_32: + case ELF::R_386_PC32: + case ELF::R_386_32: return 32; default: llvm_unreachable("unknown x86 relocation type"); @@ -112,20 +113,20 @@ unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { if (is64Bit) { switch(RelTy) { - case R_X86_64_PC32: + case ELF::R_X86_64_PC32: return true; - case R_X86_64_32: - case R_X86_64_32S: - case R_X86_64_64: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: return false; default: llvm_unreachable("unknown x86_64 relocation type"); } } else { switch(RelTy) { - case R_386_PC32: + case ELF::R_386_PC32: return true; - case R_386_32: + case ELF::R_386_32: return false; default: llvm_unreachable("unknown x86 relocation type"); @@ -143,7 +144,7 @@ long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset, unsigned RelOffset, unsigned RelTy) const { - if (RelTy == R_X86_64_PC32 || RelTy == R_386_PC32) + if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32) return SymOffset - (RelOffset + 4); else assert("computeRelocation unknown for this relocation type"); diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h index 7d37d95..a45b5bb 100644 --- a/lib/Target/X86/X86ELFWriterInfo.h +++ b/lib/Target/X86/X86ELFWriterInfo.h @@ -20,23 +20,6 @@ namespace llvm { class X86ELFWriterInfo : public TargetELFWriterInfo { - // ELF Relocation types for X86 - enum X86RelocationType { - R_386_NONE = 0, - R_386_32 = 1, - R_386_PC32 = 2 - }; - - // ELF Relocation types for X86_64 - enum X86_64RelocationType { - R_X86_64_NONE = 0, - R_X86_64_64 = 1, - R_X86_64_PC32 = 2, - R_X86_64_32 = 10, - R_X86_64_32S = 11, - R_X86_64_PC64 = 24 - }; - public: X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_); virtual ~X86ELFWriterInfo(); diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index b938539..6fa9284 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -36,7 +36,7 @@ using namespace llvm; namespace { - + class X86FastISel : public FastISel { /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -46,7 +46,7 @@ class X86FastISel : public FastISel { /// unsigned StackPtr; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. @@ -69,12 +69,12 @@ public: /// possible. virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo, const LoadInst *LI); - + #include "X86GenFastISel.inc" private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); - + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); bool X86FastEmitStore(EVT VT, const Value *Val, @@ -84,12 +84,12 @@ private: bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); - + bool X86SelectAddress(const Value *V, X86AddressMode &AM); bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); bool X86SelectLoad(const Instruction *I); - + bool X86SelectStore(const Instruction *I); bool X86SelectRet(const Instruction *I); @@ -105,7 +105,7 @@ private: bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); - + bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); @@ -134,7 +134,7 @@ private: bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false); }; - + } // end anonymous namespace. bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) { @@ -250,7 +250,7 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m; break; } - + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)), AM).addReg(Val); return true; @@ -261,7 +261,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Val)) Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); - + // If this is a store of a simple constant, fold the constant into the store. if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { unsigned Opc = 0; @@ -278,7 +278,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, Opc = X86::MOV64mi32; break; } - + if (Opc) { addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)), AM) @@ -287,11 +287,11 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, return true; } } - + unsigned ValReg = getRegForValue(Val); if (ValReg == 0) - return false; - + return false; + return X86FastEmitStore(VT, ValReg, AM); } @@ -303,7 +303,7 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned &ResultReg) { unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src, /*TODO: Kill=*/false); - + if (RR != 0) { ResultReg = RR; return true; @@ -320,11 +320,11 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Don't walk into other basic blocks; it's possible we haven't // visited them yet, so the instructions may not yet be assigned // virtual registers. - if (FuncInfo.MBBMap[I->getParent()] != FuncInfo.MBB) - return false; - - Opcode = I->getOpcode(); - U = I; + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { Opcode = C->getOpcode(); U = C; @@ -438,7 +438,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { AM.Disp = (uint32_t)Disp; if (X86SelectAddress(U->getOperand(0), AM)) return true; - + // If we couldn't merge the sub value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; @@ -467,7 +467,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Okay, we've committed to selecting this global. Set up the basic address. AM.GV = GV; - + // Allow the subtarget to classify the global. unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); @@ -476,7 +476,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // FIXME: How do we know Base.Reg is free?? AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); } - + // Unless the ABI requires an extra load, return a direct reference to // the global. if (!isGlobalStubReference(GVFlags)) { @@ -489,7 +489,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { AM.GVOpFlags = GVFlags; return true; } - + // Ok, we need to do a load from a stub. If we've already loaded from this // stub, reuse the loaded pointer, otherwise emit the load now. DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); @@ -511,14 +511,14 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { if (TLI.getPointerTy() == MVT::i64) { Opc = X86::MOV64rm; RC = X86::GR64RegisterClass; - + if (Subtarget->isPICStyleRIPRel()) StubAM.Base.Reg = X86::RIP; } else { Opc = X86::MOV32rm; RC = X86::GR32RegisterClass; } - + LoadReg = createResultReg(RC); MachineInstrBuilder LoadMI = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); @@ -530,7 +530,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { // Prevent loading GV stub multiple times in same MBB. LocalValueMap[V] = LoadReg; } - + // Now construct the final address. Note that the Disp, Scale, // and Index values may already be set here. AM.Base.Reg = LoadReg; @@ -597,14 +597,18 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle TLS or DLLImport. + // Can't handle DLLImport. + if (GV->hasDLLImportLinkage()) + return false; + + // Can't handle TLS. if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) - if (GVar->isThreadLocal() || GVar->hasDLLImportLinkage()) + if (GVar->isThreadLocal()) return false; // Okay, we've committed to selecting this global. Set up the basic address. AM.GV = GV; - + // No ABI requires an extra load for anything other than DLLImport, which // we rejected above. Return a direct reference to the global. if (Subtarget->isPICStyleRIPRel()) { @@ -617,7 +621,7 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { } else if (Subtarget->isPICStyleGOT()) { AM.GVOpFlags = X86II::MO_GOTOFF; } - + return true; } @@ -702,7 +706,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { return false; CCValAssign &VA = ValLocs[0]; - + // Don't bother handling odd stuff for now. if (VA.getLocInfo() != CCValAssign::Full) return false; @@ -792,11 +796,11 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT) { unsigned Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; - + // Handle 'null' like i32/i64 0. if (isa<ConstantPointerNull>(Op1)) Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext())); - + // We have two options: compare with register or immediate. If the RHS of // the compare is an immediate that we can fold into this compare, use // CMPri, otherwise use CMPrr. @@ -808,16 +812,16 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, return true; } } - + unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); if (CompareOpc == 0) return false; - + unsigned Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc)) .addReg(Op0Reg) .addReg(Op1Reg); - + return true; } @@ -835,13 +839,13 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { case CmpInst::FCMP_OEQ: { if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) return false; - + unsigned EReg = createResultReg(&X86::GR8RegClass); unsigned NPReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNPr), NPReg); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); UpdateValueMap(I, ResultReg); return true; @@ -874,7 +878,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; - + case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; @@ -896,7 +900,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { // Emit a compare of Op0/Op1. if (!X86FastEmitCompare(Op0, Op1, VT)) return false; - + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg); UpdateValueMap(I, ResultReg); return true; @@ -961,7 +965,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; - + case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; @@ -975,7 +979,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { default: return false; } - + const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); if (SwapArgs) std::swap(Op0, Op1); @@ -983,7 +987,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(Op0, Op1, VT)) return false; - + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc)) .addMBB(TrueMBB); @@ -1036,8 +1040,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } const TargetInstrDesc &TID = MI.getDesc(); - if (TID.hasUnmodeledSideEffects() || - TID.hasImplicitDefOfPhysReg(X86::EFLAGS)) + if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) || + MI.hasUnmodeledSideEffects()) break; } @@ -1119,16 +1123,16 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - + // Fold immediate in shl(x,3). if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { unsigned ResultReg = createResultReg(RC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpImm), ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff); UpdateValueMap(I, ResultReg); return true; } - + unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), @@ -1152,10 +1156,10 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { MVT VT; if (!isTypeLegal(I->getType(), VT)) return false; - + // We only use cmov here, if we don't have a cmov instruction bail. if (!Subtarget->hasCMov()) return false; - + unsigned Opc = 0; const TargetRegisterClass *RC = NULL; if (VT == MVT::i16) { @@ -1168,7 +1172,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { Opc = X86::CMOVE64rr; RC = &X86::GR64RegClass; } else { - return false; + return false; } unsigned Op0Reg = getRegForValue(I->getOperand(0)); @@ -1233,7 +1237,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return false; EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); - + // This code only handles truncation to byte right now. if (DstVT != MVT::i8 && DstVT != MVT::i1) // All other cases should be handled by the tblgen generated code. @@ -1304,21 +1308,21 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; - + if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; - + return true; } case Intrinsic::objectsize: { ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1)); const Type *Ty = I.getCalledFunction()->getReturnType(); - + assert(CI && "Non-constant type in Intrinsic::objectsize?"); - + MVT VT; if (!isTypeLegal(Ty, VT)) return false; - + unsigned OpC = 0; if (VT == MVT::i32) OpC = X86::MOV32ri; @@ -1326,7 +1330,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { OpC = X86::MOV64ri; else return false; - + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg). addImm(CI->isZero() ? -1ULL : 0); @@ -1398,7 +1402,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { ResultReg = DestReg1+1; else ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8)); - + unsigned Opc = X86::SETBr; if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) Opc = X86::SETOr; @@ -1516,10 +1520,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext()); - + // Allocate shadow area for Win64 - if (Subtarget->isTargetWin64()) { - CCInfo.AllocateStack(32, 8); + if (Subtarget->isTargetWin64()) { + CCInfo.AllocateStack(32, 8); } CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); @@ -1539,7 +1543,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { CCValAssign &VA = ArgLocs[i]; unsigned Arg = Args[VA.getValNo()]; EVT ArgVT = ArgVTs[VA.getValNo()]; - + // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -1547,16 +1551,14 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { case CCValAssign::SExt: { bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); - assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted; - Emitted = true; + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::ZExt: { bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); - assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted; - Emitted = true; + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } @@ -1572,21 +1574,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (!Emitted) Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), Arg, ArgVT, Arg); - - assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted; + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::BCvt: { unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(), - ISD::BIT_CONVERT, Arg, /*TODO: Kill=*/false); + ISD::BITCAST, Arg, /*TODO: Kill=*/false); assert(BC != 0 && "Failed to emit a bitcast!"); Arg = BC; ArgVT = VA.getLocVT(); break; } } - + if (VA.isRegLoc()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); @@ -1597,7 +1599,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { AM.Base.Reg = StackPtr; AM.Disp = LocMemOffset; const Value *ArgVal = ArgVals[VA.getValNo()]; - + // If this is a really simple value, emit this with the Value* version of // X86FastEmitStore. If it isn't simple, we don't want to do this, as it // can cause us to reevaluate the argument. @@ -1609,13 +1611,13 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { } // ELF / PIC requires GOT in the EBX register before function calls via PLT - // GOT pointer. + // GOT pointer. if (Subtarget->isPICStyleGOT()) { unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); } - + // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { @@ -1629,7 +1631,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { CallOpc = X86::CALL32r; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) .addReg(CalleeOp); - + } else { // Direct call. assert(GV && "Not a direct call"); @@ -1640,10 +1642,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { CallOpc = X86::CALL64pcrel32; else CallOpc = X86::CALLpcrel32; - + // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = 0; - + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then @@ -1660,8 +1662,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } - - + + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) .addGlobalAddress(GV, 0, OpFlags); } @@ -1690,7 +1692,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { assert(RVLocs.size() == 1 && "Can't handle multi-value calls!"); EVT CopyVT = RVLocs[0].getValVT(); TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT); - + // If this is a call to a function that returns an fp value on the x87 fp // stack, but where we prefer to use the value in xmm registers, copy it // out as F80 and use a truncate to move it from fp stack reg to xmm reg. @@ -1728,7 +1730,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) { if (AndToI1) { // Mask out all but lowest bit for some call which produces an i1. unsigned AndResult = createResultReg(X86::GR8RegisterClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1); ResultReg = AndResult; } @@ -1798,7 +1800,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { MVT VT; if (!isTypeLegal(C->getType(), VT)) return false; - + // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = NULL; @@ -1843,7 +1845,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { // No f80 support yet. return false; } - + // Materialize addresses with LEA instructions. if (isa<GlobalValue>(C)) { X86AddressMode AM; @@ -1859,14 +1861,14 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { } return 0; } - + // MachineConstantPool wants an explicit alignment. unsigned Align = TD.getPrefTypeAlignment(C->getType()); if (Align == 0) { // Alignment of vector types. FIXME! Align = TD.getTypeAllocSize(C->getType()); } - + // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; unsigned char OpFlag = 0; @@ -1922,20 +1924,20 @@ bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, X86AddressMode AM; if (!X86SelectAddress(LI->getOperand(0), AM)) return false; - + X86InstrInfo &XII = (X86InstrInfo&)TII; - + unsigned Size = TD.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); SmallVector<MachineOperand, 8> AddrOps; AM.getFullAddress(AddrOps); - + MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); if (Result == 0) return false; - - MI->getParent()->insert(MI, Result); + + FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); return true; } diff --git a/lib/Target/X86/X86FixupKinds.h b/lib/Target/X86/X86FixupKinds.h index 64ee3eb..17d242a 100644 --- a/lib/Target/X86/X86FixupKinds.h +++ b/lib/Target/X86/X86FixupKinds.h @@ -15,17 +15,17 @@ namespace llvm { namespace X86 { enum Fixups { - reloc_pcrel_4byte = FirstTargetFixupKind, // 32-bit pcrel, e.g. a branch. - reloc_pcrel_1byte, // 8-bit pcrel, e.g. branch_1 - reloc_pcrel_2byte, // 16-bit pcrel, e.g. callw - reloc_riprel_4byte, // 32-bit rip-relative + reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4 // this will be sign extended at // runtime. - reloc_global_offset_table // 32-bit, relative to the start + reloc_global_offset_table, // 32-bit, relative to the start // of the instruction. Used only // for _GLOBAL_OFFSET_TABLE_. + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; } } diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 5da6d3a..3aaa693 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -32,6 +32,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -51,6 +52,7 @@ namespace { struct FPS : public MachineFunctionPass { static char ID; FPS() : MachineFunctionPass(ID) { + initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); // This is really only to keep valgrind quiet. // The logic in isLive() is too much for it. memset(Stack, 0, sizeof(Stack)); @@ -59,6 +61,7 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); + AU.addRequired<EdgeBundles>(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); @@ -94,7 +97,7 @@ namespace { // FixStack[i] == getStackEntry(i) for all i < FixCount. unsigned char FixStack[8]; - LiveBundle(unsigned m = 0) : Mask(m), FixCount(0) {} + LiveBundle() : Mask(0), FixCount(0) {} // Have the live registers been assigned a stack order yet? bool isFixed() const { return !Mask || FixCount; } @@ -104,10 +107,8 @@ namespace { // with no live FP registers. SmallVector<LiveBundle, 8> LiveBundles; - // Map each MBB in the current function to an (ingoing, outgoing) index into - // LiveBundles. Blocks with no FP registers live in or out map to (0, 0) - // and are not actually stored in the map. - DenseMap<MachineBasicBlock*, std::pair<unsigned, unsigned> > BlockBundle; + // The edge bundle analysis provides indices into the LiveBundles vector. + EdgeBundles *Bundles; // Return a bitmask of FP registers in block's live-in list. unsigned calcLiveInMask(MachineBasicBlock *MBB) { @@ -284,6 +285,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Early exit. if (!FPIsUsed) return false; + Bundles = &getAnalysis<EdgeBundles>(); TII = MF.getTarget().getInstrInfo(); // Prepare cross-MBB liveness. @@ -308,7 +310,6 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { if (Processed.insert(BB)) Changed |= processBasicBlock(MF, *BB); - BlockBundle.clear(); LiveBundles.clear(); return Changed; @@ -321,90 +322,16 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { /// registers may be implicitly defined, or not used by all successors. void FPS::bundleCFG(MachineFunction &MF) { assert(LiveBundles.empty() && "Stale data in LiveBundles"); - assert(BlockBundle.empty() && "Stale data in BlockBundle"); - SmallPtrSet<MachineBasicBlock*, 8> PropDown, PropUp; + LiveBundles.resize(Bundles->getNumBundles()); - // LiveBundle[0] is the empty live-in set. - LiveBundles.resize(1); - - // First gather the actual live-in masks for all MBBs. + // Gather the actual live-in masks for all MBBs. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { MachineBasicBlock *MBB = I; const unsigned Mask = calcLiveInMask(MBB); if (!Mask) continue; - // Ingoing bundle index. - unsigned &Idx = BlockBundle[MBB].first; - // Already assigned an ingoing bundle? - if (Idx) - continue; - // Allocate a new LiveBundle struct for this block's live-ins. - const unsigned BundleIdx = Idx = LiveBundles.size(); - DEBUG(dbgs() << "Creating LB#" << BundleIdx << ": in:BB#" - << MBB->getNumber()); - LiveBundles.push_back(Mask); - LiveBundle &Bundle = LiveBundles.back(); - - // Make sure all predecessors have the same live-out set. - PropUp.insert(MBB); - - // Keep pushing liveness up and down the CFG until convergence. - // Only critical edges cause iteration here, but when they do, multiple - // blocks can be assigned to the same LiveBundle index. - do { - // Assign BundleIdx as liveout from predecessors in PropUp. - for (SmallPtrSet<MachineBasicBlock*, 16>::iterator I = PropUp.begin(), - E = PropUp.end(); I != E; ++I) { - MachineBasicBlock *MBB = *I; - for (MachineBasicBlock::const_pred_iterator LinkI = MBB->pred_begin(), - LinkE = MBB->pred_end(); LinkI != LinkE; ++LinkI) { - MachineBasicBlock *PredMBB = *LinkI; - // PredMBB's liveout bundle should be set to LIIdx. - unsigned &Idx = BlockBundle[PredMBB].second; - if (Idx) { - assert(Idx == BundleIdx && "Inconsistent CFG"); - continue; - } - Idx = BundleIdx; - DEBUG(dbgs() << " out:BB#" << PredMBB->getNumber()); - // Propagate to siblings. - if (PredMBB->succ_size() > 1) - PropDown.insert(PredMBB); - } - } - PropUp.clear(); - - // Assign BundleIdx as livein to successors in PropDown. - for (SmallPtrSet<MachineBasicBlock*, 16>::iterator I = PropDown.begin(), - E = PropDown.end(); I != E; ++I) { - MachineBasicBlock *MBB = *I; - for (MachineBasicBlock::const_succ_iterator LinkI = MBB->succ_begin(), - LinkE = MBB->succ_end(); LinkI != LinkE; ++LinkI) { - MachineBasicBlock *SuccMBB = *LinkI; - // LinkMBB's livein bundle should be set to BundleIdx. - unsigned &Idx = BlockBundle[SuccMBB].first; - if (Idx) { - assert(Idx == BundleIdx && "Inconsistent CFG"); - continue; - } - Idx = BundleIdx; - DEBUG(dbgs() << " in:BB#" << SuccMBB->getNumber()); - // Propagate to siblings. - if (SuccMBB->pred_size() > 1) - PropUp.insert(SuccMBB); - // Also accumulate the bundle liveness mask from the liveins here. - Bundle.Mask |= calcLiveInMask(SuccMBB); - } - } - PropDown.clear(); - } while (!PropUp.empty()); - DEBUG({ - dbgs() << " live:"; - for (unsigned i = 0; i < 8; ++i) - if (Bundle.Mask & (1<<i)) - dbgs() << " %FP" << i; - dbgs() << '\n'; - }); + // Update MBB ingoing bundle mask. + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; } } @@ -492,13 +419,15 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { return Changed; } -/// setupBlockStack - Use the BlockBundle map to set up our model of the stack +/// setupBlockStack - Use the live bundles to set up our model of the stack /// to match predecessors' live out stack. void FPS::setupBlockStack() { DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber() << " derived from " << MBB->getName() << ".\n"); StackTop = 0; - const LiveBundle &Bundle = LiveBundles[BlockBundle.lookup(MBB).first]; + // Get the live-in bundle for MBB. + const LiveBundle &Bundle = + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; if (!Bundle.Mask) { DEBUG(dbgs() << "Block has no FP live-ins.\n"); @@ -535,7 +464,8 @@ void FPS::finishBlockStack() { DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber() << " derived from " << MBB->getName() << ".\n"); - unsigned BundleIdx = BlockBundle.lookup(MBB).second; + // Get MBB's live-out bundle. + unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true); LiveBundle &Bundle = LiveBundles[BundleIdx]; // We may need to kill and define some registers to match successors. diff --git a/lib/Target/X86/X86FrameInfo.h b/lib/Target/X86/X86FrameInfo.h deleted file mode 100644 index 7828e17..0000000 --- a/lib/Target/X86/X86FrameInfo.h +++ /dev/null @@ -1,46 +0,0 @@ -//===-- X86TargetFrameInfo.h - Define TargetFrameInfo for X86 ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -// -//===----------------------------------------------------------------------===// - -#ifndef X86_FRAMEINFO_H -#define X86_FRAMEINFO_H - -#include "X86Subtarget.h" -#include "llvm/Target/TargetFrameInfo.h" - -namespace llvm { - class MCSymbol; - -class X86FrameInfo : public TargetFrameInfo { -protected: - const X86Subtarget &STI; - -public: - explicit X86FrameInfo(const X86Subtarget &sti) - : TargetFrameInfo(StackGrowsDown, - sti.getStackAlignment(), - (sti.isTargetWin64() ? -40 : (sti.is64Bit() ? -8 : -4))), - STI(sti) { - } - - void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label, - unsigned FramePtr) const; - - /// emitProlog/emitEpilog - These methods insert prolog and epilog code into - /// the function. - void emitPrologue(MachineFunction &MF) const; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; -}; - -} // End llvm namespace - -#endif diff --git a/lib/Target/X86/X86FrameInfo.cpp b/lib/Target/X86/X86FrameLowering.cpp index 98c579b..3246c43 100644 --- a/lib/Target/X86/X86FrameInfo.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -1,4 +1,4 @@ -//=======- X86FrameInfo.cpp - X86 Frame Information ------------*- C++ -*-====// +//=======- X86FrameLowering.cpp - X86 Frame Information ------------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -7,14 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This file contains the X86 implementation of TargetFrameInfo class. +// This file contains the X86 implementation of TargetFrameLowering class. // //===----------------------------------------------------------------------===// -#include "X86FrameInfo.h" +#include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" #include "llvm/Function.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -24,12 +25,33 @@ #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/CommandLine.h" +#include "llvm/ADT/SmallSet.h" using namespace llvm; // FIXME: completely move here. extern cl::opt<bool> ForceStackAlign; +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineModuleInfo &MMI = MF.getMMI(); + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + + return (DisableFramePointerElim(MF) || + RI->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MMI.callsUnwindInit()); +} + static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) { if (is64Bit) { if (isInt<8>(Imm)) @@ -54,12 +76,70 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) { } } +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live +/// when it reaches the "return" instruction. We can then pop a stack object +/// to this register without worry about clobbering it. +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetRegisterInfo &TRI, + bool Is64Bit) { + const MachineFunction *MF = MBB.getParent(); + const Function *F = MF->getFunction(); + if (!F || MF->getMMI().callsEHReturn()) + return 0; + + static const unsigned CallerSavedRegs32Bit[] = { + X86::EAX, X86::EDX, X86::ECX + }; + + static const unsigned CallerSavedRegs64Bit[] = { + X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, + X86::R8, X86::R9, X86::R10, X86::R11 + }; + + unsigned Opc = MBBI->getOpcode(); + switch (Opc) { + default: return 0; + case X86::RET: + case X86::RETI: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<unsigned, 8> Uses; + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MBBI->getOperand(i); + if (!MO.isReg() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + for (const unsigned *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI) + Uses.insert(*AsI); + } + + const unsigned *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; + for (; *CS; ++CS) + if (!Uses.count(*CS)) + return *CS; + } + } + + return 0; +} + + /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, int64_t NumBytes, bool Is64Bit, - const TargetInstrInfo &TII) { + unsigned StackPtr, int64_t NumBytes, + bool Is64Bit, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc = isSub ? @@ -70,10 +150,26 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, while (Offset) { uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (ThisVal == (Is64Bit ? 8 : 4)) { + // Use push / pop instead. + unsigned Reg = isSub + ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + if (Reg) { + Opc = isSub + ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); + BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + Offset -= ThisVal; + continue; + } + } + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(ThisVal); + .addReg(StackPtr) + .addImm(ThisVal); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. Offset -= ThisVal; } @@ -180,12 +276,10 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -void X86FrameInfo::emitCalleeSavedFrameMoves(MachineFunction &MF, +void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label, unsigned FramePtr) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); MachineModuleInfo &MMI = MF.getMMI(); // Add callee saved registers to move list. @@ -193,14 +287,11 @@ void X86FrameInfo::emitCalleeSavedFrameMoves(MachineFunction &MF, if (CSI.empty()) return; std::vector<MachineMove> &Moves = MMI.getFrameMoves(); - const TargetData *TD = MF.getTarget().getTargetData(); - bool HasFP = RegInfo->hasFP(MF); + const TargetData *TD = TM.getTargetData(); + bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. - int stackGrowth = - (MF.getTarget().getFrameInfo()->getStackGrowthDirection() == - TargetFrameInfo::StackGrowsUp ? - TD->getPointerSize() : -TD->getPointerSize()); + int stackGrowth = -TD->getPointerSize(); // FIXME: This is dirty hack. The code itself is pretty mess right now. // It should be rewritten from scratch and generalized sometimes. @@ -227,7 +318,7 @@ void X86FrameInfo::emitCalleeSavedFrameMoves(MachineFunction &MF, // move" for this extra "PUSH", the linker will lose track of the fact that // the frame pointer should have the value of the first "PUSH" when it's // trying to unwind. - // + // // FIXME: This looks inelegant. It's possibly correct, but it's covering up // another bug. I.e., one where we generate a prolog like this: // @@ -253,23 +344,20 @@ void X86FrameInfo::emitCalleeSavedFrameMoves(MachineFunction &MF, /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. -void X86FrameInfo::emitPrologue(MachineFunction &MF) const { +void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86Subtarget *Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); - const X86InstrInfo &TII = - *static_cast<const X86InstrInfo*>(MF.getTarget().getInstrInfo()); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || !Fn->doesNotThrow() || UnwindTablesMandatory; uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. - bool HasFP = RegInfo->hasFP(MF); + bool HasFP = hasFP(MF); bool Is64Bit = STI.is64Bit(); bool IsWin64 = STI.isTargetWin64(); unsigned StackAlign = getStackAlignment(); @@ -309,11 +397,6 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); - } else if (IsWin64) { - // We need to always allocate 32 bytes as register spill area. - // FIXME: We might reuse these 32 bytes for leaf functions. - StackSize += 32; - MFI->setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only @@ -376,7 +459,6 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth); Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); } else { - // FIXME: Verify & implement for FP MachineLocation SPDst(StackPtr); MachineLocation SPSrc(StackPtr, stackGrowth); Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); @@ -470,12 +552,15 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. if (NumBytes >= 4096 && - (Subtarget->isTargetCygMing() || Subtarget->isTargetWin32())) { + (STI.isTargetCygMing() || STI.isTargetWin32()) && + !STI.isTargetEnvMacho()) { // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); const char *StackProbeSymbol = - Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; + STI.isTargetWindows() ? "_chkstk" : "_alloca"; + if (Is64Bit && STI.isTargetCygMing()) + StackProbeSymbol = "__chkstk"; unsigned CallOp = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; if (!isEAXAlive) { BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) @@ -504,9 +589,11 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { StackPtr, false, NumBytes - 4); MBB.insert(MBBI, MI); } - } else if (NumBytes >= 4096 && Subtarget->isTargetWin64()) { + } else if (NumBytes >= 4096 && + STI.isTargetWin64() && + !STI.isTargetEnvMacho()) { // Sanity check that EAX is not livein for this function. It should - // should not be, so throw an assert. + // not be, so throw an assert. assert(!isEAXLiveIn(MF) && "EAX is livein in the Win64 case!"); // Handle the 64-bit Windows ABI case where we need to call __chkstk. @@ -516,9 +603,11 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { BuildMI(MBB, MBBI, DL, TII.get(X86::WINCALL64pcrel32)) .addExternalSymbol("__chkstk") .addReg(StackPtr, RegState::Define | RegState::Implicit); - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, + TII, *RegInfo); } else if (NumBytes) - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, + TII, *RegInfo); if ((NumBytes || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. @@ -533,7 +622,6 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { -StackSize + stackGrowth); Moves.push_back(MachineMove(Label, SPDst, SPSrc)); } else { - // FIXME: Verify & implement for FP MachineLocation SPDst(StackPtr); MachineLocation SPSrc(StackPtr, stackGrowth); Moves.push_back(MachineMove(Label, SPDst, SPSrc)); @@ -546,15 +634,14 @@ void X86FrameInfo::emitPrologue(MachineFunction &MF) const { } } -void X86FrameInfo::emitEpilogue(MachineFunction &MF, +void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); - const X86InstrInfo &TII = - *static_cast<const X86InstrInfo*>(MF.getTarget().getInstrInfo()); - MachineBasicBlock::iterator MBBI = prior(MBB.end()); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); bool Is64Bit = STI.is64Bit(); @@ -596,7 +683,7 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, MaxAlign = MaxAlign ? MaxAlign : 4; } - if (RegInfo->hasFP(MF)) { + if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; if (RegInfo->needsStackRealignment(MF)) @@ -617,7 +704,7 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = prior(MBBI); unsigned Opc = PI->getOpcode(); - if (Opc != X86::POP32r && Opc != X86::POP64r && + if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && !PI->getDesc().isTerminator()) break; @@ -638,7 +725,7 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, // We cannot use LEA here, because stack pointer was realigned. We need to // deallocate local frame back. if (CSSize) { - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); MBBI = prior(LastCSPop); } @@ -659,12 +746,12 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); } // We're returning from function via eh_return. if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { - MBBI = prior(MBB.end()); + MBBI = MBB.getLastNonDebugInstr(); MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); BuildMI(MBB, MBBI, DL, @@ -676,7 +763,7 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, RetOpcode == X86::TCRETURNmi64) { bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; // Tail call return: adjust the stack pointer and jump to callee. - MBBI = prior(MBB.end()); + MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); @@ -694,15 +781,22 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, if (Offset) { // Check for possible merge with preceeding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); } // Jump to label or value in register. if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) - ? X86::TAILJMPd : X86::TAILJMPd64)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd : X86::TAILJMPd64)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) @@ -727,10 +821,196 @@ void X86FrameInfo::emitEpilogue(MachineFunction &MF, (X86FI->getTCReturnAddrDelta() < 0)) { // Add the return addr area delta back since we are not tail calling. int delta = -1*X86FI->getTCReturnAddrDelta(); - MBBI = prior(MBB.end()); + MBBI = MBB.getLastNonDebugInstr(); // Check for possible merge with preceeding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); + } +} + +void +X86FrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const { + // Calculate amount of bytes used for return address storing + int stackGrowth = (STI.is64Bit() ? -8 : -4); + const X86RegisterInfo *RI = TM.getRegisterInfo(); + + // Initial state of the frame pointer is esp+stackGrowth. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(RI->getStackRegister(), stackGrowth); + Moves.push_back(MachineMove(0, Dst, Src)); + + // Add return address to move list + MachineLocation CSDst(RI->getStackRegister(), stackGrowth); + MachineLocation CSSrc(RI->getRARegister()); + Moves.push_back(MachineMove(0, CSDst, CSSrc)); +} + +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { + const X86RegisterInfo *RI = + static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + uint64_t StackSize = MFI->getStackSize(); + + if (RI->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + Offset += RI->getSlotSize(); + } else { + unsigned Align = MFI->getObjectAlignment(FI); + assert((-(Offset + StackSize)) % Align == 0); + Align = 0; + return Offset + StackSize; + } + // FIXME: Support tail calls + } else { + if (!hasFP(MF)) + return Offset + StackSize; + + // Skip the saved EBP. + Offset += RI->getSlotSize(); + + // Skip the RETADDR move area + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + Offset -= TailCallReturnAddrDelta; + } + + return Offset; +} + +bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + + unsigned SlotSize = STI.is64Bit() ? 8 : 4; + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned CalleeFrameSize = 0; + + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + + // Push GPRs. It increases frame size. + unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + if (Reg == FPReg) + // X86RegisterInfo::emitPrologue will handle spilling of frame register. + continue; + CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill); + } + + X86FI->setCalleeSavedFrameSize(CalleeFrameSize); + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + // Note that only Win64 ABI might spill XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), + RC, TRI); + } + + return true; +} + +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), + RC, TRI); + } + + // POP GPRs. + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + if (Reg == FPReg) + // X86RegisterInfo::emitEpilogue will handle restoring of frame register. + continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + } + return true; +} + +void +X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI->CreateFixedObject(-TailCallReturnAddrDelta, + (-1U*SlotSize)+TailCallReturnAddrDelta, true); + } + + if (hasFP(MF)) { + assert((TailCallReturnAddrDelta <= 0) && + "The Delta should always be zero or negative"); + const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); + + // Create a frame entry for the EBP register that must be saved. + int FrameIdx = MFI->CreateFixedObject(SlotSize, + -(int)SlotSize + + TFI.getOffsetOfLocalArea() + + TailCallReturnAddrDelta, + true); + assert(FrameIdx == MFI->getObjectIndexBegin() && + "Slot for EBP register must be last in order to be found!"); + FrameIdx = 0; } } diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h new file mode 100644 index 0000000..d71108c --- /dev/null +++ b/lib/Target/X86/X86FrameLowering.h @@ -0,0 +1,65 @@ +//=-- X86TargetFrameLowering.h - Define frame lowering for X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements X86-specific bits of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_FRAMELOWERING_H +#define X86_FRAMELOWERING_H + +#include "X86Subtarget.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class MCSymbol; + class X86TargetMachine; + +class X86FrameLowering : public TargetFrameLowering { + const X86TargetMachine &TM; + const X86Subtarget &STI; +public: + explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti) + : TargetFrameLowering(StackGrowsDown, + sti.getStackAlignment(), + (sti.is64Bit() ? -8 : -4)), + TM(tm), STI(sti) { + } + + void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label, + unsigned FramePtr) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; + + void getInitialFrameState(std::vector<MachineMove> &Moves) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 0226278..9b0ec6e 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -506,7 +506,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { N->getOperand(0), MemTmp, MachinePointerInfo(), MemVT, false, false, 0); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, DstVT, dl, Store, MemTmp, + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MachinePointerInfo(), MemVT, false, false, 0); @@ -530,9 +530,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI) { const TargetInstrInfo *TII = TM.getInstrInfo(); - if (Subtarget->isTargetCygMing()) + if (Subtarget->isTargetCygMing()) { + unsigned CallOp = + Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32; BuildMI(BB, DebugLoc(), - TII->get(X86::CALLpcrel32)).addExternalSymbol("__main"); + TII->get(CallOp)).addExternalSymbol("__main"); + } } void X86DAGToDAGISel::EmitFunctionEntryCode() { @@ -682,25 +685,6 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } -/// isLogicallyAddWithConstant - Return true if this node is semantically an -/// add of a value with a constantint. -static bool isLogicallyAddWithConstant(SDValue V, SelectionDAG *CurDAG) { - // Check for (add x, Cst) - if (V->getOpcode() == ISD::ADD) - return isa<ConstantSDNode>(V->getOperand(1)); - - // Check for (or x, Cst), where Cst & x == 0. - if (V->getOpcode() != ISD::OR || - !isa<ConstantSDNode>(V->getOperand(1))) - return false; - - // Handle "X | C" as "X + C" iff X is known to have C bits clear. - ConstantSDNode *CN = cast<ConstantSDNode>(V->getOperand(1)); - - // Check to see if the LHS & C is zero. - return CurDAG->MaskedValueIsZero(V->getOperand(0), CN->getAPIntValue()); -} - bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { bool is64Bit = Subtarget->is64Bit(); @@ -786,7 +770,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. - if (isLogicallyAddWithConstant(ShVal, CurDAG)) { + if (CurDAG->isBaseWithConstantOffset(ShVal)) { AM.IndexReg = ShVal.getNode()->getOperand(0); ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); @@ -930,24 +914,18 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); - SDValue LHS = Handle.getValue().getNode()->getOperand(0); - SDValue RHS = Handle.getValue().getNode()->getOperand(1); X86ISelAddressMode Backup = AM; - if (!MatchAddressRecursively(LHS, AM, Depth+1) && - !MatchAddressRecursively(RHS, AM, Depth+1)) + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) return false; AM = Backup; - LHS = Handle.getValue().getNode()->getOperand(0); - RHS = Handle.getValue().getNode()->getOperand(1); - + // Try again after commuting the operands. - if (!MatchAddressRecursively(RHS, AM, Depth+1) && - !MatchAddressRecursively(LHS, AM, Depth+1)) + if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& + !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) return false; AM = Backup; - LHS = Handle.getValue().getNode()->getOperand(0); - RHS = Handle.getValue().getNode()->getOperand(1); // If we couldn't fold both operands into the address at the same time, // see if we can just put each operand into a register and fold at least @@ -955,17 +933,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.BaseType == X86ISelAddressMode::RegBase && !AM.Base_Reg.getNode() && !AM.IndexReg.getNode()) { - AM.Base_Reg = LHS; - AM.IndexReg = RHS; + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); AM.Scale = 1; return false; } + N = Handle.getValue(); break; } case ISD::OR: // Handle "X | C" as "X + C" iff X is known to have C bits clear. - if (isLogicallyAddWithConstant(N, CurDAG)) { + if (CurDAG->isBaseWithConstantOffset(N)) { X86ISelAddressMode Backup = AM; ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); uint64_t Offset = CN->getSExtValue(); @@ -1600,7 +1580,32 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return RetVal; break; } - + case X86ISD::UMUL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + unsigned LoReg; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; + case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; + case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); + return NULL; + } + case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue N0 = Node->getOperand(0); @@ -1650,14 +1655,15 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops, + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, array_lengthof(Ops)); InFlag = SDValue(CNode, 1); + // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); } else { - InFlag = - SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag); + InFlag = SDValue(CNode, 0); } // Prevent use of AH in a REX instruction by referencing AX instead. @@ -1696,7 +1702,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { ReplaceUses(SDValue(Node, 1), Result); DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } - + return NULL; } @@ -1781,7 +1787,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if (isSigned && !signBitIsZero) { // Sign extend the low part into the high part. InFlag = - SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Flag, InFlag),0); + SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. SDValue ClrNode = @@ -1795,14 +1801,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; SDNode *CNode = - CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops, + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, array_lengthof(Ops)); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); } else { InFlag = - SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0); + SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); } // Prevent use of AH in a REX instruction by referencing AX instead. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6233af5..c21aa43 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16,9 +16,9 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86ISelLowering.h" -#include "X86ShuffleDecode.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" +#include "Utils/X86ShuffleDecode.h" #include "llvm/CallingConv.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" @@ -28,6 +28,7 @@ #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" +#include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -44,7 +45,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/VectorExtras.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" @@ -55,43 +55,171 @@ using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); -static cl::opt<bool> -DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); - // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, SDValue V2); -static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { +static SDValue Insert128BitVector(SDValue Result, + SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl); + +static SDValue Extract128BitVector(SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl); + +static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG); + + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 instruction or a +/// simple subregister reference. Idx is an index in the 128 bits we +/// want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl) { + EVT VT = Vec.getValueType(); + assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); + + EVT ElVT = VT.getVectorElementType(); + + int Factor = VT.getSizeInBits() / 128; + + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), + ElVT, + VT.getVectorNumElements() / Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(ISD::UNDEF, dl, ResultVT); + + if (isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR + // we can match to VEXTRACTF128. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + + SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, + VecIdx); + + return Result; + } + + return SDValue(); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128 instruction or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, + SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl) { + if (isa<ConstantSDNode>(Idx)) { + EVT VT = Vec.getValueType(); + assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); + + EVT ElVT = VT.getVectorElementType(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + EVT ResultVT = Result.getValueType(); - bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); + // Insert the relevant 128 bits. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); - if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { - if (is64Bit) return new X8664_MachoTargetObjectFile(); + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + + Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, + VecIdx); + return Result; + } + + return SDValue(); +} + +/// Given two vectors, concat them. +static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) { + DebugLoc dl = Lower.getDebugLoc(); + + assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!"); + + EVT VT = EVT::getVectorVT(*DAG.getContext(), + Lower.getValueType().getVectorElementType(), + Lower.getValueType().getVectorNumElements() * 2); + + // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors). + assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!"); + + // Insert the upper subvector. + SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper, + DAG.getConstant( + // This is half the length of the result + // vector. Start inserting the upper 128 + // bits here. + Lower.getValueType().getVectorNumElements(), + MVT::i32), + DAG, dl); + + // Insert the lower subvector. + Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl); + return Vec; +} + +static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + bool is64Bit = Subtarget->is64Bit(); + + if (Subtarget->isTargetEnvMacho()) { + if (is64Bit) + return new X8664_MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); - } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ - if (is64Bit) return new X8664_ELFTargetObjectFile(TM); + } + + if (Subtarget->isTargetELF()) { + if (is64Bit) + return new X8664_ELFTargetObjectFile(TM); return new X8632_ELFTargetObjectFile(TM); - } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { - return new TargetLoweringObjectFileCOFF(); } + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + return new TargetLoweringObjectFileCOFF(); llvm_unreachable("unknown subtarget type"); } X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget<X86Subtarget>(); - X86ScalarSSEf64 = Subtarget->hasSSE2(); - X86ScalarSSEf32 = Subtarget->hasSSE1(); + X86ScalarSSEf64 = Subtarget->hasXMMInt(); + X86ScalarSSEf32 = Subtarget->hasXMM(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); TD = getTargetData(); // Set up the TargetLowering object. + static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird, it always uses i8 for shift amounts and setcc results. - setShiftAmountType(MVT::i8); setBooleanContents(ZeroOrOneBooleanContent); setSchedulingPreference(Sched::RegPressure); setStackPointerRegisterToSaveRestore(X86StackPtr); @@ -226,12 +354,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { - setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); - setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); + setOperationAction(ISD::BITCAST , MVT::f32 , Expand); + setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { - setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); + setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. - setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); + setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } @@ -245,30 +373,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - setOperationAction(ISD::MULHS , MVT::i8 , Expand); - setOperationAction(ISD::MULHU , MVT::i8 , Expand); - setOperationAction(ISD::SDIV , MVT::i8 , Expand); - setOperationAction(ISD::UDIV , MVT::i8 , Expand); - setOperationAction(ISD::SREM , MVT::i8 , Expand); - setOperationAction(ISD::UREM , MVT::i8 , Expand); - setOperationAction(ISD::MULHS , MVT::i16 , Expand); - setOperationAction(ISD::MULHU , MVT::i16 , Expand); - setOperationAction(ISD::SDIV , MVT::i16 , Expand); - setOperationAction(ISD::UDIV , MVT::i16 , Expand); - setOperationAction(ISD::SREM , MVT::i16 , Expand); - setOperationAction(ISD::UREM , MVT::i16 , Expand); - setOperationAction(ISD::MULHS , MVT::i32 , Expand); - setOperationAction(ISD::MULHU , MVT::i32 , Expand); - setOperationAction(ISD::SDIV , MVT::i32 , Expand); - setOperationAction(ISD::UDIV , MVT::i32 , Expand); - setOperationAction(ISD::SREM , MVT::i32 , Expand); - setOperationAction(ISD::UREM , MVT::i32 , Expand); - setOperationAction(ISD::MULHS , MVT::i64 , Expand); - setOperationAction(ISD::MULHU , MVT::i64 , Expand); - setOperationAction(ISD::SDIV , MVT::i64 , Expand); - setOperationAction(ISD::UDIV , MVT::i64 , Expand); - setOperationAction(ISD::SREM , MVT::i64 , Expand); - setOperationAction(ISD::UREM , MVT::i64 , Expand); + for (unsigned i = 0, e = 4; i != e; ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + + // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. + setOperationAction(ISD::ADDC, VT, Custom); + setOperationAction(ISD::ADDE, VT, Custom); + setOperationAction(ISD::SUBC, VT, Custom); + setOperationAction(ISD::SUBE, VT, Custom); + } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); @@ -285,21 +404,27 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); - setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTTZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i8 , Custom); - setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); - setOperationAction(ISD::CTPOP , MVT::i32 , Expand); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) { - setOperationAction(ISD::CTPOP , MVT::i64 , Expand); setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ , MVT::i64 , Custom); } + if (Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + } + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); setOperationAction(ISD::BSWAP , MVT::i16 , Expand); @@ -307,7 +432,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); - setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); @@ -350,7 +475,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); // We may not have a libcall for MEMBARRIER so we should lower this. @@ -364,15 +489,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setShouldFoldAtomicFences(true); // Expand certain atomics - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); - - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); - setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + for (unsigned i = 0, e = 4; i != e; ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + } if (!Subtarget->is64Bit()) { setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); @@ -521,13 +642,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { - bool ignored; - APFloat TmpFlt(+0.0); - TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, - &ignored); + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS + + bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); @@ -573,8 +693,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); + setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); @@ -622,7 +743,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. - if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { + if (!UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); // No operations on x86mmx supported, everything uses intrinsics. } @@ -654,12 +775,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SELECT, MVT::v4i16, Expand); setOperationAction(ISD::SELECT, MVT::v2i32, Expand); setOperationAction(ISD::SELECT, MVT::v1i64, Expand); - setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Expand); - setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Expand); - setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Expand); - setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Expand); + setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); + setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); + setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); + setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasSSE1()) { + if (!UseSoftFloat && Subtarget->hasXMM()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -676,7 +797,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasSSE2()) { + if (!UseSoftFloat && Subtarget->hasXMMInt()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -821,9 +942,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } } - if (Subtarget->hasSSE42()) { + if (Subtarget->hasSSE42()) setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); - } if (!UseSoftFloat && Subtarget->hasAVX()) { addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); @@ -836,27 +956,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::LOAD, MVT::v8i32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); setOperationAction(ISD::LOAD, MVT::v4i64, Legal); + setOperationAction(ISD::FADD, MVT::v8f32, Legal); setOperationAction(ISD::FSUB, MVT::v8f32, Legal); setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); - //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); - //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); - //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); - //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); - - // Operations to consider commented out -v16i16 v32i8 - //setOperationAction(ISD::ADD, MVT::v16i16, Legal); - setOperationAction(ISD::ADD, MVT::v8i32, Custom); - setOperationAction(ISD::ADD, MVT::v4i64, Custom); - //setOperationAction(ISD::SUB, MVT::v32i8, Legal); - //setOperationAction(ISD::SUB, MVT::v16i16, Legal); - setOperationAction(ISD::SUB, MVT::v8i32, Custom); - setOperationAction(ISD::SUB, MVT::v4i64, Custom); - //setOperationAction(ISD::MUL, MVT::v16i16, Legal); + setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); @@ -864,85 +971,66 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); - setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); - // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); - // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); - setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); - - // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); - // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); - // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); - - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); - -#if 0 - // Not sure we want to do this since there are no 256-bit integer - // operations in AVX - - // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - // This includes 256-bit vectors - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { - EVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to custom lower non-power-of-2 vectors - if (!isPowerOf2_32(VT.getVectorNumElements())) + // Custom lower build_vector, vector_shuffle, scalar_to_vector, + // insert_vector_elt extract_subvector and extract_vector_elt for + // 256-bit types. + for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; + ++i) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)i; + // Do not attempt to custom lower non-256-bit vectors + if (!isPowerOf2_32(MVT(VT).getVectorNumElements()) + || (MVT(VT).getSizeInBits() < 256)) continue; - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - } + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + } + // Custom-lower insert_subvector and extract_subvector based on + // the result type. + for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; + ++i) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)i; + // Do not attempt to custom lower non-256-bit vectors + if (!isPowerOf2_32(MVT(VT).getVectorNumElements())) + continue; - if (Subtarget->is64Bit()) { - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); + if (MVT(VT).getSizeInBits() == 128) { + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + } + else if (MVT(VT).getSizeInBits() == 256) { + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + } } -#endif - -#if 0 - // Not sure we want to do this since there are no 256-bit integer - // operations in AVX - // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. - // Including 256-bit vectors - for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { - EVT VT = (MVT::SimpleValueType)i; + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. + // Don't promote loads because we need them for VPERM vector index versions. - if (!VT.is256BitVector()) { + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; + VT++) { + if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements()) + || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256)) continue; - } - setOperationAction(ISD::AND, VT, Promote); - AddPromotedToType (ISD::AND, VT, MVT::v4i64); - setOperationAction(ISD::OR, VT, Promote); - AddPromotedToType (ISD::OR, VT, MVT::v4i64); - setOperationAction(ISD::XOR, VT, Promote); - AddPromotedToType (ISD::XOR, VT, MVT::v4i64); - setOperationAction(ISD::LOAD, VT, Promote); - AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); - setOperationAction(ISD::SELECT, VT, Promote); - AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); + setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); + AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64); + setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); + AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64); + setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); + AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64); + //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); + //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64); + setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); + AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64); } - - setTruncStoreAction(MVT::f64, MVT::f32, Expand); -#endif } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Add/Sub/Mul with overflow operations are custom lowered. - setOperationAction(ISD::SADDO, MVT::i32, Custom); - setOperationAction(ISD::UADDO, MVT::i32, Custom); - setOperationAction(ISD::SSUBO, MVT::i32, Custom); - setOperationAction(ISD::USUBO, MVT::i32, Custom); - setOperationAction(ISD::SMULO, MVT::i32, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -950,14 +1038,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. - if (Subtarget->is64Bit()) { - setOperationAction(ISD::SADDO, MVT::i64, Custom); - setOperationAction(ISD::UADDO, MVT::i64, Custom); - setOperationAction(ISD::SSUBO, MVT::i64, Custom); - setOperationAction(ISD::USUBO, MVT::i64, Custom); - setOperationAction(ISD::SMULO, MVT::i64, Custom); + for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + // Add/Sub/Mul with overflow operations are custom lowered. + MVT VT = IntVTs[i]; + setOperationAction(ISD::SADDO, VT, Custom); + setOperationAction(ISD::UADDO, VT, Custom); + setOperationAction(ISD::SSUBO, VT, Custom); + setOperationAction(ISD::USUBO, VT, Custom); + setOperationAction(ISD::SMULO, VT, Custom); + setOperationAction(ISD::UMULO, VT, Custom); } + // There are no 8-bit 3-address imul/mul instructions + setOperationAction(ISD::SMULO, MVT::i8, Expand); + setOperationAction(ISD::UMULO, MVT::i8, Expand); + if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, 0); @@ -974,6 +1069,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); if (Subtarget->is64Bit()) @@ -981,11 +1079,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) computeRegisterProperties(); - // FIXME: These should be based on subtarget info. Plus, the values should - // be smaller when we are in optimizing for size mode. + // On Darwin, -Os means optimize for size without hurting performance, + // do not reduce the limit. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; } @@ -1036,7 +1137,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { } unsigned Align = 4; - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) getMaxByValAlign(Ty, Align); return Align; } @@ -1077,7 +1178,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->getStackAlignment() >= 8 && - Subtarget->hasSSE2()) { + Subtarget->hasXMMInt()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; @@ -1144,6 +1245,7 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; } +// FIXME: Why this routine is here? Move to RegInfo! std::pair<const TargetRegisterClass*, uint8_t> X86TargetLowering::findRepresentativeClass(EVT VT) const{ const TargetRegisterClass *RRC = 0; @@ -1169,17 +1271,20 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{ return std::make_pair(RRC, Cost); } +// FIXME: Why this routine is here? Move to RegInfo! unsigned X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { default: return 0; case X86::GR32RegClassID: return 4 - FPDiff; case X86::GR64RegClassID: - return 8 - FPDiff; + return 12 - FPDiff; case X86::VR128RegClassID: return Subtarget->is64Bit() ? 10 : 4; case X86::VR64RegClassID: @@ -1263,14 +1368,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + (Subtarget->is64Bit() && !Subtarget->hasXMM())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -1291,13 +1396,13 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { - ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) - ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } } @@ -1336,6 +1441,28 @@ X86TargetLowering::LowerReturn(SDValue Chain, MVT::Other, &RetOps[0], RetOps.size()); } +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() != ISD::CopyToReg && + Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != X86ISD::RET_FLAG) + return false; + HasRet = true; + } + + return HasRet; +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// @@ -1360,7 +1487,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -1381,7 +1508,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; SDValue Ops[] = { Chain, InFlag }; - Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, + Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Glue, Ops, 2), 1); Val = Chain.getValue(0); @@ -1404,7 +1531,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, MVT::i64, InFlag).getValue(1); Val = Chain.getValue(0); } - Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); + Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); @@ -1542,6 +1669,12 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; @@ -1587,7 +1720,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) - ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. @@ -1669,17 +1802,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + assert(!(NumXMMRegs && !Subtarget->hasXMM()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) + if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; if (IsWin64) { - const TargetFrameInfo &TFI = *getTargetMachine().getFrameInfo(); + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; @@ -1776,8 +1909,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { - const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); - unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); + unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); if (Flags.isByVal()) @@ -1836,6 +1968,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isTargetWin64(); bool IsStructRet = CallIsStructReturn(Outs); bool IsSibcall = false; @@ -1861,6 +1994,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. @@ -1920,14 +2059,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, case CCValAssign::AExt: if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { // Special case: passing MMX values in XMM registers. - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); + Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. @@ -1943,7 +2082,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - if (isVarArg && Subtarget->isTargetWin64()) { + if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; @@ -2009,7 +2148,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, } } - if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { + if (Is64Bit && isVarArg && !IsWin64) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -2024,7 +2163,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - assert((Subtarget->hasSSE1() || !NumXMMRegs) + assert((Subtarget->hasXMM() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, @@ -2140,8 +2279,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; - // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external - // symbols should go through the PLT. + // On ELF targets, in either X86-64 or X86-32 mode, direct calls to + // external symbols should go through the PLT. if (Subtarget->isTargetELF() && getTargetMachine().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; @@ -2158,7 +2297,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, } // Returns a chain & a flag for retval copy to use. - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector<SDValue, 8> Ops; if (!IsSibcall && isTailCall) { @@ -2184,7 +2323,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. - if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) + if (Is64Bit && isVarArg && !IsWin64) Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); if (InFlag.getNode()) @@ -2271,7 +2410,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const TargetFrameInfo &TFI = *TM.getFrameInfo(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -2298,7 +2437,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -2444,14 +2583,17 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (Subtarget->isTargetWin64()) { + CCInfo.AllocateStack(32, 8); + } + CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { MachineFunction &MF = DAG.getMachineFunction(); if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) return false; - if (Subtarget->isTargetWin64()) - // Win64 ABI has additional complications. - return false; // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. @@ -2545,6 +2687,10 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKLPS: case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPS: + case X86ISD::VUNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: case X86ISD::PUNPCKLWD: case X86ISD::PUNPCKLBW: case X86ISD::PUNPCKLDQ: @@ -2612,6 +2758,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, case X86ISD::MOVSD: case X86ISD::UNPCKLPS: case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPS: + case X86ISD::VUNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: case X86ISD::PUNPCKLWD: case X86ISD::PUNPCKLBW: case X86ISD::PUNPCKLDQ: @@ -2713,8 +2863,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. - if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && - !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { + if (ISD::isNON_EXTLoad(LHS.getNode()) && + !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } @@ -3023,7 +3173,8 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { unsigned NumElems = N->getValueType(0).getVectorNumElements(); - if (NumElems != 2 && NumElems != 4) + if ((NumElems != 2 && NumElems != 4) + || N->getValueType(0).getSizeInBits() > 128) return false; for (unsigned i = 0; i < NumElems/2; ++i) @@ -3045,19 +3196,36 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return false; - for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (V2IsSplat) { - if (!isUndefOrEqual(BitI1, NumElts)) - return false; - } else { - if (!isUndefOrEqual(BitI1, j + NumElts)) + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElts / NumSections; + + unsigned Start = 0; + unsigned End = NumSectionElts; + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = Start, j = s * NumSectionElts; + i != End; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } } + // Process the next 128 bits. + Start += NumSectionElts; + End += NumSectionElts; } + return true; } @@ -3105,14 +3273,27 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) return false; - for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { - int BitI = Mask[i]; - int BitI1 = Mask[i+1]; - if (!isUndefOrEqual(BitI, j)) - return false; - if (!isUndefOrEqual(BitI1, j)) - return false; + // Handle vector lengths > 128 bits. Define a "section" as a set of + // 128 bits. AVX defines UNPCK* to operate independently on 128-bit + // sections. + unsigned NumSections = VT.getSizeInBits() / 128; + if (NumSections == 0 ) NumSections = 1; // Handle MMX + unsigned NumSectionElts = NumElems / NumSections; + + for (unsigned s = 0; s < NumSections; ++s) { + for (unsigned i = s * NumSectionElts, j = s * NumSectionElts; + i != NumSectionElts * (s + 1); + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } } + return true; } @@ -3263,6 +3444,44 @@ bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { return true; } +/// isVEXTRACTF128Index - Return true if the specified +/// EXTRACT_SUBVECTOR operand specifies a vector extract that is +/// suitable for input to VEXTRACTF128. +bool X86::isVEXTRACTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + unsigned VL = N->getValueType(0).getVectorNumElements(); + unsigned VBits = N->getValueType(0).getSizeInBits(); + unsigned ElSize = VBits / VL; + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + +/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// operand specifies a subvector insert that is suitable for input to +/// VINSERTF128. +bool X86::isVINSERTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + unsigned VL = N->getValueType(0).getVectorNumElements(); + unsigned VBits = N->getValueType(0).getSizeInBits(); + unsigned ElSize = VBits / VL; + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. unsigned X86::getShuffleSHUFImmediate(SDNode *N) { @@ -3331,6 +3550,42 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { return (Val - i) * EltSize; } +/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// instructions. +unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + EVT VecVT = N->getOperand(0).getValueType(); + EVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + + return Index / NumElemsPerChunk; +} + +/// getInsertVINSERTF128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// instructions. +unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + llvm_unreachable("Illegal insert subvector for VINSERTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + EVT VecVT = N->getValueType(0); + EVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + + return Index / NumElemsPerChunk; +} + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { @@ -3499,7 +3754,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); } - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. @@ -3512,7 +3767,7 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -3597,9 +3852,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { // Perform the splat. int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); + V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1); V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, V1); } /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified @@ -3671,11 +3926,15 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, case X86ISD::PUNPCKLWD: case X86ISD::PUNPCKLDQ: case X86ISD::PUNPCKLQDQ: - DecodePUNPCKLMask(NumElems, ShuffleMask); + DecodePUNPCKLMask(VT, ShuffleMask); break; case X86ISD::UNPCKLPS: case X86ISD::UNPCKLPD: - DecodeUNPCKLPMask(NumElems, ShuffleMask); + case X86ISD::VUNPCKLPS: + case X86ISD::VUNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + DecodeUNPCKLPMask(VT, ShuffleMask); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, ShuffleMask); @@ -3723,7 +3982,7 @@ SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, } // Actual nodes that may contain scalar elements - if (Opcode == ISD::BIT_CONVERT) { + if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); @@ -3912,7 +4171,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } } - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); } /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. @@ -3953,10 +4212,11 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, const TargetLowering &TLI, DebugLoc dl) { EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; - SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, - DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); + DAG.getConstant(NumBits, + TLI.getShiftAmountTy(SrcOp.getValueType())))); } SDValue @@ -3979,8 +4239,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { FI = FINode->getIndex(); Offset = 0; - } else if (Ptr.getOpcode() == ISD::ADD && - isa<ConstantSDNode>(Ptr.getOperand(1)) && + } else if (DAG.isBaseWithConstantOffset(Ptr) && isa<FrameIndexSDNode>(Ptr.getOperand(0))) { FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); @@ -4021,8 +4280,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, LD->getPointerInfo().getWithOffset(StartOffset), false, false, 0); // Canonicalize it to a v4i32 shuffle. - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getVectorShuffle(MVT::v4i32, dl, V1, DAG.getUNDEF(MVT::v4i32),&Mask[0])); } @@ -4090,7 +4349,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i32, LDBase->getMemOperand()); - return DAG.getNode(ISD::BIT_CONVERT, DL, VT, ResNode); + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); } @@ -4098,6 +4357,34 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DebugLoc dl = Op.getDebugLoc(); + + EVT VT = Op.getValueType(); + EVT ExtVT = VT.getVectorElementType(); + + unsigned NumElems = Op.getNumOperands(); + + // For AVX-length vectors, build the individual 128-bit pieces and + // use shuffles to put them in place. + if (VT.getSizeInBits() > 256 && + Subtarget->hasAVX() && + !ISD::isBuildVectorAllZeros(Op.getNode())) { + SmallVector<SDValue, 8> V; + V.resize(NumElems); + for (unsigned i = 0; i < NumElems; ++i) { + V[i] = Op.getOperand(i); + } + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); + + // Build the lower subvector. + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); + // Build the upper subvector. + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], + NumElems/2); + + return ConcatVectors(Lower, Upper, DAG); + } + // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. // All one's are handled with pcmpeqd. In AVX, zero's are handled with // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd @@ -4116,11 +4403,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); } - EVT VT = Op.getValueType(); - EVT ExtVT = VT.getVectorElementType(); unsigned EVTBits = ExtVT.getSizeInBits(); - unsigned NumElems = Op.getNumOperands(); unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; @@ -4182,7 +4466,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(Item.getValueType()), &Mask[0]); } - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); } } @@ -4206,7 +4490,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), DAG); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } @@ -4399,21 +4683,21 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || ResVT == MVT::v8i16 || ResVT == MVT::v16i8); int Mask[2]; - SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); + SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); InVec = Op.getOperand(1); if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { unsigned NumElts = ResVT.getVectorNumElements(); - VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); + VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); } else { - InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); + InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); Mask[0] = 0; Mask[1] = 2; VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); } - return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); + return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); } // v8i16 shuffles - Prefer shuffles in the following order: @@ -4495,9 +4779,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); NewV = DAG.getVectorShuffle(MVT::v2i64, dl, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); - NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); + NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the // source words for the shuffle, to aid later transformations. @@ -4566,12 +4850,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); } - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); if (!TwoInputs) - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. @@ -4586,12 +4870,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); } - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); } // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, @@ -4758,8 +5042,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, // No SSSE3 - Calculate in place words and then fix all out of place words // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from // the 16 different words that comprise the two doublequadword input vectors. - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); SDValue NewV = V2Only ? V2 : V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; @@ -4797,7 +5081,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DAG.getIntPtrConstant(Elt1 / 2)); if ((Elt1 & 1) == 0) InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, - DAG.getConstant(8, TLI.getShiftAmountTy())); + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt.getValueType()))); else if (Elt0 >= 0) InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, DAG.getConstant(0xFF00, MVT::i16)); @@ -4811,7 +5096,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); if ((Elt0 & 1) != 0) InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, - DAG.getConstant(8, TLI.getShiftAmountTy())); + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt0.getValueType()))); else if (Elt1 >= 0) InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, DAG.getConstant(0x00FF, MVT::i16)); @@ -4821,7 +5107,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); } - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide @@ -4865,8 +5151,8 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, MaskVec.push_back(StartIdx / Scale); } - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } @@ -4885,11 +5171,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && - SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && + SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OpVT, @@ -4899,9 +5185,9 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT, } } - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, OpVT, SrcOp))); } @@ -5012,6 +5298,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { // Break it into (shuffle shuffle_hi, shuffle_lo). Locs.clear(); + Locs.resize(4); SmallVector<int,8> LoMask(4U, -1); SmallVector<int,8> HiMask(4U, -1); @@ -5055,7 +5342,7 @@ LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } static bool MayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); @@ -5072,7 +5359,7 @@ static bool MayFoldVectorLoad(SDValue V) { // one use. Remove this version after this bug get fixed. // rdar://8434668, PR8156 static bool RelaxedMayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); @@ -5110,7 +5397,7 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, // If the bit convert changed the number of elements, it is unsafe // to examine the mask. bool HasShuffleIntoBitcast = false; - if (V.getOpcode() == ISD::BIT_CONVERT) { + if (V.getOpcode() == ISD::BITCAST) { EVT SrcVT = V.getOperand(0).getValueType(); if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) return false; @@ -5125,7 +5412,7 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); // Skip one more bit_convert if necessary - if (V.getOpcode() == ISD::BIT_CONVERT) + if (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); if (ISD::isNormalLoad(V.getNode())) { @@ -5162,8 +5449,8 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Canonizalize to v2f64. - V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, V1, DAG)); } @@ -5225,6 +5512,10 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) CanFoldLoad = true; + // Both of them can't be memory operations though. + if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) + CanFoldLoad = false; + if (CanFoldLoad) { if (HasSSE2 && NumElems == 2) return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); @@ -5253,16 +5544,20 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { X86::getShuffleSHUFImmediate(SVOp), DAG); } -static inline unsigned getUNPCKLOpcode(EVT VT) { +static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) { switch(VT.getSimpleVT().SimpleTy) { case MVT::v4i32: return X86ISD::PUNPCKLDQ; case MVT::v2i64: return X86ISD::PUNPCKLQDQ; - case MVT::v4f32: return X86ISD::UNPCKLPS; - case MVT::v2f64: return X86ISD::UNPCKLPD; + case MVT::v4f32: + return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS; + case MVT::v2f64: + return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; + case MVT::v8f32: return X86ISD::VUNPCKLPSY; + case MVT::v4f64: return X86ISD::VUNPCKLPDY; case MVT::v16i8: return X86ISD::PUNPCKLBW; case MVT::v8i16: return X86ISD::PUNPCKLWD; default: - llvm_unreachable("Unknow type for unpckl"); + llvm_unreachable("Unknown type for unpckl"); } return 0; } @@ -5276,7 +5571,7 @@ static inline unsigned getUNPCKHOpcode(EVT VT) { case MVT::v16i8: return X86ISD::PUNPCKHBW; case MVT::v8i16: return X86ISD::PUNPCKHWD; default: - llvm_unreachable("Unknow type for unpckh"); + llvm_unreachable("Unknown type for unpckh"); } return 0; } @@ -5317,7 +5612,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, if (VT == MVT::v8i16 || VT == MVT::v16i8) { SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); if (NewOp.getNode()) - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, NewOp); + return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { // FIXME: Figure out a cleaner way to do this. // Try to make use of movq to zero out the top part. @@ -5386,7 +5681,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // unpckh_undef). Only use pshufd if speed is more important than size. if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG); if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); @@ -5507,7 +5802,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { } if (X86::isUNPCKLMask(SVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V1, V2, DAG); if (X86::isUNPCKHMask(SVOp)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); @@ -5534,7 +5830,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); if (X86::isUNPCKLMask(NewSVOp)) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V2, V1, DAG); if (X86::isUNPCKHMask(NewSVOp)) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); @@ -5557,8 +5854,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && SVOp->getSplatIndex() == 0 && V2IsUndef) { - if (VT == MVT::v2f64) - return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); + if (VT == MVT::v2f64) { + X86ISD::NodeType Opcode = + getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD; + return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG); + } if (VT == MVT::v2i64) return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); } @@ -5585,7 +5885,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if (X86::isUNPCKL_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) - return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), + dl, VT, V1, V1, DAG); if (X86::isUNPCKH_v_undef_Mask(SVOp)) if (VT != MVT::v2i64 && VT != MVT::v2f64) return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); @@ -5627,7 +5928,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1))); @@ -5648,14 +5949,14 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if ((User->getOpcode() != ISD::STORE || (isa<ConstantSDNode>(Op.getOperand(1)) && cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && - (User->getOpcode() != ISD::BIT_CONVERT || + (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), Op.getOperand(1)); - return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); } else if (VT == MVT::i32) { // ExtractPS works with constant index. if (isa<ConstantSDNode>(Op.getOperand(1))) @@ -5671,6 +5972,38 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (!isa<ConstantSDNode>(Op.getOperand(1))) return SDValue(); + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + + // If this is a 256-bit vector result, first extract the 128-bit + // vector and then extract from the 128-bit vector. + if (VecVT.getSizeInBits() > 128) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + unsigned NumElems = VecVT.getVectorNumElements(); + SDValue Idx = Op.getOperand(1); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Get the 128-bit vector. + bool Upper = IdxVal >= ExtractNumElems; + Vec = Extract128BitVector(Vec, Idx, DAG, dl); + + // Extract from it. + SDValue ScaledIdx = Idx; + if (Upper) + ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx, + DAG.getConstant(ExtractNumElems, + Idx.getValueType())); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + ScaledIdx); + } + + assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); + if (Subtarget->hasSSE41()) { SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); if (Res.getNode()) @@ -5686,7 +6019,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, if (Idx == 0) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getNode(ISD::BIT_CONVERT, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Vec), Op.getOperand(1))); // Transform it so it match pextrw which produces a 32-bit result. @@ -5783,17 +6116,45 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + + // If this is a 256-bit vector result, first insert into a 128-bit + // vector and then insert into the 256-bit vector. + if (VT.getSizeInBits() > 128) { + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + + // Get the 128-bit vector. + unsigned NumElems = VT.getVectorNumElements(); + unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); + bool Upper = IdxVal >= NumElems / 2; + + SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl); + + // Insert into it. + SDValue ScaledN2 = N2; + if (Upper) + ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2, + DAG.getConstant(NumElems / + (VT.getSizeInBits() / 128), + N2.getValueType())); + Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0, + N1, ScaledN2); + + // Insert the 128-bit vector + // FIXME: Why UNDEF? + return Insert128BitVector(N0, Op, N2, DAG, dl); + } + if (Subtarget->hasSSE41()) return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); if (EltVT == MVT::i8) return SDValue(); - DebugLoc dl = Op.getDebugLoc(); - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDValue N2 = Op.getOperand(2); - if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. @@ -5808,7 +6169,25 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); + EVT OpVT = Op.getValueType(); + + // If this is a 256-bit vector result, first insert into a 128-bit + // vector and then insert into the 256-bit vector. + if (OpVT.getSizeInBits() > 128) { + // Insert into a 128-bit vector. + EVT VT128 = EVT::getVectorVT(*Context, + OpVT.getVectorElementType(), + OpVT.getVectorNumElements() / 2); + + Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); + + // Insert the 128-bit vector. + return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, + DAG.getConstant(0, MVT::i32), + DAG, dl); + } if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) @@ -5817,10 +6196,47 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && "Expected an SSE type!"); - return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); } +// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in +// a simple subregister reference or explicit instructions to grab +// upper bits of a vector. +SDValue +X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->hasAVX()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue Idx = Op.getNode()->getOperand(1); + + if (Op.getNode()->getValueType(0).getSizeInBits() == 128 + && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { + return Extract128BitVector(Vec, Idx, DAG, dl); + } + } + return SDValue(); +} + +// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a +// simple superregister reference or explicit instructions to insert +// the upper bits of a vector. +SDValue +X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->hasAVX()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue SubVec = Op.getNode()->getOperand(1); + SDValue Idx = Op.getNode()->getOperand(2); + + if (Op.getNode()->getValueType(0).getSizeInBits() == 256 + && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { + return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); + } + } + return SDValue(); +} + // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise @@ -6015,7 +6431,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags) { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); DebugLoc dl = GA->getDebugLoc(); SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), @@ -6150,7 +6566,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { OpFlag = X86II::MO_TLVP; DebugLoc DL = Op.getDebugLoc(); SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, - getPointerTy(), + GA->getValueType(0), GA->getOffset(), OpFlag); SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); @@ -6163,8 +6579,10 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // Lowering the machine isd will make sure everything is in the right // location. - SDValue Args[] = { Offset }; - SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Args[] = { Chain, Offset }; + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -6269,7 +6687,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDVTList Tys; bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); if (useSSE) - Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); + Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); else Tys = DAG.getVTList(Op.getValueType(), MVT::Other); @@ -6388,7 +6806,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, MachinePointerInfo::getConstantPool(), false, false, 16); SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); - SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, MachinePointerInfo::getConstantPool(), false, false, 16); @@ -6418,19 +6836,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, DAG.getIntPtrConstant(0))); Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), DAG.getIntPtrConstant(0)); // Or the load with the bias. SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), DAG.getIntPtrConstant(0)); // Subtract the bias. @@ -6525,7 +6943,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), + SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), MVT::f32, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. @@ -6688,11 +7106,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo::getConstantPool(), false, false, 16); if (VT.isVector()) { - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, + return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::XOR, dl, MVT::v2i64, - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op.getOperand(0)), - DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); } else { return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); } @@ -6744,7 +7162,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, DAG.getIntPtrConstant(0)); } @@ -7003,8 +7421,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). - if (Op0.getOpcode() == ISD::AND && - Op0.hasOneUse() && + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && Op1.getOpcode() == ISD::Constant && cast<ConstantSDNode>(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { @@ -7013,19 +7430,25 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return NewSetCC; } - // Look for "(setcc) == / != 1" to avoid unncessary setcc. - if (Op0.getOpcode() == X86ISD::SETCC && - Op1.getOpcode() == ISD::Constant && + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if (Op1.getOpcode() == ISD::Constant && (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || cast<ConstantSDNode>(Op1)->isNullValue()) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast<ConstantSDNode>(Op1)->isNullValue(); - if (Invert) + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast<ConstantSDNode>(Op1)->isNullValue(); + if (!Invert) return Op0; + CCode = X86::GetOppositeBranchCondition(CCode); - return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } } bool isFP = Op1.getValueType().isFloatingPoint(); @@ -7033,17 +7456,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (X86CC == X86::COND_INVALID) return SDValue(); - SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); - - // Use sbb x, x to materialize carry bit into a GPR. - if (X86CC == X86::COND_B) - return DAG.getNode(ISD::AND, dl, MVT::i8, - DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond), - DAG.getConstant(1, MVT::i8)); - + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); + DAG.getConstant(X86CC, MVT::i8), EFLAGS); } SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { @@ -7167,6 +7582,8 @@ static bool isX86LogicalCmp(SDValue Op) { if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || + Opc == X86ISD::ADC || + Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || Opc == X86ISD::INC || @@ -7176,13 +7593,28 @@ static bool isX86LogicalCmp(SDValue Op) { Opc == X86ISD::AND)) return true; + if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) + return true; + return false; } +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isNullValue(); +} + +static bool isAllOnes(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isAllOnesValue(); +} + SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { bool addTest = true; SDValue Cond = Op.getOperand(0); - DebugLoc dl = Op.getDebugLoc(); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); SDValue CC; if (Cond.getOpcode() == ISD::SETCC) { @@ -7191,30 +7623,40 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = NewCond; } - // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) - SDValue Op1 = Op.getOperand(1); - SDValue Op2 = Op.getOperand(2); + // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y + // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y + // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y + // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && - cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + isZero(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); - if (Cmp.getOpcode() == X86ISD::CMP) { - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); + + unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + + if ((isAllOnes(Op1) || isAllOnes(Op2)) && + (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { + SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + + SDValue CmpOp0 = Cmp.getOperand(0); + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, + CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + + SDValue Res = // Res = 0 or -1. + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cmp); + + if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + Res = DAG.getNOT(DL, Res, Res.getValueType()); + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - ConstantSDNode *RHSC = - dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); - if (N1C && N1C->isAllOnesValue() && - N2C && N2C->isNullValue() && - RHSC && RHSC->isNullValue()) { - SDValue CmpOp0 = Cmp.getOperand(0); - Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, - CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); - return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), - DAG.getConstant(X86::COND_B, MVT::i8), Cmp); - } + if (N2C == 0 || !N2C->isNullValue()) + Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); + return Res; } } - // Look pass (and (setcc_carry (cmp ...)), 1). + // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); @@ -7252,7 +7694,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); + SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); if (NewSetCC.getNode()) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); @@ -7266,11 +7708,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond = EmitTest(Cond, X86::COND_NE, DAG); } + // a < b ? -1 : 0 -> RES = ~setcc_carry + // a < b ? 0 : -1 -> RES = setcc_carry + // a >= b ? -1 : 0 -> RES = setcc_carry + // a >= b ? 0 : -1 -> RES = ~setcc_carry + if (Cond.getOpcode() == X86ISD::CMP) { + unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + + if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && + (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cond); + if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + return DAG.getNOT(DL, Res, Res.getValueType()); + return Res; + } + } + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; - return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); } // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or @@ -7469,7 +7928,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); Flag = Chain.getValue(1); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Flag = Chain.getValue(1); @@ -7577,7 +8036,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(!UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && - Subtarget->hasSSE1()); + Subtarget->hasXMM()); } // Insert VAARG_64 node into the DAG @@ -7893,7 +8352,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const } EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(NewIntNo, MVT::i32), Op.getOperand(1), ShAmt); @@ -8148,7 +8607,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const TargetFrameInfo &TFI = *TM.getFrameInfo(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); EVT VT = Op.getValueType(); DebugLoc DL = Op.getDebugLoc(); @@ -8327,7 +8786,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { false, false, 16); Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); - Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); return DAG.getNode(ISD::MUL, dl, VT, Op, R); } @@ -8353,9 +8812,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(4, MVT::i32)); - R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), - R, M, Op); + R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); @@ -8370,15 +8827,12 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, DAG.getConstant(2, MVT::i32)); - R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), - R, M, Op); + R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op); // a += a Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); // return pblendv(r, r+r, a); - R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), + R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); return R; } @@ -8395,8 +8849,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { SDValue RHS = N->getOperand(1); unsigned BaseOp = 0; unsigned Cond = 0; - DebugLoc dl = Op.getDebugLoc(); - + DebugLoc DL = Op.getDebugLoc(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown ovf instruction!"); case ISD::SADDO: @@ -8435,19 +8888,29 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { BaseOp = X86ISD::SMUL; Cond = X86::COND_O; break; - case ISD::UMULO: - BaseOp = X86ISD::UMUL; - Cond = X86::COND_B; - break; + case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), + MVT::i32); + SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(X86::COND_O, MVT::i32), + SDValue(Sum.getNode(), 2)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); + return Sum; + } } // Also sets EFLAGS. SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); - SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); + SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); SDValue SetCC = - DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), - DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), + DAG.getConstant(Cond, MVT::i32), + SDValue(Sum.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); return Sum; @@ -8520,7 +8983,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(3), DAG.getTargetConstant(size, MVT::i8), cpIn.getValue(1) }; - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, 5, T, MMO); @@ -8532,7 +8995,7 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "Result not type legalized?"); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); @@ -8548,16 +9011,15 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); - assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && - Subtarget->hasMMX() && !DisableMMX) && - "Unexpected custom BIT_CONVERT"); + assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && + Subtarget->hasMMX() && "Unexpected custom BITCAST"); assert((DstVT == MVT::i64 || (DstVT.isVector() && DstVT.getSizeInBits()==64)) && - "Unexpected custom BIT_CONVERT"); + "Unexpected custom BITCAST"); // i64 <=> MMX conversions are Legal. if (SrcVT==MVT::i64 && DstVT.isVector()) return Op; @@ -8569,6 +9031,7 @@ SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, // All other conversions need to be expanded. return SDValue(); } + SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); @@ -8583,6 +9046,32 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { cast<AtomicSDNode>(Node)->getAlignment()); } +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getNode()->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: assert(0 && "Invalid code"); + case ISD::ADDC: Opc = X86ISD::ADD; break; + case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; + case ISD::SUBC: Opc = X86ISD::SUB; break; + case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -8596,6 +9085,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -8640,7 +9131,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); - case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); + case ISD::BITCAST: return LowerBITCAST(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); } } @@ -8677,6 +9172,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, default: assert(false && "Do not know how to custom type legalize this operation!"); return; + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + // We don't want to expand or promote these. + return; case ISD::FP_TO_SINT: { std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, true); @@ -8690,7 +9191,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::READCYCLECOUNTER: { - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = N->getOperand(0); SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, @@ -8726,7 +9227,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Ops[] = { swapInH.getValue(0), N->getOperand(1), swapInH.getValue(1) }; - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3, T, MMO); @@ -8803,6 +9304,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; + case X86ISD::PANDN: return "X86ISD::PANDN"; + case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; + case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; + case X86ISD::PSIGND: return "X86ISD::PSIGND"; + case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; @@ -8836,6 +9342,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; + case X86ISD::ADC: return "X86ISD::ADC"; + case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; case X86ISD::INC: return "X86ISD::INC"; @@ -8869,6 +9377,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MOVSS: return "X86ISD::MOVSS"; case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; + case X86ISD::VUNPCKLPS: return "X86ISD::VUNPCKLPS"; + case X86ISD::VUNPCKLPD: return "X86ISD::VUNPCKLPD"; + case X86ISD::VUNPCKLPSY: return "X86ISD::VUNPCKLPSY"; + case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; @@ -9403,15 +9915,12 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MachineBasicBlock * X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, unsigned numArgs, bool memArg) const { - assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && "Target must have SSE4.2 or AVX features enabled"); DebugLoc dl = MI->getDebugLoc(); const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - unsigned Opc; - if (!Subtarget->hasAVX()) { if (memArg) Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; @@ -9424,20 +9933,59 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; } - MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); - + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); for (unsigned i = 0; i < numArgs; ++i) { MachineOperand &Op = MI->getOperand(i+1); - if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } - - BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) + BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) .addReg(X86::XMM0); MI->eraseFromParent(); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // Address into RAX/EAX, other two args into ECX, EDX. + unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + + unsigned ValOps = X86::AddrNumOperands; + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(ValOps).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) + .addReg(MI->getOperand(ValOps+1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + // First arg in ECX, the second in EAX. + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); + + MI->eraseFromParent(); // The pseudo is gone now. return BB; } @@ -9925,6 +10473,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); + case X86::TAILJMPd64: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + assert(!"TAILJMP64 would not be touched here."); + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. + // On AMD64, additional defs should be added before register allocation. + if (!Subtarget->isTargetWin64()) { + MI->addRegisterDefined(X86::RSI); + MI->addRegisterDefined(X86::RDI); + MI->addRegisterDefined(X86::XMM6); + MI->addRegisterDefined(X86::XMM7); + MI->addRegisterDefined(X86::XMM8); + MI->addRegisterDefined(X86::XMM9); + MI->addRegisterDefined(X86::XMM10); + MI->addRegisterDefined(X86::XMM11); + MI->addRegisterDefined(X86::XMM12); + MI->addRegisterDefined(X86::XMM13); + MI->addRegisterDefined(X86::XMM14); + MI->addRegisterDefined(X86::XMM15); + } + return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::TLSCall_32: @@ -10040,6 +10612,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: return EmitPCMP(MI, BB, 5, true /* in mem */); + // Thread synchronization. + case X86::MONITOR: + return EmitMonitor(MI, BB); + case X86::MWAIT: + return EmitMwait(MI, BB); + // Atomic Lowering. case X86::ATOMAND32: return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, @@ -10233,6 +10811,8 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, default: break; case X86ISD::ADD: case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: case X86ISD::SMUL: case X86ISD::UMUL: case X86ISD::INC: @@ -10281,13 +10861,18 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, /// if the load addresses are consecutive, non-overlapping, and in the right /// order. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, - const TargetLowering &TLI) { + TargetLowering::DAGCombinerInfo &DCI) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); if (VT.getSizeInBits() != 128) return SDValue(); + // Don't create instructions with illegal types after legalize types has run. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) + return SDValue(); + SmallVector<SDValue, 16> Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); @@ -10944,6 +11529,36 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, return SDValue(); } + +static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Want to form PANDN nodes, in the hopes of then easily combining them with + // OR and AND nodes to form PBLEND/PSIGN. + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for vnot + if (N0.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::PANDN, DL, VT, N0.getOperand(0), N1); + + // Check RHS for vnot + if (N1.getOpcode() == ISD::XOR && + ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::PANDN, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -10951,12 +11566,99 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) return SDValue(); - // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // look for psign/blend + if (Subtarget->hasSSSE3()) { + if (VT == MVT::v2i64) { + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::PANDN) + std::swap(N0, N1); + // or (and (m, x), (pandn m, y)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::PANDN) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and PANDN and + if (!Y.getNode()) + return SDValue(); + + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + if (Mask.getOpcode() != ISD::BITCAST || + X.getOpcode() != ISD::BITCAST || + Y.getOpcode() != ISD::BITCAST) + return SDValue(); + + // Look through mask bitcast. + Mask = Mask.getOperand(0); + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. The sra node + // will be an intrinsic. + if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) + return SDValue(); + + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + break; + default: return SDValue(); + } + + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(2); + unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + X = X.getOperand(0); + Y = Y.getOperand(0); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ + unsigned Opc = 0; + switch (EltBits) { + case 8: Opc = X86ISD::PSIGNB; break; + case 16: Opc = X86ISD::PSIGNW; break; + case 32: Opc = X86ISD::PSIGND; break; + default: break; + } + if (Opc) { + SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); + } + } + // PBLENDVB only available on SSE 4.1 + if (!Subtarget->hasSSE41()) + return SDValue(); + + X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); + Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); + Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + } + } + } + + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) @@ -11175,13 +11877,13 @@ static SDValue PerformBTCombine(SDNode *N, static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { SDValue Op = N->getOperand(0); - if (Op.getOpcode() == ISD::BIT_CONVERT) + if (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); EVT VT = N->getValueType(0), OpVT = Op.getValueType(); if (Op.getOpcode() == X86ISD::VZEXT_LOAD && VT.getVectorElementType().getSizeInBits() == OpVT.getVectorElementType().getSizeInBits()) { - return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); } return SDValue(); } @@ -11212,19 +11914,106 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { + unsigned X86CC = N->getConstantOperandVal(0); + SDValue EFLAG = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without + // a zext and produces an all-ones bit which is more useful than 0/1 in some + // cases. + if (X86CC == X86::COND_B) + return DAG.getNode(ISD::AND, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAG), + DAG.getConstant(1, MVT::i8)); + + return SDValue(); +} + +// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS +static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, + X86TargetLowering::DAGCombinerInfo &DCI) { + // If the LHS and RHS of the ADC node are zero, then it can't overflow and + // the result is either zero or one (depending on the input carry bit). + // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. + if (X86::isZeroNode(N->getOperand(0)) && + X86::isZeroNode(N->getOperand(1)) && + // We don't have a good way to replace an EFLAGS use, so only do this when + // dead right now. + SDValue(N, 1).use_empty()) { + DebugLoc DL = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); + SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B,MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, VT)); + return DCI.CombineTo(N, Res1, CarryOut); + } + + return SDValue(); +} + +// fold (add Y, (sete X, 0)) -> adc 0, Y +// (add Y, (setne X, 0)) -> sbb -1, Y +// (sub (sete X, 0), Y) -> sbb 0, Y +// (sub (setne X, 0), Y) -> adc -1, Y +static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) { + DebugLoc DL = N->getDebugLoc(); + + // Look through ZExts. + SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); + if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) + return SDValue(); + + SDValue SetCC = Ext.getOperand(0); + if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + return SDValue(); + + X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SDValue Cmp = SetCC.getOperand(1); + if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || + !X86::isZeroNode(Cmp.getOperand(1)) || + !Cmp.getOperand(0).getValueType().isInteger()) + return SDValue(); + + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, + DAG.getConstant(1, CmpOp0.getValueType())); + + SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + if (CC == X86::COND_NE) + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(0, OtherVal.getValueType()), NewCmp); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case ISD::ADD: + case ISD::SUB: return OptimizeConditonalInDecrement(N, DAG); + case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); case X86ISD::FXOR: @@ -11233,6 +12022,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); case X86ISD::SHUFPS: // Handle all target specific shuffles case X86ISD::SHUFPD: case X86ISD::PALIGN: @@ -11248,6 +12038,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PUNPCKLQDQ: case X86ISD::UNPCKLPS: case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPS: + case X86ISD::VUNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: case X86ISD::PSHUFD: @@ -11255,7 +12049,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI); } return SDValue(); @@ -11362,38 +12156,8 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -static bool LowerToBSwap(CallInst *CI) { - // FIXME: this should verify that we are targetting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical ops - // instead of emitting the bswap asm. For now, we don't support 486 or lower - // so don't worry about this. - - // Verify this is a simple bswap. - if (CI->getNumArgOperands() != 1 || - CI->getType() != CI->getArgOperand(0)->getType() || - !CI->getType()->isIntegerTy()) - return false; - - const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - - // Okay, we can do this xform, do so now. - const Type *Tys[] = { Ty }; - Module *M = CI->getParent()->getParent()->getParent(); - Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); - - Value *Op = CI->getArgOperand(0); - Op = CallInst::Create(Int, Op, CI->getName(), CI); - - CI->replaceAllUsesWith(Op); - CI->eraseFromParent(); - return true; -} - bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); - InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); std::string AsmStr = IA->getAsmString(); @@ -11408,6 +12172,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. + // FIXME: this should verify that we are targetting a 486 or better. If not, + // we will turn this bswap into something that will be lowered to logical ops + // instead of emitting the bswap asm. For now, we don't support 486 or lower + // so don't worry about this. // bswap $0 if (AsmPieces.size() == 2 && (AsmPieces[0] == "bswap" || @@ -11417,7 +12185,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[1] == "${0:q}")) { // No need to check constraints, nothing other than the equivalent of // "=r,0" would be valid here. - return LowerToBSwap(CI); + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 if (CI->getType()->isIntegerTy(16) && @@ -11427,15 +12198,18 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { AsmPieces[2] == "${0:w}" && IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { AsmPieces.clear(); - const std::string &Constraints = IA->getConstraintString(); - SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); std::sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && AsmPieces[2] == "~{flags}" && AsmPieces[3] == "~{fpsr}") { - return LowerToBSwap(CI); + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); } } break; @@ -11455,36 +12229,45 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && Words[2] == "${0:w}") { AsmPieces.clear(); - const std::string &Constraints = IA->getConstraintString(); - SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); std::sort(AsmPieces.begin(), AsmPieces.end()); if (AsmPieces.size() == 4 && AsmPieces[0] == "~{cc}" && AsmPieces[1] == "~{dirflag}" && AsmPieces[2] == "~{flags}" && AsmPieces[3] == "~{fpsr}") { - return LowerToBSwap(CI); + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); } } } } } - if (CI->getType()->isIntegerTy(64) && - Constraints.size() >= 2 && - Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && - Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { - // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - SmallVector<StringRef, 4> Words; - SplitString(AsmPieces[0], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { - Words.clear(); - SplitString(AsmPieces[1], Words, " \t"); - if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { + + if (CI->getType()->isIntegerTy(64)) { + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); + if (Constraints.size() >= 2 && + Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && + Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { + // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 + SmallVector<StringRef, 4> Words; + SplitString(AsmPieces[0], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { Words.clear(); - SplitString(AsmPieces[2], Words, " \t,"); - if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && - Words[2] == "%edx") { - return LowerToBSwap(CI); + SplitString(AsmPieces[1], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { + Words.clear(); + SplitString(AsmPieces[2], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && + Words[2] == "%edx") { + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); + } } } } @@ -11575,12 +12358,12 @@ TargetLowering::ConstraintWeight weight = CW_SpecificReg; break; case 'y': - if (type->isX86_MMXTy() && !DisableMMX && Subtarget->hasMMX()) + if (type->isX86_MMXTy() && Subtarget->hasMMX()) weight = CW_SpecificReg; break; case 'x': case 'Y': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) + if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) weight = CW_Register; break; case 'I': @@ -11650,9 +12433,9 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasSSE2()) + if (Subtarget->hasXMMInt()) return "Y"; - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) return "x"; } @@ -11882,10 +12665,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (!Subtarget->hasMMX()) break; return std::make_pair(0U, X86::VR64RegisterClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasSSE2()) break; + if (!Subtarget->hasXMMInt()) break; // FALL THROUGH. case 'x': // SSE_REGS if SSE1 allowed - if (!Subtarget->hasSSE1()) break; + if (!Subtarget->hasXMM()) break; switch (VT.getSimpleVT().SimpleTy) { default: break; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 776299e..6ec4a7d 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -86,13 +86,13 @@ namespace llvm { /// X86 bit-test instructions. BT, - /// X86 SetCC. Operand 0 is condition code, and operand 1 is the flag - /// operand produced by a CMP instruction. + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. SETCC, // Same as SETCC except it's materialized with a sbb and the value is all // one's or all zero's. - SETCC_CARRY, + SETCC_CARRY, // R = carry_bit ? ~0 : 0 /// X86 conditional moves. Operand 0 and operand 1 are the two values /// to select from. Operand 2 is the condition code, and operand 3 is the @@ -160,6 +160,15 @@ namespace llvm { /// PSHUFB - Shuffle 16 8-bit values within a vector. PSHUFB, + /// PANDN - and with not'd value. + PANDN, + + /// PSIGNB/W/D - Copy integer sign. + PSIGNB, PSIGNW, PSIGND, + + /// PBLENDVB - Variable blend + PBLENDVB, + /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, @@ -200,10 +209,12 @@ namespace llvm { PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ, PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ, - // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results. - ADD, SUB, SMUL, UMUL, + // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, + UMUL, // LOW, HI, FLAGS = umul LHS, RHS + // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, @@ -237,6 +248,10 @@ namespace llvm { MOVSS, UNPCKLPS, UNPCKLPD, + VUNPCKLPS, + VUNPCKLPD, + VUNPCKLPSY, + VUNPCKLPDY, UNPCKHPS, UNPCKHPD, PUNPCKLBW, @@ -256,6 +271,12 @@ namespace llvm { // WIN_ALLOCA - Windows's _chkstk call to do stack probing. WIN_ALLOCA, + // Memory barrier + MEMBARRIER, + MFENCE, + SFENCE, + LFENCE, + // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - // Atomic 64-bit binary operations. @@ -267,12 +288,6 @@ namespace llvm { ATOMNAND64_DAG, ATOMSWAP64_DAG, - // Memory barrier - MEMBARRIER, - MFENCE, - SFENCE, - LFENCE, - // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap. LCMPXCHG_DAG, LCMPXCHG8_DAG, @@ -397,6 +412,16 @@ namespace llvm { /// specifies a shuffle of elements that is suitable for input to PALIGNR. bool isPALIGNRMask(ShuffleVectorSDNode *N); + /// isVEXTRACTF128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128. + bool isVEXTRACTF128Index(SDNode *N); + + /// isVINSERTF128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128. + bool isVINSERTF128Index(SDNode *N); + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* /// instructions. @@ -414,6 +439,16 @@ namespace llvm { /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. unsigned getShufflePALIGNRImmediate(SDNode *N); + /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF128 instructions. + unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + + /// getInsertVINSERTF128Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF128 instructions. + unsigned getInsertVINSERTF128Immediate(SDNode *N); + /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool isZeroNode(SDValue Elt); @@ -432,6 +467,8 @@ namespace llvm { virtual unsigned getJumpTableEncoding() const; + virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; } + virtual const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid, @@ -730,6 +767,8 @@ namespace llvm { SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, @@ -740,7 +779,7 @@ namespace llvm { SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; - SDValue LowerBIT_CONVERT(SDValue op, SelectionDAG &DAG) const; + SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; @@ -805,6 +844,8 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; + virtual bool isUsedByReturnOnly(SDNode *N) const; + virtual bool CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, @@ -821,6 +862,13 @@ namespace llvm { MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB, unsigned argNum, bool inMem) const; + /// Utility functions to emit monitor and mwait instructions. These + /// need to make sure that the arguments to the intrinsic are in the + /// correct registers. + MachineBasicBlock *EmitMonitor(MachineInstr *MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const; + /// Utility function to emit atomic bitwise operations (and, or, xor). /// It takes the bitwise instruction to expand, the associated machine basic /// block, and the associated X86 opcodes for reg/reg and reg/imm. @@ -871,6 +919,9 @@ namespace llvm { MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; + MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, + MachineBasicBlock *BB) const; + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent, for use with the given x86 condition code. SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const; diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index f82e1c6..f0ea068 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -60,11 +60,12 @@ def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), - "mul{l}\t$src", - []>; // EAX,EDX = EAX*GR32 + "mul{l}\t$src", // EAX,EDX = EAX*GR32 + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>; let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), - "mul{q}\t$src", []>; // RAX,RDX = RAX*GR64 + "mul{q}\t$src", // RAX,RDX = RAX*GR64 + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>; let Defs = [AL,EFLAGS,AX], Uses = [AL] in def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), @@ -630,6 +631,15 @@ class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; +// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result, and has EFLAGS as input. +class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, + EFLAGS))]>; + // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> : ITy<opcode, MRMSrcReg, typeinfo, @@ -668,6 +678,14 @@ class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; +// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]". +class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), + EFLAGS))]>; + // BinOpRI - Instructions like "add reg, reg, imm". class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f, dag outlist, list<dag> pattern> @@ -698,6 +716,14 @@ class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; +// BinOpRI_RFF - Instructions like "adc reg, reg, imm". +class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, + EFLAGS))]>; + // BinOpRI8 - Instructions like "add reg, reg, imm8". class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f, dag outlist, list<dag> pattern> @@ -728,6 +754,14 @@ class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; +// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". +class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, + EFLAGS))]>; + // BinOpMR - Instructions like "add [mem], reg". class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, list<dag> pattern> @@ -742,6 +776,14 @@ class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst), (implicit EFLAGS)]>; +// BinOpMR_RMW_FF - Instructions like "adc [mem], reg". +class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), + addr:$dst), + (implicit EFLAGS)]>; + // BinOpMR_F - Instructions like "cmp [mem], reg". class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode> @@ -765,6 +807,14 @@ class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, typeinfo.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>; +// BinOpMI_RMW_FF - Instructions like "adc [mem], imm". +class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + // BinOpMI_F - Instructions like "cmp [mem], imm". class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> @@ -790,6 +840,14 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, typeinfo.Imm8Operator:$src), addr:$dst), (implicit EFLAGS)]>; +// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". +class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + // BinOpMI8_F - Instructions like "cmp [mem], imm8". class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> @@ -875,22 +933,24 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } -/// ArithBinOp_R - This is an arithmetic binary operator where the pattern is -/// defined with "(set GPR:$dst, (...". It would be really nice to find a way -/// to factor this with the other ArithBinOp_*. +/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and +/// SBB. /// -multiclass ArithBinOp_R<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, - string mnemonic, Format RegMRM, Format MemMRM, - SDNode opnode, - bit CommutableRR, bit ConvertibleToThreeAddress> { +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, bit CommutableRR, + bit ConvertibleToThreeAddress> { let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def #NAME#8rr : BinOpRR_R<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16rr : BinOpRR_R<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32rr : BinOpRR_R<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64rr : BinOpRR_R<BaseOpc, mnemonic, Xi64, opnode>; + def #NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; } // isCommutable def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; @@ -898,40 +958,40 @@ multiclass ArithBinOp_R<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; - def #NAME#8rm : BinOpRM_R<BaseOpc2, mnemonic, Xi8 , opnode>; - def #NAME#16rm : BinOpRM_R<BaseOpc2, mnemonic, Xi16, opnode>; - def #NAME#32rm : BinOpRM_R<BaseOpc2, mnemonic, Xi32, opnode>; - def #NAME#64rm : BinOpRM_R<BaseOpc2, mnemonic, Xi64, opnode>; + def #NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def #NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def #NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def #NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def #NAME#16ri8 : BinOpRI8_R<0x82, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri8 : BinOpRI8_R<0x82, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri8 : BinOpRI8_R<0x82, mnemonic, Xi64, opnode, RegMRM>; - - def #NAME#8ri : BinOpRI_R<0x80, mnemonic, Xi8 , opnode, RegMRM>; - def #NAME#16ri : BinOpRI_R<0x80, mnemonic, Xi16, opnode, RegMRM>; - def #NAME#32ri : BinOpRI_R<0x80, mnemonic, Xi32, opnode, RegMRM>; - def #NAME#64ri32: BinOpRI_R<0x80, mnemonic, Xi64, opnode, RegMRM>; + def #NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def #NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def #NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; } } // Constraints = "$src1 = $dst" - def #NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; - def #NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; - def #NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; - def #NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + def #NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def #NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + def #NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - def #NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; - def #NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; - def #NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; - def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + def #NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; + def #NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>; def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>; @@ -1017,13 +1077,12 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, // Arithmetic. let Uses = [EFLAGS] in { - // FIXME: Delete ArithBinOp_R if these switch off adde/sube. - defm ADC : ArithBinOp_R<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, adde, 1, 0>; - defm SBB : ArithBinOp_R<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, sube, 0, 0>; + defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; + defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; } - - defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index b299b90..4c915d9 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -193,9 +193,26 @@ def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "", } // isCodeGenOnly +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C64r)>; +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; //===----------------------------------------------------------------------===// // String Pseudo Instructions @@ -244,8 +261,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [ESP] in def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), - "leal\t$sym, %eax; " - "call\t___tls_get_addr@PLT", + "# TLS_addr32", [(X86tlsaddr tls32addr:$sym)]>, Requires<[In32BitMode]>; @@ -259,11 +275,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [RSP] in def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), - ".byte\t0x66; " - "leaq\t$sym(%rip), %rdi; " - ".word\t0x6666; " - "rex64; " - "call\t__tls_get_addr@PLT", + "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, Requires<[In64BitMode]>; @@ -271,7 +283,7 @@ def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), // For i386, the address of the thunk is passed on the stack, on return the // address of the variable is in %eax. %ecx is trashed during the function // call. All other registers are preserved. -let Defs = [EAX, ECX], +let Defs = [EAX, ECX, EFLAGS], Uses = [ESP], usesCustomInserter = 1 in def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), @@ -281,8 +293,8 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // For x86_64, the address of the thunk is passed in %rdi, on return // the address of the variable is in %rax. All other registers are preserved. -let Defs = [RAX], - Uses = [RDI], +let Defs = [RAX, EFLAGS], + Uses = [RSP, RDI], usesCustomInserter = 1 in def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLSCall_64", @@ -837,38 +849,38 @@ def : Pat<(X86call (i64 texternalsym:$dst)), // tailcall stuff def : Pat<(X86tcret GR32_TC:$dst, imm:$off), (TCRETURNri GR32_TC:$dst, imm:$off)>, - Requires<[In32BitMode]>; + Requires<[In32BitMode]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, - Requires<[In32BitMode, IsNotPIC]>; + Requires<[In32BitMode, IsNotPIC]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[In32BitMode]>; + Requires<[In32BitMode]>; def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, - Requires<[In32BitMode]>; + Requires<[In32BitMode]>; -def : Pat<(X86tcret GR64_TC:$dst, imm:$off), - (TCRETURNri64 GR64_TC:$dst, imm:$off)>, - Requires<[In64BitMode]>; +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode]>; def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode]>; def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), (TCRETURNdi64 texternalsym:$dst, imm:$off)>, - Requires<[In64BitMode]>; + Requires<[In64BitMode]>; // Normal calls, with various flavors of addresses. def : Pat<(X86call (i32 tglobaladdr:$dst)), @@ -878,43 +890,6 @@ def : Pat<(X86call (i32 texternalsym:$dst)), def : Pat<(X86call (i32 imm:$dst)), (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; -// X86 specific add which produces a flag. -def : Pat<(addc GR32:$src1, GR32:$src2), - (ADD32rr GR32:$src1, GR32:$src2)>; -def : Pat<(addc GR32:$src1, (load addr:$src2)), - (ADD32rm GR32:$src1, addr:$src2)>; -def : Pat<(addc GR32:$src1, imm:$src2), - (ADD32ri GR32:$src1, imm:$src2)>; -def : Pat<(addc GR32:$src1, i32immSExt8:$src2), - (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; - -def : Pat<(addc GR64:$src1, GR64:$src2), - (ADD64rr GR64:$src1, GR64:$src2)>; -def : Pat<(addc GR64:$src1, (load addr:$src2)), - (ADD64rm GR64:$src1, addr:$src2)>; -def : Pat<(addc GR64:$src1, i64immSExt8:$src2), - (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(addc GR64:$src1, i64immSExt32:$src2), - (ADD64ri32 GR64:$src1, imm:$src2)>; - -def : Pat<(subc GR32:$src1, GR32:$src2), - (SUB32rr GR32:$src1, GR32:$src2)>; -def : Pat<(subc GR32:$src1, (load addr:$src2)), - (SUB32rm GR32:$src1, addr:$src2)>; -def : Pat<(subc GR32:$src1, imm:$src2), - (SUB32ri GR32:$src1, imm:$src2)>; -def : Pat<(subc GR32:$src1, i32immSExt8:$src2), - (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; - -def : Pat<(subc GR64:$src1, GR64:$src2), - (SUB64rr GR64:$src1, GR64:$src2)>; -def : Pat<(subc GR64:$src1, (load addr:$src2)), - (SUB64rm GR64:$src1, addr:$src2)>; -def : Pat<(subc GR64:$src1, i64immSExt8:$src2), - (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; -def : Pat<(subc GR64:$src1, imm:$src2), - (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; - // Comparisons. // TEST R,R is smaller than CMP R,0 @@ -1043,8 +1018,9 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. let AddedComplexity = 5 in { // Try this before the selecting to OR -let isCommutable = 1, isConvertibleToThreeAddress = 1, +let isConvertibleToThreeAddress = 1, Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "", // orw/addw REG, REG [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; @@ -1054,6 +1030,7 @@ def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "", // orq/addq REG, REG [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1647,4 +1624,3 @@ def : Pat<(and GR64:$src1, i64immSExt8:$src2), (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(and GR64:$src1, i64immSExt32:$src2), (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; - diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index ee20ebc..77f4725 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -1,10 +1,10 @@ //===- X86InstrControl.td - Control Flow Instructions ------*- tablegen -*-===// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the X86 jump, return, call, and related instructions. @@ -26,7 +26,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, [(X86retflag timm:$amt)]>; def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), "retw\t$amt", - [(X86retflag timm:$amt)]>, OpSize; + []>, OpSize; def LRETL : I <0xCB, RawFrm, (outs), (ins), "lretl", []>; def LRETQ : RI <0xCB, RawFrm, (outs), (ins), @@ -43,7 +43,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1 in { "jmp\t$dst", [(br bb:$dst)]>; def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), "jmp\t$dst", []>; - def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), + def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), "jmp{q}\t$dst", []>; } @@ -108,16 +108,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>; - def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), "ljmp{w}\t{$seg, $off|$off, $seg}", []>, OpSize; def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), - "ljmp{l}\t{$seg, $off|$off, $seg}", []>; + "ljmp{l}\t{$seg, $off|$off, $seg}", []>; def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), "ljmp{q}\t{*}$dst", []>; - def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), "ljmp{w}\t{*}$dst", []>, OpSize; def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), "ljmp{l}\t{*}$dst", []>; @@ -152,14 +152,14 @@ let isCall = 1 in def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops), "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>, Requires<[In32BitMode]>; - - def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), (ins i16imm:$off, i16imm:$seg), "lcall{w}\t{$seg, $off|$off, $seg}", []>, OpSize; def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), (ins i32imm:$off, i16imm:$seg), "lcall{l}\t{$seg, $off|$off, $seg}", []>; - + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), "lcall{w}\t{*}$dst", []>, OpSize; def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), @@ -182,16 +182,13 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [ESP] in { - def TCRETURNdi : I<0, Pseudo, (outs), - (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), - "#TC_RETURN $dst $offset", []>; - def TCRETURNri : I<0, Pseudo, (outs), - (ins GR32_TC:$dst, i32imm:$offset, variable_ops), - "#TC_RETURN $dst $offset", []>; + def TCRETURNdi : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>; + def TCRETURNri : PseudoI<(outs), + (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>; let mayLoad = 1 in - def TCRETURNmi : I<0, Pseudo, (outs), - (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), - "#TC_RETURN $dst $offset", []>; + def TCRETURNmi : PseudoI<(outs), + (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>; // FIXME: The should be pseudo instructions that are lowered when going to // mcinst. @@ -199,7 +196,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, (ins i32imm_pcrel:$dst, variable_ops), "jmp\t$dst # TAILCALL", []>; - def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), "", []>; // FIXME: Remove encoding when JIT is dead. let mayLoad = 1 in def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops), @@ -221,7 +218,7 @@ let isCall = 1 in XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [RSP] in { - + // NOTE: this pattern doesn't match "X86call imm", because we do not know // that the offset between an arbitrary immediate and the call will fit in // the 32-bit pcrel field that we have. @@ -235,12 +232,12 @@ let isCall = 1 in def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops), "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, Requires<[In64BitMode, NotWin64]>; - + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), "lcall{q}\t{*}$dst", []>; } - // FIXME: We need to teach codegen about single list of call-clobbered + // FIXME: We need to teach codegen about single list of call-clobbered // registers. let isCall = 1, isCodeGenOnly = 1 in // All calls clobber the non-callee saved registers. RSP is marked as @@ -259,40 +256,39 @@ let isCall = 1, isCodeGenOnly = 1 in def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, Requires<[IsWin64]>; - def WINCALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst,variable_ops), + def WINCALL64m : I<0xFF, MRM2m, (outs), + (ins i64mem:$dst,variable_ops), "call{q}\t{*}$dst", - [(X86call (loadi64 addr:$dst))]>, + [(X86call (loadi64 addr:$dst))]>, Requires<[IsWin64]>; } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1 in - let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + // AMD64 cc clobbers RSI, RDI, XMM6-XMM15. + let Defs = [RAX, RCX, RDX, R8, R9, R10, R11, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, - XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, - XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - Uses = [RSP] in { - def TCRETURNdi64 : I<0, Pseudo, (outs), - (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops), - "#TC_RETURN $dst $offset", []>; - def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64_TC:$dst, i32imm:$offset, - variable_ops), - "#TC_RETURN $dst $offset", []>; + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS], + Uses = [RSP], + usesCustomInserter = 1 in { + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>; let mayLoad = 1 in - def TCRETURNmi64 : I<0, Pseudo, (outs), - (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), - "#TC_RETURN $dst $offset", []>; + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>; def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst, variable_ops), "jmp\t$dst # TAILCALL", []>; - def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64_TC:$dst, variable_ops), + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops), "jmp{q}\t{*}$dst # TAILCALL", []>; let mayLoad = 1 in def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops), "jmp{q}\t{*}$dst # TAILCALL", []>; } - diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 9124f90..b506f5e 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -34,12 +34,12 @@ def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, - [SDNPHasChain, SDNPInFlag, SDNPMayStore, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, SDNPMemOperand]>; def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, - [SDNPHasChain, SDNPOutFlag, SDNPMayLoad, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, SDNPMemOperand]>; def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -73,41 +73,23 @@ def fpimmneg1 : PatLeaf<(fpimm), [{ // Some 'special' instructions let usesCustomInserter = 1 in { // Expanded after instruction selection. - def FP32_TO_INT16_IN_MEM : I<0, Pseudo, - (outs), (ins i16mem:$dst, RFP32:$src), - "##FP32_TO_INT16_IN_MEM PSEUDO!", + def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; - def FP32_TO_INT32_IN_MEM : I<0, Pseudo, - (outs), (ins i32mem:$dst, RFP32:$src), - "##FP32_TO_INT32_IN_MEM PSEUDO!", + def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; - def FP32_TO_INT64_IN_MEM : I<0, Pseudo, - (outs), (ins i64mem:$dst, RFP32:$src), - "##FP32_TO_INT64_IN_MEM PSEUDO!", + def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; - def FP64_TO_INT16_IN_MEM : I<0, Pseudo, - (outs), (ins i16mem:$dst, RFP64:$src), - "##FP64_TO_INT16_IN_MEM PSEUDO!", + def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; - def FP64_TO_INT32_IN_MEM : I<0, Pseudo, - (outs), (ins i32mem:$dst, RFP64:$src), - "##FP64_TO_INT32_IN_MEM PSEUDO!", + def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; - def FP64_TO_INT64_IN_MEM : I<0, Pseudo, - (outs), (ins i64mem:$dst, RFP64:$src), - "##FP64_TO_INT64_IN_MEM PSEUDO!", + def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; - def FP80_TO_INT16_IN_MEM : I<0, Pseudo, - (outs), (ins i16mem:$dst, RFP80:$src), - "##FP80_TO_INT16_IN_MEM PSEUDO!", + def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; - def FP80_TO_INT32_IN_MEM : I<0, Pseudo, - (outs), (ins i32mem:$dst, RFP80:$src), - "##FP80_TO_INT32_IN_MEM PSEUDO!", + def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; - def FP80_TO_INT64_IN_MEM : I<0, Pseudo, - (outs), (ins i64mem:$dst, RFP80:$src), - "##FP80_TO_INT64_IN_MEM PSEUDO!", + def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; } @@ -643,8 +625,12 @@ def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE; def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), "fxsave\t$dst", []>, TB; +def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), + "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor\t$src", []>, TB; +def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Non-Instruction Patterns diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index a440359..0660072 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -41,6 +41,8 @@ def MRM_F8 : Format<41>; def MRM_F9 : Format<42>; def RawFrmImm8 : Format<43>; def RawFrmImm16 : Format<44>; +def MRM_D0 : Format<45>; +def MRM_D1 : Format<46>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -135,17 +137,17 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? bits<4> Prefix = 0; // Which prefix byte does this inst have? - bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix? + bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? FPFormat FPForm = NotFP; // What flavor of FP instruction is this? bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? bits<2> SegOvrBits = 0; // Segment override prefix. Domain ExeDomain = d; - bit hasVEXPrefix = 0; // Does this inst requires a VEX prefix? + bit hasVEXPrefix = 0; // Does this inst require a VEX prefix? bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? - bit hasVEX_4VPrefix = 0; // Does this inst requires the VEX.VVVV field? - bit hasVEX_i8ImmReg = 0; // Does this inst requires the last source register + bit hasVEX_4VPrefix = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register // to be encoded in a immediate field? - bit hasVEX_L = 0; // Does this inst uses large (256-bit) registers? + bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? // TSFlags layout should be kept in sync with X86InstrInfo.h. @@ -168,6 +170,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{37} = has3DNow0F0FOpcode; } +class PseudoI<dag oops, dag iops, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, oops, iops, ""> { + let Pattern = pattern; +} + class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, Domain d = GenericDomain> : X86Inst<o, f, NoImm, outs, ins, asm, d> { diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 70c3d07..3cbfac1 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -43,6 +43,21 @@ def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; +def X86pandn : SDNode<"X86ISD::PANDN", + SDTypeProfile<1, 2, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignb : SDNode<"X86ISD::PSIGNB", + SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignw : SDNode<"X86ISD::PSIGNW", + SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignd : SDNode<"X86ISD::PSIGND", + SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86pblendv : SDNode<"X86ISD::PBLENDVB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", @@ -117,6 +132,8 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; +def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>; +def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>; def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; @@ -327,6 +344,18 @@ def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{ return getI8Imm(X86::getShufflePALIGNRImmediate(N)); }]>; +// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index +// to VEXTRACTF128 imm. +def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +}]>; + +// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to +// VINSERTF128 imm. +def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +}]>; + def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); @@ -417,3 +446,16 @@ def palign : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N)); }], SHUFFLE_get_palign_imm>; + +def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACTF128Index(N); +}], EXTRACT_get_vextractf128_imm>; + +def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERTF128Index(N); +}], INSERT_get_vinsertf128_imm>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 79d9872..21df57c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -58,7 +58,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) TB_NOT_REVERSABLE = 1U << 31, TB_FLAGS = TB_NOT_REVERSABLE }; - + static const unsigned OpTbl2Addr[][2] = { { X86::ADC32ri, X86::ADC32mi }, { X86::ADC32ri8, X86::ADC32mi8 }, @@ -231,16 +231,16 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) unsigned MemOp = OpTbl2Addr[i][1] & ~TB_FLAGS; assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U); - + // If this is not a reversable operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE) continue; - + // Index 0, folded load and store, no alignment requirement. unsigned AuxInfo = 0 | (1 << 4) | (1 << 5); - - assert(!MemOp2RegOpTable.count(MemOp) && + + assert(!MemOp2RegOpTable.count(MemOp) && "Duplicated entries in unfolding maps?"); MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo); } @@ -334,12 +334,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) unsigned Align = OpTbl0[i][3]; assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?"); RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align); - + // If this is not a reversable operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl0[i][1] & TB_NOT_REVERSABLE) continue; - + // Index 0, folded load or store. unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5); assert(!MemOp2RegOpTable.count(MemOp) && "Duplicated entries?"); @@ -369,8 +369,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm, 16 }, @@ -461,12 +459,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) unsigned Align = OpTbl1[i][2]; assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries"); RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align); - + // If this is not a reversable operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl1[i][1] & TB_NOT_REVERSABLE) continue; - + // Index 1, folded load unsigned AuxInfo = 1 | (1 << 4); assert(!MemOp2RegOpTable.count(MemOp) && "Duplicate entries"); @@ -568,6 +566,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::IMUL16rr, X86::IMUL16rm, 0 }, { X86::IMUL32rr, X86::IMUL32rm, 0 }, { X86::IMUL64rr, X86::IMUL64rm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, { X86::MAXPDrr, X86::MAXPDrm, 16 }, { X86::MAXPDrr_Int, X86::MAXPDrm_Int, 16 }, { X86::MAXPSrr, X86::MAXPSrm, 16 }, @@ -678,15 +678,15 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) unsigned RegOp = OpTbl2[i][0]; unsigned MemOp = OpTbl2[i][1] & ~TB_FLAGS; unsigned Align = OpTbl2[i][2]; - + assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!"); RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align); - + // If this is not a reversable operation (because there is a many->one) // mapping, don't insert the reverse of the operation into MemOp2RegOpTable. if (OpTbl2[i][1] & TB_NOT_REVERSABLE) continue; - + // Index 2, folded load unsigned AuxInfo = 2 | (1 << 4); assert(!MemOp2RegOpTable.count(MemOp) && @@ -808,7 +808,7 @@ static bool isFrameStoreOpcode(int Opcode) { return false; } -unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const { if (isFrameLoadOpcode(MI->getOpcode())) if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) @@ -816,7 +816,7 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return 0; } -unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, +unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const { if (isFrameLoadOpcode(MI->getOpcode())) { unsigned Reg; @@ -946,10 +946,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, isPICBase = true; } return isPICBase; - } + } return false; } - + case X86::LEA32r: case X86::LEA64r: { if (MI->getOperand(2).isImm() && @@ -1124,9 +1124,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); - + // Build and insert into an implicit UNDEF value. This is OK because - // well be shifting and then extracting the lower 16-bits. + // well be shifting and then extracting the lower 16-bits. // This has the potential to cause partial register stall. e.g. // movw (%rbp,%rcx,2), %dx // leal -65(%rdx), %esi @@ -1162,7 +1162,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); + addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); break; case X86::ADD16rr: case X86::ADD16rr_DB: { @@ -1177,7 +1177,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, } else { leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because - // well be shifting and then extracting the lower 16-bits. + // well be shifting and then extracting the lower 16-bits. BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); InsMI2 = BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) @@ -1244,7 +1244,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::SHUFPSrri: { assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; - + unsigned B = MI->getOperand(1).getReg(); unsigned C = MI->getOperand(2).getReg(); if (B != C) return 0; @@ -1392,7 +1392,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, RC = X86::GR32_NOSPRegisterClass; } - + unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); @@ -1471,7 +1471,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, LV->replaceKillInstruction(Dest, MI, NewMI); } - MFI->insert(MBBI, NewMI); // Insert the new inst + MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } @@ -1692,7 +1692,7 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { const TargetInstrDesc &TID = MI->getDesc(); if (!TID.isTerminator()) return false; - + // Conditional branch is a special case. if (TID.isBranch() && !TID.isBarrier()) return true; @@ -1701,7 +1701,7 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { return !isPredicated(MI); } -bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, @@ -1862,7 +1862,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { I = MBB.end(); ++Count; } - + return Count; } @@ -2025,6 +2025,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, case X86::GR64_NOREX_NOSPRegClassID: case X86::GR64_NOSPRegClassID: case X86::GR64_TCRegClassID: + case X86::GR64_TCW64RegClassID: return load ? X86::MOV64rm : X86::MOV64mr; case X86::GR32RegClassID: case X86::GR32_ABCDRegClassID: @@ -2151,76 +2152,6 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, NewMIs.push_back(MIB); } -bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; - - DebugLoc DL = MBB.findDebugLoc(MI); - - bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); - bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64(); - unsigned SlotSize = is64Bit ? 8 : 4; - - MachineFunction &MF = *MBB.getParent(); - unsigned FPReg = RI.getFrameRegister(MF); - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - unsigned CalleeFrameSize = 0; - - unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r; - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - // Add the callee-saved register as live-in. It's killed at the spill. - MBB.addLiveIn(Reg); - if (Reg == FPReg) - // X86RegisterInfo::emitPrologue will handle spilling of frame register. - continue; - if (!X86::VR128RegClass.contains(Reg) && !isWin64) { - CalleeFrameSize += SlotSize; - BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, &RI); - } - } - - X86FI->setCalleeSavedFrameSize(CalleeFrameSize); - return true; -} - -bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; - - DebugLoc DL = MBB.findDebugLoc(MI); - - MachineFunction &MF = *MBB.getParent(); - unsigned FPReg = RI.getFrameRegister(MF); - bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); - bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64(); - unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r; - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - if (Reg == FPReg) - // X86RegisterInfo::emitEpilogue will handle restoring of frame register. - continue; - if (!X86::VR128RegClass.contains(Reg) && !isWin64) { - BuildMI(MBB, MI, DL, get(Opc), Reg); - } else { - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, &RI); - } - } - return true; -} - MachineInstr* X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, @@ -2247,7 +2178,7 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, MIB.addOperand(MOs[i]); if (NumAddrOps < 4) // FrameIndex only addOffset(MIB, 0); - + // Loop over the rest of the ri operands, converting them over. unsigned NumOps = MI->getDesc().getNumOperands()-2; for (unsigned i = 0; i != NumOps; ++i) { @@ -2268,7 +2199,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); MachineInstrBuilder MIB(NewMI); - + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { @@ -2317,7 +2248,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (isTwoAddr && NumOps >= 2 && i < 2 && MI->getOperand(0).isReg() && MI->getOperand(1).isReg() && - MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 @@ -2331,14 +2262,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI); if (NewMI) return NewMI; - + OpcodeTablePtr = &RegOp2MemOpTable0; } else if (i == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; } else if (i == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; } - + // If table selected... if (OpcodeTablePtr) { // Find the Opcode to fuse @@ -2386,8 +2317,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, return NewMI; } } - - // No fusion + + // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; return NULL; @@ -2398,7 +2329,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, int FrameIndex) const { - // Check switch flag + // Check switch flag if (NoFusing) return NULL; if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize)) @@ -2450,7 +2381,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops, MachineInstr *LoadMI) const { - // Check switch flag + // Check switch flag if (NoFusing) return NULL; if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize)) @@ -2490,9 +2421,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Alignment = 16; break; case X86::FsFLD0SD: + case X86::VFsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: + case X86::VFsFLD0SS: Alignment = 4; break; default: @@ -2556,9 +2489,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineConstantPool &MCP = *MF.getConstantPool(); const Type *Ty; unsigned Opc = LoadMI->getOpcode(); - if (Opc == X86::FsFLD0SS) + if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD) + else if (Opc == X86::FsFLD0SD || Opc == X86::VFsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); @@ -2591,13 +2524,13 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl<unsigned> &Ops) const { - // Check switch flag + // Check switch flag if (NoFusing) return 0; if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { switch (MI->getOpcode()) { default: return false; - case X86::TEST8rr: + case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: @@ -2618,7 +2551,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; - if (isTwoAddr && NumOps >= 2 && OpNum < 2) { + if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 switch (Opc) { @@ -2634,7 +2567,7 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; } - + if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) return true; return TargetInstrInfoImpl::canFoldMemoryOperand(MI, Ops); @@ -2704,7 +2637,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // Emit the data processing instruction. MachineInstr *DataMI = MF.CreateMachineInstr(TID, MI->getDebugLoc(), true); MachineInstrBuilder MIB(DataMI); - + if (FoldedStore) MIB.addReg(Reg, RegState::Define); for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) @@ -3152,12 +3085,8 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } -bool X86InstrInfo:: -hasHighOperandLatency(const InstrItineraryData *ItinData, - const MachineRegisterInfo *MRI, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - switch (DefMI->getOpcode()) { +bool X86InstrInfo::isHighLatencyDef(int opc) const { + switch (opc) { default: return false; case X86::DIVSDrm: case X86::DIVSDrm_Int: @@ -3187,6 +3116,14 @@ hasHighOperandLatency(const InstrItineraryData *ItinData, } } +bool X86InstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return isHighLatencyDef(DefMI->getOpcode()); +} + namespace { /// CGBR - Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. @@ -3224,11 +3161,11 @@ namespace { PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass); else PC = GlobalBaseReg; - + // Operand of MovePCtoStack is completely ignored by asm printer. It's // only used in JIT code emission as displacement to pc. BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); - + // If we're using vanilla 'GOT' PIC style, we should use relative addressing // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) { diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 46db68e..81bbd3d 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -33,15 +33,15 @@ namespace X86 { AddrScaleAmt = 1, AddrIndexReg = 2, AddrDisp = 3, - + /// AddrSegmentReg - The operand # of the segment in the memory operand. AddrSegmentReg = 4, /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; - - + + // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. enum CondCode { @@ -72,16 +72,16 @@ namespace X86 { COND_INVALID }; - + // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); - + /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(X86::CondCode CC); } - + /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. /// @@ -90,14 +90,14 @@ namespace X86II { enum TOF { //===------------------------------------------------------------------===// // X86 Specific MachineOperand flags. - + MO_NO_FLAG, - + /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a /// relocation of: /// SYMBOL_LABEL + [. - PICBASELABEL] MO_GOT_ABSOLUTE_ADDRESS, - + /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the /// immediate should get the value of the symbol minus the PIC base label: /// SYMBOL_LABEL - PICBASELABEL @@ -106,77 +106,77 @@ namespace X86II { /// MO_GOT - On a symbol operand this indicates that the immediate is the /// offset to the GOT entry for the symbol name from the base of the GOT. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOT MO_GOT, - + /// MO_GOTOFF - On a symbol operand this indicates that the immediate is - /// the offset to the location of the symbol name from the base of the GOT. + /// the offset to the location of the symbol name from the base of the GOT. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOTOFF MO_GOTOFF, - + /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is /// offset to the GOT entry for the symbol name from the current code - /// location. + /// location. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @GOTPCREL MO_GOTPCREL, - + /// MO_PLT - On a symbol operand this indicates that the immediate is - /// offset to the PLT entry of symbol name from the current code location. + /// offset to the PLT entry of symbol name from the current code location. /// - /// See the X86-64 ELF ABI supplement for more details. + /// See the X86-64 ELF ABI supplement for more details. /// SYMBOL_LABEL @PLT MO_PLT, - + /// MO_TLSGD - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TLSGD MO_TLSGD, - + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @GOTTPOFF MO_GOTTPOFF, - + /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @INDNTPOFF MO_INDNTPOFF, - + /// MO_TPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @TPOFF MO_TPOFF, - + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is /// some TLS offset. /// - /// See 'ELF Handling for Thread-Local Storage' for more details. + /// See 'ELF Handling for Thread-Local Storage' for more details. /// SYMBOL_LABEL @NTPOFF MO_NTPOFF, - + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the /// reference is actually to the "__imp_FOO" symbol. This is used for /// dllimport linkage on windows. MO_DLLIMPORT, - + /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$stub" symbol. This is used for calls /// and jumps to external functions on Tiger and earlier. MO_DARWIN_STUB, - + /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub. @@ -186,19 +186,19 @@ namespace X86II { /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub. MO_DARWIN_NONLAZY_PIC_BASE, - + /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE", /// which is a PIC-base-relative reference to a hidden dyld lazy pointer /// stub. MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, - + /// MO_TLVP - On a symbol operand this indicates that the immediate is /// some TLS offset. /// /// This is the TLS offset for the Darwin TLS mechanism. MO_TLVP, - + /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate /// is some TLS offset from the picbase. /// @@ -239,7 +239,7 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { return false; } } - + /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. /// @@ -299,7 +299,7 @@ namespace X86II { // MRMInitReg - This form is used for instructions whose source and // destinations are the same register. MRMInitReg = 32, - + //// MRM_C1 - A mod/rm byte of exactly 0xC1. MRM_C1 = 33, MRM_C2 = 34, @@ -311,12 +311,14 @@ namespace X86II { MRM_F0 = 40, MRM_F8 = 41, MRM_F9 = 42, + MRM_D0 = 45, + MRM_D1 = 46, /// RawFrmImm8 - This is used for the ENTER instruction, which has two /// immediates, the first of which is a 16-bit immediate (specified by /// the imm encoding) and the second is a 8-bit fixed value. RawFrmImm8 = 43, - + /// RawFrmImm16 - This is used for CALL FAR instructions, which have two /// immediates, the first of which is a 16 or 32-bit immediate (specified by /// the imm encoding) and the second is a 16-bit fixed value. In the AMD @@ -368,7 +370,7 @@ namespace X86II { // T8, TA - Prefix after the 0x0F prefix. T8 = 13 << Op0Shift, TA = 14 << Op0Shift, - + // TF - Prefix before and after 0x0F TF = 15 << Op0Shift, @@ -471,7 +473,7 @@ namespace X86II { /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. VEX_L = 1U << 4, - + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction @@ -480,18 +482,18 @@ namespace X86II { /// this flag to indicate that the encoder should do the wacky 3DNow! thing. Has3DNow0F0FOpcode = 1U << 5 }; - + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the // specified machine instruction. // static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { return TSFlags >> X86II::OpcodeShift; } - + static inline bool hasImm(uint64_t TSFlags) { return (TSFlags & X86II::ImmMask) != 0; } - + /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field /// of the specified instruction. static inline unsigned getSizeOfImm(uint64_t TSFlags) { @@ -506,7 +508,7 @@ namespace X86II { case X86II::Imm64: return 8; } } - + /// isImmPCRel - Return true if the immediate of the specified instruction's /// TSFlags indicates that it is pc relative. static inline unsigned isImmPCRel(uint64_t TSFlags) { @@ -523,7 +525,7 @@ namespace X86II { return false; } } - + /// getMemoryOperandNo - The function returns the MCInst operand # for the /// first field of the memory operand. If the instruction doesn't have a /// memory operand, this returns -1. @@ -551,7 +553,7 @@ namespace X86II { unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). - + // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || // Opcode == X86::LEA16r || Opcode == X86::LEA32r) @@ -577,6 +579,8 @@ namespace X86II { case X86II::MRM_F0: case X86II::MRM_F8: case X86II::MRM_F9: + case X86II::MRM_D0: + case X86II::MRM_D1: return -1; } } @@ -609,7 +613,7 @@ inline static bool isMem(const MachineInstr *MI, unsigned Op) { class X86InstrInfo : public TargetInstrInfoImpl { X86TargetMachine &TM; const X86RegisterInfo RI; - + /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, /// RegOp2MemOpTable2 - Load / store folding opcode maps. /// @@ -617,7 +621,7 @@ class X86InstrInfo : public TargetInstrInfoImpl { DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable0; DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable1; DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2; - + /// MemOp2RegOpTable - Load / store unfolding opcode map. /// DenseMap<unsigned, std::pair<unsigned, unsigned> > MemOp2RegOpTable; @@ -742,17 +746,6 @@ public: MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl<MachineInstr*> &NewMIs) const; - - virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const; - - virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const; - virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, @@ -802,7 +795,7 @@ public: virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex = 0) const; - + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler /// to determine if two loads are loading from the same base address. It /// should only return true if the base pointers are the same and the @@ -836,7 +829,7 @@ public: return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } - + static bool isX86_64ExtendedReg(const MachineOperand &MO) { if (!MO.isReg()) return false; return isX86_64ExtendedReg(MO.getReg()); @@ -865,11 +858,13 @@ public: const SmallVectorImpl<MachineOperand> &MOs, unsigned Size, unsigned Alignment) const; + bool isHighLatencyDef(int opc) const; + bool hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const; - + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 0a60eb7..f832a7c 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -35,6 +35,20 @@ def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTX86BrCond : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; @@ -78,7 +92,7 @@ def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; @@ -114,10 +128,10 @@ def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, [SDNPHasChain, SDNPMayStore, @@ -141,7 +155,7 @@ def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, - [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>; + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def X86vastart_save_xmm_regs : SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", @@ -153,43 +167,45 @@ def X86vaarg64 : SDNPMemOperand]>; def X86callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, - [SDNPHasChain, SDNPOutFlag]>; + [SDNPHasChain, SDNPOutGlue]>; def X86callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, - [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, SDNPMayLoad]>; def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutFlag, SDNPSideEffect]>; + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, - [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, [SDNPHasChain]>; def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, - [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>; + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, [SDNPCommutative]>; def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, [SDNPCommutative]>; -def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags, +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; @@ -203,10 +219,10 @@ def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void, - [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, - []>; + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; //===----------------------------------------------------------------------===// // X86 Operand Definitions. @@ -257,6 +273,10 @@ def i8mem_NOREX : Operand<i64> { let ParserMatchClass = X86MemAsmOperand; } +// GPRs available for tailcall. +// It represents GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<2>; + // Special i32mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled // after callee-saved register are popped. @@ -271,7 +291,8 @@ def i32mem_TC : Operand<i32> { // after callee-saved register are popped. def i64mem_TC : Operand<i64> { let PrintMethod = "printi64mem"; - let MIOperandInfo = (ops GR64_TC, i8imm, GR64_TC, i32imm, i8imm); + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } @@ -394,26 +415,26 @@ def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", def HasCMov : Predicate<"Subtarget->hasCMov()">; def NoCMov : Predicate<"!Subtarget->hasCMov()">; -// FIXME: temporary hack to let codegen assert or generate poor code in case -// no AVX version of the desired intructions is present, this is better for -// incremental dev (without fallbacks it's easier to spot what's missing) -def HasMMX : Predicate<"Subtarget->hasMMX() && !Subtarget->hasAVX()">; +def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; -def HasSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; -def HasSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; -def HasSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; -def HasSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; -def HasSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; -def HasSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; -def HasSSE4A : Predicate<"Subtarget->hasSSE4A() && !Subtarget->hasAVX()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">; + +def HasAES : Predicate<"Subtarget->hasAES()">; def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; -def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; -def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def FPStackf32 : Predicate<"!Subtarget->hasXMM()">; +def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">; def In32BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; @@ -430,7 +451,6 @@ def OptForSize : Predicate<"OptForSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; -def HasAES : Predicate<"Subtarget->hasAES()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. @@ -1144,6 +1164,12 @@ def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), // Lock instruction prefix def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; + // Repeat string operation instruction prefixes // These uses the DF flag in the EFLAGS register to inc or dec ECX let Defs = [ECX], Uses = [ECX,EFLAGS] in { @@ -1267,6 +1293,12 @@ def : MnemonicAlias<"cdqe", "cltq">; // lret maps to lretl, it is not ambiguous with lretq. def : MnemonicAlias<"lret", "lretl">; +def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>; +def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; + def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>; def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>; def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>; @@ -1374,6 +1406,9 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "q">; def : InstAlias<"aad", (AAD8i8 10)>; def : InstAlias<"aam", (AAM8i8 10)>; +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; + // clr aliases. def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; @@ -1444,14 +1479,15 @@ defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they -// commute. We also allow fdivrp/fsubrp even though they don't commute, solely -// because gas supports it. +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op)>; def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; -// We accepts "fnstsw %eax" even though it only writes %ax. +// We accept "fnstsw %eax" even though it only writes %ax. def : InstAlias<"fnstsw %eax", (FNSTSW8r)>; def : InstAlias<"fnstsw %al" , (FNSTSW8r)>; def : InstAlias<"fnstsw" , (FNSTSW8r)>; @@ -1497,6 +1533,10 @@ def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; +// Match 'movq GR64, MMX' as an alias for movd. +def : InstAlias<"movq $src, $dst", (MMX_MOVD64to64rr VR64:$dst, GR64:$src)>; +def : InstAlias<"movq $src, $dst", (MMX_MOVD64from64rr GR64:$dst, VR64:$src)>; + // movsd with no operands (as opposed to the SSE scalar move of a double) is an // alias for movsl. (as in rep; movsd) def : InstAlias<"movsd", (MOVSD)>; @@ -1586,4 +1626,3 @@ def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; - diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e4e92cc..45e9051 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -180,6 +180,12 @@ def : Pat<(v4f32 (scalar_to_vector FR32:$src)), // Implicitly promote a 64-bit scalar to a vector. def : Pat<(v2f64 (scalar_to_vector FR64:$src)), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>; +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v8f32 (scalar_to_vector FR32:$src)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v4f64 (scalar_to_vector FR64:$src)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>; let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this @@ -706,6 +712,9 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; } +def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, + Requires<[HasAVX]>; + def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround FR64:$src))]>; @@ -733,6 +742,9 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; } +def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>, + Requires<[HasAVX]>; + def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fextend FR32:$src))]>, XS, @@ -1434,6 +1446,12 @@ def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>, TB, OpSize; +def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, + Requires<[HasAVX]>, TB, OpSize, VEX_4V; +def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, + Requires<[HasAVX]>, TB, OpSize, VEX_4V; } // Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper @@ -1566,19 +1584,13 @@ defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; let isCommutable = 0 in defm ANDN : sse12_fp_packed_logical<0x55, "andn", undef /* dummy */, 1, [ // single r+r - [(set VR128:$dst, (v2i64 (and (xor VR128:$src1, - (bc_v2i64 (v4i32 immAllOnesV))), - VR128:$src2)))], + [(set VR128:$dst, (X86pandn VR128:$src1, VR128:$src2))], // double r+r - [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (bc_v2i64 (v2f64 VR128:$src2))))], + [], // single r+m - [(set VR128:$dst, (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)), - (bc_v2i64 (v4i32 immAllOnesV))), - (memopv2i64 addr:$src2))))], + [(set VR128:$dst, (X86pandn VR128:$src1, (memopv2i64 addr:$src2)))], // double r+m - [(set VR128:$dst, (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), - (memopv2i64 addr:$src2)))]]>; + []]>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Arithmetic Instructions @@ -2189,6 +2201,10 @@ let neverHasSideEffects = 1 in def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", []>; +def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + []>, XS, Requires<[HasSSE2]>; + let canFoldAsLoad = 1, mayLoad = 1 in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", @@ -2518,15 +2534,11 @@ let ExeDomain = SSEPackedInt in { } def PANDNrr : PDI<0xDF, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), - VR128:$src2)))]>; + "pandn\t{$src2, $dst|$dst, $src2}", []>; def PANDNrm : PDI<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), - "pandn\t{$src2, $dst|$dst, $src2}", - [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), - (memopv2i64 addr:$src2))))]>; + "pandn\t{$src2, $dst|$dst, $src2}", []>; } } // Constraints = "$src1 = $dst" @@ -3590,6 +3602,13 @@ def : Pat<(X86pshufb VR128:$src, VR128:$mask), def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignb VR128:$src1, VR128:$src2), + (PSIGNBrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignw VR128:$src1, VR128:$src2), + (PSIGNWrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; +def : Pat<(X86psignd VR128:$src1, VR128:$src2), + (PSIGNDrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>; + //===---------------------------------------------------------------------===// // SSSE3 - Packed Align Instruction Patterns //===---------------------------------------------------------------------===// @@ -3640,10 +3659,27 @@ def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)), //===---------------------------------------------------------------------===// // Thread synchronization -def MONITOR : I<0x01, MRM_C8, (outs), (ins), "monitor", - [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>; -def MWAIT : I<0x01, MRM_C9, (outs), (ins), "mwait", - [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; +let usesCustomInserter = 1 in { +def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>; +def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2), + [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>; +} + +let Uses = [EAX, ECX, EDX] in +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB, + Requires<[HasSSE3]>; +let Uses = [ECX, EAX] in +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB, + Requires<[HasSSE3]>; + +def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, + Requires<[In32BitMode]>; +def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, + Requires<[In64BitMode]>; //===---------------------------------------------------------------------===// // Non-Instruction Patterns @@ -3659,7 +3695,7 @@ let Predicates = [HasSSE2] in (CVTSS2SDrm addr:$src)>; // bit_convert -let Predicates = [HasSSE2] in { +let Predicates = [HasXMMInt] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; @@ -3692,6 +3728,10 @@ let Predicates = [HasSSE2] in { def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; } +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; +} + // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 15 in { @@ -3865,27 +3905,6 @@ def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>; -// Some special case pandn patterns. -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), - VR128:$src2)), - (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; - -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; -def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), - (memop addr:$src2))), - (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; - // vector -> vector casts def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>; @@ -4588,22 +4607,25 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>; //===----------------------------------------------------------------------===// def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), - "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS; -let mayLoad = 1 in + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop GR16:$src))]>, OpSize, XS; def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), - "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS; + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop (loadi16 addr:$src)))]>, OpSize, XS; def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS; -let mayLoad = 1 in + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop GR32:$src))]>, XS; def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS; + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop (loadi32 addr:$src)))]>, XS; def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS; -let mayLoad = 1 in + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop GR64:$src))]>, XS; def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS; + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop (loadi64 addr:$src)))]>, XS; @@ -4845,6 +4867,9 @@ defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>; defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; +def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0), + (PBLENDVBrr0 VR128:$src1, VR128:$src2)>; + let isAsmParserOnly = 1, Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", @@ -4896,12 +4921,12 @@ def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), // Packed Compare Implicit Length Strings, Return Mask multiclass pseudo_pcmpistrm<string asm> { - def REG : Ii8<0, Pseudo, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), !strconcat(asm, "rr PSEUDO"), + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, imm:$src3))]>; - def MEM : Ii8<0, Pseudo, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), !strconcat(asm, "rm PSEUDO"), + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2), imm:$src3))]>; } @@ -4932,12 +4957,12 @@ let Defs = [XMM0, EFLAGS] in { // Packed Compare Explicit Length Strings, Return Mask multiclass pseudo_pcmpestrm<string asm> { - def REG : Ii8<0, Pseudo, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src3, i8imm:$src5), !strconcat(asm, "rr PSEUDO"), + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; - def MEM : Ii8<0, Pseudo, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src3, i8imm:$src5), !strconcat(asm, "rm PSEUDO"), + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>; } @@ -5419,6 +5444,23 @@ def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3), (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), (VEXTRACTF128rr VR256:$src1, imm:$src2)>; def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), @@ -5426,6 +5468,23 @@ def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), (VEXTRACTF128rr VR256:$src1, imm:$src2)>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4f32 (VEXTRACTF128rr + (v8f32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2f64 (VEXTRACTF128rr + (v4f64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2i64 (VEXTRACTF128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -5563,11 +5622,15 @@ def : Pat<(X86Movddup (bc_v2f64 // Shuffle with UNPCKLPS def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), (UNPCKLPSrm VR128:$src1, addr:$src2)>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), (UNPCKLPSrr VR128:$src1, VR128:$src2)>; @@ -5584,20 +5647,24 @@ def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), // Shuffle with UNPCKLPD def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), - (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; + (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), - (UNPCKLPSrm VR128:$src1, addr:$src2)>; + (UNPCKLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; +def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), (UNPCKLPDrr VR128:$src1, VR128:$src2)>; // Shuffle with UNPCKHPD def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), - (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; + (VUNPCKHPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>; def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), - (UNPCKLPSrm VR128:$src1, addr:$src2)>; + (UNPCKHPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), (VUNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>; @@ -5782,6 +5849,9 @@ def : Pat<(X86Movlps VR128:$src1, def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>; +def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; + // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 1a58ba0..be7d5ef 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -207,10 +207,15 @@ def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; -def STRr : I<0x00, MRM1r, (outs GR16:$dst), (ins), - "str{w}\t{$dst}", []>, TB; -def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), - "str{w}\t{$dst}", []>, TB; +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t{$dst}", []>, TB, OpSize; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t{$dst}", []>, TB; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t{$dst}", []>, TB; +def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), + "str{w}\t{$dst}", []>, TB; + def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t{$src}", []>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), @@ -388,3 +393,8 @@ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB; def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB; def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB; +let Defs = [RDX, RAX], Uses = [RCX] in + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + +let Uses = [RDX, RAX, RCX] in + def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp index e83f359..3f88fa6 100644 --- a/lib/Target/X86/X86JITInfo.cpp +++ b/lib/Target/X86/X86JITInfo.cpp @@ -19,7 +19,7 @@ #include "llvm/Function.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/System/Valgrind.h" +#include "llvm/Support/Valgrind.h" #include <cstdlib> #include <cstring> using namespace llvm; @@ -127,9 +127,17 @@ extern "C" { "movaps %xmm6, 96(%rsp)\n" "movaps %xmm7, 112(%rsp)\n" // JIT callee +#ifdef _WIN64 + "subq $32, %rsp\n" + "movq %rbp, %rcx\n" // Pass prev frame and return address + "movq 8(%rbp), %rdx\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "addq $32, %rsp\n" +#else "movq %rbp, %rdi\n" // Pass prev frame and return address "movq 8(%rbp), %rsi\n" "call " ASMPREFIX "X86CompilationCallback2\n" +#endif // Restore all XMM arg registers "movaps 112(%rsp), %xmm7\n" "movaps 96(%rsp), %xmm6\n" @@ -333,7 +341,7 @@ extern "C" { extern "C" { #if !(defined (X86_64_JIT) && defined(_MSC_VER)) // the following function is called only from this translation unit, - // unless we are under 64bit Windows with MSC, where there is + // unless we are under 64bit Windows with MSC, where there is // no support for inline assembly static #endif @@ -462,7 +470,7 @@ TargetJITInfo::StubLayout X86JITInfo::getStubLayout() { void *X86JITInfo::emitFunctionStub(const Function* F, void *Target, JITCodeEmitter &JCE) { - // Note, we cast to intptr_t here to silence a -pedantic warning that + // Note, we cast to intptr_t here to silence a -pedantic warning that // complains about casting a function pointer to a normal pointer. #if defined (X86_32_JIT) && !defined (_MSC_VER) bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback && diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp index f45fdf5..6686214 100644 --- a/lib/Target/X86/X86MCAsmInfo.cpp +++ b/lib/Target/X86/X86MCAsmInfo.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" using namespace llvm; enum AsmWriterFlavorTy { @@ -68,7 +69,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) { DwarfUsesInlineInfoSection = true; // Exceptions handling - ExceptionsType = ExceptionHandling::Dwarf; + ExceptionsType = ExceptionHandling::DwarfTable; } X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { @@ -88,8 +89,8 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::Dwarf; - + ExceptionsType = ExceptionHandling::DwarfTable; + // OpenBSD has buggy support for .quad in 32-bit mode, just split into two // .words. if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86) @@ -98,13 +99,15 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { const MCSection *X86ELFMCAsmInfo:: getNonexecutableStackSection(MCContext &Ctx) const { - return Ctx.getELFSection(".note.GNU-stack", MCSectionELF::SHT_PROGBITS, + return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0, SectionKind::getMetadata()); } X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) { - if (Triple.getArch() == Triple::x86_64) + if (Triple.getArch() == Triple::x86_64) { GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } AsmTransCBE = x86_asm_table; AssemblerDialect = AsmWriterFlavor; diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp index fea37f8..0e3b571 100644 --- a/lib/Target/X86/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/X86MCCodeEmitter.cpp @@ -38,29 +38,6 @@ public: ~X86MCCodeEmitter() {} - unsigned getNumFixupKinds() const { - return 7; - } - - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { - const static MCFixupKindInfo Infos[] = { - { "reloc_pcrel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, - { "reloc_pcrel_1byte", 0, 1 * 8, MCFixupKindInfo::FKF_IsPCRel }, - { "reloc_pcrel_2byte", 0, 2 * 8, MCFixupKindInfo::FKF_IsPCRel }, - { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, - { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, - { "reloc_signed_4byte", 0, 4 * 8, 0}, - { "reloc_global_offset_table", 0, 4 * 8, 0} - }; - - if (Kind < FirstTargetFixupKind) - return MCCodeEmitter::getFixupKindInfo(Kind); - - assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && - "Invalid kind!"); - return Infos[Kind - FirstTargetFixupKind]; - } - static unsigned GetX86RegNum(const MCOperand &MO) { return X86RegisterInfo::getX86RegNum(MO.getReg()); } @@ -173,13 +150,7 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) { unsigned Size = X86II::getSizeOfImm(TSFlags); bool isPCRel = X86II::isImmPCRel(TSFlags); - switch (Size) { - default: assert(0 && "Unknown immediate size"); - case 1: return isPCRel ? MCFixupKind(X86::reloc_pcrel_1byte) : FK_Data_1; - case 2: return isPCRel ? MCFixupKind(X86::reloc_pcrel_2byte) : FK_Data_2; - case 4: return isPCRel ? MCFixupKind(X86::reloc_pcrel_4byte) : FK_Data_4; - case 8: assert(!isPCRel); return FK_Data_8; - } + return MCFixup::getKindForSize(Size, isPCRel); } /// Is32BitMemOperand - Return true if the specified instruction with a memory @@ -218,18 +189,22 @@ void X86MCCodeEmitter:: EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { - // If this is a simple integer displacement that doesn't require a relocation, - // emit it now. + const MCExpr *Expr = NULL; if (DispOp.isImm()) { - // FIXME: is this right for pc-rel encoding?? Probably need to emit this as - // a fixup if so. - EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); - return; + // If this is a simple integer displacement that doesn't require a relocation, + // emit it now. + if (FixupKind != FK_PCRel_1 && + FixupKind != FK_PCRel_2 && + FixupKind != FK_PCRel_4) { + EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); + return; + } + Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx); + } else { + Expr = DispOp.getExpr(); } // If we have an immoffset, add it to the expression. - const MCExpr *Expr = DispOp.getExpr(); - if (FixupKind == FK_Data_4 && StartsWithGlobalOffsetTable(Expr)) { assert(ImmOffset == 0); @@ -239,13 +214,13 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, // If the fixup is pc-relative, we need to bias the value to be relative to // the start of the field, not the end of the field. - if (FixupKind == MCFixupKind(X86::reloc_pcrel_4byte) || + if (FixupKind == FK_PCRel_4 || FixupKind == MCFixupKind(X86::reloc_riprel_4byte) || FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load)) ImmOffset -= 4; - if (FixupKind == MCFixupKind(X86::reloc_pcrel_2byte)) + if (FixupKind == FK_PCRel_2) ImmOffset -= 2; - if (FixupKind == MCFixupKind(X86::reloc_pcrel_1byte)) + if (FixupKind == FK_PCRel_1) ImmOffset -= 1; if (ImmOffset) @@ -1004,6 +979,14 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, EmitByte(BaseOpcode, CurByte, OS); EmitByte(0xF9, CurByte, OS); break; + case X86II::MRM_D0: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xD0, CurByte, OS); + break; + case X86II::MRM_D1: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xD1, CurByte, OS); + break; } // If there is a remaining operand, it must be a trailing immediate. Emit it @@ -1021,7 +1004,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, Fixups); } else { unsigned FixupKind; - if (MI.getOpcode() == X86::MOV64ri32 || MI.getOpcode() == X86::MOV64mi32) + // FIXME: Is there a better way to know that we need a signed relocation? + if (MI.getOpcode() == X86::MOV64ri32 || + MI.getOpcode() == X86::MOV64mi32 || + MI.getOpcode() == X86::PUSH64i32) FixupKind = X86::reloc_signed_4byte; else FixupKind = getImmFixupKind(TSFlags); diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 05a5de6..cbe6db2 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -374,6 +374,8 @@ ReSimplify: case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::VFsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; + case X86::VFsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break; case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; @@ -525,6 +527,66 @@ ReSimplify: } } +static void LowerTlsAddr(MCStreamer &OutStreamer, + X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { + bool is64Bits = MI.getOpcode() == X86::TLS_addr64; + MCContext &context = OutStreamer.getContext(); + + if (is64Bits) { + MCInst prefix; + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + } + MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); + const MCSymbolRefExpr *symRef = + MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context); + + MCInst LEA; + if (is64Bits) { + LEA.setOpcode(X86::LEA64r); + LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest + LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(0)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } else { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::CreateReg(0)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } + OutStreamer.EmitInstruction(LEA); + + if (is64Bits) { + MCInst prefix; + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + prefix.setOpcode(X86::REX64_PREFIX); + OutStreamer.EmitInstruction(prefix); + } + + MCInst call; + if (is64Bits) + call.setOpcode(X86::CALL64pcrel32); + else + call.setOpcode(X86::CALLpcrel32); + StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; + MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name); + const MCSymbolRefExpr *tlsRef = + MCSymbolRefExpr::Create(tlsGetAddr, + MCSymbolRefExpr::VK_PLT, + context); + + call.addOperand(MCOperand::CreateExpr(tlsRef)); + OutStreamer.EmitInstruction(call); +} void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(Mang, *MF, *this); @@ -559,7 +621,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // Lower these as normal, but add some comments. OutStreamer.AddComment("TAILCALL"); break; - + + case X86::TLS_addr32: + case X86::TLS_addr64: + return LowerTlsAddr(OutStreamer, MCInstLowering, *MI); + case X86::MOVPC32r: { MCInst TmpInst; // This is a pseudo op for a two instruction sequence with a label, which diff --git a/lib/Target/X86/X86MachObjectWriter.cpp b/lib/Target/X86/X86MachObjectWriter.cpp new file mode 100644 index 0000000..8f3dd32 --- /dev/null +++ b/lib/Target/X86/X86MachObjectWriter.cpp @@ -0,0 +1,32 @@ +//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/MC/MCMachObjectWriter.h" +using namespace llvm; + +namespace { +class X86MachObjectWriter : public MCMachObjectTargetWriter { +public: + X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/Is64Bit) {} +}; +} + +MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 0f505d1..2f6bd88 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -31,7 +31,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -60,7 +60,7 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); Is64Bit = Subtarget->is64Bit(); IsWin64 = Subtarget->isTargetWin64(); - StackAlign = TM.getFrameInfo()->getStackAlignment(); + StackAlign = TM.getFrameLowering()->getStackAlignment(); if (Is64Bit) { SlotSize = 8; @@ -316,10 +316,16 @@ X86RegisterInfo::getPointerRegClass(unsigned Kind) const { if (TM.getSubtarget<X86Subtarget>().is64Bit()) return &X86::GR64RegClass; return &X86::GR32RegClass; - case 1: // Normal GRPs except the stack pointer (for encoding reasons). + case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (TM.getSubtarget<X86Subtarget>().is64Bit()) return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; + case 2: // Available for tailcall (not callee-saved GPRs). + if (TM.getSubtarget<X86Subtarget>().isTargetWin64()) + return &X86::GR64_TCW64RegClass; + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + return &X86::GR64_TCRegClass; + return &X86::GR32_TCRegClass; } } @@ -388,6 +394,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + // Set the stack-pointer register and its aliases as reserved. Reserved.set(X86::RSP); Reserved.set(X86::ESP); @@ -400,7 +408,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(X86::IP); // Set the frame-pointer register and its aliases as reserved if needed. - if (hasFP(MF)) { + if (TFI->hasFP(MF)) { Reserved.set(X86::RBP); Reserved.set(X86::EBP); Reserved.set(X86::BP); @@ -425,21 +433,6 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. This is true if the function has variable sized allocas -/// or if frame pointer elimination is disabled. -bool X86RegisterInfo::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const MachineModuleInfo &MMI = MF.getMMI(); - - return (DisableFramePointerElim(MF) || - needsStackRealignment(MF) || - MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken() || - MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit()); -} - bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); return (RealignStack && @@ -458,62 +451,25 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { if (0 && requiresRealignment && MFI->hasVarSizedObjects()) report_fatal_error( "Stack realignment in presense of dynamic allocas is not supported"); - + // If we've requested that we force align the stack do so now. if (ForceStackAlign) return canRealignStack(MF); - - return requiresRealignment && canRealignStack(MF); -} -bool X86RegisterInfo::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return requiresRealignment && canRealignStack(MF); } bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { - if (Reg == FramePtr && hasFP(MF)) { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (Reg == FramePtr && TFI->hasFP(MF)) { FrameIdx = MF.getFrameInfo()->getObjectIndexBegin(); return true; } return false; } -int -X86RegisterInfo::getFrameIndexOffset(const MachineFunction &MF, int FI) const { - const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int Offset = MFI->getObjectOffset(FI) - TFI.getOffsetOfLocalArea(); - uint64_t StackSize = MFI->getStackSize(); - - if (needsStackRealignment(MF)) { - if (FI < 0) { - // Skip the saved EBP. - Offset += SlotSize; - } else { - unsigned Align = MFI->getObjectAlignment(FI); - assert((-(Offset + StackSize)) % Align == 0); - Align = 0; - return Offset + StackSize; - } - // FIXME: Support tail calls - } else { - if (!hasFP(MF)) - return Offset + StackSize; - - // Skip the saved EBP. - Offset += SlotSize; - - // Skip the RETADDR move area - const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - if (TailCallReturnAddrDelta < 0) - Offset -= TailCallReturnAddrDelta; - } - - return Offset; -} - static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) { if (is64Bit) { if (isInt<8>(Imm)) @@ -541,69 +497,70 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) { void X86RegisterInfo:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - if (!hasReservedCallFrame(MF)) { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + bool reseveCallFrame = TFI->hasReservedCallFrame(MF); + int Opcode = I->getOpcode(); + bool isDestroy = Opcode == getCallFrameDestroyOpcode(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reseveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' // TODO: consider using push / pop instead of sub + store / add - MachineInstr *Old = I; - uint64_t Amount = Old->getOperand(0).getImm(); - if (Amount != 0) { - // We need to keep the stack aligned properly. To do this, we round the - // amount of space needed for the outgoing arguments up to the next - // alignment boundary. - Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; - - MachineInstr *New = 0; - if (Old->getOpcode() == getCallFrameSetupOpcode()) { - New = BuildMI(MF, Old->getDebugLoc(), - TII.get(getSUBriOpcode(Is64Bit, Amount)), - StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } else { - assert(Old->getOpcode() == getCallFrameDestroyOpcode()); - - // Factor out the amount the callee already popped. - uint64_t CalleeAmt = Old->getOperand(1).getImm(); - Amount -= CalleeAmt; - - if (Amount) { - unsigned Opc = getADDriOpcode(Is64Bit, Amount); - New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } - } + if (Amount == 0) + return; + + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + + MachineInstr *New = 0; + if (Opcode == getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(Is64Bit, Amount)), + StackPtr) + .addReg(StackPtr) + .addImm(Amount); + } else { + assert(Opcode == getCallFrameDestroyOpcode()); - if (New) { - // The EFLAGS implicit def is dead. - New->getOperand(3).setIsDead(); + // Factor out the amount the callee already popped. + Amount -= CalleeAmt; - // Replace the pseudo instruction with a new instruction. - MBB.insert(I, New); + if (Amount) { + unsigned Opc = getADDriOpcode(Is64Bit, Amount); + New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); } } - } else if (I->getOpcode() == getCallFrameDestroyOpcode()) { - // If we are performing frame pointer elimination and if the callee pops - // something off the stack pointer, add it back. We do this until we have - // more advanced stack pointer tracking ability. - if (uint64_t CalleeAmt = I->getOperand(1).getImm()) { - unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt); - MachineInstr *Old = I; - MachineInstr *New = - BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), - StackPtr) - .addReg(StackPtr) - .addImm(CalleeAmt); + if (New) { // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead(); + + // Replace the pseudo instruction with a new instruction. MBB.insert(I, New); } + + return; } - MBB.erase(I); + if (Opcode == getCallFrameDestroyOpcode() && CalleeAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt); + MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(CalleeAmt); + + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + MBB.insert(I, New); + } } void @@ -614,6 +571,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned i = 0; MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); while (!MI.getOperand(i).isFI()) { ++i; @@ -630,7 +588,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else if (AfterFPPop) BasePtr = StackPtr; else - BasePtr = (hasFP(MF) ? FramePtr : StackPtr); + BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. @@ -640,11 +598,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FIOffset; if (AfterFPPop) { // Tail call jmp happens after FP is popped. - const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo(); const MachineFrameInfo *MFI = MF.getFrameInfo(); - FIOffset = MFI->getObjectOffset(FrameIndex) - TFI.getOffsetOfLocalArea(); + FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else - FIOffset = getFrameIndexOffset(MF, FrameIndex); + FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); if (MI.getOperand(i+3).isImm()) { // Offset is a 32-bit integer. @@ -657,68 +614,14 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } -void -X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { - MachineFrameInfo *MFI = MF.getFrameInfo(); - - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); - - if (TailCallReturnAddrDelta < 0) { - // create RETURNADDR area - // arg - // arg - // RETADDR - // { ... - // RETADDR area - // ... - // } - // [EBP] - MFI->CreateFixedObject(-TailCallReturnAddrDelta, - (-1U*SlotSize)+TailCallReturnAddrDelta, true); - } - - if (hasFP(MF)) { - assert((TailCallReturnAddrDelta <= 0) && - "The Delta should always be zero or negative"); - const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo(); - - // Create a frame entry for the EBP register that must be saved. - int FrameIdx = MFI->CreateFixedObject(SlotSize, - -(int)SlotSize + - TFI.getOffsetOfLocalArea() + - TailCallReturnAddrDelta, - true); - assert(FrameIdx == MFI->getObjectIndexBegin() && - "Slot for EBP register must be last in order to be found!"); - FrameIdx = 0; - } -} - unsigned X86RegisterInfo::getRARegister() const { return Is64Bit ? X86::RIP // Should have dwarf #16. : X86::EIP; // Should have dwarf #8. } unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return hasFP(MF) ? FramePtr : StackPtr; -} - -void -X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const { - // Calculate amount of bytes used for return address storing - int stackGrowth = (Is64Bit ? -8 : -4); - - // Initial state of the frame pointer is esp+stackGrowth. - MachineLocation Dst(MachineLocation::VirtualFP); - MachineLocation Src(StackPtr, stackGrowth); - Moves.push_back(MachineMove(0, Dst, Src)); - - // Add return address to move list - MachineLocation CSDst(StackPtr, stackGrowth); - MachineLocation CSSrc(getRARegister()); - Moves.push_back(MachineMove(0, CSDst, CSSrc)); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + return TFI->hasFP(MF) ? FramePtr : StackPtr; } unsigned X86RegisterInfo::getEHExceptionRegister() const { @@ -917,13 +820,13 @@ namespace { // Be over-conservative: scan over all vreg defs and find whether vector // registers are used. If yes, there is a possibility that vector register // will be spilled and thus require dynamic stack realignment. - for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister; - RegNum < RI.getLastVirtReg(); ++RegNum) - if (RI.getRegClass(RegNum)->getAlignment() > StackAlignment) { + for (unsigned i = 0, e = RI.getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (RI.getRegClass(Reg)->getAlignment() > StackAlignment) { FuncInfo->setReserveFP(true); return true; } - + } // Nothing to do return false; } diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 0796191..064be64 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -111,14 +111,10 @@ public: /// register scavenger to determine what registers are free. BitVector getReservedRegs(const MachineFunction &MF) const; - bool hasFP(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; bool needsStackRealignment(const MachineFunction &MF) const; - bool hasReservedCallFrame(const MachineFunction &MF) const; - bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const; @@ -129,9 +125,6 @@ public: void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, RegScavenger *RS = NULL) const; - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS = NULL) const; - // Debug information queries. unsigned getRARegister() const; unsigned getFrameRegister(const MachineFunction &MF) const; @@ -139,9 +132,6 @@ public: // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } - int getFrameIndexOffset(const MachineFunction &MF, int FI) const; - void getInitialFrameState(std::vector<MachineMove> &Moves) const; - // Exception handling queries. unsigned getEHExceptionRegister() const; unsigned getEHHandlerRegister() const; diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 8fc26d7..612fac2 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -1,10 +1,10 @@ //===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// -// +// // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. -// +// //===----------------------------------------------------------------------===// // // This file describes the X86 Register file, defining the registers themselves, @@ -34,8 +34,8 @@ let Namespace = "X86" in { // because the register file generator is smart enough to figure out that // AL aliases AX if we tell it that AX aliased AL (for example). - // Dwarf numbering is different for 32-bit and 64-bit, and there are - // variations by target as well. Currently the first entry is for X86-64, + // Dwarf numbering is different for 32-bit and 64-bit, and there are + // variations by target as well. Currently the first entry is for X86-64, // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux // and debug information on X86-32/Darwin) @@ -81,7 +81,7 @@ let Namespace = "X86" in { def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>; } def IP : Register<"ip">, DwarfRegNum<[16]>; - + // X86-64 only let SubRegIndices = [sub_8bit] in { def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>; @@ -103,8 +103,8 @@ let Namespace = "X86" in { def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>; def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>; def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>; - def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; - + def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>; + // X86-64 only def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>; def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>; @@ -208,7 +208,7 @@ let Namespace = "X86" in { def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>; def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>; def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>; - def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>; + def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>; // Status flags register def EFLAGS : Register<"flags">; @@ -220,7 +220,7 @@ let Namespace = "X86" in { def ES : Register<"es">; def FS : Register<"fs">; def GS : Register<"gs">; - + // Debug registers def DR0 : Register<"dr0">; def DR1 : Register<"dr1">; @@ -230,7 +230,7 @@ let Namespace = "X86" in { def DR5 : Register<"dr5">; def DR6 : Register<"dr6">; def DR7 : Register<"dr7">; - + // Control registers def CR0 : Register<"cr0">; def CR1 : Register<"cr1">; @@ -261,10 +261,10 @@ let Namespace = "X86" in { // implicitly defined to be the register allocation order. // -// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// List call-clobbered registers before callee-save registers. RBX, RBP, (and // R12, R13, R14, and R15 for X86-64) are callee-save registers. // In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and -// R8B, ... R15B. +// R8B, ... R15B. // Allocate R12 and R13 last, as these require an extra byte when // encoded in x86_64 instructions. // FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in @@ -299,14 +299,14 @@ def GR8 : RegisterClass<"X86", [i8], 8, GR8Class::iterator GR8Class::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); // Does the function dedicate RBP / EBP to being a frame ptr? if (!Subtarget.is64Bit()) // In 32-mode, none of the 8-bit registers aliases EBP or ESP. return begin() + 8; - else if (RI->hasFP(MF) || MFI->getReserveFP()) + else if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate SPL or BPL. return array_endof(X86_GR8_AO_64) - 1; else @@ -344,12 +344,12 @@ def GR16 : RegisterClass<"X86", [i16], 16, GR16Class::iterator GR16Class::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (Subtarget.is64Bit()) { // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate SP or BP. return array_endof(X86_GR16_AO_64) - 1; else @@ -357,7 +357,7 @@ def GR16 : RegisterClass<"X86", [i16], 16, return array_endof(X86_GR16_AO_64); } else { // Does the function dedicate EBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate SP or BP. return begin() + 6; else @@ -396,12 +396,12 @@ def GR32 : RegisterClass<"X86", [i32], 32, GR32Class::iterator GR32Class::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (Subtarget.is64Bit()) { // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate ESP or EBP. return array_endof(X86_GR32_AO_64) - 1; else @@ -409,7 +409,7 @@ def GR32 : RegisterClass<"X86", [i32], 32, return array_endof(X86_GR32_AO_64); } else { // Does the function dedicate EBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate ESP or EBP. return begin() + 6; else @@ -436,13 +436,13 @@ def GR64 : RegisterClass<"X86", [i64], 64, GR64Class::iterator GR64Class::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (!Subtarget.is64Bit()) return begin(); // None of these are allocatable in 32-bit. // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) return end()-3; // If so, don't allocate RIP, RSP or RBP else return end()-2; // If not, just don't allocate RIP or RSP @@ -496,6 +496,9 @@ def GR64_TC : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI, (GR32_TC sub_32bit)]; } +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, + R8, R9, R11]>; + // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, [AL, CL, DL, AH, CH, DH, BL, BH]> { @@ -541,10 +544,10 @@ def GR16_NOREX : RegisterClass<"X86", [i16], 16, GR16_NOREXClass::iterator GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); // Does the function dedicate RBP / EBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate SP or BP. return end() - 2; else @@ -565,10 +568,10 @@ def GR32_NOREX : RegisterClass<"X86", [i32], 32, GR32_NOREXClass::iterator GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); // Does the function dedicate RBP / EBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate ESP or EBP. return end() - 2; else @@ -590,10 +593,10 @@ def GR64_NOREX : RegisterClass<"X86", [i64], 64, GR64_NOREXClass::iterator GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate RIP, RSP or RBP. return end() - 3; else @@ -632,12 +635,12 @@ def GR32_NOSP : RegisterClass<"X86", [i32], 32, GR32_NOSPClass::iterator GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (Subtarget.is64Bit()) { // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate EBP. return array_endof(X86_GR32_NOSP_AO_64) - 1; else @@ -645,7 +648,7 @@ def GR32_NOSP : RegisterClass<"X86", [i32], 32, return array_endof(X86_GR32_NOSP_AO_64); } else { // Does the function dedicate EBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate EBP. return begin() + 6; else @@ -670,13 +673,13 @@ def GR64_NOSP : RegisterClass<"X86", [i64], 64, GR64_NOSPClass::iterator GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (!Subtarget.is64Bit()) return begin(); // None of these are allocatable in 32-bit. // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) return end()-1; // If so, don't allocate RBP else return end(); // If not, any reg in this class is ok. @@ -698,10 +701,10 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const { const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *RI = TM.getRegisterInfo(); + const TargetFrameLowering *TFI = TM.getFrameLowering(); const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); // Does the function dedicate RBP to being a frame ptr? - if (RI->hasFP(MF) || MFI->getReserveFP()) + if (TFI->hasFP(MF) || MFI->getReserveFP()) // If so, don't allocate RBP. return end() - 1; else diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 0c5a0ac..42e8193 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -136,7 +136,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, Dst, InFlag); InFlag = Chain.getValue(1); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); @@ -150,7 +150,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, X86::ECX, Left, InFlag); InFlag = Chain.getValue(1); - Tys = DAG.getVTList(MVT::Other, MVT::Flag); + Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); } else if (BytesLeft) { @@ -230,7 +230,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, Src, InFlag); InFlag = Chain.getValue(1); - SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, array_lengthof(Ops)); diff --git a/lib/Target/X86/X86ShuffleDecode.h b/lib/Target/X86/X86ShuffleDecode.h deleted file mode 100644 index df04052..0000000 --- a/lib/Target/X86/X86ShuffleDecode.h +++ /dev/null @@ -1,155 +0,0 @@ -//===-- X86ShuffleDecode.h - X86 shuffle decode logic ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Define several functions to decode x86 specific shuffle semantics into a -// generic vector mask. -// -//===----------------------------------------------------------------------===// - -#ifndef X86_SHUFFLE_DECODE_H -#define X86_SHUFFLE_DECODE_H - -#include "llvm/ADT/SmallVector.h" -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Vector Mask Decoding -//===----------------------------------------------------------------------===// - -enum { - SM_SentinelZero = ~0U -}; - -static inline -void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) { - // Defaults the copying the dest value. - ShuffleMask.push_back(0); - ShuffleMask.push_back(1); - ShuffleMask.push_back(2); - ShuffleMask.push_back(3); - - // Decode the immediate. - unsigned ZMask = Imm & 15; - unsigned CountD = (Imm >> 4) & 3; - unsigned CountS = (Imm >> 6) & 3; - - // CountS selects which input element to use. - unsigned InVal = 4+CountS; - // CountD specifies which element of destination to update. - ShuffleMask[CountD] = InVal; - // ZMask zaps values, potentially overriding the CountD elt. - if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; - if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; - if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; - if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; -} - -// <3,1> or <6,7,2,3> -static void DecodeMOVHLPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(NElts+i); - - for (unsigned i = NElts/2; i != NElts; ++i) - ShuffleMask.push_back(i); -} - -// <0,2> or <0,1,4,5> -static void DecodeMOVLHPSMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(i); - - for (unsigned i = 0; i != NElts/2; ++i) - ShuffleMask.push_back(NElts+i); -} - -static void DecodePSHUFMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts; ++i) { - ShuffleMask.push_back(Imm % NElts); - Imm /= NElts; - } -} - -static void DecodePSHUFHWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - ShuffleMask.push_back(0); - ShuffleMask.push_back(1); - ShuffleMask.push_back(2); - ShuffleMask.push_back(3); - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back(4+(Imm & 3)); - Imm >>= 2; - } -} - -static void DecodePSHUFLWMask(unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != 4; ++i) { - ShuffleMask.push_back((Imm & 3)); - Imm >>= 2; - } - ShuffleMask.push_back(4); - ShuffleMask.push_back(5); - ShuffleMask.push_back(6); - ShuffleMask.push_back(7); -} - -static void DecodePUNPCKLMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i); - ShuffleMask.push_back(i+NElts); - } -} - -static void DecodePUNPCKHMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i+NElts/2); - ShuffleMask.push_back(i+NElts+NElts/2); - } -} - -static void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, - SmallVectorImpl<unsigned> &ShuffleMask) { - // Part that reads from dest. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts); - Imm /= NElts; - } - // Part that reads from src. - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(Imm % NElts + NElts); - Imm /= NElts; - } -} - -static void DecodeUNPCKHPMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i+NElts/2); // Reads from dest - ShuffleMask.push_back(i+NElts+NElts/2); // Reads from src - } -} - - -/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// etc. NElts indicates the number of elements in the vector allowing it to -/// handle different datatypes and vector widths. -static void DecodeUNPCKLPMask(unsigned NElts, - SmallVectorImpl<unsigned> &ShuffleMask) { - for (unsigned i = 0; i != NElts/2; ++i) { - ShuffleMask.push_back(i); // Reads from dest - ShuffleMask.push_back(i+NElts); // Reads from src - } -} - -#endif diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 0d02e5e..1ee7312 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -1,4 +1,4 @@ -//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===// +//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// // // The LLVM Compiler Infrastructure // @@ -18,7 +18,7 @@ #include "llvm/GlobalValue.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/System/Host.h" +#include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/SmallVector.h" @@ -256,13 +256,14 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if ((ECX >> 9) & 1) X86SSELevel = SSSE3; if ((ECX >> 19) & 1) X86SSELevel = SSE41; if ((ECX >> 20) & 1) X86SSELevel = SSE42; + // FIXME: AVX codegen support is not ready. + //if ((ECX >> 28) & 1) { HasAVX = true; X86SSELevel = NoMMXSSE; } bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; HasCLMUL = IsIntel && ((ECX >> 1) & 0x1); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); - HasAVX = ((ECX >> 28) & 0x1); HasAES = IsIntel && ((ECX >> 25) & 0x1); if (IsIntel || IsAMD) { @@ -289,6 +290,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, , X863DNowLevel(NoThreeDNow) , HasCMov(false) , HasX86_64(false) + , HasPOPCNT(false) , HasSSE4A(false) , HasAVX(false) , HasAES(false) @@ -315,11 +317,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, ParseSubtargetFeatures(FS, CPU); // All X86-64 CPUs also have SSE2, however user might request no SSE via // -mattr, so don't force SSELevel here. + if (HasAVX) + X86SSELevel = NoMMXSSE; } else { // Otherwise, use CPUID to auto-detect feature set. AutoDetectSubtargetFeatures(); // Make sure SSE2 is enabled; it is available on all X86-64 CPUs. - if (Is64Bit && X86SSELevel < SSE2) + if (Is64Bit && !HasAVX && X86SSELevel < SSE2) X86SSELevel = SSE2; } @@ -338,9 +342,10 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, assert((!Is64Bit || HasX86_64) && "64-bit code requested on a subtarget that doesn't support it!"); - // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64 - // bit targets. - if (isTargetDarwin() || Is64Bit) + // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both + // 32 and 64 bit) and for all 64-bit targets. + if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() || + isTargetSolaris() || Is64Bit) stackAlignment = 16; if (StackAlignment) diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 2e73124..0a62a02 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -65,6 +65,9 @@ protected: /// bool HasX86_64; + /// HasPOPCNT - True if the processor supports POPCNT. + bool HasPOPCNT; + /// HasSSE4A - True if the processor supports SSE4A instructions. bool HasSSE4A; @@ -100,7 +103,7 @@ protected: /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. /// unsigned MaxInlineSizeThreshold; - + /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -150,7 +153,10 @@ public: bool hasSSE4A() const { return HasSSE4A; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } + bool hasPOPCNT() const { return HasPOPCNT; } bool hasAVX() const { return HasAVX; } + bool hasXMM() const { return hasSSE1() || hasAVX(); } + bool hasXMMInt() const { return hasSSE2() || hasAVX(); } bool hasAES() const { return HasAES; } bool hasCLMUL() const { return HasCLMUL; } bool hasFMA3() const { return HasFMA3; } @@ -160,23 +166,23 @@ public: bool hasVectorUAMem() const { return HasVectorUAMem; } bool isTargetDarwin() const { return TargetTriple.getOS() == Triple::Darwin; } - + bool isTargetFreeBSD() const { return TargetTriple.getOS() == Triple::FreeBSD; } + bool isTargetSolaris() const { return TargetTriple.getOS() == Triple::Solaris; } + // ELF is a reasonably sane default and the only other X86 targets we // support are Darwin and Windows. Just use "not those". - bool isTargetELF() const { + bool isTargetELF() const { return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing(); } bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; } - bool isTargetMingw() const { - return TargetTriple.getOS() == Triple::MinGW32 || - TargetTriple.getOS() == Triple::MinGW64; } + bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; } bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; } bool isTargetCygMing() const { return isTargetMingw() || isTargetCygwin(); } - + /// isTargetCOFF - Return true if this is any COFF/Windows target variant. bool isTargetCOFF() const { return isTargetMingw() || isTargetCygwin() || isTargetWindows(); @@ -186,6 +192,10 @@ public: return Is64Bit && (isTargetMingw() || isTargetWindows()); } + bool isTargetEnvMacho() const { + return isTargetDarwin() || (TargetTriple.getEnvironment() == Triple::MachO); + } + bool isTargetWin32() const { return !Is64Bit && (isTargetMingw() || isTargetWindows()); } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 4c01e60..8fb9470 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -30,10 +30,12 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { case Triple::Darwin: return new X86MCAsmInfoDarwin(TheTriple); case Triple::MinGW32: - case Triple::MinGW64: case Triple::Cygwin: case Triple::Win32: - return new X86MCAsmInfoCOFF(TheTriple); + if (TheTriple.getEnvironment() == Triple::MachO) + return new X86MCAsmInfoDarwin(TheTriple); + else + return new X86MCAsmInfoCOFF(TheTriple); default: return new X86ELFMCAsmInfo(TheTriple); } @@ -43,22 +45,25 @@ static MCStreamer *createMCStreamer(const Target &T, const std::string &TT, MCContext &Ctx, TargetAsmBackend &TAB, raw_ostream &_OS, MCCodeEmitter *_Emitter, - bool RelaxAll) { + bool RelaxAll, + bool NoExecStack) { Triple TheTriple(TT); switch (TheTriple.getOS()) { case Triple::Darwin: return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); case Triple::MinGW32: - case Triple::MinGW64: case Triple::Cygwin: case Triple::Win32: - return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); + if (TheTriple.getEnvironment() == Triple::MachO) + return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); + else + return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll); default: - return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll); + return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack); } } -extern "C" void LLVMInitializeX86Target() { +extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target); RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target); @@ -91,11 +96,11 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT, const std::string &FS) : X86TargetMachine(T, TT, FS, false), DataLayout(getSubtargetImpl()->isTargetDarwin() ? - "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-n8:16:32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-n8:16:32" : (getSubtargetImpl()->isTargetCygMing() || getSubtargetImpl()->isTargetWindows()) ? - "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-n8:16:32" : - "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-n8:16:32"), + "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-n8:16:32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-n8:16:32"), InstrInfo(*this), TSInfo(*this), TLInfo(*this), @@ -106,7 +111,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT, X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT, const std::string &FS) : X86TargetMachine(T, TT, FS, true), - DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-n8:16:32:64"), + DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-n8:16:32:64"), InstrInfo(*this), TSInfo(*this), TLInfo(*this), @@ -115,11 +120,11 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT, /// X86TargetMachine ctor - Create an X86 target. /// -X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT, +X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT, const std::string &FS, bool is64Bit) - : LLVMTargetMachine(T, TT), + : LLVMTargetMachine(T, TT), Subtarget(TT, FS, is64Bit), - FrameInfo(Subtarget), + FrameLowering(*this, Subtarget), ELFWriterInfo(is64Bit, true) { DefRelocModel = getRelocationModel(); @@ -227,12 +232,12 @@ bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) { // FIXME: Move this to TargetJITInfo! // On Darwin, do not override 64-bit setting made in X86TargetMachine(). - if (DefRelocModel == Reloc::Default && + if (DefRelocModel == Reloc::Default && (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) { setRelocationModel(Reloc::Static); Subtarget.setPICStyle(PICStyles::None); } - + PM.add(createX86JITCodeEmitterPass(*this, JCE)); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 8ba7a71..5973922 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -18,13 +18,13 @@ #include "X86ELFWriterInfo.h" #include "X86InstrInfo.h" #include "X86ISelLowering.h" -#include "X86FrameInfo.h" +#include "X86FrameLowering.h" #include "X86JITInfo.h" #include "X86SelectionDAGInfo.h" #include "X86Subtarget.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -32,7 +32,7 @@ class formatted_raw_ostream; class X86TargetMachine : public LLVMTargetMachine { X86Subtarget Subtarget; - X86FrameInfo FrameInfo; + X86FrameLowering FrameLowering; X86ELFWriterInfo ELFWriterInfo; Reloc::Model DefRelocModel; // Reloc model before it's overridden. @@ -48,7 +48,9 @@ public: virtual const X86InstrInfo *getInstrInfo() const { llvm_unreachable("getInstrInfo not implemented"); } - virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual const TargetFrameLowering *getFrameLowering() const { + return &FrameLowering; + } virtual X86JITInfo *getJITInfo() { llvm_unreachable("getJITInfo not implemented"); } |