diff options
Diffstat (limited to 'lib/Target')
240 files changed, 85965 insertions, 0 deletions
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h new file mode 100644 index 0000000..8134dcc --- /dev/null +++ b/lib/Target/ARM/ARM.h @@ -0,0 +1,109 @@ +//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// ARM back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_ARM_H +#define TARGET_ARM_H + +#include <iosfwd> +#include <cassert> + +namespace llvm { + +class ARMTargetMachine; +class FunctionPass; +class MachineCodeEmitter; + +// Enums corresponding to ARM condition codes +namespace ARMCC { + enum CondCodes { + EQ, + NE, + HS, + LO, + MI, + PL, + VS, + VC, + HI, + LS, + GE, + LT, + GT, + LE, + AL + }; + + inline static CondCodes getOppositeCondition(CondCodes CC){ + switch (CC) { + default: assert(0 && "Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } + } +} + +inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) { + switch (CC) { + default: assert(0 && "Unknown condition code"); + case ARMCC::EQ: return "eq"; + case ARMCC::NE: return "ne"; + case ARMCC::HS: return "hs"; + case ARMCC::LO: return "lo"; + case ARMCC::MI: return "mi"; + case ARMCC::PL: return "pl"; + case ARMCC::VS: return "vs"; + case ARMCC::VC: return "vc"; + case ARMCC::HI: return "hi"; + case ARMCC::LS: return "ls"; + case ARMCC::GE: return "ge"; + case ARMCC::LT: return "lt"; + case ARMCC::GT: return "gt"; + case ARMCC::LE: return "le"; + case ARMCC::AL: return "al"; + } +} + +FunctionPass *createARMISelDag(ARMTargetMachine &TM); +FunctionPass *createARMCodePrinterPass(std::ostream &O, ARMTargetMachine &TM); +FunctionPass *createARMCodeEmitterPass(ARMTargetMachine &TM, + MachineCodeEmitter &MCE); +FunctionPass *createARMLoadStoreOptimizationPass(); +FunctionPass *createARMConstantIslandPass(); + +} // end namespace llvm; + +// Defines symbolic names for ARM registers. This defines a mapping from +// register name to register number. +// +#include "ARMGenRegisterNames.inc" + +// Defines symbolic names for the ARM instructions. +// +#include "ARMGenInstrNames.inc" + + +#endif diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td new file mode 100644 index 0000000..0272004 --- /dev/null +++ b/lib/Target/ARM/ARM.td @@ -0,0 +1,119 @@ +//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "../Target.td" + +//===----------------------------------------------------------------------===// +// ARM Subtarget features. +// + +def ArchV4T : SubtargetFeature<"v4t", "ARMArchVersion", "V4T", + "ARM v4T">; +def ArchV5T : SubtargetFeature<"v5t", "ARMArchVersion", "V5T", + "ARM v5T">; +def ArchV5TE : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE", + "ARM v5TE, v5TEj, v5TExp">; +def ArchV6 : SubtargetFeature<"v6", "ARMArchVersion", "V6", + "ARM v6">; +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFP2", "true", + "Enable VFP2 instructions ">; + +//===----------------------------------------------------------------------===// +// ARM Processors supported. +// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +// V4 Processors. +def : Proc<"generic", []>; +def : Proc<"arm8", []>; +def : Proc<"arm810", []>; +def : Proc<"strongarm", []>; +def : Proc<"strongarm110", []>; +def : Proc<"strongarm1100", []>; +def : Proc<"strongarm1110", []>; + +// V4T Processors. +def : Proc<"arm7tdmi", [ArchV4T]>; +def : Proc<"arm7tdmi-s", [ArchV4T]>; +def : Proc<"arm710t", [ArchV4T]>; +def : Proc<"arm720t", [ArchV4T]>; +def : Proc<"arm9", [ArchV4T]>; +def : Proc<"arm9tdmi", [ArchV4T]>; +def : Proc<"arm920", [ArchV4T]>; +def : Proc<"arm920t", [ArchV4T]>; +def : Proc<"arm922t", [ArchV4T]>; +def : Proc<"arm940t", [ArchV4T]>; +def : Proc<"ep9312", [ArchV4T]>; + +// V5T Processors. +def : Proc<"arm10tdmi", [ArchV5T]>; +def : Proc<"arm1020t", [ArchV5T]>; + +// V5TE Processors. +def : Proc<"arm9e", [ArchV5TE]>; +def : Proc<"arm926ej-s", [ArchV5TE]>; +def : Proc<"arm946e-s", [ArchV5TE]>; +def : Proc<"arm966e-s", [ArchV5TE]>; +def : Proc<"arm968e-s", [ArchV5TE]>; +def : Proc<"arm10e", [ArchV5TE]>; +def : Proc<"arm1020e", [ArchV5TE]>; +def : Proc<"arm1022e", [ArchV5TE]>; +def : Proc<"xscale", [ArchV5TE]>; +def : Proc<"iwmmxt", [ArchV5TE]>; + +// V6 Processors. +def : Proc<"arm1136j-s", [ArchV6]>; +def : Proc<"arm1136jf-s", [ArchV6, FeatureVFP2]>; +def : Proc<"arm1176jz-s", [ArchV6]>; +def : Proc<"arm1176jzf-s", [ArchV6, FeatureVFP2]>; +def : Proc<"mpcorenovfp", [ArchV6]>; +def : Proc<"mpcore", [ArchV6, FeatureVFP2]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "ARMRegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "ARMInstrInfo.td" + +def ARMInstrInfo : InstrInfo { + // Define how we want to layout our target-specific information field. + let TSFlagsFields = ["AddrModeBits", + "SizeFlag", + "IndexModeBits", + "Opcode"]; + let TSFlagsShifts = [0, + 4, + 7, + 9]; +} + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def ARM : Target { + // Pull in Instruction Info: + let InstructionSet = ARMInstrInfo; +} diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h new file mode 100644 index 0000000..3f47a69 --- /dev/null +++ b/lib/Target/ARM/ARMAddressingModes.h @@ -0,0 +1,394 @@ +//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H +#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H + +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> + +namespace llvm { + +/// ARM_AM - ARM Addressing Mode Stuff +namespace ARM_AM { + enum ShiftOpc { + no_shift = 0, + asr, + lsl, + lsr, + ror, + rrx + }; + + enum AddrOpc { + add = '+', sub = '-' + }; + + static inline const char *getShiftOpcStr(ShiftOpc Op) { + switch (Op) { + default: assert(0 && "Unknown shift opc!"); + case ARM_AM::asr: return "asr"; + case ARM_AM::lsl: return "lsl"; + case ARM_AM::lsr: return "lsr"; + case ARM_AM::ror: return "ror"; + case ARM_AM::rrx: return "rrx"; + } + } + + static inline ShiftOpc getShiftOpcForNode(SDOperand N) { + switch (N.getOpcode()) { + default: return ARM_AM::no_shift; + case ISD::SHL: return ARM_AM::lsl; + case ISD::SRL: return ARM_AM::lsr; + case ISD::SRA: return ARM_AM::asr; + case ISD::ROTR: return ARM_AM::ror; + //case ISD::ROTL: // Only if imm -> turn into ROTR. + // Can't handle RRX here, because it would require folding a flag into + // the addressing mode. :( This causes us to miss certain things. + //case ARMISD::RRX: return ARM_AM::rrx; + } + } + + enum AMSubMode { + bad_am_submode = 0, + ia, + ib, + da, + db + }; + + static inline const char *getAMSubModeStr(AMSubMode Mode) { + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::ia: return "ia"; + case ARM_AM::ib: return "ib"; + case ARM_AM::da: return "da"; + case ARM_AM::db: return "db"; + } + } + + static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) { + switch (Mode) { + default: assert(0 && "Unknown addressing sub-mode!"); + case ARM_AM::ia: return isLD ? "fd" : "ea"; + case ARM_AM::ib: return isLD ? "ed" : "fa"; + case ARM_AM::da: return isLD ? "fa" : "ed"; + case ARM_AM::db: return isLD ? "ea" : "fd"; + } + } + + /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits. + /// + static inline unsigned rotr32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val >> Amt) | (Val << ((32-Amt)&31)); + } + + /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits. + /// + static inline unsigned rotl32(unsigned Val, unsigned Amt) { + assert(Amt < 32 && "Invalid rotate amount"); + return (Val << Amt) | (Val >> ((32-Amt)&31)); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #1: shift_operand with registers + //===--------------------------------------------------------------------===// + // + // This 'addressing mode' is used for arithmetic instructions. It can + // represent things like: + // reg + // reg [asr|lsl|lsr|ror|rrx] reg + // reg [asr|lsl|lsr|ror|rrx] imm + // + // This is stored three operands [rega, regb, opc]. The first is the base + // reg, the second is the shift amount (or reg0 if not present or imm). The + // third operand encodes the shift opcode and the imm if a reg isn't present. + // + static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) { + return ShOp | (Imm << 3); + } + static inline unsigned getSORegOffset(unsigned Op) { + return Op >> 3; + } + static inline ShiftOpc getSORegShOp(unsigned Op) { + return (ShiftOpc)(Op & 7); + } + + /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return + /// the 8-bit imm value. + static inline unsigned getSOImmValImm(unsigned Imm) { + return Imm & 0xFF; + } + /// getSOImmValRotate - Given an encoded imm field for the reg/imm form, return + /// the rotate amount. + static inline unsigned getSOImmValRot(unsigned Imm) { + return (Imm >> 8) * 2; + } + + /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand, + /// computing the rotate amount to use. If this immediate value cannot be + /// handled with a single shifter-op, determine a good rotate amount that will + /// take a maximal chunk of bits out of the immediate. + static inline unsigned getSOImmValRotate(unsigned Imm) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the rotate amount. + unsigned TZ = CountTrailingZeros_32(Imm); + + // Rotate amount must be even. Something like 0x200 must be rotated 8 bits, + // not 9. + unsigned RotAmt = TZ & ~1; + + // If we can handle this spread, return it. + if ((rotr32(Imm, RotAmt) & ~255U) == 0) + return (32-RotAmt)&31; // HW rotates right, not left. + + // For values like 0xF000000F, we should skip the first run of ones, then + // retry the hunt. + if (Imm & 1) { + unsigned TrailingOnes = CountTrailingZeros_32(~Imm); + if (TrailingOnes != 32) { // Avoid overflow on 0xFFFFFFFF + // Restart the search for a high-order bit after the initial seconds of + // ones. + unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1)); + + // Rotate amount must be even. + unsigned RotAmt2 = TZ2 & ~1; + + // If this fits, use it. + if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0) + return (32-RotAmt2)&31; // HW rotates right, not left. + } + } + + // Otherwise, we have no way to cover this span of bits with a single + // shifter_op immediate. Return a chunk of bits that will be useful to + // handle. + return (32-RotAmt)&31; // HW rotates right, not left. + } + + /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit + /// into an shifter_operand immediate operand, return the 12-bit encoding for + /// it. If not, return -1. + static inline int getSOImmVal(unsigned Arg) { + // 8-bit (or less) immediates are trivially shifter_operands with a rotate + // of zero. + if ((Arg & ~255U) == 0) return Arg; + + unsigned RotAmt = getSOImmValRotate(Arg); + + // If this cannot be handled with a single shifter_op, bail out. + if (rotr32(~255U, RotAmt) & Arg) + return -1; + + // Encode this correctly. + return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8); + } + + /// isSOImmTwoPartVal - Return true if the specified value can be obtained by + /// or'ing together two SOImmVal's. + static inline bool isSOImmTwoPartVal(unsigned V) { + // If this can be handled with a single shifter_op, bail out. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + if (V == 0) + return false; + + // If this can be handled with two shifter_op's, accept. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + return V == 0; + } + + /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal, + /// return the first chunk of it. + static inline unsigned getSOImmTwoPartFirst(unsigned V) { + return rotr32(255U, getSOImmValRotate(V)) & V; + } + + /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal, + /// return the second chunk of it. + static inline unsigned getSOImmTwoPartSecond(unsigned V) { + // Mask out the first hunk. + V = rotr32(~255U, getSOImmValRotate(V)) & V; + + // Take what's left. + assert(V == (rotr32(255U, getSOImmValRotate(V)) & V)); + return V; + } + + /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed + /// by a left shift. Returns the shift amount to use. + static inline unsigned getThumbImmValShift(unsigned Imm) { + // 8-bit (or less) immediates are trivially immediate operand with a shift + // of zero. + if ((Imm & ~255U) == 0) return 0; + + // Use CTZ to compute the shift amount. + return CountTrailingZeros_32(Imm); + } + + /// isThumbImmShiftedVal - Return true if the specified value can be obtained + /// by left shifting a 8-bit immediate. + static inline bool isThumbImmShiftedVal(unsigned V) { + // If this can be handled with + V = (~255U << getThumbImmValShift(V)) & V; + return V == 0; + } + + /// getThumbImmNonShiftedVal - If V is a value that satisfies + /// isThumbImmShiftedVal, return the non-shiftd value. + static inline unsigned getThumbImmNonShiftedVal(unsigned V) { + return V >> getThumbImmValShift(V); + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #2 + //===--------------------------------------------------------------------===// + // + // This is used for most simple load/store instructions. + // + // addrmode2 := reg +/- reg shop imm + // addrmode2 := reg +/- imm12 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. + // + // If this addressing mode is a frame index (before prolog/epilog insertion + // and code rewriting), this operand will have the form: FI#, reg0, <offs> + // with no shift amount for the frame offset. + // + static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) { + assert(Imm12 < (1 << 12) && "Imm too large!"); + bool isSub = Opc == sub; + return Imm12 | ((int)isSub << 12) | (SO << 13); + } + static inline unsigned getAM2Offset(unsigned AM2Opc) { + return AM2Opc & ((1 << 12)-1); + } + static inline AddrOpc getAM2Op(unsigned AM2Opc) { + return ((AM2Opc >> 12) & 1) ? sub : add; + } + static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) { + return (ShiftOpc)(AM2Opc >> 13); + } + + + //===--------------------------------------------------------------------===// + // Addressing Mode #3 + //===--------------------------------------------------------------------===// + // + // This is used for sign-extending loads, and load/store-pair instructions. + // + // addrmode3 := reg +/- reg + // addrmode3 := reg +/- imm8 + // + // The first operand is always a Reg. The second operand is a reg if in + // reg/reg form, otherwise it's reg#0. The third field encodes the operation + // in bit 8, the immediate in bits 0-7. + + /// getAM3Opc - This function encodes the addrmode3 opc field. + static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM3Offset(unsigned AM3Opc) { + return AM3Opc & 0xFF; + } + static inline AddrOpc getAM3Op(unsigned AM3Opc) { + return ((AM3Opc >> 8) & 1) ? sub : add; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #4 + //===--------------------------------------------------------------------===// + // + // This is used for load / store multiple instructions. + // + // addrmode4 := reg, <mode> + // + // The four modes are: + // IA - Increment after + // IB - Increment before + // DA - Decrement after + // DB - Decrement before + // + // If the 4th bit (writeback)is set, then the base register is updated after + // the memory transfer. + + static inline AMSubMode getAM4SubMode(unsigned Mode) { + return (AMSubMode)(Mode & 0x7); + } + + static inline unsigned getAM4ModeImm(AMSubMode SubMode, bool WB = false) { + return (int)SubMode | ((int)WB << 3); + } + + static inline bool getAM4WBFlag(unsigned Mode) { + return (Mode >> 3) & 1; + } + + //===--------------------------------------------------------------------===// + // Addressing Mode #5 + //===--------------------------------------------------------------------===// + // + // This is used for coprocessor instructions, such as FP load/stores. + // + // addrmode5 := reg +/- imm8*4 + // + // The first operand is always a Reg. The third field encodes the operation + // in bit 8, the immediate in bits 0-7. + // + // This can also be used for FP load/store multiple ops. The third field encodes + // writeback mode in bit 8, the number of registers (or 2 times the number of + // registers for DPR ops) in bits 0-7. In addition, bit 9-11 encodes one of the + // following two sub-modes: + // + // IA - Increment after + // DB - Decrement before + + /// getAM5Opc - This function encodes the addrmode5 opc field. + static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) { + bool isSub = Opc == sub; + return ((int)isSub << 8) | Offset; + } + static inline unsigned char getAM5Offset(unsigned AM5Opc) { + return AM5Opc & 0xFF; + } + static inline AddrOpc getAM5Op(unsigned AM5Opc) { + return ((AM5Opc >> 8) & 1) ? sub : add; + } + + /// getAM5Opc - This function encodes the addrmode5 opc field for FLDM and + /// FSTM instructions. + static inline unsigned getAM5Opc(AMSubMode SubMode, bool WB, + unsigned char Offset) { + assert((SubMode == ia || SubMode == db) && + "Illegal addressing mode 5 sub-mode!"); + return ((int)SubMode << 9) | ((int)WB << 8) | Offset; + } + static inline AMSubMode getAM5SubMode(unsigned AM5Opc) { + return (AMSubMode)((AM5Opc >> 9) & 0x7); + } + static inline bool getAM5WBFlag(unsigned AM5Opc) { + return ((AM5Opc >> 8) & 1); + } + +} // end namespace ARM_AM +} // end namespace llvm + +#endif + diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp new file mode 100644 index 0000000..5e65226 --- /dev/null +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -0,0 +1,1029 @@ +//===-- ARMAsmPrinter.cpp - ARM LLVM assembly writer ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format ARM assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "ARM.h" +#include "ARMTargetMachine.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMMachineFunctionInfo.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/DwarfWriter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Support/MathExtras.h" +#include <cctype> +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct VISIBILITY_HIDDEN ARMAsmPrinter : public AsmPrinter { + ARMAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T) + : AsmPrinter(O, TM, T), DW(O, this, T), AFI(NULL), InCPMode(false) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + } + + DwarfWriter DW; + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when printing asm code for different targets. + const ARMSubtarget *Subtarget; + + /// AFI - Keep a pointer to ARMFunctionInfo for the current + /// MachineFunction + ARMFunctionInfo *AFI; + + /// We name each basic block in a Function with a unique number, so + /// that we can consistently refer to them later. This is cleared + /// at the beginning of each call to runOnMachineFunction(). + /// + typedef std::map<const Value *, unsigned> ValueMapTy; + ValueMapTy NumberForBB; + + /// Keeps the set of GlobalValues that require non-lazy-pointers for + /// indirect access. + std::set<std::string> GVNonLazyPtrs; + + /// Keeps the set of external function GlobalAddresses that the asm + /// printer should generate stubs for. + std::set<std::string> FnStubs; + + /// True if asm printer is printing a series of CONSTPOOL_ENTRY. + bool InCPMode; + + virtual const char *getPassName() const { + return "ARM Assembly Printer"; + } + + void printOperand(const MachineInstr *MI, int opNum, + const char *Modifier = 0); + void printSOImmOperand(const MachineInstr *MI, int opNum); + void printSOImm2PartOperand(const MachineInstr *MI, int opNum); + void printSORegOperand(const MachineInstr *MI, int opNum); + void printAddrMode2Operand(const MachineInstr *MI, int OpNo); + void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNo); + void printAddrMode3Operand(const MachineInstr *MI, int OpNo); + void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNo); + void printAddrMode4Operand(const MachineInstr *MI, int OpNo, + const char *Modifier = 0); + void printAddrMode5Operand(const MachineInstr *MI, int OpNo, + const char *Modifier = 0); + void printAddrModePCOperand(const MachineInstr *MI, int OpNo, + const char *Modifier = 0); + void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNo); + void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNo, + unsigned Scale); + void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNo); + void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNo); + void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNo); + void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNo); + void printPredicateOperand(const MachineInstr *MI, int opNum); + void printSBitModifierOperand(const MachineInstr *MI, int opNum); + void printPCLabel(const MachineInstr *MI, int opNum); + void printRegisterList(const MachineInstr *MI, int opNum); + void printCPInstOperand(const MachineInstr *MI, int opNum, + const char *Modifier); + void printJTBlockOperand(const MachineInstr *MI, int opNum); + + virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + + bool printInstruction(const MachineInstr *MI); // autogenerated. + void printMachineInstruction(const MachineInstr *MI); + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { + printDataDirective(MCPV->getType()); + + ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MCPV; + GlobalValue *GV = ACPV->getGV(); + std::string Name = GV ? Mang->getValueName(GV) : TAI->getGlobalPrefix(); + if (!GV) + Name += ACPV->getSymbol(); + if (ACPV->isNonLazyPointer()) { + GVNonLazyPtrs.insert(Name); + O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr"; + } else if (ACPV->isStub()) { + FnStubs.insert(Name); + O << TAI->getPrivateGlobalPrefix() << Name << "$stub"; + } else + O << Name; + if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")"; + if (ACPV->getPCAdjustment() != 0) { + O << "-(" << TAI->getPrivateGlobalPrefix() << "PC" + << utostr(ACPV->getLabelId()) + << "+" << (unsigned)ACPV->getPCAdjustment(); + if (ACPV->mustAddCurrentAddress()) + O << "-."; + O << ")"; + } + O << "\n"; + + // If the constant pool value is a extern weak symbol, remember to emit + // the weak reference. + if (GV && GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineModuleInfo>(); + } + }; +} // end of anonymous namespace + +#include "ARMGenAsmWriter.inc" + +/// createARMCodePrinterPass - Returns a pass that prints the ARM +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. This should work +/// regardless of whether the function is in SSA form. +/// +FunctionPass *llvm::createARMCodePrinterPass(std::ostream &o, + ARMTargetMachine &tm) { + return new ARMAsmPrinter(o, tm, tm.getTargetAsmInfo()); +} + +/// runOnMachineFunction - This uses the printInstruction() +/// method to print assembly for each instruction. +/// +bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + AFI = MF.getInfo<ARMFunctionInfo>(); + + DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>()); + + SetupMachineFunction(MF); + O << "\n"; + + // NOTE: we don't print out constant pools here, they are handled as + // instructions. + + O << "\n"; + // Print out labels for the function. + const Function *F = MF.getFunction(); + switch (F->getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: + SwitchToTextSection("\t.text", F); + break; + case Function::ExternalLinkage: + SwitchToTextSection("\t.text", F); + O << "\t.globl\t" << CurrentFnName << "\n"; + break; + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + if (Subtarget->isTargetDarwin()) { + SwitchToTextSection( + ".section __TEXT,__textcoal_nt,coalesced,pure_instructions", F); + O << "\t.globl\t" << CurrentFnName << "\n"; + O << "\t.weak_definition\t" << CurrentFnName << "\n"; + } else { + O << TAI->getWeakRefDirective() << CurrentFnName << "\n"; + } + break; + } + + const char *VisibilityDirective = NULL; + if (F->hasHiddenVisibility()) + VisibilityDirective = TAI->getHiddenDirective(); + else if (F->hasProtectedVisibility()) + VisibilityDirective = TAI->getProtectedDirective(); + + if (VisibilityDirective) + O << VisibilityDirective << CurrentFnName << "\n"; + + if (AFI->isThumbFunction()) { + EmitAlignment(1, F, AFI->getAlign()); + O << "\t.code\t16\n"; + O << "\t.thumb_func"; + if (Subtarget->isTargetDarwin()) + O << "\t" << CurrentFnName; + O << "\n"; + InCPMode = false; + } else + EmitAlignment(2, F); + + O << CurrentFnName << ":\n"; + // Emit pre-function debug information. + DW.BeginFunction(&MF); + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block. + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + printMachineInstruction(II); + } + } + + if (TAI->hasDotTypeDotSizeDirective()) + O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n"; + + // Emit post-function debug information. + DW.EndFunction(); + + return false; +} + +void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum, + const char *Modifier) { + const MachineOperand &MO = MI->getOperand(opNum); + switch (MO.getType()) { + case MachineOperand::MO_Register: + if (MRegisterInfo::isPhysicalRegister(MO.getReg())) + O << TM.getRegisterInfo()->get(MO.getReg()).Name; + else + assert(0 && "not implemented"); + break; + case MachineOperand::MO_Immediate: { + if (!Modifier || strcmp(Modifier, "no_hash") != 0) + O << "#"; + + O << (int)MO.getImmedValue(); + break; + } + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_GlobalAddress: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + GlobalValue *GV = MO.getGlobal(); + std::string Name = Mang->getValueName(GV); + bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()); + if (isExt && isCallOp && Subtarget->isTargetDarwin() && + TM.getRelocationModel() != Reloc::Static) { + O << TAI->getPrivateGlobalPrefix() << Name << "$stub"; + FnStubs.insert(Name); + } else + O << Name; + + if (MO.getOffset() > 0) + O << '+' << MO.getOffset(); + else if (MO.getOffset() < 0) + O << MO.getOffset(); + + if (isCallOp && Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_) + O << "(PLT)"; + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + break; + } + case MachineOperand::MO_ExternalSymbol: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + std::string Name(TAI->getGlobalPrefix()); + Name += MO.getSymbolName(); + if (isCallOp && Subtarget->isTargetDarwin() && + TM.getRelocationModel() != Reloc::Static) { + O << TAI->getPrivateGlobalPrefix() << Name << "$stub"; + FnStubs.insert(Name); + } else + O << Name; + if (isCallOp && Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_) + O << "(PLT)"; + break; + } + case MachineOperand::MO_ConstantPoolIndex: + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + << '_' << MO.getConstantPoolIndex(); + break; + case MachineOperand::MO_JumpTableIndex: + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << MO.getJumpTableIndex(); + break; + default: + O << "<unknown operand type>"; abort (); break; + } +} + +static void printSOImm(std::ostream &O, int64_t V, const TargetAsmInfo *TAI) { + assert(V < (1 << 12) && "Not a valid so_imm value!"); + unsigned Imm = ARM_AM::getSOImmValImm(V); + unsigned Rot = ARM_AM::getSOImmValRot(V); + + // Print low-level immediate formation info, per + // A5.1.3: "Data-processing operands - Immediate". + if (Rot) { + O << "#" << Imm << ", " << Rot; + // Pretty printed version. + O << ' ' << TAI->getCommentString() << ' ' << (int)ARM_AM::rotr32(Imm, Rot); + } else { + O << "#" << Imm; + } +} + +/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit +/// immediate in bits 0-7. +void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) { + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isImmediate() && "Not a valid so_imm value!"); + printSOImm(O, MO.getImmedValue(), TAI); +} + +/// printSOImm2PartOperand - SOImm is broken into two pieces using a mov +/// followed by a or to materialize. +void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) { + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isImmediate() && "Not a valid so_imm value!"); + unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImmedValue()); + unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImmedValue()); + printSOImm(O, ARM_AM::getSOImmVal(V1), TAI); + O << "\n\torr"; + printPredicateOperand(MI, 2); + O << " "; + printOperand(MI, 0); + O << ", "; + printOperand(MI, 0); + O << ", "; + printSOImm(O, ARM_AM::getSOImmVal(V2), TAI); +} + +// so_reg is a 4-operand unit corresponding to register forms of the A5.1 +// "Addressing Mode 1 - Data-processing operands" forms. This includes: +// REG 0 0 - e.g. R5 +// REG REG 0,SH_OPC - e.g. R5, ROR R3 +// REG 0 IMM,SH_OPC - e.g. R5, LSL #3 +void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + assert(MRegisterInfo::isPhysicalRegister(MO1.getReg())); + O << TM.getRegisterInfo()->get(MO1.getReg()).Name; + + // Print the shift opc. + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImmedValue())) + << " "; + + if (MO2.getReg()) { + assert(MRegisterInfo::isPhysicalRegister(MO2.getReg())); + O << TM.getRegisterInfo()->get(MO2.getReg()).Name; + assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0); + } else { + O << "#" << ARM_AM::getSORegOffset(MO3.getImm()); + } +} + +void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isRegister()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op); + return; + } + + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + + if (!MO2.getReg()) { + if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0. + O << ", #" + << (char)ARM_AM::getAM2Op(MO3.getImm()) + << ARM_AM::getAM2Offset(MO3.getImm()); + O << "]"; + return; + } + + O << ", " + << (char)ARM_AM::getAM2Op(MO3.getImm()) + << TM.getRegisterInfo()->get(MO2.getReg()).Name; + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImmedValue())) + << " #" << ShImm; + O << "]"; +} + +void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){ + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (!MO1.getReg()) { + unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm()); + assert(ImmOffs && "Malformed indexed load / store!"); + O << "#" + << (char)ARM_AM::getAM2Op(MO2.getImm()) + << ImmOffs; + return; + } + + O << (char)ARM_AM::getAM2Op(MO2.getImm()) + << TM.getRegisterInfo()->get(MO1.getReg()).Name; + + if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm())) + O << ", " + << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImmedValue())) + << " #" << ShImm; +} + +void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + assert(MRegisterInfo::isPhysicalRegister(MO1.getReg())); + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + + if (MO2.getReg()) { + O << ", " + << (char)ARM_AM::getAM3Op(MO3.getImm()) + << TM.getRegisterInfo()->get(MO2.getReg()).Name + << "]"; + return; + } + + if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm())) + O << ", #" + << (char)ARM_AM::getAM3Op(MO3.getImm()) + << ImmOffs; + O << "]"; +} + +void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){ + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (MO1.getReg()) { + O << (char)ARM_AM::getAM3Op(MO2.getImm()) + << TM.getRegisterInfo()->get(MO1.getReg()).Name; + return; + } + + unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm()); + assert(ImmOffs && "Malformed indexed load / store!"); + O << "#" + << (char)ARM_AM::getAM3Op(MO2.getImm()) + << ImmOffs; +} + +void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op, + const char *Modifier) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm()); + if (Modifier && strcmp(Modifier, "submode") == 0) { + if (MO1.getReg() == ARM::SP) { + bool isLDM = (MI->getOpcode() == ARM::LDM || + MI->getOpcode() == ARM::LDM_RET); + O << ARM_AM::getAMSubModeAltStr(Mode, isLDM); + } else + O << ARM_AM::getAMSubModeStr(Mode); + } else { + printOperand(MI, Op); + if (ARM_AM::getAM4WBFlag(MO2.getImm())) + O << "!"; + } +} + +void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op, + const char *Modifier) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + + if (!MO1.isRegister()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op); + return; + } + + assert(MRegisterInfo::isPhysicalRegister(MO1.getReg())); + + if (Modifier && strcmp(Modifier, "submode") == 0) { + ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm()); + if (MO1.getReg() == ARM::SP) { + bool isFLDM = (MI->getOpcode() == ARM::FLDMD || + MI->getOpcode() == ARM::FLDMS); + O << ARM_AM::getAMSubModeAltStr(Mode, isFLDM); + } else + O << ARM_AM::getAMSubModeStr(Mode); + return; + } else if (Modifier && strcmp(Modifier, "base") == 0) { + // Used for FSTM{D|S} and LSTM{D|S} operations. + O << TM.getRegisterInfo()->get(MO1.getReg()).Name; + if (ARM_AM::getAM5WBFlag(MO2.getImm())) + O << "!"; + return; + } + + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + + if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) { + O << ", #" + << (char)ARM_AM::getAM5Op(MO2.getImm()) + << ImmOffs*4; + } + O << "]"; +} + +void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op, + const char *Modifier) { + if (Modifier && strcmp(Modifier, "label") == 0) { + printPCLabel(MI, Op+1); + return; + } + + const MachineOperand &MO1 = MI->getOperand(Op); + assert(MRegisterInfo::isPhysicalRegister(MO1.getReg())); + O << "[pc, +" << TM.getRegisterInfo()->get(MO1.getReg()).Name << "]"; +} + +void +ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + O << ", " << TM.getRegisterInfo()->get(MO2.getReg()).Name << "]"; +} + +void +ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op, + unsigned Scale) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + const MachineOperand &MO3 = MI->getOperand(Op+2); + + if (!MO1.isRegister()) { // FIXME: This is for CP entries, but isn't right. + printOperand(MI, Op); + return; + } + + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + if (MO3.getReg()) + O << ", " << TM.getRegisterInfo()->get(MO3.getReg()).Name; + else if (unsigned ImmOffs = MO2.getImm()) { + O << ", #" << ImmOffs; + if (Scale > 1) + O << " * " << Scale; + } + O << "]"; +} + +void +ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op) { + printThumbAddrModeRI5Operand(MI, Op, 1); +} +void +ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op) { + printThumbAddrModeRI5Operand(MI, Op, 2); +} +void +ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) { + printThumbAddrModeRI5Operand(MI, Op, 4); +} + +void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) { + const MachineOperand &MO1 = MI->getOperand(Op); + const MachineOperand &MO2 = MI->getOperand(Op+1); + O << "[" << TM.getRegisterInfo()->get(MO1.getReg()).Name; + if (unsigned ImmOffs = MO2.getImm()) + O << ", #" << ImmOffs << " * 4"; + O << "]"; +} + +void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int opNum) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(opNum).getImmedValue(); + if (CC != ARMCC::AL) + O << ARMCondCodeToString(CC); +} + +void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int opNum){ + unsigned Reg = MI->getOperand(opNum).getReg(); + if (Reg) { + assert(Reg == ARM::CPSR && "Expect ARM CPSR register!"); + O << 's'; + } +} + +void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int opNum) { + int Id = (int)MI->getOperand(opNum).getImmedValue(); + O << TAI->getPrivateGlobalPrefix() << "PC" << Id; +} + +void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int opNum) { + O << "{"; + for (unsigned i = opNum, e = MI->getNumOperands(); i != e; ++i) { + printOperand(MI, i); + if (i != e-1) O << ", "; + } + O << "}"; +} + +void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNo, + const char *Modifier) { + assert(Modifier && "This operand only works with a modifier!"); + // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the + // data itself. + if (!strcmp(Modifier, "label")) { + unsigned ID = MI->getOperand(OpNo).getImm(); + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + << '_' << ID << ":\n"; + } else { + assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE"); + unsigned CPI = MI->getOperand(OpNo).getConstantPoolIndex(); + + const MachineConstantPoolEntry &MCPE = // Chasing pointers is fun? + MI->getParent()->getParent()->getConstantPool()->getConstants()[CPI]; + + if (MCPE.isMachineConstantPoolEntry()) + EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + else { + EmitGlobalConstant(MCPE.Val.ConstVal); + // remember to emit the weak reference + if (const GlobalValue *GV = dyn_cast<GlobalValue>(MCPE.Val.ConstVal)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + } + } +} + +void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNo) { + const MachineOperand &MO1 = MI->getOperand(OpNo); + const MachineOperand &MO2 = MI->getOperand(OpNo+1); // Unique Id + unsigned JTI = MO1.getJumpTableIndex(); + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << JTI << '_' << MO2.getImmedValue() << ":\n"; + + const char *JTEntryDirective = TAI->getJumpTableDirective(); + if (!JTEntryDirective) + JTEntryDirective = TAI->getData32bitsDirective(); + + const MachineFunction *MF = MI->getParent()->getParent(); + MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; + bool UseSet= TAI->getSetDirective() && TM.getRelocationModel() == Reloc::PIC_; + std::set<MachineBasicBlock*> JTSets; + for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { + MachineBasicBlock *MBB = JTBBs[i]; + if (UseSet && JTSets.insert(MBB).second) + printSetLabel(JTI, MO2.getImmedValue(), MBB); + + O << JTEntryDirective << ' '; + if (UseSet) + O << TAI->getPrivateGlobalPrefix() << getFunctionNumber() + << '_' << JTI << '_' << MO2.getImmedValue() + << "_set_" << MBB->getNumber(); + else if (TM.getRelocationModel() == Reloc::PIC_) { + printBasicBlockLabel(MBB, false, false); + // If the arch uses custom Jump Table directives, don't calc relative to JT + if (!TAI->getJumpTableDirective()) + O << '-' << TAI->getPrivateGlobalPrefix() << "JTI" + << getFunctionNumber() << '_' << JTI << '_' << MO2.getImmedValue(); + } else + printBasicBlockLabel(MBB, false, false); + if (i != e-1) + O << '\n'; + } +} + + +bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode){ + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'c': // Don't print "$" before a global var name or constant. + case 'P': // Print a VFP double precision register. + printOperand(MI, OpNo); + return false; + case 'Q': + if (TM.getTargetData()->isLittleEndian()) + break; + // Fallthrough + case 'R': + if (TM.getTargetData()->isBigEndian()) + break; + // Fallthrough + case 'H': // Write second word of DI / DF reference. + // Verify that this operand has two consecutive registers. + if (!MI->getOperand(OpNo).isRegister() || + OpNo+1 == MI->getNumOperands() || + !MI->getOperand(OpNo+1).isRegister()) + return true; + ++OpNo; // Return the high-part. + } + } + + printOperand(MI, OpNo); + return false; +} + +void ARMAsmPrinter::printMachineInstruction(const MachineInstr *MI) { + ++EmittedInsts; + + int Opc = MI->getOpcode(); + switch (Opc) { + case ARM::CONSTPOOL_ENTRY: + if (!InCPMode && AFI->isThumbFunction()) { + EmitAlignment(2); + InCPMode = true; + } + break; + default: { + if (InCPMode && AFI->isThumbFunction()) + InCPMode = false; + switch (Opc) { + case ARM::PICADD: + case ARM::PICLD: + case ARM::PICLDZH: + case ARM::PICLDZB: + case ARM::PICLDH: + case ARM::PICLDB: + case ARM::PICLDSH: + case ARM::PICLDSB: + case ARM::PICSTR: + case ARM::PICSTRH: + case ARM::PICSTRB: + case ARM::tPICADD: + break; + default: + O << "\t"; + break; + } + }} + + // Call the autogenerated instruction printer routines. + printInstruction(MI); +} + +bool ARMAsmPrinter::doInitialization(Module &M) { + // Emit initial debug information. + DW.BeginModule(&M); + + AsmPrinter::doInitialization(M); + + // Darwin wants symbols to be quoted if they have complex names. + if (Subtarget->isTargetDarwin()) + Mang->setUseQuotes(true); + + return false; +} + +bool ARMAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (!I->hasInitializer()) // External global require no code + continue; + + if (EmitSpecialLLVMGlobal(I)) { + if (Subtarget->isTargetDarwin() && + TM.getRelocationModel() == Reloc::Static) { + if (I->getName() == "llvm.global_ctors") + O << ".reference .constructors_used\n"; + else if (I->getName() == "llvm.global_dtors") + O << ".reference .destructors_used\n"; + } + continue; + } + + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + const Type *Type = C->getType(); + unsigned Size = TD->getTypeSize(Type); + unsigned Align = TD->getPreferredAlignmentLog(I); + + const char *VisibilityDirective = NULL; + if (I->hasHiddenVisibility()) + VisibilityDirective = TAI->getHiddenDirective(); + else if (I->hasProtectedVisibility()) + VisibilityDirective = TAI->getProtectedDirective(); + + if (VisibilityDirective) + O << VisibilityDirective << name << "\n"; + + if (Subtarget->isTargetELF()) + O << "\t.type " << name << ",%object\n"; + + if (C->isNullValue()) { + if (I->hasExternalLinkage()) { + if (const char *Directive = TAI->getZeroFillDirective()) { + O << "\t.globl\t" << name << "\n"; + O << Directive << "__DATA__, __common, " << name << ", " + << Size << ", " << Align << "\n"; + continue; + } + } + + if (!I->hasSection() && + (I->hasInternalLinkage() || I->hasWeakLinkage() || + I->hasLinkOnceLinkage())) { + if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. + if (!NoZerosInBSS && TAI->getBSSSection()) + SwitchToDataSection(TAI->getBSSSection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + if (TAI->getLCOMMDirective() != NULL) { + if (I->hasInternalLinkage()) { + O << TAI->getLCOMMDirective() << name << "," << Size; + if (Subtarget->isTargetDarwin()) + O << "," << Align; + } else + O << TAI->getCOMMDirective() << name << "," << Size; + } else { + if (I->hasInternalLinkage()) + O << "\t.local\t" << name << "\n"; + O << TAI->getCOMMDirective() << name << "," << Size; + if (TAI->getCOMMDirectiveTakesAlignment()) + O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align); + } + O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n"; + continue; + } + } + + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + if (Subtarget->isTargetDarwin()) { + O << "\t.globl " << name << "\n" + << "\t.weak_definition " << name << "\n"; + SwitchToDataSection("\t.section __DATA,__const_coal,coalesced", I); + } else { + std::string SectionName("\t.section\t.llvm.linkonce.d." + + name + + ",\"aw\",%progbits"); + SwitchToDataSection(SectionName.c_str(), I); + O << "\t.weak " << name << "\n"; + } + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + O << "\t.globl " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: { + if (I->isConstant()) { + const ConstantArray *CVA = dyn_cast<ConstantArray>(C); + if (TAI->getCStringSection() && CVA && CVA->isCString()) { + SwitchToDataSection(TAI->getCStringSection(), I); + break; + } + } + // FIXME: special handling for ".ctors" & ".dtors" sections + if (I->hasSection() && + (I->getSection() == ".ctors" || + I->getSection() == ".dtors")) { + assert(!Subtarget->isTargetDarwin()); + std::string SectionName = ".section " + I->getSection(); + SectionName += ",\"aw\",%progbits"; + SwitchToDataSection(SectionName.c_str()); + } else { + if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection()) + SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() : + TAI->getBSSSection(), I); + else if (!I->isConstant()) + SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() : + TAI->getDataSection(), I); + else if (I->isThreadLocal()) + SwitchToDataSection(TAI->getTLSDataSection()); + else { + // Read-only data. + bool HasReloc = C->ContainsRelocations(); + if (HasReloc && + Subtarget->isTargetDarwin() && + TM.getRelocationModel() != Reloc::Static) + SwitchToDataSection("\t.const_data\n"); + else if (!HasReloc && Size == 4 && + TAI->getFourByteConstantSection()) + SwitchToDataSection(TAI->getFourByteConstantSection(), I); + else if (!HasReloc && Size == 8 && + TAI->getEightByteConstantSection()) + SwitchToDataSection(TAI->getEightByteConstantSection(), I); + else if (!HasReloc && Size == 16 && + TAI->getSixteenByteConstantSection()) + SwitchToDataSection(TAI->getSixteenByteConstantSection(), I); + else if (TAI->getReadOnlySection()) + SwitchToDataSection(TAI->getReadOnlySection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + } + } + + break; + } + default: + assert(0 && "Unknown linkage type!"); + break; + } + + EmitAlignment(Align, I); + O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName() + << "\n"; + if (TAI->hasDotTypeDotSizeDirective()) + O << "\t.size " << name << ", " << Size << "\n"; + // If the initializer is a extern weak symbol, remember to emit the weak + // reference! + if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + EmitGlobalConstant(C); + O << '\n'; + } + + if (Subtarget->isTargetDarwin()) { + SwitchToDataSection(""); + + // Output stubs for dynamically-linked functions + unsigned j = 1; + for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end(); + i != e; ++i, ++j) { + if (TM.getRelocationModel() == Reloc::PIC_) + SwitchToTextSection(".section __TEXT,__picsymbolstub4,symbol_stubs," + "none,16", 0); + else + SwitchToTextSection(".section __TEXT,__symbol_stub4,symbol_stubs," + "none,12", 0); + + EmitAlignment(2); + O << "\t.code\t32\n"; + + O << "L" << *i << "$stub:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\tldr ip, L" << *i << "$slp\n"; + if (TM.getRelocationModel() == Reloc::PIC_) { + O << "L" << *i << "$scv:\n"; + O << "\tadd ip, pc, ip\n"; + } + O << "\tldr pc, [ip, #0]\n"; + O << "L" << *i << "$slp:\n"; + if (TM.getRelocationModel() == Reloc::PIC_) + O << "\t.long\tL" << *i << "$lazy_ptr-(L" << *i << "$scv+8)\n"; + else + O << "\t.long\tL" << *i << "$lazy_ptr\n"; + SwitchToDataSection(".lazy_symbol_pointer", 0); + O << "L" << *i << "$lazy_ptr:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\t.long\tdyld_stub_binding_helper\n"; + } + O << "\n"; + + // Output non-lazy-pointers for external and common global variables. + if (GVNonLazyPtrs.begin() != GVNonLazyPtrs.end()) + SwitchToDataSection(".non_lazy_symbol_pointer", 0); + for (std::set<std::string>::iterator i = GVNonLazyPtrs.begin(), + e = GVNonLazyPtrs.end(); i != e; ++i) { + O << "L" << *i << "$non_lazy_ptr:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\t.long\t0\n"; + } + + // Emit initial debug information. + DW.EndModule(); + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + O << "\t.subsections_via_symbols\n"; + } else { + // Emit final debug information for ELF. + DW.EndModule(); + } + + AsmPrinter::doFinalization(M); + return false; // success +} diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp new file mode 100644 index 0000000..ed1d287 --- /dev/null +++ b/lib/Target/ARM/ARMCodeEmitter.cpp @@ -0,0 +1,92 @@ +//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Raul Herbster and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the ARM machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-emitter" +#include "ARMInstrInfo.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "ARM.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Compiler.h" +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass { + const ARMInstrInfo *II; + const TargetData *TD; + TargetMachine &TM; + MachineCodeEmitter &MCE; + public: + static char ID; + explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce) + : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm), + MCE(mce) {} + Emitter(TargetMachine &tm, MachineCodeEmitter &mce, + const ARMInstrInfo &ii, const TargetData &td) + : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm), + MCE(mce) {} + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "ARM Machine Code Emitter"; + } + + void emitInstruction(const MachineInstr &MI); + + private: + + }; + char Emitter::ID = 0; +} + +/// createARMCodeEmitterPass - Return a pass that emits the collected ARM code +/// to the specified MCE object. +FunctionPass *llvm::createARMCodeEmitterPass(ARMTargetMachine &TM, + MachineCodeEmitter &MCE) { + return new Emitter(TM, MCE); +} + +bool Emitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo(); + TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData(); + + do { + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) + emitInstruction(*I); + } + } while (MCE.finishFunction(MF)); + + return false; +} + +void Emitter::emitInstruction(const MachineInstr &MI) { + NumEmitted++; // Keep track of the # of mi's emitted +} diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp new file mode 100644 index 0000000..1b93631 --- /dev/null +++ b/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -0,0 +1,1277 @@ +//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that splits the constant pool up into 'islands' +// which are scattered through-out the function. This is required due to the +// limited pc-relative displacements that ARM has. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-cp-islands" +#include "ARM.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMInstrInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumCPEs, "Number of constpool entries"); +STATISTIC(NumSplit, "Number of uncond branches inserted"); +STATISTIC(NumCBrFixed, "Number of cond branches fixed"); +STATISTIC(NumUBrFixed, "Number of uncond branches fixed"); + +namespace { + /// ARMConstantIslands - Due to limited PC-relative displacements, ARM + /// requires constant pool entries to be scattered among the instructions + /// inside a function. To do this, it completely ignores the normal LLVM + /// constant pool; instead, it places constants wherever it feels like with + /// special instructions. + /// + /// The terminology used in this pass includes: + /// Islands - Clumps of constants placed in the function. + /// Water - Potential places where an island could be formed. + /// CPE - A constant pool entry that has been placed somewhere, which + /// tracks a list of users. + class VISIBILITY_HIDDEN ARMConstantIslands : public MachineFunctionPass { + /// NextUID - Assign unique ID's to CPE's. + unsigned NextUID; + + /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed + /// by MBB Number. The two-byte pads required for Thumb alignment are + /// counted as part of the following block (i.e., the offset and size for + /// a padded block will both be ==2 mod 4). + std::vector<unsigned> BBSizes; + + /// BBOffsets - the offset of each MBB in bytes, starting from 0. + /// The two-byte pads required for Thumb alignment are counted as part of + /// the following block. + std::vector<unsigned> BBOffsets; + + /// WaterList - A sorted list of basic blocks where islands could be placed + /// (i.e. blocks that don't fall through to the following block, due + /// to a return, unreachable, or unconditional branch). + std::vector<MachineBasicBlock*> WaterList; + + /// CPUser - One user of a constant pool, keeping the machine instruction + /// pointer, the constant pool being referenced, and the max displacement + /// allowed from the instruction to the CP. + struct CPUser { + MachineInstr *MI; + MachineInstr *CPEMI; + unsigned MaxDisp; + CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp) + : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp) {} + }; + + /// CPUsers - Keep track of all of the machine instructions that use various + /// constant pools and their max displacement. + std::vector<CPUser> CPUsers; + + /// CPEntry - One per constant pool entry, keeping the machine instruction + /// pointer, the constpool index, and the number of CPUser's which + /// reference this entry. + struct CPEntry { + MachineInstr *CPEMI; + unsigned CPI; + unsigned RefCount; + CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0) + : CPEMI(cpemi), CPI(cpi), RefCount(rc) {} + }; + + /// CPEntries - Keep track of all of the constant pool entry machine + /// instructions. For each original constpool index (i.e. those that + /// existed upon entry to this pass), it keeps a vector of entries. + /// Original elements are cloned as we go along; the clones are + /// put in the vector of the original element, but have distinct CPIs. + std::vector<std::vector<CPEntry> > CPEntries; + + /// ImmBranch - One per immediate branch, keeping the machine instruction + /// pointer, conditional or unconditional, the max displacement, + /// and (if isCond is true) the corresponding unconditional branch + /// opcode. + struct ImmBranch { + MachineInstr *MI; + unsigned MaxDisp : 31; + bool isCond : 1; + int UncondBr; + ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr) + : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {} + }; + + /// ImmBranches - Keep track of all the immediate branch instructions. + /// + std::vector<ImmBranch> ImmBranches; + + /// PushPopMIs - Keep track of all the Thumb push / pop instructions. + /// + SmallVector<MachineInstr*, 4> PushPopMIs; + + /// HasFarJump - True if any far jump instruction has been emitted during + /// the branch fix up pass. + bool HasFarJump; + + const TargetInstrInfo *TII; + ARMFunctionInfo *AFI; + bool isThumb; + public: + static char ID; + ARMConstantIslands() : MachineFunctionPass((intptr_t)&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM constant island placement and branch shortening pass"; + } + + private: + void DoInitialPlacement(MachineFunction &Fn, + std::vector<MachineInstr*> &CPEMIs); + CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI); + void InitialFunctionScan(MachineFunction &Fn, + const std::vector<MachineInstr*> &CPEMIs); + MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI); + void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB); + void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta); + bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI); + int LookForExistingCPEntry(CPUser& U, unsigned UserOffset); + bool LookForWater(CPUser&U, unsigned UserOffset, + MachineBasicBlock** NewMBB); + MachineBasicBlock* AcceptWater(MachineBasicBlock *WaterBB, + std::vector<MachineBasicBlock*>::iterator IP); + void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset, + MachineBasicBlock** NewMBB); + bool HandleConstantPoolUser(MachineFunction &Fn, unsigned CPUserIndex); + void RemoveDeadCPEMI(MachineInstr *CPEMI); + bool RemoveUnusedCPEntries(); + bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, unsigned Disp, + bool DoDump); + bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water, + CPUser &U); + bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset, + unsigned Disp, bool NegativeOK); + bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br); + bool FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br); + bool FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br); + bool UndoLRSpillRestore(); + + unsigned GetOffsetOf(MachineInstr *MI) const; + void dumpBBs(); + void verify(MachineFunction &Fn); + }; + char ARMConstantIslands::ID = 0; +} + +/// verify - check BBOffsets, BBSizes, alignment of islands +void ARMConstantIslands::verify(MachineFunction &Fn) { + assert(BBOffsets.size() == BBSizes.size()); + for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i) + assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]); + if (isThumb) { + for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty() && + MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) + assert((BBOffsets[MBB->getNumber()]%4 == 0 && + BBSizes[MBB->getNumber()]%4 == 0) || + (BBOffsets[MBB->getNumber()]%4 != 0 && + BBSizes[MBB->getNumber()]%4 != 0)); + } + } +} + +/// print block size and offset information - debugging +void ARMConstantIslands::dumpBBs() { + for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) { + DOUT << "block " << J << " offset " << BBOffsets[J] << + " size " << BBSizes[J] << "\n"; + } +} + +/// createARMConstantIslandPass - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createARMConstantIslandPass() { + return new ARMConstantIslands(); +} + +bool ARMConstantIslands::runOnMachineFunction(MachineFunction &Fn) { + MachineConstantPool &MCP = *Fn.getConstantPool(); + + TII = Fn.getTarget().getInstrInfo(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + isThumb = AFI->isThumbFunction(); + + HasFarJump = false; + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + Fn.RenumberBlocks(); + + /// Thumb functions containing constant pools get 2-byte alignment. This is so + /// we can keep exact track of where the alignment padding goes. Set default. + AFI->setAlign(isThumb ? 1U : 2U); + + // Perform the initial placement of the constant pool entries. To start with, + // we put them all at the end of the function. + std::vector<MachineInstr*> CPEMIs; + if (!MCP.isEmpty()) { + DoInitialPlacement(Fn, CPEMIs); + if (isThumb) + AFI->setAlign(2U); + } + + /// The next UID to take is the first unused one. + NextUID = CPEMIs.size(); + + // Do the initial scan of the function, building up information about the + // sizes of each block, the location of all the water, and finding all of the + // constant pool users. + InitialFunctionScan(Fn, CPEMIs); + CPEMIs.clear(); + + /// Remove dead constant pool entries. + RemoveUnusedCPEntries(); + + // Iteratively place constant pool entries and fix up branches until there + // is no change. + bool MadeChange = false; + while (true) { + bool Change = false; + for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) + Change |= HandleConstantPoolUser(Fn, i); + DEBUG(dumpBBs()); + for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) + Change |= FixUpImmediateBr(Fn, ImmBranches[i]); + DEBUG(dumpBBs()); + if (!Change) + break; + MadeChange = true; + } + + // After a while, this might be made debug-only, but it is not expensive. + verify(Fn); + + // If LR has been forced spilled and no far jumps (i.e. BL) has been issued. + // Undo the spill / restore of LR if possible. + if (!HasFarJump && AFI->isLRSpilledForFarJump() && isThumb) + MadeChange |= UndoLRSpillRestore(); + + BBSizes.clear(); + BBOffsets.clear(); + WaterList.clear(); + CPUsers.clear(); + CPEntries.clear(); + ImmBranches.clear(); + PushPopMIs.clear(); + + return MadeChange; +} + +/// DoInitialPlacement - Perform the initial placement of the constant pool +/// entries. To start with, we put them all at the end of the function. +void ARMConstantIslands::DoInitialPlacement(MachineFunction &Fn, + std::vector<MachineInstr*> &CPEMIs){ + // Create the basic block to hold the CPE's. + MachineBasicBlock *BB = new MachineBasicBlock(); + Fn.getBasicBlockList().push_back(BB); + + // Add all of the constants from the constant pool to the end block, use an + // identity mapping of CPI's to CPE's. + const std::vector<MachineConstantPoolEntry> &CPs = + Fn.getConstantPool()->getConstants(); + + const TargetData &TD = *Fn.getTarget().getTargetData(); + for (unsigned i = 0, e = CPs.size(); i != e; ++i) { + unsigned Size = TD.getTypeSize(CPs[i].getType()); + // Verify that all constant pool entries are a multiple of 4 bytes. If not, + // we would have to pad them out or something so that instructions stay + // aligned. + assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!"); + MachineInstr *CPEMI = + BuildMI(BB, TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(i).addConstantPoolIndex(i).addImm(Size); + CPEMIs.push_back(CPEMI); + + // Add a new CPEntry, but no corresponding CPUser yet. + std::vector<CPEntry> CPEs; + CPEs.push_back(CPEntry(CPEMI, i)); + CPEntries.push_back(CPEs); + NumCPEs++; + DOUT << "Moved CPI#" << i << " to end of function as #" << i << "\n"; + } +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB; + if (next(MBBI) == MBB->getParent()->end()) // Can't fall off end of function. + return false; + + MachineBasicBlock *NextBB = next(MBBI); + for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), + E = MBB->succ_end(); I != E; ++I) + if (*I == NextBB) + return true; + + return false; +} + +/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI, +/// look up the corresponding CPEntry. +ARMConstantIslands::CPEntry +*ARMConstantIslands::findConstPoolEntry(unsigned CPI, + const MachineInstr *CPEMI) { + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + // Number of entries per constpool index should be small, just do a + // linear search. + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + if (CPEs[i].CPEMI == CPEMI) + return &CPEs[i]; + } + return NULL; +} + +/// InitialFunctionScan - Do the initial scan of the function, building up +/// information about the sizes of each block, the location of all the water, +/// and finding all of the constant pool users. +void ARMConstantIslands::InitialFunctionScan(MachineFunction &Fn, + const std::vector<MachineInstr*> &CPEMIs) { + unsigned Offset = 0; + for (MachineFunction::iterator MBBI = Fn.begin(), E = Fn.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + + // If this block doesn't fall through into the next MBB, then this is + // 'water' that a constant pool island could be placed. + if (!BBHasFallthrough(&MBB)) + WaterList.push_back(&MBB); + + unsigned MBBSize = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // Add instruction size to MBBSize. + MBBSize += ARM::GetInstSize(I); + + int Opc = I->getOpcode(); + if (TII->isBranch(Opc)) { + bool isCond = false; + unsigned Bits = 0; + unsigned Scale = 1; + int UOpc = Opc; + switch (Opc) { + case ARM::tBR_JTr: + // A Thumb table jump may involve padding; for the offsets to + // be right, functions containing these must be 4-byte aligned. + AFI->setAlign(2U); + if ((Offset+MBBSize)%4 != 0) + MBBSize += 2; // padding + continue; // Does not get an entry in ImmBranches + default: + continue; // Ignore other JT branches + case ARM::Bcc: + isCond = true; + UOpc = ARM::B; + // Fallthrough + case ARM::B: + Bits = 24; + Scale = 4; + break; + case ARM::tBcc: + isCond = true; + UOpc = ARM::tB; + Bits = 8; + Scale = 2; + break; + case ARM::tB: + Bits = 11; + Scale = 2; + break; + } + + // Record this immediate branch. + unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale; + ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc)); + } + + if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET) + PushPopMIs.push_back(I); + + // Scan the instructions for constant pool operands. + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) + if (I->getOperand(op).isConstantPoolIndex()) { + // We found one. The addressing mode tells us the max displacement + // from the PC that this instruction permits. + + // Basic size info comes from the TSFlags field. + unsigned Bits = 0; + unsigned Scale = 1; + unsigned TSFlags = I->getInstrDescriptor()->TSFlags; + switch (TSFlags & ARMII::AddrModeMask) { + default: + // Constant pool entries can reach anything. + if (I->getOpcode() == ARM::CONSTPOOL_ENTRY) + continue; + if (I->getOpcode() == ARM::tLEApcrel) { + Bits = 8; // Taking the address of a CP entry. + break; + } + assert(0 && "Unknown addressing mode for CP reference!"); + case ARMII::AddrMode1: // AM1: 8 bits << 2 + Bits = 8; + Scale = 4; // Taking the address of a CP entry. + break; + case ARMII::AddrMode2: + Bits = 12; // +-offset_12 + break; + case ARMII::AddrMode3: + Bits = 8; // +-offset_8 + break; + // addrmode4 has no immediate offset. + case ARMII::AddrMode5: + Bits = 8; + Scale = 4; // +-(offset_8*4) + break; + case ARMII::AddrModeT1: + Bits = 5; // +offset_5 + break; + case ARMII::AddrModeT2: + Bits = 5; + Scale = 2; // +(offset_5*2) + break; + case ARMII::AddrModeT4: + Bits = 5; + Scale = 4; // +(offset_5*4) + break; + case ARMII::AddrModeTs: + Bits = 8; + Scale = 4; // +(offset_8*4) + break; + } + + // Remember that this is a user of a CP entry. + unsigned CPI = I->getOperand(op).getConstantPoolIndex(); + MachineInstr *CPEMI = CPEMIs[CPI]; + unsigned MaxOffs = ((1 << Bits)-1) * Scale; + CPUsers.push_back(CPUser(I, CPEMI, MaxOffs)); + + // Increment corresponding CPEntry reference count. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Cannot find a corresponding CPEntry!"); + CPE->RefCount++; + + // Instructions can only use one CP entry, don't bother scanning the + // rest of the operands. + break; + } + } + + // In thumb mode, if this block is a constpool island, we may need padding + // so it's aligned on 4 byte boundary. + if (isThumb && + !MBB.empty() && + MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY && + (Offset%4) != 0) + MBBSize += 2; + + BBSizes.push_back(MBBSize); + BBOffsets.push_back(Offset); + Offset += MBBSize; + } +} + +/// GetOffsetOf - Return the current offset of the specified machine instruction +/// from the start of the function. This offset changes as stuff is moved +/// around inside the function. +unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BBOffsets[MBB->getNumber()]; + + // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has + // alignment padding, and compensate if so. + if (isThumb && + MI->getOpcode() == ARM::CONSTPOOL_ENTRY && + Offset%4 != 0) + Offset += 2; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + if (&*I == MI) return Offset; + Offset += ARM::GetInstSize(I); + } +} + +/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB +/// ID. +static bool CompareMBBNumbers(const MachineBasicBlock *LHS, + const MachineBasicBlock *RHS) { + return LHS->getNumber() < RHS->getNumber(); +} + +/// UpdateForInsertedWaterBlock - When a block is newly inserted into the +/// machine function, it upsets all of the block numbers. Renumber the blocks +/// and update the arrays that parallel this numbering. +void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) { + // Renumber the MBB's to keep them consequtive. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add NewMBB as having + // available water after it. + std::vector<MachineBasicBlock*>::iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), NewBB, + CompareMBBNumbers); + WaterList.insert(IP, NewBB); +} + + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update datastructures and renumber blocks to +/// account for this change and returns the newly created block. +MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = new MachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = OrigBB; ++MBBI; + OrigBB->getParent()->getBasicBlockList().insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + BuildMI(OrigBB, TII->get(isThumb ? ARM::tB : ARM::B)).addMBB(NewBB); + NumSplit++; + + // Update the CFG. All succs of OrigBB are now succs of NewBB. + while (!OrigBB->succ_empty()) { + MachineBasicBlock *Succ = *OrigBB->succ_begin(); + OrigBB->removeSuccessor(Succ); + NewBB->addSuccessor(Succ); + + // This pass should be run after register allocation, so there should be no + // PHI nodes to update. + assert((Succ->empty() || Succ->begin()->getOpcode() != TargetInstrInfo::PHI) + && "PHI nodes should be eliminated by now!"); + } + + // OrigBB branches to NewBB. + OrigBB->addSuccessor(NewBB); + + // Update internal data structures to account for the newly inserted MBB. + // This is almost the same as UpdateForInsertedWaterBlock, except that + // the Water goes after OrigBB, not NewBB. + NewBB->getParent()->RenumberBlocks(NewBB); + + // Insert a size into BBSizes to align it properly with the (newly + // renumbered) block numbers. + BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0); + + // Likewise for BBOffsets. + BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0); + + // Next, update WaterList. Specifically, we need to add OrigMBB as having + // available water after it (but not if it's already there, which happens + // when splitting before a conditional branch that is followed by an + // unconditional branch - in that case we want to insert NewBB). + std::vector<MachineBasicBlock*>::iterator IP = + std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB, + CompareMBBNumbers); + MachineBasicBlock* WaterBB = *IP; + if (WaterBB == OrigBB) + WaterList.insert(next(IP), NewBB); + else + WaterList.insert(IP, OrigBB); + + // Figure out how large the first NewMBB is. (It cannot + // contain a constpool_entry or tablejump.) + unsigned NewBBSize = 0; + for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end(); + I != E; ++I) + NewBBSize += ARM::GetInstSize(I); + + unsigned OrigBBI = OrigBB->getNumber(); + unsigned NewBBI = NewBB->getNumber(); + // Set the size of NewBB in BBSizes. + BBSizes[NewBBI] = NewBBSize; + + // We removed instructions from UserMBB, subtract that off from its size. + // Add 2 or 4 to the block to count the unconditional branch we added to it. + unsigned delta = isThumb ? 2 : 4; + BBSizes[OrigBBI] -= NewBBSize - delta; + + // ...and adjust BBOffsets for NewBB accordingly. + BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI]; + + // All BBOffsets following these blocks must be modified. + AdjustBBOffsetsAfter(NewBB, delta); + + return NewBB; +} + +/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool +/// reference) is within MaxDisp of TrialOffset (a proposed location of a +/// constant pool entry). +bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset, + unsigned TrialOffset, unsigned MaxDisp, bool NegativeOK) { + // On Thumb offsets==2 mod 4 are rounded down by the hardware for + // purposes of the displacement computation; compensate for that here. + // Effectively, the valid range of displacements is 2 bytes smaller for such + // references. + if (isThumb && UserOffset%4 !=0) + UserOffset -= 2; + // CPEs will be rounded up to a multiple of 4. + if (isThumb && TrialOffset%4 != 0) + TrialOffset += 2; + + if (UserOffset <= TrialOffset) { + // User before the Trial. + if (TrialOffset-UserOffset <= MaxDisp) + return true; + } else if (NegativeOK) { + if (UserOffset-TrialOffset <= MaxDisp) + return true; + } + return false; +} + +/// WaterIsInRange - Returns true if a CPE placed after the specified +/// Water (a basic block) will be in range for the specific MI. + +bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset, + MachineBasicBlock* Water, CPUser &U) +{ + unsigned MaxDisp = U.MaxDisp; + MachineFunction::iterator I = next(MachineFunction::iterator(Water)); + unsigned CPEOffset = BBOffsets[Water->getNumber()] + + BBSizes[Water->getNumber()]; + + // If the CPE is to be inserted before the instruction, that will raise + // the offset of the instruction. (Currently applies only to ARM, so + // no alignment compensation attempted here.) + if (CPEOffset < UserOffset) + UserOffset += U.CPEMI->getOperand(2).getImm(); + + return OffsetIsInRange (UserOffset, CPEOffset, MaxDisp, !isThumb); +} + +/// CPEIsInRange - Returns true if the distance between specific MI and +/// specific ConstPool entry instruction can fit in MI's displacement field. +bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset, + MachineInstr *CPEMI, + unsigned MaxDisp, bool DoDump) { + unsigned CPEOffset = GetOffsetOf(CPEMI); + assert(CPEOffset%4 == 0 && "Misaligned CPE"); + + if (DoDump) { + DOUT << "User of CPE#" << CPEMI->getOperand(0).getImm() + << " max delta=" << MaxDisp + << " insn address=" << UserOffset + << " CPE address=" << CPEOffset + << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI; + } + + return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, !isThumb); +} + +/// BBIsJumpedOver - Return true of the specified basic block's only predecessor +/// unconditionally branches to its only successor. +static bool BBIsJumpedOver(MachineBasicBlock *MBB) { + if (MBB->pred_size() != 1 || MBB->succ_size() != 1) + return false; + + MachineBasicBlock *Succ = *MBB->succ_begin(); + MachineBasicBlock *Pred = *MBB->pred_begin(); + MachineInstr *PredMI = &Pred->back(); + if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB) + return PredMI->getOperand(0).getMBB() == Succ; + return false; +} + +void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB, + int delta) { + MachineFunction::iterator MBBI = BB; MBBI = next(MBBI); + for(unsigned i=BB->getNumber()+1; i<BB->getParent()->getNumBlockIDs(); i++) { + BBOffsets[i] += delta; + // If some existing blocks have padding, adjust the padding as needed, a + // bit tricky. delta can be negative so don't use % on that. + if (isThumb) { + MachineBasicBlock *MBB = MBBI; + if (!MBB->empty()) { + // Constant pool entries require padding. + if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) { + unsigned oldOffset = BBOffsets[i] - delta; + if (oldOffset%4==0 && BBOffsets[i]%4!=0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } else if (oldOffset%4!=0 && BBOffsets[i]%4==0) { + // remove existing padding + BBSizes[i] -=2; + delta -= 2; + } + } + // Thumb jump tables require padding. They should be at the end; + // following unconditional branches are removed by AnalyzeBranch. + MachineInstr *ThumbJTMI = NULL; + if (prior(MBB->end())->getOpcode() == ARM::tBR_JTr) + ThumbJTMI = prior(MBB->end()); + if (ThumbJTMI) { + unsigned newMIOffset = GetOffsetOf(ThumbJTMI); + unsigned oldMIOffset = newMIOffset - delta; + if (oldMIOffset%4 == 0 && newMIOffset%4 != 0) { + // remove existing padding + BBSizes[i] -= 2; + delta -= 2; + } else if (oldMIOffset%4 != 0 && newMIOffset%4 == 0) { + // add new padding + BBSizes[i] += 2; + delta += 2; + } + } + if (delta==0) + return; + } + MBBI = next(MBBI); + } + } +} + +/// DecrementOldEntry - find the constant pool entry with index CPI +/// and instruction CPEMI, and decrement its refcount. If the refcount +/// becomes 0 remove the entry and instruction. Returns true if we removed +/// the entry, false if we didn't. + +bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) { + // Find the old entry. Eliminate it if it is no longer used. + CPEntry *CPE = findConstPoolEntry(CPI, CPEMI); + assert(CPE && "Unexpected!"); + if (--CPE->RefCount == 0) { + RemoveDeadCPEMI(CPEMI); + CPE->CPEMI = NULL; + NumCPEs--; + return true; + } + return false; +} + +/// LookForCPEntryInRange - see if the currently referenced CPE is in range; +/// if not, see if an in-range clone of the CPE is in range, and if so, +/// change the data structures so the user references the clone. Returns: +/// 0 = no existing entry found +/// 1 = entry found, and there were no code insertions or deletions +/// 2 = entry found, and there were code insertions or deletions +int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset) +{ + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + + // Check to see if the CPE is already in-range. + if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, true)) { + DOUT << "In range\n"; + return 1; + } + + // No. Look for previously created clones of the CPE that are in range. + unsigned CPI = CPEMI->getOperand(1).getConstantPoolIndex(); + std::vector<CPEntry> &CPEs = CPEntries[CPI]; + for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + // We already tried this one + if (CPEs[i].CPEMI == CPEMI) + continue; + // Removing CPEs can leave empty entries, skip + if (CPEs[i].CPEMI == NULL) + continue; + if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, false)) { + DOUT << "Replacing CPE#" << CPI << " with CPE#" << CPEs[i].CPI << "\n"; + // Point the CPUser node to the replacement + U.CPEMI = CPEs[i].CPEMI; + // Change the CPI in the instruction operand to refer to the clone. + for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j) + if (UserMI->getOperand(j).isConstantPoolIndex()) { + UserMI->getOperand(j).setConstantPoolIndex(CPEs[i].CPI); + break; + } + // Adjust the refcount of the clone... + CPEs[i].RefCount++; + // ...and the original. If we didn't remove the old entry, none of the + // addresses changed, so we don't need another pass. + return DecrementOldEntry(CPI, CPEMI) ? 2 : 1; + } + } + return 0; +} + +/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in +/// the specific unconditional branch instruction. +static inline unsigned getUnconditionalBrDisp(int Opc) { + return (Opc == ARM::tB) ? ((1<<10)-1)*2 : ((1<<23)-1)*4; +} + +/// AcceptWater - Small amount of common code factored out of the following. + +MachineBasicBlock* ARMConstantIslands::AcceptWater(MachineBasicBlock *WaterBB, + std::vector<MachineBasicBlock*>::iterator IP) { + DOUT << "found water in range\n"; + // Remove the original WaterList entry; we want subsequent + // insertions in this vicinity to go after the one we're + // about to insert. This considerably reduces the number + // of times we have to move the same CPE more than once. + WaterList.erase(IP); + // CPE goes before following block (NewMBB). + return next(MachineFunction::iterator(WaterBB)); +} + +/// LookForWater - look for an existing entry in the WaterList in which +/// we can place the CPE referenced from U so it's within range of U's MI. +/// Returns true if found, false if not. If it returns true, *NewMBB +/// is set to the WaterList entry. +/// For ARM, we prefer the water that's farthest away. For Thumb, prefer +/// water that will not introduce padding to water that will; within each +/// group, prefer the water that's farthest away. + +bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset, + MachineBasicBlock** NewMBB) { + std::vector<MachineBasicBlock*>::iterator IPThatWouldPad; + MachineBasicBlock* WaterBBThatWouldPad = NULL; + if (!WaterList.empty()) { + for (std::vector<MachineBasicBlock*>::iterator IP = prior(WaterList.end()), + B = WaterList.begin();; --IP) { + MachineBasicBlock* WaterBB = *IP; + if (WaterIsInRange(UserOffset, WaterBB, U)) { + if (isThumb && + (BBOffsets[WaterBB->getNumber()] + + BBSizes[WaterBB->getNumber()])%4 != 0) { + // This is valid Water, but would introduce padding. Remember + // it in case we don't find any Water that doesn't do this. + if (!WaterBBThatWouldPad) { + WaterBBThatWouldPad = WaterBB; + IPThatWouldPad = IP; + } + } else { + *NewMBB = AcceptWater(WaterBB, IP); + return true; + } + } + if (IP == B) + break; + } + } + if (isThumb && WaterBBThatWouldPad) { + *NewMBB = AcceptWater(WaterBBThatWouldPad, IPThatWouldPad); + return true; + } + return false; +} + +/// CreateNewWater - No existing WaterList entry will work for +/// CPUsers[CPUserIndex], so create a place to put the CPE. The end of the +/// block is used if in range, and the conditional branch munged so control +/// flow is correct. Otherwise the block is split to create a hole with an +/// unconditional branch around it. In either case *NewMBB is set to a +/// block following which the new island can be inserted (the WaterList +/// is not adjusted). + +void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex, + unsigned UserOffset, MachineBasicBlock** NewMBB) { + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + MachineBasicBlock *UserMBB = UserMI->getParent(); + unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] + + BBSizes[UserMBB->getNumber()]; + assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]); + + // If the use is at the end of the block, or the end of the block + // is within range, make new water there. (The addition below is + // for the unconditional branch we will be adding: 4 bytes on ARM, + // 2 on Thumb. Possible Thumb alignment padding is allowed for + // inside OffsetIsInRange. + // If the block ends in an unconditional branch already, it is water, + // and is known to be out of range, so we'll always be adding a branch.) + if (&UserMBB->back() == UserMI || + OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb ? 2: 4), + U.MaxDisp, !isThumb)) { + DOUT << "Split at end of block\n"; + if (&UserMBB->back() == UserMI) + assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!"); + *NewMBB = next(MachineFunction::iterator(UserMBB)); + // Add an unconditional branch from UserMBB to fallthrough block. + // Record it for branch lengthening; this new branch will not get out of + // range, but if the preceding conditional branch is out of range, the + // targets will be exchanged, and the altered branch may be out of + // range, so the machinery has to know about it. + int UncondBr = isThumb ? ARM::tB : ARM::B; + BuildMI(UserMBB, TII->get(UncondBr)).addMBB(*NewMBB); + unsigned MaxDisp = getUnconditionalBrDisp(UncondBr); + ImmBranches.push_back(ImmBranch(&UserMBB->back(), + MaxDisp, false, UncondBr)); + int delta = isThumb ? 2 : 4; + BBSizes[UserMBB->getNumber()] += delta; + AdjustBBOffsetsAfter(UserMBB, delta); + } else { + // What a big block. Find a place within the block to split it. + // This is a little tricky on Thumb since instructions are 2 bytes + // and constant pool entries are 4 bytes: if instruction I references + // island CPE, and instruction I+1 references CPE', it will + // not work well to put CPE as far forward as possible, since then + // CPE' cannot immediately follow it (that location is 2 bytes + // farther away from I+1 than CPE was from I) and we'd need to create + // a new island. So, we make a first guess, then walk through the + // instructions between the one currently being looked at and the + // possible insertion point, and make sure any other instructions + // that reference CPEs will be able to use the same island area; + // if not, we back up the insertion point. + + // The 4 in the following is for the unconditional branch we'll be + // inserting (allows for long branch on Thumb). Alignment of the + // island is handled inside OffsetIsInRange. + unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4; + // This could point off the end of the block if we've already got + // constant pool entries following this block; only the last one is + // in the water list. Back past any possible branches (allow for a + // conditional and a maximally long unconditional). + if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1]) + BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] - + (isThumb ? 6 : 8); + unsigned EndInsertOffset = BaseInsertOffset + + CPEMI->getOperand(2).getImm(); + MachineBasicBlock::iterator MI = UserMI; + ++MI; + unsigned CPUIndex = CPUserIndex+1; + for (unsigned Offset = UserOffset+ARM::GetInstSize(UserMI); + Offset < BaseInsertOffset; + Offset += ARM::GetInstSize(MI), + MI = next(MI)) { + if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) { + if (!OffsetIsInRange(Offset, EndInsertOffset, + CPUsers[CPUIndex].MaxDisp, !isThumb)) { + BaseInsertOffset -= (isThumb ? 2 : 4); + EndInsertOffset -= (isThumb ? 2 : 4); + } + // This is overly conservative, as we don't account for CPEMIs + // being reused within the block, but it doesn't matter much. + EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm(); + CPUIndex++; + } + } + DOUT << "Split in middle of big block\n"; + *NewMBB = SplitBlockBeforeInstr(prior(MI)); + } +} + +/// HandleConstantPoolUser - Analyze the specified user, checking to see if it +/// is out-of-range. If so, pick it up the constant pool value and move it some +/// place in-range. Return true if we changed any addresses (thus must run +/// another pass of branch lengthening), false otherwise. +bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &Fn, + unsigned CPUserIndex){ + CPUser &U = CPUsers[CPUserIndex]; + MachineInstr *UserMI = U.MI; + MachineInstr *CPEMI = U.CPEMI; + unsigned CPI = CPEMI->getOperand(1).getConstantPoolIndex(); + unsigned Size = CPEMI->getOperand(2).getImm(); + MachineBasicBlock *NewMBB; + // Compute this only once, it's expensive. The 4 or 8 is the value the + // hardware keeps in the PC (2 insns ahead of the reference). + unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8); + + // Special case: tLEApcrel are two instructions MI's. The actual user is the + // second instruction. + if (UserMI->getOpcode() == ARM::tLEApcrel) + UserOffset += 2; + + // See if the current entry is within range, or there is a clone of it + // in range. + int result = LookForExistingCPEntry(U, UserOffset); + if (result==1) return false; + else if (result==2) return true; + + // No existing clone of this CPE is within range. + // We will be generating a new clone. Get a UID for it. + unsigned ID = NextUID++; + + // Look for water where we can place this CPE. We look for the farthest one + // away that will work. Forward references only for now (although later + // we might find some that are backwards). + + if (!LookForWater(U, UserOffset, &NewMBB)) { + // No water found. + DOUT << "No water found\n"; + CreateNewWater(CPUserIndex, UserOffset, &NewMBB); + } + + // Okay, we know we can put an island before NewMBB now, do it! + MachineBasicBlock *NewIsland = new MachineBasicBlock(); + Fn.getBasicBlockList().insert(NewMBB, NewIsland); + + // Update internal data structures to account for the newly inserted MBB. + UpdateForInsertedWaterBlock(NewIsland); + + // Decrement the old entry, and remove it if refcount becomes 0. + DecrementOldEntry(CPI, CPEMI); + + // Now that we have an island to add the CPE to, clone the original CPE and + // add it to the island. + U.CPEMI = BuildMI(NewIsland, TII->get(ARM::CONSTPOOL_ENTRY)) + .addImm(ID).addConstantPoolIndex(CPI).addImm(Size); + CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1)); + NumCPEs++; + + BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()]; + // Compensate for .align 2 in thumb mode. + if (isThumb && BBOffsets[NewIsland->getNumber()]%4 != 0) + Size += 2; + // Increase the size of the island block to account for the new entry. + BBSizes[NewIsland->getNumber()] += Size; + AdjustBBOffsetsAfter(NewIsland, Size); + + // Finally, change the CPI in the instruction operand to be ID. + for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) + if (UserMI->getOperand(i).isConstantPoolIndex()) { + UserMI->getOperand(i).setConstantPoolIndex(ID); + break; + } + + DOUT << " Moved CPE to #" << ID << " CPI=" << CPI << "\t" << *UserMI; + + return true; +} + +/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update +/// sizes and offsets of impacted basic blocks. +void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) { + MachineBasicBlock *CPEBB = CPEMI->getParent(); + unsigned Size = CPEMI->getOperand(2).getImm(); + CPEMI->eraseFromParent(); + BBSizes[CPEBB->getNumber()] -= Size; + // All succeeding offsets have the current size value added in, fix this. + if (CPEBB->empty()) { + // In thumb mode, the size of island may be padded by two to compensate for + // the alignment requirement. Then it will now be 2 when the block is + // empty, so fix this. + // All succeeding offsets have the current size value added in, fix this. + if (BBSizes[CPEBB->getNumber()] != 0) { + Size += BBSizes[CPEBB->getNumber()]; + BBSizes[CPEBB->getNumber()] = 0; + } + } + AdjustBBOffsetsAfter(CPEBB, -Size); + // An island has only one predecessor BB and one successor BB. Check if + // this BB's predecessor jumps directly to this BB's successor. This + // shouldn't happen currently. + assert(!BBIsJumpedOver(CPEBB) && "How did this happen?"); + // FIXME: remove the empty blocks after all the work is done? +} + +/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts +/// are zero. +bool ARMConstantIslands::RemoveUnusedCPEntries() { + unsigned MadeChange = false; + for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { + std::vector<CPEntry> &CPEs = CPEntries[i]; + for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { + if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { + RemoveDeadCPEMI(CPEs[j].CPEMI); + CPEs[j].CPEMI = NULL; + MadeChange = true; + } + } + } + return MadeChange; +} + +/// BBIsInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB, + unsigned MaxDisp) { + unsigned PCAdj = isThumb ? 4 : 8; + unsigned BrOffset = GetOffsetOf(MI) + PCAdj; + unsigned DestOffset = BBOffsets[DestBB->getNumber()]; + + DOUT << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxDisp + << " from " << GetOffsetOf(MI) << " to " << DestOffset + << " offset " << int(DestOffset-BrOffset) << "\t" << *MI; + + if (BrOffset <= DestOffset) { + // Branch before the Dest. + if (DestOffset-BrOffset <= MaxDisp) + return true; + } else { + if (BrOffset-DestOffset <= MaxDisp) + return true; + } + return false; +} + +/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far +/// away to fit in its displacement field. +bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &Fn, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMachineBasicBlock(); + + // Check to see if the DestBB is already in-range. + if (BBIsInRange(MI, DestBB, Br.MaxDisp)) + return false; + + if (!Br.isCond) + return FixUpUnconditionalBr(Fn, Br); + return FixUpConditionalBr(Fn, Br); +} + +/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is +/// too far away to fit in its displacement field. If the LR register has been +/// spilled in the epilogue, then we can use BL to implement a far jump. +/// Otherwise, add an intermediate branch instruction to to a branch. +bool +ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &Fn, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *MBB = MI->getParent(); + assert(isThumb && "Expected a Thumb function!"); + + // Use BL to implement far jump. + Br.MaxDisp = (1 << 21) * 2; + MI->setInstrDescriptor(TII->get(ARM::tBfar)); + BBSizes[MBB->getNumber()] += 2; + AdjustBBOffsetsAfter(MBB, 2); + HasFarJump = true; + NumUBrFixed++; + + DOUT << " Changed B to long jump " << *MI; + + return true; +} + +/// FixUpConditionalBr - Fix up a conditional branch whose destination is too +/// far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool +ARMConstantIslands::FixUpConditionalBr(MachineFunction &Fn, ImmBranch &Br) { + MachineInstr *MI = Br.MI; + MachineBasicBlock *DestBB = MI->getOperand(0).getMachineBasicBlock(); + + // Add a unconditional branch to the destination and invert the branch + // condition to jump over it: + // blt L1 + // => + // bge L2 + // b L1 + // L2: + ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImmedValue(); + CC = ARMCC::getOppositeCondition(CC); + unsigned CCReg = MI->getOperand(2).getReg(); + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + NumCBrFixed++; + if (BMI != MI) { + if (next(MachineBasicBlock::iterator(MI)) == MBB->back() && + BMI->getOpcode() == Br.UncondBr) { + // Last MI in the BB is a unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMachineBasicBlock(); + if (BBIsInRange(MI, NewDest, Br.MaxDisp)) { + DOUT << " Invert Bcc condition and swap its destination with " << *BMI; + BMI->getOperand(0).setMachineBasicBlock(DestBB); + MI->getOperand(0).setMachineBasicBlock(NewDest); + MI->getOperand(1).setImm(CC); + return true; + } + } + } + + if (NeedSplit) { + SplitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding a unconditional + // branch to the destination. + int delta = ARM::GetInstSize(&MBB->back()); + BBSizes[MBB->getNumber()] -= delta; + MachineBasicBlock* SplitBB = next(MachineFunction::iterator(MBB)); + AdjustBBOffsetsAfter(SplitBB, -delta); + MBB->back().eraseFromParent(); + // BBOffsets[SplitBB] is wrong temporarily, fixed below + } + MachineBasicBlock *NextBB = next(MachineFunction::iterator(MBB)); + + DOUT << " Insert B to BB#" << DestBB->getNumber() + << " also invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"; + + // Insert a new conditional branch and a new unconditional branch. + // Also update the ImmBranch as well as adding a new entry for the new branch. + BuildMI(MBB, TII->get(MI->getOpcode())).addMBB(NextBB) + .addImm(CC).addReg(CCReg); + Br.MI = &MBB->back(); + BBSizes[MBB->getNumber()] += ARM::GetInstSize(&MBB->back()); + BuildMI(MBB, TII->get(Br.UncondBr)).addMBB(DestBB); + BBSizes[MBB->getNumber()] += ARM::GetInstSize(&MBB->back()); + unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr); + ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr)); + + // Remove the old conditional branch. It may or may not still be in MBB. + BBSizes[MI->getParent()->getNumber()] -= ARM::GetInstSize(MI); + MI->eraseFromParent(); + + // The net size change is an addition of one unconditional branch. + int delta = ARM::GetInstSize(&MBB->back()); + AdjustBBOffsetsAfter(MBB, delta); + return true; +} + +/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills +/// LR / restores LR to pc. +bool ARMConstantIslands::UndoLRSpillRestore() { + bool MadeChange = false; + for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { + MachineInstr *MI = PushPopMIs[i]; + if (MI->getOpcode() == ARM::tPOP_RET && + MI->getOperand(0).getReg() == ARM::PC && + MI->getNumExplicitOperands() == 1) { + BuildMI(MI->getParent(), TII->get(ARM::tBX_RET)); + MI->eraseFromParent(); + MadeChange = true; + } + } + return MadeChange; +} diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp new file mode 100644 index 0000000..30a8eaf --- /dev/null +++ b/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -0,0 +1,90 @@ +//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#include "ARMConstantPoolValue.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/GlobalValue.h" +#include "llvm/Type.h" +using namespace llvm; + +ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, unsigned id, + ARMCP::ARMCPKind k, + unsigned char PCAdj, + const char *Modif, + bool AddCA) + : MachineConstantPoolValue((const Type*)gv->getType()), + GV(gv), S(NULL), LabelId(id), Kind(k), PCAdjust(PCAdj), + Modifier(Modif), AddCurrentAddress(AddCA) {} + +ARMConstantPoolValue::ARMConstantPoolValue(const char *s, unsigned id, + ARMCP::ARMCPKind k, + unsigned char PCAdj, + const char *Modif, + bool AddCA) + : MachineConstantPoolValue((const Type*)Type::Int32Ty), + GV(NULL), S(s), LabelId(id), Kind(k), PCAdjust(PCAdj), + Modifier(Modif), AddCurrentAddress(AddCA) {} + +ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, + ARMCP::ARMCPKind k, + const char *Modif) + : MachineConstantPoolValue((const Type*)Type::Int32Ty), + GV(gv), S(NULL), LabelId(0), Kind(k), PCAdjust(0), + Modifier(Modif) {} + +int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment) { + unsigned AlignMask = (1 << Alignment)-1; + const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants(); + for (unsigned i = 0, e = Constants.size(); i != e; ++i) { + if (Constants[i].isMachineConstantPoolEntry() && + (Constants[i].Offset & AlignMask) == 0) { + ARMConstantPoolValue *CPV = + (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal; + if (CPV->GV == GV && + CPV->S == S && + CPV->LabelId == LabelId && + CPV->Kind == Kind && + CPV->PCAdjust == PCAdjust) + return i; + } + } + + return -1; +} + +void +ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) { + ID.AddPointer(GV); + ID.AddPointer(S); + ID.AddInteger(LabelId); + ID.AddInteger((unsigned)Kind); + ID.AddInteger(PCAdjust); +} + +void ARMConstantPoolValue::print(std::ostream &O) const { + if (GV) + O << GV->getName(); + else + O << S; + if (isNonLazyPointer()) O << "$non_lazy_ptr"; + else if (isStub()) O << "$stub"; + if (Modifier) O << "(" << Modifier << ")"; + if (PCAdjust != 0) { + O << "-(LPIC" << LabelId << "+" + << (unsigned)PCAdjust; + if (AddCurrentAddress) + O << "-."; + O << ")"; + } +} diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h new file mode 100644 index 0000000..d71bcf0 --- /dev/null +++ b/lib/Target/ARM/ARMConstantPoolValue.h @@ -0,0 +1,75 @@ +//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific constantpool value class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H +#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H + +#include "llvm/CodeGen/MachineConstantPool.h" + +namespace llvm { + +namespace ARMCP { + enum ARMCPKind { + CPValue, + CPNonLazyPtr, + CPStub + }; +} + +/// ARMConstantPoolValue - ARM specific constantpool value. This is used to +/// represent PC relative displacement between the address of the load +/// instruction and the global value being loaded, i.e. (&GV-(LPIC+8)). +class ARMConstantPoolValue : public MachineConstantPoolValue { + GlobalValue *GV; // GlobalValue being loaded. + const char *S; // ExtSymbol being loaded. + unsigned LabelId; // Label id of the load. + ARMCP::ARMCPKind Kind; // non_lazy_ptr or stub? + unsigned char PCAdjust; // Extra adjustment if constantpool is pc relative. + // 8 for ARM, 4 for Thumb. + const char *Modifier; // GV modifier i.e. (&GV(modifier)-(LPIC+8)) + bool AddCurrentAddress; + +public: + ARMConstantPoolValue(GlobalValue *gv, unsigned id, + ARMCP::ARMCPKind Kind = ARMCP::CPValue, + unsigned char PCAdj = 0, const char *Modifier = NULL, + bool AddCurrentAddress = false); + ARMConstantPoolValue(const char *s, unsigned id, + ARMCP::ARMCPKind Kind = ARMCP::CPValue, + unsigned char PCAdj = 0, const char *Modifier = NULL, + bool AddCurrentAddress = false); + ARMConstantPoolValue(GlobalValue *GV, ARMCP::ARMCPKind Kind, + const char *Modifier); + + + GlobalValue *getGV() const { return GV; } + const char *getSymbol() const { return S; } + const char *getModifier() const { return Modifier; } + bool hasModifier() const { return Modifier != NULL; } + bool mustAddCurrentAddress() const { return AddCurrentAddress; } + unsigned getLabelId() const { return LabelId; } + bool isNonLazyPointer() const { return Kind == ARMCP::CPNonLazyPtr; } + bool isStub() const { return Kind == ARMCP::CPStub; } + unsigned char getPCAdjustment() const { return PCAdjust; } + + virtual int getExistingMachineCPValue(MachineConstantPool *CP, + unsigned Alignment); + + virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID); + + virtual void print(std::ostream &O) const; +}; + +} + +#endif diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h new file mode 100644 index 0000000..c56640a --- /dev/null +++ b/lib/Target/ARM/ARMFrameInfo.h @@ -0,0 +1,33 @@ +//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef ARM_FRAMEINFO_H +#define ARM_FRAMEINFO_H + +#include "ARM.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "ARMSubtarget.h" + +namespace llvm { + +class ARMFrameInfo : public TargetFrameInfo { +public: + ARMFrameInfo(const ARMSubtarget &ST) + : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0) { + } +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp new file mode 100644 index 0000000..933651d --- /dev/null +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -0,0 +1,859 @@ +//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the ARM target. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMISelLowering.h" +#include "ARMTargetMachine.h" +#include "ARMAddressingModes.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" +using namespace llvm; + +//===--------------------------------------------------------------------===// +/// ARMDAGToDAGISel - ARM specific code to select ARM machine +/// instructions for SelectionDAG operations. +/// +namespace { +class ARMDAGToDAGISel : public SelectionDAGISel { + ARMTargetLowering Lowering; + + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + +public: + ARMDAGToDAGISel(ARMTargetMachine &TM) + : SelectionDAGISel(Lowering), Lowering(TM), + Subtarget(&TM.getSubtarget<ARMSubtarget>()) { + } + + virtual const char *getPassName() const { + return "ARM Instruction Selection"; + } + + SDNode *Select(SDOperand Op); + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + bool SelectAddrMode2(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Offset, SDOperand &Opc); + bool SelectAddrMode2Offset(SDOperand Op, SDOperand N, + SDOperand &Offset, SDOperand &Opc); + bool SelectAddrMode3(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Offset, SDOperand &Opc); + bool SelectAddrMode3Offset(SDOperand Op, SDOperand N, + SDOperand &Offset, SDOperand &Opc); + bool SelectAddrMode5(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Offset); + + bool SelectAddrModePC(SDOperand Op, SDOperand N, SDOperand &Offset, + SDOperand &Label); + + bool SelectThumbAddrModeRR(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Offset); + bool SelectThumbAddrModeRI5(SDOperand Op, SDOperand N, unsigned Scale, + SDOperand &Base, SDOperand &OffImm, + SDOperand &Offset); + bool SelectThumbAddrModeS1(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &OffImm, SDOperand &Offset); + bool SelectThumbAddrModeS2(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &OffImm, SDOperand &Offset); + bool SelectThumbAddrModeS4(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &OffImm, SDOperand &Offset); + bool SelectThumbAddrModeSP(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &OffImm); + + bool SelectShifterOperandReg(SDOperand Op, SDOperand N, SDOperand &A, + SDOperand &B, SDOperand &C); + + // Include the pieces autogenerated from the target description. +#include "ARMGenDAGISel.inc" +}; +} + +void ARMDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + + DAG.setRoot(SelectRoot(DAG.getRoot())); + DAG.RemoveDeadNodes(); + + ScheduleAndEmitDAG(DAG); +} + +bool ARMDAGToDAGISel::SelectAddrMode2(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Offset, + SDOperand &Opc) { + if (N.getOpcode() == ISD::MUL) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + // X * [3,5,9] -> X + X * [2,4,8] etc. + int RHSC = (int)RHS->getValue(); + if (RHSC & 1) { + RHSC = RHSC & ~1; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + if (isPowerOf2_32(RHSC)) { + unsigned ShAmt = Log2_32(RHSC); + Base = Offset = N.getOperand(0); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, + ARM_AM::lsl), + MVT::i32); + return true; + } + } + } + } + + if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper) { + Base = N.getOperand(0); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0, + ARM_AM::no_shift), + MVT::i32); + return true; + } + + // Match simple R +/- imm12 operands. + if (N.getOpcode() == ISD::ADD) + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if ((RHSC >= 0 && RHSC < 0x1000) || + (RHSC < 0 && RHSC > -0x1000)) { // 12 bits. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC, + ARM_AM::no_shift), + MVT::i32); + return true; + } + } + + // Otherwise this is R +/- [possibly shifted] R + ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1)); + unsigned ShAmt = 0; + + Base = N.getOperand(0); + Offset = N.getOperand(1); + + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) { + ShAmt = Sh->getValue(); + Offset = N.getOperand(1).getOperand(0); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + // Try matching (R shl C) + (R). + if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) { + ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0)); + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't + // fold it. + if (ConstantSDNode *Sh = + dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) { + ShAmt = Sh->getValue(); + Offset = N.getOperand(0).getOperand(0); + Base = N.getOperand(1); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDOperand Op, SDOperand N, + SDOperand &Offset, SDOperand &Opc) { + unsigned Opcode = Op.getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + int Val = (int)C->getValue(); + if (Val >= 0 && Val < 0x1000) { // 12 bits. + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val, + ARM_AM::no_shift), + MVT::i32); + return true; + } + } + + Offset = N; + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + unsigned ShAmt = 0; + if (ShOpcVal != ARM_AM::no_shift) { + // Check to see if the RHS of the shift is a constant, if not, we can't fold + // it. + if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShAmt = Sh->getValue(); + Offset = N.getOperand(0); + } else { + ShOpcVal = ARM_AM::no_shift; + } + } + + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), + MVT::i32); + return true; +} + + +bool ARMDAGToDAGISel::SelectAddrMode3(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Offset, + SDOperand &Opc) { + if (N.getOpcode() == ISD::SUB) { + // X - C is canonicalize to X + -C, no need to handle it here. + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32); + return true; + } + + if (N.getOpcode() != ISD::ADD) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if ((RHSC >= 0 && RHSC < 256) || + (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + Offset = CurDAG->getRegister(0, MVT::i32); + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32); + return true; + } + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDOperand Op, SDOperand N, + SDOperand &Offset, SDOperand &Opc) { + unsigned Opcode = Op.getOpcode(); + ISD::MemIndexedMode AM = (Opcode == ISD::LOAD) + ? cast<LoadSDNode>(Op)->getAddressingMode() + : cast<StoreSDNode>(Op)->getAddressingMode(); + ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC) + ? ARM_AM::add : ARM_AM::sub; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + int Val = (int)C->getValue(); + if (Val >= 0 && Val < 256) { + Offset = CurDAG->getRegister(0, MVT::i32); + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32); + return true; + } + } + + Offset = N; + Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32); + return true; +} + + +bool ARMDAGToDAGISel::SelectAddrMode5(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Offset) { + if (N.getOpcode() != ISD::ADD) { + Base = N; + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } else if (N.getOpcode() == ARMISD::Wrapper) { + Base = N.getOperand(0); + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; + } + + // If the RHS is +/- imm8, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if ((RHSC & 3) == 0) { // The constant is implicitly multiplied by 4. + RHSC >>= 2; + if ((RHSC >= 0 && RHSC < 256) || + (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed. + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + + ARM_AM::AddrOpc AddSub = ARM_AM::add; + if (RHSC < 0) { + AddSub = ARM_AM::sub; + RHSC = - RHSC; + } + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC), + MVT::i32); + return true; + } + } + } + + Base = N; + Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), + MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectAddrModePC(SDOperand Op, SDOperand N, + SDOperand &Offset, SDOperand &Label) { + if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { + Offset = N.getOperand(0); + SDOperand N1 = N.getOperand(1); + Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getValue(), + MVT::i32); + return true; + } + return false; +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Offset){ + if (N.getOpcode() != ISD::ADD) { + Base = N; + // We must materialize a zero in a reg! Returning an constant here won't + // work since its node is -1 so it won't get added to the selection queue. + // Explicitly issue a tMOVri8 node! + Offset = SDOperand(CurDAG->getTargetNode(ARM::tMOVi8, MVT::i32, + CurDAG->getTargetConstant(0, MVT::i32)), 0); + return true; + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + return true; +} + +bool +ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDOperand Op, SDOperand N, + unsigned Scale, SDOperand &Base, + SDOperand &OffImm, SDOperand &Offset) { + if (Scale == 4) { + SDOperand TmpBase, TmpOffImm; + if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm)) + return false; // We want to select tLDRspi / tSTRspi instead. + if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return false; // We want to select tLDRpci instead. + } + + if (N.getOpcode() != ISD::ADD) { + Base = (N.getOpcode() == ARMISD::Wrapper) ? N.getOperand(0) : N; + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // Thumb does not have [sp, r] address mode. + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); + if ((LHSR && LHSR->getReg() == ARM::SP) || + (RHSR && RHSR->getReg() == ARM::SP)) { + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // If the RHS is + imm5 * scale, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if ((RHSC & (Scale-1)) == 0) { // The constant is implicitly multiplied. + RHSC /= Scale; + if (RHSC >= 0 && RHSC < 32) { + Base = N.getOperand(0); + Offset = CurDAG->getRegister(0, MVT::i32); + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } + + Base = N.getOperand(0); + Offset = N.getOperand(1); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &OffImm, + SDOperand &Offset) { + return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &OffImm, + SDOperand &Offset) { + return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &OffImm, + SDOperand &Offset) { + return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset); +} + +bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &OffImm) { + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + if (N.getOpcode() != ISD::ADD) + return false; + + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); + if (N.getOperand(0).getOpcode() == ISD::FrameIndex || + (LHSR && LHSR->getReg() == ARM::SP)) { + // If the RHS is + imm8 * scale, fold into addr mode. + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if ((RHSC & 3) == 0) { // The constant is implicitly multiplied. + RHSC >>= 2; + if (RHSC >= 0 && RHSC < 256) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); + return true; + } + } + } + } + + return false; +} + +bool ARMDAGToDAGISel::SelectShifterOperandReg(SDOperand Op, + SDOperand N, + SDOperand &BaseReg, + SDOperand &ShReg, + SDOperand &Opc) { + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N); + + // Don't match base register only case. That is matched to a separate + // lower complexity pattern with explicit register operand. + if (ShOpcVal == ARM_AM::no_shift) return false; + + BaseReg = N.getOperand(0); + unsigned ShImmVal = 0; + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + ShReg = CurDAG->getRegister(0, MVT::i32); + ShImmVal = RHS->getValue() & 31; + } else { + ShReg = N.getOperand(1); + } + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), + MVT::i32); + return true; +} + +/// getAL - Returns a ARMCC::AL immediate node. +static inline SDOperand getAL(SelectionDAG *CurDAG) { + return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32); +} + + +SDNode *ARMDAGToDAGISel::Select(SDOperand Op) { + SDNode *N = Op.Val; + unsigned Opcode = N->getOpcode(); + + if (Opcode >= ISD::BUILTIN_OP_END && Opcode < ARMISD::FIRST_NUMBER) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + case ISD::Constant: { + unsigned Val = cast<ConstantSDNode>(N)->getValue(); + bool UseCP = true; + if (Subtarget->isThumb()) + UseCP = (Val > 255 && // MOV + ~Val > 255 && // MOV + MVN + !ARM_AM::isThumbImmShiftedVal(Val)); // MOV + LSL + else + UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV + ARM_AM::getSOImmVal(~Val) == -1 && // MVN + !ARM_AM::isSOImmTwoPartVal(Val)); // two instrs. + if (UseCP) { + SDOperand CPIdx = + CurDAG->getTargetConstantPool(ConstantInt::get(Type::Int32Ty, Val), + TLI.getPointerTy()); + + SDNode *ResNode; + if (Subtarget->isThumb()) + ResNode = CurDAG->getTargetNode(ARM::tLDRcp, MVT::i32, MVT::Other, + CPIdx, CurDAG->getEntryNode()); + else { + SDOperand Ops[] = { + CPIdx, + CurDAG->getRegister(0, MVT::i32), + CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; + ResNode=CurDAG->getTargetNode(ARM::LDRcp, MVT::i32, MVT::Other, Ops, 6); + } + ReplaceUses(Op, SDOperand(ResNode, 0)); + return NULL; + } + + // Other cases are autogenerated. + break; + } + case ISD::FrameIndex: { + // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm. + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDOperand TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); + if (Subtarget->isThumb()) + return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI, + CurDAG->getTargetConstant(0, MVT::i32)); + else { + SDOperand Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, ARM::ADDri, MVT::i32, Ops, 5); + } + } + case ISD::ADD: { + // Select add sp, c to tADDhirr. + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(Op.getOperand(0)); + RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(Op.getOperand(1)); + if (LHSR && LHSR->getReg() == ARM::SP) { + std::swap(N0, N1); + std::swap(LHSR, RHSR); + } + if (RHSR && RHSR->getReg() == ARM::SP) { + AddToISelQueue(N0); + AddToISelQueue(N1); + return CurDAG->SelectNodeTo(N, ARM::tADDhirr, Op.getValueType(), N0, N1); + } + break; + } + case ISD::MUL: + if (Subtarget->isThumb()) + break; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + unsigned RHSV = C->getValue(); + if (!RHSV) break; + if (isPowerOf2_32(RHSV-1)) { // 2^n+1? + SDOperand V = Op.getOperand(0); + AddToISelQueue(V); + unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV-1)); + SDOperand Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32), + CurDAG->getTargetConstant(ShImm, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7); + } + if (isPowerOf2_32(RHSV+1)) { // 2^n-1? + SDOperand V = Op.getOperand(0); + AddToISelQueue(V); + unsigned ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, Log2_32(RHSV+1)); + SDOperand Ops[] = { V, V, CurDAG->getRegister(0, MVT::i32), + CurDAG->getTargetConstant(ShImm, MVT::i32), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7); + } + } + break; + case ARMISD::FMRRD: + AddToISelQueue(Op.getOperand(0)); + return CurDAG->getTargetNode(ARM::FMRRD, MVT::i32, MVT::i32, + Op.getOperand(0), getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32)); + case ARMISD::MULHILOU: { + AddToISelQueue(Op.getOperand(0)); + AddToISelQueue(Op.getOperand(1)); + SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getTargetNode(ARM::UMULL, MVT::i32, MVT::i32, Ops, 5); + } + case ARMISD::MULHILOS: { + AddToISelQueue(Op.getOperand(0)); + AddToISelQueue(Op.getOperand(1)); + SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1), + getAL(CurDAG), CurDAG->getRegister(0, MVT::i32), + CurDAG->getRegister(0, MVT::i32) }; + return CurDAG->getTargetNode(ARM::SMULL, MVT::i32, MVT::i32, Ops, 5); + } + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Op); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + MVT::ValueType LoadedVT = LD->getLoadedVT(); + if (AM != ISD::UNINDEXED) { + SDOperand Offset, AMOpc; + bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); + unsigned Opcode = 0; + bool Match = false; + if (LoadedVT == MVT::i32 && + SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) { + Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST; + Match = true; + } else if (LoadedVT == MVT::i16 && + SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = (LD->getExtensionType() == ISD::SEXTLOAD) + ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST) + : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST); + } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) { + if (LD->getExtensionType() == ISD::SEXTLOAD) { + if (SelectAddrMode3Offset(Op, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST; + } + } else { + if (SelectAddrMode2Offset(Op, LD->getOffset(), Offset, AMOpc)) { + Match = true; + Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST; + } + } + } + + if (Match) { + SDOperand Chain = LD->getChain(); + SDOperand Base = LD->getBasePtr(); + AddToISelQueue(Chain); + AddToISelQueue(Base); + AddToISelQueue(Offset); + SDOperand Ops[]= { Base, Offset, AMOpc, getAL(CurDAG), + CurDAG->getRegister(0, MVT::i32), Chain }; + return CurDAG->getTargetNode(Opcode, MVT::i32, MVT::i32, + MVT::Other, Ops, 6); + } + } + // Other cases are autogenerated. + break; + } + case ARMISD::BRCOND: { + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) + // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + + unsigned Opc = Subtarget->isThumb() ? ARM::tBcc : ARM::Bcc; + SDOperand Chain = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + SDOperand N3 = Op.getOperand(3); + SDOperand InFlag = Op.getOperand(4); + assert(N1.getOpcode() == ISD::BasicBlock); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + AddToISelQueue(Chain); + AddToISelQueue(N1); + AddToISelQueue(InFlag); + SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getValue()), MVT::i32); + SDOperand Ops[] = { N1, Tmp2, N3, Chain, InFlag }; + SDNode *ResNode = CurDAG->getTargetNode(Opc, MVT::Other, MVT::Flag, Ops, 5); + Chain = SDOperand(ResNode, 0); + InFlag = SDOperand(ResNode, 1); + ReplaceUses(SDOperand(Op.Val, 1), InFlag); + ReplaceUses(SDOperand(Op.Val, 0), SDOperand(Chain.Val, Chain.ResNo)); + return NULL; + } + case ARMISD::CMOV: { + bool isThumb = Subtarget->isThumb(); + MVT::ValueType VT = Op.getValueType(); + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + SDOperand N3 = Op.getOperand(3); + SDOperand InFlag = Op.getOperand(4); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc) + // Pattern complexity = 18 cost = 1 size = 0 + SDOperand CPTmp0; + SDOperand CPTmp1; + SDOperand CPTmp2; + if (!isThumb && VT == MVT::i32 && + SelectShifterOperandReg(Op, N1, CPTmp0, CPTmp1, CPTmp2)) { + AddToISelQueue(N0); + AddToISelQueue(CPTmp0); + AddToISelQueue(CPTmp1); + AddToISelQueue(CPTmp2); + AddToISelQueue(InFlag); + SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getValue()), MVT::i32); + SDOperand Ops[] = { N0, CPTmp0, CPTmp1, CPTmp2, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.Val, ARM::MOVCCs, MVT::i32, Ops, 7); + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, + // (imm:i32)<<P:Predicate_so_imm>><<X:so_imm_XFORM>>:$true, + // (imm:i32):$cc) + // Emits: (MOVCCi:i32 GPR:i32:$false, + // (so_imm_XFORM:i32 (imm:i32):$true), (imm:i32):$cc) + // Pattern complexity = 10 cost = 1 size = 0 + if (VT == MVT::i32 && + N3.getOpcode() == ISD::Constant && + Predicate_so_imm(N3.Val)) { + AddToISelQueue(N0); + AddToISelQueue(InFlag); + SDOperand Tmp1 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N1)->getValue()), MVT::i32); + Tmp1 = Transform_so_imm_XFORM(Tmp1.Val); + SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getValue()), MVT::i32); + SDOperand Ops[] = { N0, Tmp1, Tmp2, N3, InFlag }; + return CurDAG->SelectNodeTo(Op.Val, ARM::MOVCCi, MVT::i32, Ops, 5); + } + + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 1 size = 0 + // + // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc) + // Pattern complexity = 6 cost = 11 size = 0 + // + // Also FCPYScc and FCPYDcc. + AddToISelQueue(N0); + AddToISelQueue(N1); + AddToISelQueue(InFlag); + SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getValue()), MVT::i32); + SDOperand Ops[] = { N0, N1, Tmp2, N3, InFlag }; + unsigned Opc = 0; + switch (VT) { + default: assert(false && "Illegal conditional move type!"); + break; + case MVT::i32: + Opc = isThumb ? ARM::tMOVCCr : ARM::MOVCCr; + break; + case MVT::f32: + Opc = ARM::FCPYScc; + break; + case MVT::f64: + Opc = ARM::FCPYDcc; + break; + } + return CurDAG->SelectNodeTo(Op.Val, Opc, VT, Ops, 5); + } + case ARMISD::CNEG: { + MVT::ValueType VT = Op.getValueType(); + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + SDOperand N3 = Op.getOperand(3); + SDOperand InFlag = Op.getOperand(4); + assert(N2.getOpcode() == ISD::Constant); + assert(N3.getOpcode() == ISD::Register); + + AddToISelQueue(N0); + AddToISelQueue(N1); + AddToISelQueue(InFlag); + SDOperand Tmp2 = CurDAG->getTargetConstant(((unsigned) + cast<ConstantSDNode>(N2)->getValue()), MVT::i32); + SDOperand Ops[] = { N0, N1, Tmp2, N3, InFlag }; + unsigned Opc = 0; + switch (VT) { + default: assert(false && "Illegal conditional move type!"); + break; + case MVT::f32: + Opc = ARM::FNEGScc; + break; + case MVT::f64: + Opc = ARM::FNEGDcc; + break; + } + return CurDAG->SelectNodeTo(Op.Val, Opc, VT, Ops, 5); + } + } + return SelectCode(Op); +} + +/// createARMISelDag - This pass converts a legalized DAG into a +/// ARM-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createARMISelDag(ARMTargetMachine &TM) { + return new ARMDAGToDAGISel(TM); +} diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp new file mode 100644 index 0000000..6f63fbd --- /dev/null +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -0,0 +1,1859 @@ +//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMConstantPoolValue.h" +#include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "ARMTargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/Instruction.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/MathExtras.h" +using namespace llvm; + +ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) + : TargetLowering(TM), ARMPCLabelIndex(0) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + + if (Subtarget->isTargetDarwin()) { + // Don't have these. + setLibcallName(RTLIB::UINTTOFP_I64_F32, NULL); + setLibcallName(RTLIB::UINTTOFP_I64_F64, NULL); + + // Uses VFP for Thumb libfuncs if available. + if (Subtarget->isThumb() && Subtarget->hasVFP2()) { + // Single-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); + setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); + setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); + setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); + + // Double-precision floating-point arithmetic. + setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); + setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); + setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); + setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); + + // Single-precision comparisons. + setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); + setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); + setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); + setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); + setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); + setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); + setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); + setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); + + // Double-precision comparisons. + setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); + setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); + setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); + setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); + setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); + setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); + setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); + setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); + + setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); + setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); + setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); + setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); + + // Conversions between floating types. + setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); + setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: e.g. + // __floatunsidf vs. __floatunssidfvfp. + setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); + setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); + setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + } + } + + addRegisterClass(MVT::i32, ARM::GPRRegisterClass); + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) { + addRegisterClass(MVT::f32, ARM::SPRRegisterClass); + addRegisterClass(MVT::f64, ARM::DPRRegisterClass); + } + computeRegisterProperties(); + + // ARM does not have f32 extending load. + setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand); + + // ARM supports all 4 flavors of integer indexed load / store. + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i1, Legal); + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i1, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + } + + // i64 operation support. + if (Subtarget->isThumb()) { + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + } else { + setOperationAction(ISD::MUL, MVT::i64, Custom); + setOperationAction(ISD::MULHU, MVT::i32, Custom); + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::MULHS, MVT::i32, Custom); + } + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL, MVT::i64, Custom); + setOperationAction(ISD::SRA, MVT::i64, Custom); + + // ARM does not have ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::CTTZ , MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + if (!Subtarget->hasV5TOps() || Subtarget->isThumb()) + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + + // Only ARMv6 has BSWAP. + if (!Subtarget->hasV6Ops()) + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + + // These are expanded into libcalls. + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + + // Support label based line numbers. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + + setOperationAction(ISD::RET, MVT::Other, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + + // Expand mem operations genericly. + setOperationAction(ISD::MEMSET , MVT::Other, Expand); + setOperationAction(ISD::MEMCPY , MVT::Other, Custom); + setOperationAction(ISD::MEMMOVE , MVT::Other, Expand); + + // Use the default implementation. + setOperationAction(ISD::VASTART , MVT::Other, Expand); + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Expand); + + if (!Subtarget->hasV6Ops()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + } + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb()) + // Turn f64->i64 into FMRRD iff target supports vfp2. + setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom); + + setOperationAction(ISD::SETCC , MVT::i32, Expand); + setOperationAction(ISD::SETCC , MVT::f32, Expand); + setOperationAction(ISD::SETCC , MVT::f64, Expand); + setOperationAction(ISD::SELECT , MVT::i32, Expand); + setOperationAction(ISD::SELECT , MVT::f32, Expand); + setOperationAction(ISD::SELECT , MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + setOperationAction(ISD::BRCOND , MVT::Other, Expand); + setOperationAction(ISD::BR_CC , MVT::i32, Custom); + setOperationAction(ISD::BR_CC , MVT::f32, Custom); + setOperationAction(ISD::BR_CC , MVT::f64, Custom); + setOperationAction(ISD::BR_JT , MVT::Other, Custom); + + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + // FP Constants can't be immediates. + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + + // We don't support sin/cos/fmod/copysign + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // int <-> fp are custom expanded into bit_convert + ARMISD ops. + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + + setStackPointerRegisterToSaveRestore(ARM::SP); + setSchedulingPreference(SchedulingForRegPressure); + setIfCvtBlockSizeLimit(Subtarget->isThumb() ? 0 : 10); + setIfCvtDupBlockSizeLimit(Subtarget->isThumb() ? 0 : 2); + + maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type +} + + +const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case ARMISD::Wrapper: return "ARMISD::Wrapper"; + case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; + case ARMISD::CALL: return "ARMISD::CALL"; + case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; + case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tCALL: return "ARMISD::tCALL"; + case ARMISD::BRCOND: return "ARMISD::BRCOND"; + case ARMISD::BR_JT: return "ARMISD::BR_JT"; + case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; + case ARMISD::CMP: return "ARMISD::CMP"; + case ARMISD::CMPNZ: return "ARMISD::CMPNZ"; + case ARMISD::CMPFP: return "ARMISD::CMPFP"; + case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; + case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; + case ARMISD::CMOV: return "ARMISD::CMOV"; + case ARMISD::CNEG: return "ARMISD::CNEG"; + + case ARMISD::FTOSI: return "ARMISD::FTOSI"; + case ARMISD::FTOUI: return "ARMISD::FTOUI"; + case ARMISD::SITOF: return "ARMISD::SITOF"; + case ARMISD::UITOF: return "ARMISD::UITOF"; + case ARMISD::MULHILOU: return "ARMISD::MULHILOU"; + case ARMISD::MULHILOS: return "ARMISD::MULHILOS"; + + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; + case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; + case ARMISD::RRX: return "ARMISD::RRX"; + + case ARMISD::FMRRD: return "ARMISD::FMRRD"; + case ARMISD::FMDRR: return "ARMISD::FMDRR"; + + case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; + } +} + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + + +/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC +static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { + switch (CC) { + default: assert(0 && "Unknown condition code!"); + case ISD::SETNE: return ARMCC::NE; + case ISD::SETEQ: return ARMCC::EQ; + case ISD::SETGT: return ARMCC::GT; + case ISD::SETGE: return ARMCC::GE; + case ISD::SETLT: return ARMCC::LT; + case ISD::SETLE: return ARMCC::LE; + case ISD::SETUGT: return ARMCC::HI; + case ISD::SETUGE: return ARMCC::HS; + case ISD::SETULT: return ARMCC::LO; + case ISD::SETULE: return ARMCC::LS; + } +} + +/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. It +/// returns true if the operands should be inverted to form the proper +/// comparison. +static bool FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, + ARMCC::CondCodes &CondCode2) { + bool Invert = false; + CondCode2 = ARMCC::AL; + switch (CC) { + default: assert(0 && "Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: CondCode = ARMCC::EQ; break; + case ISD::SETGT: + case ISD::SETOGT: CondCode = ARMCC::GT; break; + case ISD::SETGE: + case ISD::SETOGE: CondCode = ARMCC::GE; break; + case ISD::SETOLT: CondCode = ARMCC::MI; break; + case ISD::SETOLE: CondCode = ARMCC::GT; Invert = true; break; + case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; + case ISD::SETO: CondCode = ARMCC::VC; break; + case ISD::SETUO: CondCode = ARMCC::VS; break; + case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; + case ISD::SETUGT: CondCode = ARMCC::HI; break; + case ISD::SETUGE: CondCode = ARMCC::PL; break; + case ISD::SETLT: + case ISD::SETULT: CondCode = ARMCC::LT; break; + case ISD::SETLE: + case ISD::SETULE: CondCode = ARMCC::LE; break; + case ISD::SETNE: + case ISD::SETUNE: CondCode = ARMCC::NE; break; + } + return Invert; +} + +static void +HowToPassArgument(MVT::ValueType ObjectVT, unsigned NumGPRs, + unsigned StackOffset, unsigned &NeededGPRs, + unsigned &NeededStackSize, unsigned &GPRPad, + unsigned &StackPad, unsigned Flags) { + NeededStackSize = 0; + NeededGPRs = 0; + StackPad = 0; + GPRPad = 0; + unsigned align = (Flags >> ISD::ParamFlags::OrigAlignmentOffs); + GPRPad = NumGPRs % ((align + 3)/4); + StackPad = StackOffset % align; + unsigned firstGPR = NumGPRs + GPRPad; + switch (ObjectVT) { + default: assert(0 && "Unhandled argument type!"); + case MVT::i32: + case MVT::f32: + if (firstGPR < 4) + NeededGPRs = 1; + else + NeededStackSize = 4; + break; + case MVT::i64: + case MVT::f64: + if (firstGPR < 3) + NeededGPRs = 2; + else if (firstGPR == 3) { + NeededGPRs = 1; + NeededStackSize = 4; + } else + NeededStackSize = 8; + } +} + +/// LowerCALL - Lowering a ISD::CALL node into a callseq_start <- +/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter +/// nodes. +SDOperand ARMTargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType RetVT= Op.Val->getValueType(0); + SDOperand Chain = Op.getOperand(0); + unsigned CallConv = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + assert((CallConv == CallingConv::C || + CallConv == CallingConv::Fast) && "unknown calling convention"); + SDOperand Callee = Op.getOperand(4); + unsigned NumOps = (Op.getNumOperands() - 5) / 2; + unsigned ArgOffset = 0; // Frame mechanisms handle retaddr slot + unsigned NumGPRs = 0; // GPRs used for parameter passing. + + // Count how many bytes are to be pushed on the stack. + unsigned NumBytes = 0; + + // Add up all the space actually used. + for (unsigned i = 0; i < NumOps; ++i) { + unsigned ObjSize; + unsigned ObjGPRs; + unsigned StackPad; + unsigned GPRPad; + MVT::ValueType ObjectVT = Op.getOperand(5+2*i).getValueType(); + unsigned Flags = Op.getConstantOperandVal(5+2*i+1); + HowToPassArgument(ObjectVT, NumGPRs, NumBytes, ObjGPRs, ObjSize, + GPRPad, StackPad, Flags); + NumBytes += ObjSize + StackPad; + NumGPRs += ObjGPRs + GPRPad; + } + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getConstant(NumBytes, MVT::i32)); + + SDOperand StackPtr = DAG.getRegister(ARM::SP, MVT::i32); + + static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 + }; + + NumGPRs = 0; + std::vector<std::pair<unsigned, SDOperand> > RegsToPass; + std::vector<SDOperand> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDOperand Arg = Op.getOperand(5+2*i); + unsigned Flags = Op.getConstantOperandVal(5+2*i+1); + MVT::ValueType ArgVT = Arg.getValueType(); + + unsigned ObjSize; + unsigned ObjGPRs; + unsigned GPRPad; + unsigned StackPad; + HowToPassArgument(ArgVT, NumGPRs, ArgOffset, ObjGPRs, + ObjSize, GPRPad, StackPad, Flags); + NumGPRs += GPRPad; + ArgOffset += StackPad; + if (ObjGPRs > 0) { + switch (ArgVT) { + default: assert(0 && "Unexpected ValueType for argument!"); + case MVT::i32: + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Arg)); + break; + case MVT::f32: + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], + DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Arg))); + break; + case MVT::i64: { + SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Arg, + DAG.getConstant(0, getPointerTy())); + SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Arg, + DAG.getConstant(1, getPointerTy())); + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Lo)); + if (ObjGPRs == 2) + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs+1], Hi)); + else { + SDOperand PtrOff= DAG.getConstant(ArgOffset, StackPtr.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Hi, PtrOff, NULL, 0)); + } + break; + } + case MVT::f64: { + SDOperand Cvt = DAG.getNode(ARMISD::FMRRD, + DAG.getVTList(MVT::i32, MVT::i32), + &Arg, 1); + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs], Cvt)); + if (ObjGPRs == 2) + RegsToPass.push_back(std::make_pair(GPRArgRegs[NumGPRs+1], + Cvt.getValue(1))); + else { + SDOperand PtrOff= DAG.getConstant(ArgOffset, StackPtr.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Cvt.getValue(1), PtrOff, + NULL, 0)); + } + break; + } + } + } else { + assert(ObjSize != 0); + SDOperand PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + } + + NumGPRs += ObjGPRs; + ArgOffset += ObjSize; + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + bool isDirect = false; + bool isARMFunc = false; + bool isLocalARMFunc = false; + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + GlobalValue *GV = G->getGlobal(); + isDirect = true; + bool isExt = (GV->isDeclaration() || GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()); + bool isStub = (isExt && Subtarget->isTargetDarwin()) && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // ARM call to a local ARM function is predicable. + isLocalARMFunc = !Subtarget->isThumb() && !isExt; + // tBX takes a register source operand. + if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) { + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, + ARMCP::CPStub, 4); + SDOperand CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 2); + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), CPAddr, NULL, 0); + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, getPointerTy(), Callee, PICLabel); + } else + Callee = DAG.getTargetGlobalAddress(GV, getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + isDirect = true; + bool isStub = Subtarget->isTargetDarwin() && + getTargetMachine().getRelocationModel() != Reloc::Static; + isARMFunc = !Subtarget->isThumb() || isStub; + // tBX takes a register source operand. + const char *Sym = S->getSymbol(); + if (isARMFunc && Subtarget->isThumb() && !Subtarget->hasV5TOps()) { + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(Sym, ARMPCLabelIndex, + ARMCP::CPStub, 4); + SDOperand CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 2); + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + Callee = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), CPAddr, NULL, 0); + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Callee = DAG.getNode(ARMISD::PIC_ADD, getPointerTy(), Callee, PICLabel); + } else + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + } + + // FIXME: handle tail calls differently. + unsigned CallOpc; + if (Subtarget->isThumb()) { + if (!Subtarget->hasV5TOps() && (!isDirect || isARMFunc)) + CallOpc = ARMISD::CALL_NOLINK; + else + CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL; + } else { + CallOpc = (isDirect || Subtarget->hasV5TOps()) + ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL) + : ARMISD::CALL_NOLINK; + } + if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb()) { + // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK + Chain = DAG.getCopyToReg(Chain, ARM::LR, + DAG.getNode(ISD::UNDEF, MVT::i32), InFlag); + InFlag = Chain.getValue(1); + } + + std::vector<MVT::ValueType> NodeTys; + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Flag); // Returns a flag for retval copy to use. + + std::vector<SDOperand> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.Val) + Ops.push_back(InFlag); + Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + SDOperand CSOps[] = { Chain, DAG.getConstant(NumBytes, MVT::i32), InFlag }; + Chain = DAG.getNode(ISD::CALLSEQ_END, + DAG.getNodeValueTypes(MVT::Other, MVT::Flag), + ((RetVT != MVT::Other) ? 2 : 1), CSOps, 3); + if (RetVT != MVT::Other) + InFlag = Chain.getValue(1); + + std::vector<SDOperand> ResultVals; + NodeTys.clear(); + + // If the call has results, copy the values out of the ret val registers. + switch (RetVT) { + default: assert(0 && "Unexpected ret value!"); + case MVT::Other: + break; + case MVT::i32: + Chain = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag).getValue(1); + ResultVals.push_back(Chain.getValue(0)); + if (Op.Val->getValueType(1) == MVT::i32) { + // Returns a i64 value. + Chain = DAG.getCopyFromReg(Chain, ARM::R1, MVT::i32, + Chain.getValue(2)).getValue(1); + ResultVals.push_back(Chain.getValue(0)); + NodeTys.push_back(MVT::i32); + } + NodeTys.push_back(MVT::i32); + break; + case MVT::f32: + Chain = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag).getValue(1); + ResultVals.push_back(DAG.getNode(ISD::BIT_CONVERT, MVT::f32, + Chain.getValue(0))); + NodeTys.push_back(MVT::f32); + break; + case MVT::f64: { + SDOperand Lo = DAG.getCopyFromReg(Chain, ARM::R0, MVT::i32, InFlag); + SDOperand Hi = DAG.getCopyFromReg(Lo, ARM::R1, MVT::i32, Lo.getValue(2)); + ResultVals.push_back(DAG.getNode(ARMISD::FMDRR, MVT::f64, Lo, Hi)); + NodeTys.push_back(MVT::f64); + break; + } + } + + NodeTys.push_back(MVT::Other); + + if (ResultVals.empty()) + return Chain; + + ResultVals.push_back(Chain); + SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys, &ResultVals[0], + ResultVals.size()); + return Res.getValue(Op.ResNo); +} + +static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG) { + SDOperand Copy; + SDOperand Chain = Op.getOperand(0); + switch(Op.getNumOperands()) { + default: + assert(0 && "Do not know how to return this many arguments!"); + abort(); + case 1: { + SDOperand LR = DAG.getRegister(ARM::LR, MVT::i32); + return DAG.getNode(ARMISD::RET_FLAG, MVT::Other, Chain); + } + case 3: + Op = Op.getOperand(1); + if (Op.getValueType() == MVT::f32) { + Op = DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op); + } else if (Op.getValueType() == MVT::f64) { + // Recursively legalize f64 -> i64. + Op = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Op); + return DAG.getNode(ISD::RET, MVT::Other, Chain, Op, + DAG.getConstant(0, MVT::i32)); + } + Copy = DAG.getCopyToReg(Chain, ARM::R0, Op, SDOperand()); + if (DAG.getMachineFunction().liveout_empty()) + DAG.getMachineFunction().addLiveOut(ARM::R0); + break; + case 5: + Copy = DAG.getCopyToReg(Chain, ARM::R1, Op.getOperand(3), SDOperand()); + Copy = DAG.getCopyToReg(Copy, ARM::R0, Op.getOperand(1), Copy.getValue(1)); + // If we haven't noted the R0+R1 are live out, do so now. + if (DAG.getMachineFunction().liveout_empty()) { + DAG.getMachineFunction().addLiveOut(ARM::R0); + DAG.getMachineFunction().addLiveOut(ARM::R1); + } + break; + } + + //We must use RET_FLAG instead of BRIND because BRIND doesn't have a flag + return DAG.getNode(ARMISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1)); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target countpart wrapped in the ARMISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOVi. +static SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType PtrVT = Op.getValueType(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDOperand Res; + if (CP->isMachineConstantPoolEntry()) + Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment()); + else + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment()); + return DAG.getNode(ARMISD::Wrapper, MVT::i32, Res); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model +SDOperand +ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG) { + MVT::ValueType PtrVT = getPointerTy(); + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, + PCAdj, "tlsgd", true); + SDOperand Argument = DAG.getTargetConstantPool(CPV, PtrVT, 2); + Argument = DAG.getNode(ARMISD::Wrapper, MVT::i32, Argument); + Argument = DAG.getLoad(PtrVT, DAG.getEntryNode(), Argument, NULL, 0); + SDOperand Chain = Argument.getValue(1); + + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Argument = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Argument, PICLabel); + + // call __tls_get_addr. + ArgListTy Args; + ArgListEntry Entry; + Entry.Node = Argument; + Entry.Ty = (const Type *) Type::Int32Ty; + Args.push_back(Entry); + std::pair<SDOperand, SDOperand> CallResult = + LowerCallTo(Chain, (const Type *) Type::Int32Ty, false, false, + CallingConv::C, false, + DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG); + return CallResult.first; +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" or +// "local exec" model. +SDOperand +ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG) { + GlobalValue *GV = GA->getGlobal(); + SDOperand Offset; + SDOperand Chain = DAG.getEntryNode(); + MVT::ValueType PtrVT = getPointerTy(); + // Get the Thread Pointer + SDOperand ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, PtrVT); + + if (GV->isDeclaration()){ + // initial exec model + unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, + PCAdj, "gottpoff", true); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 2); + Offset = DAG.getNode(ARMISD::Wrapper, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0); + Chain = Offset.getValue(1); + + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Offset = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Offset, PICLabel); + + Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0); + } else { + // local exec model + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GV, ARMCP::CPValue, "tpoff"); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, 2); + Offset = DAG.getNode(ARMISD::Wrapper, MVT::i32, Offset); + Offset = DAG.getLoad(PtrVT, Chain, Offset, NULL, 0); + } + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset); +} + +SDOperand +ARMTargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) { + // TODO: implement the "local dynamic" model + assert(Subtarget->isTargetELF() && + "TLS not implemented for non-ELF targets"); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + // If the relocation model is PIC, use the "General Dynamic" TLS Model, + // otherwise use the "Local Exec" TLS Model + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) + return LowerToTLSGeneralDynamicModel(GA, DAG); + else + return LowerToTLSExecModels(GA, DAG); +} + +SDOperand ARMTargetLowering::LowerGlobalAddressELF(SDOperand Op, + SelectionDAG &DAG) { + MVT::ValueType PtrVT = getPointerTy(); + GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + if (RelocM == Reloc::PIC_) { + bool UseGOTOFF = GV->hasInternalLinkage() || GV->hasHiddenVisibility(); + ARMConstantPoolValue *CPV = + new ARMConstantPoolValue(GV, ARMCP::CPValue, UseGOTOFF ? "GOTOFF":"GOT"); + SDOperand CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2); + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0); + SDOperand Chain = Result.getValue(1); + SDOperand GOT = DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, PtrVT); + Result = DAG.getNode(ISD::ADD, PtrVT, Result, GOT); + if (!UseGOTOFF) + Result = DAG.getLoad(PtrVT, Chain, Result, NULL, 0); + return Result; + } else { + SDOperand CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 2); + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + return DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0); + } +} + +/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol +/// even in non-static mode. +static bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) { + return RelocM != Reloc::Static && + (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() || + (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode())); +} + +SDOperand ARMTargetLowering::LowerGlobalAddressDarwin(SDOperand Op, + SelectionDAG &DAG) { + MVT::ValueType PtrVT = getPointerTy(); + GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + Reloc::Model RelocM = getTargetMachine().getRelocationModel(); + bool IsIndirect = GVIsIndirectSymbol(GV, RelocM); + SDOperand CPAddr; + if (RelocM == Reloc::Static) + CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 2); + else { + unsigned PCAdj = (RelocM != Reloc::PIC_) + ? 0 : (Subtarget->isThumb() ? 4 : 8); + ARMCP::ARMCPKind Kind = IsIndirect ? ARMCP::CPNonLazyPtr + : ARMCP::CPValue; + ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMPCLabelIndex, + Kind, PCAdj); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2); + } + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + + SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0); + SDOperand Chain = Result.getValue(1); + + if (RelocM == Reloc::PIC_) { + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, PtrVT, Result, PICLabel); + } + if (IsIndirect) + Result = DAG.getLoad(PtrVT, Chain, Result, NULL, 0); + + return Result; +} + +SDOperand ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDOperand Op, + SelectionDAG &DAG){ + assert(Subtarget->isTargetELF() && + "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); + MVT::ValueType PtrVT = getPointerTy(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = new ARMConstantPoolValue("_GLOBAL_OFFSET_TABLE_", + ARMPCLabelIndex, + ARMCP::CPValue, PCAdj); + SDOperand CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 2); + CPAddr = DAG.getNode(ARMISD::Wrapper, MVT::i32, CPAddr); + SDOperand Result = DAG.getLoad(PtrVT, DAG.getEntryNode(), CPAddr, NULL, 0); + SDOperand PICLabel = DAG.getConstant(ARMPCLabelIndex++, MVT::i32); + return DAG.getNode(ARMISD::PIC_ADD, PtrVT, Result, PICLabel); +} + +static SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG, + unsigned VarArgsFrameIndex) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + return DAG.getStore(Op.getOperand(0), FR, Op.getOperand(1), SV->getValue(), + SV->getOffset()); +} + +static SDOperand LowerFORMAL_ARGUMENT(SDOperand Op, SelectionDAG &DAG, + unsigned *vRegs, unsigned ArgNo, + unsigned &NumGPRs, unsigned &ArgOffset) { + MachineFunction &MF = DAG.getMachineFunction(); + MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType(); + SDOperand Root = Op.getOperand(0); + std::vector<SDOperand> ArgValues; + SSARegMap *RegMap = MF.getSSARegMap(); + + static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 + }; + + unsigned ObjSize; + unsigned ObjGPRs; + unsigned GPRPad; + unsigned StackPad; + unsigned Flags = Op.getConstantOperandVal(ArgNo + 3); + HowToPassArgument(ObjectVT, NumGPRs, ArgOffset, ObjGPRs, + ObjSize, GPRPad, StackPad, Flags); + NumGPRs += GPRPad; + ArgOffset += StackPad; + + SDOperand ArgValue; + if (ObjGPRs == 1) { + unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass); + MF.addLiveIn(GPRArgRegs[NumGPRs], VReg); + vRegs[NumGPRs] = VReg; + ArgValue = DAG.getCopyFromReg(Root, VReg, MVT::i32); + if (ObjectVT == MVT::f32) + ArgValue = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, ArgValue); + } else if (ObjGPRs == 2) { + unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass); + MF.addLiveIn(GPRArgRegs[NumGPRs], VReg); + vRegs[NumGPRs] = VReg; + ArgValue = DAG.getCopyFromReg(Root, VReg, MVT::i32); + + VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass); + MF.addLiveIn(GPRArgRegs[NumGPRs+1], VReg); + vRegs[NumGPRs+1] = VReg; + SDOperand ArgValue2 = DAG.getCopyFromReg(Root, VReg, MVT::i32); + + if (ObjectVT == MVT::i64) + ArgValue = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2); + else + ArgValue = DAG.getNode(ARMISD::FMDRR, MVT::f64, ArgValue, ArgValue2); + } + NumGPRs += ObjGPRs; + + if (ObjSize) { + // If the argument is actually used, emit a load from the right stack + // slot. + if (!Op.Val->hasNUsesOfValue(0, ArgNo)) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + int FI = MFI->CreateFixedObject(ObjSize, ArgOffset); + SDOperand FIN = DAG.getFrameIndex(FI, MVT::i32); + if (ObjGPRs == 0) + ArgValue = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0); + else { + SDOperand ArgValue2 = + DAG.getLoad(MVT::i32, Root, FIN, NULL, 0); + if (ObjectVT == MVT::i64) + ArgValue= DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2); + else + ArgValue= DAG.getNode(ARMISD::FMDRR, MVT::f64, ArgValue, ArgValue2); + } + } else { + // Don't emit a dead load. + ArgValue = DAG.getNode(ISD::UNDEF, ObjectVT); + } + + ArgOffset += ObjSize; // Move on to the next argument. + } + + return ArgValue; +} + +SDOperand +ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) { + std::vector<SDOperand> ArgValues; + SDOperand Root = Op.getOperand(0); + unsigned ArgOffset = 0; // Frame mechanisms handle retaddr slot + unsigned NumGPRs = 0; // GPRs used for parameter passing. + unsigned VRegs[4]; + + unsigned NumArgs = Op.Val->getNumValues()-1; + for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo) + ArgValues.push_back(LowerFORMAL_ARGUMENT(Op, DAG, VRegs, ArgNo, + NumGPRs, ArgOffset)); + + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + if (isVarArg) { + static const unsigned GPRArgRegs[] = { + ARM::R0, ARM::R1, ARM::R2, ARM::R3 + }; + + MachineFunction &MF = DAG.getMachineFunction(); + SSARegMap *RegMap = MF.getSSARegMap(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned VARegSize = (4 - NumGPRs) * 4; + unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1); + if (VARegSaveSize) { + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing + // the result of va_next. + AFI->setVarArgsRegSaveSize(VARegSaveSize); + VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset + + VARegSaveSize - VARegSize); + SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); + + SmallVector<SDOperand, 4> MemOps; + for (; NumGPRs < 4; ++NumGPRs) { + unsigned VReg = RegMap->createVirtualRegister(&ARM::GPRRegClass); + MF.addLiveIn(GPRArgRegs[NumGPRs], VReg); + SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i32); + SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + } + if (!MemOps.empty()) + Root = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOps[0], MemOps.size()); + } else + // This will point to the next argument passed via stack. + VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset); + } + + ArgValues.push_back(Root); + + // Return the new list of results. + std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(), + Op.Val->value_end()); + return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size()); +} + +/// isFloatingPointZero - Return true if this is +0.0. +static bool isFloatingPointZero(SDOperand Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->isExactlyValue(0.0); + else if (ISD::isEXTLoad(Op.Val) || ISD::isNON_EXTLoad(Op.Val)) { + // Maybe this has already been legalized into the constant pool? + if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { + SDOperand WrapperOp = Op.getOperand(1).getOperand(0); + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) + if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->isExactlyValue(0.0); + } + } + return false; +} + +static bool isLegalCmpImmediate(unsigned C, bool isThumb) { + return ( isThumb && (C & ~255U) == 0) || + (!isThumb && ARM_AM::getSOImmVal(C) != -1); +} + +/// Returns appropriate ARM CMP (cmp) and corresponding condition code for +/// the given operands. +static SDOperand getARMCmp(SDOperand LHS, SDOperand RHS, ISD::CondCode CC, + SDOperand &ARMCC, SelectionDAG &DAG, bool isThumb) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.Val)) { + unsigned C = RHSC->getValue(); + if (!isLegalCmpImmediate(C, isThumb)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: break; + case ISD::SETLT: + case ISD::SETGE: + if (isLegalCmpImmediate(C-1, isThumb)) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if (C > 0 && isLegalCmpImmediate(C-1, isThumb)) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(C-1, MVT::i32); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if (isLegalCmpImmediate(C+1, isThumb)) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if (C < 0xffffffff && isLegalCmpImmediate(C+1, isThumb)) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + RHS = DAG.getConstant(C+1, MVT::i32); + } + break; + } + } + } + + ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + ARMISD::NodeType CompareType; + switch (CondCode) { + default: + CompareType = ARMISD::CMP; + break; + case ARMCC::EQ: + case ARMCC::NE: + case ARMCC::MI: + case ARMCC::PL: + // Uses only N and Z Flags + CompareType = ARMISD::CMPNZ; + break; + } + ARMCC = DAG.getConstant(CondCode, MVT::i32); + return DAG.getNode(CompareType, MVT::Flag, LHS, RHS); +} + +/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. +static SDOperand getVFPCmp(SDOperand LHS, SDOperand RHS, SelectionDAG &DAG) { + SDOperand Cmp; + if (!isFloatingPointZero(RHS)) + Cmp = DAG.getNode(ARMISD::CMPFP, MVT::Flag, LHS, RHS); + else + Cmp = DAG.getNode(ARMISD::CMPFPw0, MVT::Flag, LHS); + return DAG.getNode(ARMISD::FMSTAT, MVT::Flag, Cmp); +} + +static SDOperand LowerSELECT_CC(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + MVT::ValueType VT = Op.getValueType(); + SDOperand LHS = Op.getOperand(0); + SDOperand RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDOperand TrueVal = Op.getOperand(2); + SDOperand FalseVal = Op.getOperand(3); + + if (LHS.getValueType() == MVT::i32) { + SDOperand ARMCC; + SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDOperand Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb()); + return DAG.getNode(ARMISD::CMOV, VT, FalseVal, TrueVal, ARMCC, CCR, Cmp); + } + + ARMCC::CondCodes CondCode, CondCode2; + if (FPCCToARMCC(CC, CondCode, CondCode2)) + std::swap(TrueVal, FalseVal); + + SDOperand ARMCC = DAG.getConstant(CondCode, MVT::i32); + SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDOperand Cmp = getVFPCmp(LHS, RHS, DAG); + SDOperand Result = DAG.getNode(ARMISD::CMOV, VT, FalseVal, TrueVal, + ARMCC, CCR, Cmp); + if (CondCode2 != ARMCC::AL) { + SDOperand ARMCC2 = DAG.getConstant(CondCode2, MVT::i32); + // FIXME: Needs another CMP because flag can have but one use. + SDOperand Cmp2 = getVFPCmp(LHS, RHS, DAG); + Result = DAG.getNode(ARMISD::CMOV, VT, Result, TrueVal, ARMCC2, CCR, Cmp2); + } + return Result; +} + +static SDOperand LowerBR_CC(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDOperand Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDOperand LHS = Op.getOperand(2); + SDOperand RHS = Op.getOperand(3); + SDOperand Dest = Op.getOperand(4); + + if (LHS.getValueType() == MVT::i32) { + SDOperand ARMCC; + SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDOperand Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, ST->isThumb()); + return DAG.getNode(ARMISD::BRCOND, MVT::Other, Chain, Dest, ARMCC, CCR,Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + ARMCC::CondCodes CondCode, CondCode2; + if (FPCCToARMCC(CC, CondCode, CondCode2)) + // Swap the LHS/RHS of the comparison if needed. + std::swap(LHS, RHS); + + SDOperand Cmp = getVFPCmp(LHS, RHS, DAG); + SDOperand ARMCC = DAG.getConstant(CondCode, MVT::i32); + SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag); + SDOperand Ops[] = { Chain, Dest, ARMCC, CCR, Cmp }; + SDOperand Res = DAG.getNode(ARMISD::BRCOND, VTList, Ops, 5); + if (CondCode2 != ARMCC::AL) { + ARMCC = DAG.getConstant(CondCode2, MVT::i32); + SDOperand Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) }; + Res = DAG.getNode(ARMISD::BRCOND, VTList, Ops, 5); + } + return Res; +} + +SDOperand ARMTargetLowering::LowerBR_JT(SDOperand Op, SelectionDAG &DAG) { + SDOperand Chain = Op.getOperand(0); + SDOperand Table = Op.getOperand(1); + SDOperand Index = Op.getOperand(2); + + MVT::ValueType PTy = getPointerTy(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); + ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); + SDOperand UId = DAG.getConstant(AFI->createJumpTableUId(), PTy); + SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); + Table = DAG.getNode(ARMISD::WrapperJT, MVT::i32, JTI, UId); + Index = DAG.getNode(ISD::MUL, PTy, Index, DAG.getConstant(4, PTy)); + SDOperand Addr = DAG.getNode(ISD::ADD, PTy, Index, Table); + bool isPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_; + Addr = DAG.getLoad(isPIC ? (MVT::ValueType)MVT::i32 : PTy, + Chain, Addr, NULL, 0); + Chain = Addr.getValue(1); + if (isPIC) + Addr = DAG.getNode(ISD::ADD, PTy, Addr, Table); + return DAG.getNode(ARMISD::BR_JT, MVT::Other, Chain, Addr, JTI, UId); +} + +static SDOperand LowerFP_TO_INT(SDOperand Op, SelectionDAG &DAG) { + unsigned Opc = + Op.getOpcode() == ISD::FP_TO_SINT ? ARMISD::FTOSI : ARMISD::FTOUI; + Op = DAG.getNode(Opc, MVT::f32, Op.getOperand(0)); + return DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op); +} + +static SDOperand LowerINT_TO_FP(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + unsigned Opc = + Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF; + + Op = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Op.getOperand(0)); + return DAG.getNode(Opc, VT, Op); +} + +static SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) { + // Implement fcopysign with a fabs and a conditional fneg. + SDOperand Tmp0 = Op.getOperand(0); + SDOperand Tmp1 = Op.getOperand(1); + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType SrcVT = Tmp1.getValueType(); + SDOperand AbsVal = DAG.getNode(ISD::FABS, VT, Tmp0); + SDOperand Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG); + SDOperand ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32); + SDOperand CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + return DAG.getNode(ARMISD::CNEG, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp); +} + +static SDOperand LowerBIT_CONVERT(SDOperand Op, SelectionDAG &DAG) { + // Turn f64->i64 into FMRRD. + assert(Op.getValueType() == MVT::i64 && + Op.getOperand(0).getValueType() == MVT::f64); + + Op = Op.getOperand(0); + SDOperand Cvt = DAG.getNode(ARMISD::FMRRD, DAG.getVTList(MVT::i32, MVT::i32), + &Op, 1); + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Cvt, Cvt.getValue(1)); +} + +static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG) { + // FIXME: All this code is target-independent. Create a new target-indep + // MULHILO node and move this code to the legalizer. + // + assert(Op.getValueType() == MVT::i64 && "Only handles i64 expand right now!"); + + SDOperand LL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + SDOperand RL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(1), + DAG.getConstant(0, MVT::i32)); + + unsigned LHSSB = DAG.ComputeNumSignBits(Op.getOperand(0)); + unsigned RHSSB = DAG.ComputeNumSignBits(Op.getOperand(1)); + + SDOperand Lo, Hi; + // Figure out how to lower this multiply. + if (LHSSB >= 33 && RHSSB >= 33) { + // If the input values are both sign extended, we can emit a mulhs+mul. + Lo = DAG.getNode(ISD::MUL, MVT::i32, LL, RL); + Hi = DAG.getNode(ISD::MULHS, MVT::i32, LL, RL); + } else if (LHSSB == 32 && RHSSB == 32 && + DAG.MaskedValueIsZero(Op.getOperand(0), 0xFFFFFFFF00000000ULL) && + DAG.MaskedValueIsZero(Op.getOperand(1), 0xFFFFFFFF00000000ULL)) { + // If the inputs are zero extended, use mulhu. + Lo = DAG.getNode(ISD::MUL, MVT::i32, LL, RL); + Hi = DAG.getNode(ISD::MULHU, MVT::i32, LL, RL); + } else { + SDOperand LH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0), + DAG.getConstant(1, MVT::i32)); + SDOperand RH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(1), + DAG.getConstant(1, MVT::i32)); + + // Lo,Hi = umul LHS, RHS. + SDOperand Ops[] = { LL, RL }; + SDOperand UMul64 = DAG.getNode(ARMISD::MULHILOU, + DAG.getVTList(MVT::i32, MVT::i32), Ops, 2); + Lo = UMul64; + Hi = UMul64.getValue(1); + RH = DAG.getNode(ISD::MUL, MVT::i32, LL, RH); + LH = DAG.getNode(ISD::MUL, MVT::i32, LH, RL); + Hi = DAG.getNode(ISD::ADD, MVT::i32, Hi, RH); + Hi = DAG.getNode(ISD::ADD, MVT::i32, Hi, LH); + } + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi); +} + +static SDOperand LowerMULHU(SDOperand Op, SelectionDAG &DAG) { + SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1) }; + return DAG.getNode(ARMISD::MULHILOU, + DAG.getVTList(MVT::i32, MVT::i32), Ops, 2).getValue(1); +} + +static SDOperand LowerMULHS(SDOperand Op, SelectionDAG &DAG) { + SDOperand Ops[] = { Op.getOperand(0), Op.getOperand(1) }; + return DAG.getNode(ARMISD::MULHILOS, + DAG.getVTList(MVT::i32, MVT::i32), Ops, 2).getValue(1); +} + +static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(Op.getValueType() == MVT::i64 && + (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) && + "Unknown shift to lower!"); + + // We only lower SRA, SRL of 1 here, all others use generic lowering. + if (!isa<ConstantSDNode>(Op.getOperand(1)) || + cast<ConstantSDNode>(Op.getOperand(1))->getValue() != 1) + return SDOperand(); + + // If we are in thumb mode, we don't have RRX. + if (ST->isThumb()) return SDOperand(); + + // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. + SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0), + DAG.getConstant(0, MVT::i32)); + SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op.getOperand(0), + DAG.getConstant(1, MVT::i32)); + + // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and + // captures the result into a carry flag. + unsigned Opc = Op.getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; + Hi = DAG.getNode(Opc, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1); + + // The low part is an ARMISD::RRX operand, which shifts the carry in. + Lo = DAG.getNode(ARMISD::RRX, MVT::i32, Lo, Hi.getValue(1)); + + // Merge the pieces into a single i64 value. + return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi); +} + +SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { + SDOperand Chain = Op.getOperand(0); + SDOperand Dest = Op.getOperand(1); + SDOperand Src = Op.getOperand(2); + SDOperand Count = Op.getOperand(3); + unsigned Align = + (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue(); + if (Align == 0) Align = 1; + + ConstantSDNode *I = dyn_cast<ConstantSDNode>(Count); + // Just call memcpy if: + // not 4-byte aligned + // size is unknown + // size is >= the threshold. + if ((Align & 3) != 0 || + !I || + I->getValue() >= 64 || + (I->getValue() & 3) != 0) { + MVT::ValueType IntPtr = getPointerTy(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = getTargetData()->getIntPtrType(); + Entry.Node = Op.getOperand(1); Args.push_back(Entry); + Entry.Node = Op.getOperand(2); Args.push_back(Entry); + Entry.Node = Op.getOperand(3); Args.push_back(Entry); + std::pair<SDOperand,SDOperand> CallResult = + LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false, + DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG); + return CallResult.second; + } + + // Otherwise do repeated 4-byte loads and stores. To be improved. + assert((I->getValue() & 3) == 0); + assert((Align & 3) == 0); + unsigned NumMemOps = I->getValue() >> 2; + unsigned EmittedNumMemOps = 0; + unsigned SrcOff = 0, DstOff = 0; + MVT::ValueType VT = MVT::i32; + unsigned VTSize = 4; + const unsigned MAX_LOADS_IN_LDM = 6; + SDOperand LoadChains[MAX_LOADS_IN_LDM]; + SDOperand Loads[MAX_LOADS_IN_LDM]; + + // Emit up to 4 loads, then a TokenFactor barrier, then the same + // number of stores. The loads and stores will get combined into + // ldm/stm later on. + while(EmittedNumMemOps < NumMemOps) { + unsigned i; + for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) { + Loads[i] = DAG.getLoad(VT, Chain, + DAG.getNode(ISD::ADD, VT, Src, + DAG.getConstant(SrcOff, VT)), + NULL, 0); + LoadChains[i] = Loads[i].getValue(1); + SrcOff += VTSize; + } + + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i); + + for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) { + Chain = DAG.getStore(Chain, Loads[i], + DAG.getNode(ISD::ADD, VT, Dest, + DAG.getConstant(DstOff, VT)), + NULL, 0); + DstOff += VTSize; + } + EmittedNumMemOps += i; + } + + return Chain; +} + +SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Don't know how to custom lower this!"); abort(); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: + return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) : + LowerGlobalAddressELF(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::CALL: return LowerCALL(Op, DAG); + case ISD::RET: return LowerRET(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG, Subtarget); + case ISD::BR_CC: return LowerBR_CC(Op, DAG, Subtarget); + case ISD::BR_JT: return LowerBR_JT(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG, VarArgsFrameIndex); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MULHU: return LowerMULHU(Op, DAG); + case ISD::MULHS: return LowerMULHS(Op, DAG); + case ISD::SRL: + case ISD::SRA: return LowerSRx(Op, DAG, Subtarget); + case ISD::FORMAL_ARGUMENTS: + return LowerFORMAL_ARGUMENTS(Op, DAG); + case ISD::RETURNADDR: break; + case ISD::FRAMEADDR: break; + case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); + case ISD::MEMCPY: return LowerMEMCPY(Op, DAG); + } + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// ARM Scheduler Hooks +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +ARMTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *BB) { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + switch (MI->getOpcode()) { + default: assert(false && "Unexpected instr type to insert"); + case ARM::tMOVCCr: { + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + ilist<MachineBasicBlock>::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB); + BuildMI(BB, TII->get(ARM::tBcc)).addMBB(sinkMBB) + .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg()); + MachineFunction *F = BB->getParent(); + F->getBasicBlockList().insert(It, copy0MBB); + F->getBasicBlockList().insert(It, sinkMBB); + // Update machine-CFG edges by first adding all successors of the current + // block to the new block which will contain the Phi node for the select. + for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), + e = BB->succ_end(); i != e; ++i) + sinkMBB->addSuccessor(*i); + // Next, remove all successors of the current block, and add the true + // and fallthrough blocks as its successors. + while(!BB->succ_empty()) + BB->removeSuccessor(BB->succ_begin()); + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, TII->get(ARM::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + delete MI; // The pseudo instruction is gone now. + return BB; + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Optimization Hooks +//===----------------------------------------------------------------------===// + +/// isLegalAddressImmediate - Return true if the integer value can be used +/// as the offset of the target addressing mode for load / store of the +/// given type. +static bool isLegalAddressImmediate(int64_t V, MVT::ValueType VT, + const ARMSubtarget *Subtarget) { + if (V == 0) + return true; + + if (Subtarget->isThumb()) { + if (V < 0) + return false; + + unsigned Scale = 1; + switch (VT) { + default: return false; + case MVT::i1: + case MVT::i8: + // Scale == 1; + break; + case MVT::i16: + // Scale == 2; + Scale = 2; + break; + case MVT::i32: + // Scale == 4; + Scale = 4; + break; + } + + if ((V & (Scale - 1)) != 0) + return false; + V /= Scale; + return V == V & ((1LL << 5) - 1); + } + + if (V < 0) + V = - V; + switch (VT) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + // +- imm12 + return V == V & ((1LL << 12) - 1); + case MVT::i16: + // +- imm8 + return V == V & ((1LL << 8) - 1); + case MVT::f32: + case MVT::f64: + if (!Subtarget->hasVFP2()) + return false; + if ((V & 3) != 0) + return false; + V >>= 2; + return V == V & ((1LL << 8) - 1); + } +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const { + if (!isLegalAddressImmediate(AM.BaseOffs, getValueType(Ty), Subtarget)) + return false; + + // Can never fold addr of global into load/store. + if (AM.BaseGV) + return false; + + switch (AM.Scale) { + case 0: // no scale reg, must be "r+i" or "r", or "i". + break; + case 1: + if (Subtarget->isThumb()) + return false; + // FALL THROUGH. + default: + // ARM doesn't support any R+R*scale+imm addr modes. + if (AM.BaseOffs) + return false; + + int Scale = AM.Scale; + switch (getValueType(Ty)) { + default: return false; + case MVT::i1: + case MVT::i8: + case MVT::i32: + case MVT::i64: + // This assumes i64 is legalized to a pair of i32. If not (i.e. + // ldrd / strd are used, then its address mode is same as i16. + // r + r + if (Scale < 0) Scale = -Scale; + if (Scale == 1) + return true; + // r + r << imm + return isPowerOf2_32(Scale & ~1); + case MVT::i16: + // r + r + if (((unsigned)AM.HasBaseReg + Scale) <= 2) + return true; + return false; + + case MVT::isVoid: + // Note, we allow "void" uses (basically, uses that aren't loads or + // stores), because arm allows folding a scale into many arithmetic + // operations. This should be made more precise and revisited later. + + // Allow r << imm, but the imm has to be a multiple of two. + if (AM.Scale & 1) return false; + return isPowerOf2_32(AM.Scale); + } + break; + } + return true; +} + + +static bool getIndexedAddressParts(SDNode *Ptr, MVT::ValueType VT, + bool isSEXTLoad, SDOperand &Base, + SDOperand &Offset, bool &isInc, + SelectionDAG &DAG) { + if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) + return false; + + if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { + // AddressingMode 3 + Base = Ptr->getOperand(0); + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if (RHSC < 0 && RHSC > -256) { + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + return true; + } + } + isInc = (Ptr->getOpcode() == ISD::ADD); + Offset = Ptr->getOperand(1); + return true; + } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { + // AddressingMode 2 + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { + int RHSC = (int)RHS->getValue(); + if (RHSC < 0 && RHSC > -0x1000) { + isInc = false; + Offset = DAG.getConstant(-RHSC, RHS->getValueType(0)); + Base = Ptr->getOperand(0); + return true; + } + } + + if (Ptr->getOpcode() == ISD::ADD) { + isInc = true; + ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0)); + if (ShOpcVal != ARM_AM::no_shift) { + Base = Ptr->getOperand(1); + Offset = Ptr->getOperand(0); + } else { + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + } + return true; + } + + isInc = (Ptr->getOpcode() == ISD::ADD); + Base = Ptr->getOperand(0); + Offset = Ptr->getOperand(1); + return true; + } + + // FIXME: Use FLDM / FSTM to emulate indexed FP load / store. + return false; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool +ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDOperand &Base, + SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) { + if (Subtarget->isThumb()) + return false; + + MVT::ValueType VT; + SDOperand Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getLoadedVT(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getStoredVT(); + } else + return false; + + bool isInc; + bool isLegal = getIndexedAddressParts(Ptr.Val, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + if (isLegal) { + AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; + } + return false; +} + +/// getPostIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if this node can be +/// combined with a load / store to form a post-indexed load / store. +bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDOperand &Base, + SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) { + if (Subtarget->isThumb()) + return false; + + MVT::ValueType VT; + SDOperand Ptr; + bool isSEXTLoad = false; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + VT = LD->getLoadedVT(); + isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + VT = ST->getStoredVT(); + } else + return false; + + bool isInc; + bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, + isInc, DAG); + if (isLegal) { + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; + } + return false; +} + +void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = 0; + KnownOne = 0; + switch (Op.getOpcode()) { + default: break; + case ARMISD::CMOV: { + // Bits are known zero/one if known on the LHS and RHS. + DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1); + if (KnownZero == 0 && KnownOne == 0) return; + + uint64_t KnownZeroRHS, KnownOneRHS; + DAG.ComputeMaskedBits(Op.getOperand(1), Mask, + KnownZeroRHS, KnownOneRHS, Depth+1); + KnownZero &= KnownZeroRHS; + KnownOne &= KnownOneRHS; + return; + } + } +} + +//===----------------------------------------------------------------------===// +// ARM Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +ARMTargetLowering::ConstraintType +ARMTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'l': return C_RegisterClass; + case 'w': return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair<unsigned, const TargetRegisterClass*> +ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + if (Constraint.size() == 1) { + // GCC RS6000 Constraint Letters + switch (Constraint[0]) { + case 'l': + // FIXME: in thumb mode, 'l' is only low-regs. + // FALL THROUGH. + case 'r': + return std::make_pair(0U, ARM::GPRRegisterClass); + case 'w': + if (VT == MVT::f32) + return std::make_pair(0U, ARM::SPRRegisterClass); + if (VT == MVT::f64) + return std::make_pair(0U, ARM::DPRRegisterClass); + break; + } + } + return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); +} + +std::vector<unsigned> ARMTargetLowering:: +getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + if (Constraint.size() != 1) + return std::vector<unsigned>(); + + switch (Constraint[0]) { // GCC ARM Constraint Letters + default: break; + case 'l': + case 'r': + return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, ARM::R11, + ARM::R12, ARM::LR, 0); + case 'w': + if (VT == MVT::f32) + return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12,ARM::S13,ARM::S14,ARM::S15, + ARM::S16,ARM::S17,ARM::S18,ARM::S19, + ARM::S20,ARM::S21,ARM::S22,ARM::S23, + ARM::S24,ARM::S25,ARM::S26,ARM::S27, + ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0); + if (VT == MVT::f64) + return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7, + ARM::D8, ARM::D9, ARM::D10,ARM::D11, + ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0); + break; + } + + return std::vector<unsigned>(); +} diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h new file mode 100644 index 0000000..2b66f23 --- /dev/null +++ b/lib/Target/ARM/ARMISelLowering.h @@ -0,0 +1,144 @@ +//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that ARM uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMISELLOWERING_H +#define ARMISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include <vector> + +namespace llvm { + class ARMConstantPoolValue; + class ARMSubtarget; + + namespace ARMISD { + // ARM Specific DAG Nodes + enum NodeType { + // Start the numbering where the builting ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END+ARM::INSTRUCTION_LIST_END, + + Wrapper, // Wrapper - A wrapper node for TargetConstantPool, + // TargetExternalSymbol, and TargetGlobalAddress. + WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable + + CALL, // Function call. + CALL_PRED, // Function call that's predicable. + CALL_NOLINK, // Function call with branch not branch-and-link. + tCALL, // Thumb function call. + BRCOND, // Conditional branch. + BR_JT, // Jumptable branch. + RET_FLAG, // Return with a flag operand. + + PIC_ADD, // Add with a PC operand and a PIC label. + + CMP, // ARM compare instructions. + CMPNZ, // ARM compare that uses only N or Z flags. + CMPFP, // ARM VFP compare instruction, sets FPSCR. + CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. + FMSTAT, // ARM fmstat instruction. + CMOV, // ARM conditional move instructions. + CNEG, // ARM conditional negate instructions. + + FTOSI, // FP to sint within a FP register. + FTOUI, // FP to uint within a FP register. + SITOF, // sint to FP within a FP register. + UITOF, // uint to FP within a FP register. + + MULHILOU, // Lo,Hi = umul LHS, RHS. + MULHILOS, // Lo,Hi = smul LHS, RHS. + + SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. + SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. + RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. + + FMRRD, // double to two gprs. + FMDRR, // Two gprs to double. + + THREAD_POINTER + }; + } + + //===----------------------------------------------------------------------===// + // ARMTargetLowering - ARM Implementation of the TargetLowering interface + + class ARMTargetLowering : public TargetLowering { + int VarArgsFrameIndex; // FrameIndex for start of varargs area. + public: + ARMTargetLowering(TargetMachine &TM); + + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + virtual const char *getTargetNodeName(unsigned Opcode) const; + + virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *MBB); + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + virtual bool getPreIndexedAddressParts(SDNode *N, SDOperand &Base, + SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG); + + /// getPostIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if this node can be + /// combined with a load / store to form a post-indexed load / store. + virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, + SDOperand &Base, SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG); + + virtual void computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const; + ConstraintType getConstraintType(const std::string &Constraint) const; + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + std::vector<unsigned> + getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + private: + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const ARMSubtarget *Subtarget; + + /// ARMPCLabelIndex - Keep track the number of ARM PC labels created. + /// + unsigned ARMPCLabelIndex; + + SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalAddressDarwin(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalAddressELF(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, + SelectionDAG &DAG); + SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA, + SelectionDAG &DAG); + SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG); + }; +} + +#endif // ARMISELLOWERING_H diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp new file mode 100644 index 0000000..b404ec0 --- /dev/null +++ b/lib/Target/ARM/ARMInstrInfo.cpp @@ -0,0 +1,612 @@ +//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARMInstrInfo.h" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMGenInstrInfo.inc" +#include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +static cl::opt<bool> EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden, + cl::desc("Enable ARM 2-addr to 3-addr conv")); + +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : TargetInstrInfo(ARMInsts, sizeof(ARMInsts)/sizeof(ARMInsts[0])), + RI(*this, STI) { +} + +const TargetRegisterClass *ARMInstrInfo::getPointerRegClass() const { + return &ARM::GPRRegClass; +} + +/// Return true if the instruction is a register to register move and +/// leave the source and dest operands in the passed parameters. +/// +bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const { + MachineOpCode oc = MI.getOpcode(); + switch (oc) { + default: + return false; + case ARM::FCPYS: + case ARM::FCPYD: + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + return true; + case ARM::MOVr: + case ARM::tMOVr: + assert(MI.getInstrDescriptor()->numOperands >= 2 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + "Invalid ARM MOV instruction"); + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + return true; + } +} + +unsigned ARMInstrInfo::isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const{ + switch (MI->getOpcode()) { + default: break; + case ARM::LDR: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImmediate() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::FLDD: + case ARM::FLDS: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isImmediate() && + MI->getOperand(2).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::tRestore: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isImmediate() && + MI->getOperand(2).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned ARMInstrInfo::isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case ARM::STR: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isReg() && + MI->getOperand(3).isImmediate() && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::FSTD: + case ARM::FSTS: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isImmediate() && + MI->getOperand(2).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + case ARM::tSpill: + if (MI->getOperand(1).isFrameIndex() && + MI->getOperand(2).isImmediate() && + MI->getOperand(2).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +static unsigned getUnindexedOpcode(unsigned Opc) { + switch (Opc) { + default: break; + case ARM::LDR_PRE: + case ARM::LDR_POST: + return ARM::LDR; + case ARM::LDRH_PRE: + case ARM::LDRH_POST: + return ARM::LDRH; + case ARM::LDRB_PRE: + case ARM::LDRB_POST: + return ARM::LDRB; + case ARM::LDRSH_PRE: + case ARM::LDRSH_POST: + return ARM::LDRSH; + case ARM::LDRSB_PRE: + case ARM::LDRSB_POST: + return ARM::LDRSB; + case ARM::STR_PRE: + case ARM::STR_POST: + return ARM::STR; + case ARM::STRH_PRE: + case ARM::STRH_POST: + return ARM::STRH; + case ARM::STRB_PRE: + case ARM::STRB_POST: + return ARM::STRB; + } + return 0; +} + +MachineInstr * +ARMInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables &LV) const { + if (!EnableARM3Addr) + return NULL; + + MachineInstr *MI = MBBI; + unsigned TSFlags = MI->getInstrDescriptor()->TSFlags; + bool isPre = false; + switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) { + default: return NULL; + case ARMII::IndexModePre: + isPre = true; + break; + case ARMII::IndexModePost: + break; + } + + // Try spliting an indexed load / store to a un-indexed one plus an add/sub + // operation. + unsigned MemOpc = getUnindexedOpcode(MI->getOpcode()); + if (MemOpc == 0) + return NULL; + + MachineInstr *UpdateMI = NULL; + MachineInstr *MemMI = NULL; + unsigned AddrMode = (TSFlags & ARMII::AddrModeMask); + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + unsigned NumOps = TID->numOperands; + bool isLoad = (TID->Flags & M_LOAD_FLAG) != 0; + const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0); + const MachineOperand &Base = MI->getOperand(2); + const MachineOperand &Offset = MI->getOperand(NumOps-3); + unsigned WBReg = WB.getReg(); + unsigned BaseReg = Base.getReg(); + unsigned OffReg = Offset.getReg(); + unsigned OffImm = MI->getOperand(NumOps-2).getImm(); + ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm(); + switch (AddrMode) { + default: + assert(false && "Unknown indexed op!"); + return NULL; + case ARMII::AddrMode2: { + bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM2Offset(OffImm); + if (OffReg == 0) { + int SOImmVal = ARM_AM::getSOImmVal(Amt); + if (SOImmVal == -1) + // Can't encode it in a so_imm operand. This transformation will + // add more than 1 instruction. Abandon! + return NULL; + UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(SOImmVal) + .addImm(Pred).addReg(0).addReg(0); + } else if (Amt != 0) { + ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm); + unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt); + UpdateMI = BuildMI(get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg) + .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc) + .addImm(Pred).addReg(0).addReg(0); + } else + UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + case ARMII::AddrMode3 : { + bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub; + unsigned Amt = ARM_AM::getAM3Offset(OffImm); + if (OffReg == 0) + // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand. + UpdateMI = BuildMI(get(isSub ? ARM::SUBri : ARM::ADDri), WBReg) + .addReg(BaseReg).addImm(Amt) + .addImm(Pred).addReg(0).addReg(0); + else + UpdateMI = BuildMI(get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg) + .addReg(BaseReg).addReg(OffReg) + .addImm(Pred).addReg(0).addReg(0); + break; + } + } + + std::vector<MachineInstr*> NewMIs; + if (isPre) { + if (isLoad) + MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(WBReg).addReg(0).addImm(0).addImm(Pred); + NewMIs.push_back(MemMI); + NewMIs.push_back(UpdateMI); + } else { + if (isLoad) + MemMI = BuildMI(get(MemOpc), MI->getOperand(0).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + else + MemMI = BuildMI(get(MemOpc)).addReg(MI->getOperand(1).getReg()) + .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred); + if (WB.isDead()) + UpdateMI->getOperand(0).setIsDead(); + NewMIs.push_back(UpdateMI); + NewMIs.push_back(MemMI); + } + + // Transfer LiveVariables states, kill / dead info. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isRegister() && MO.getReg() && + MRegisterInfo::isVirtualRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + LiveVariables::VarInfo &VI = LV.getVarInfo(Reg); + if (MO.isDef()) { + MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI; + if (MO.isDead()) + LV.addVirtualRegisterDead(Reg, NewMI); + // Update the defining instruction. + if (VI.DefInst == MI) + VI.DefInst = NewMI; + } + if (MO.isUse() && MO.isKill()) { + for (unsigned j = 0; j < 2; ++j) { + // Look at the two new MI's in reverse order. + MachineInstr *NewMI = NewMIs[j]; + int NIdx = NewMI->findRegisterUseOperandIdx(Reg); + if (NIdx == -1) + continue; + LV.addVirtualRegisterKilled(Reg, NewMI); + if (VI.removeKill(MI)) + VI.Kills.push_back(NewMI); + break; + } + } + } + } + + MFI->insert(MBBI, NewMIs[1]); + MFI->insert(MBBI, NewMIs[0]); + return NewMIs[0]; +} + +// Branch analysis. +bool ARMInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastOpc == ARM::B || LastOpc == ARM::tB) { + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + if (LastOpc == ARM::Bcc || LastOpc == ARM::tBcc) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + Cond.push_back(LastInst->getOperand(1)); + Cond.push_back(LastInst->getOperand(2)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with ARM::B/ARM::tB and a ARM::Bcc/ARM::tBcc, handle it. + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + if ((SecondLastOpc == ARM::Bcc && LastOpc == ARM::B) || + (SecondLastOpc == ARM::tBcc && LastOpc == ARM::tB)) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + Cond.push_back(SecondLastInst->getOperand(1)); + Cond.push_back(SecondLastInst->getOperand(2)); + FBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if ((SecondLastOpc == ARM::B || SecondLastOpc==ARM::tB) && + (LastOpc == ARM::B || LastOpc == ARM::tB)) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + I = LastInst; + I->eraseFromParent(); + return false; + } + + // Likewise if it ends with a branch table followed by an unconditional branch. + // The branch folder can create these, and we must get rid of them for + // correctness of Thumb constant islands. + if ((SecondLastOpc == ARM::BR_JTr || SecondLastOpc==ARM::BR_JTm || + SecondLastOpc == ARM::BR_JTadd || SecondLastOpc==ARM::tBR_JTr) && + (LastOpc == ARM::B || LastOpc == ARM::tB)) { + I = LastInst; + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + + +unsigned ARMInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int BOpc = AFI->isThumbFunction() ? ARM::tB : ARM::B; + int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc; + + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != BccOpc) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned ARMInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + int BOpc = AFI->isThumbFunction() ? ARM::tB : ARM::B; + int BccOpc = AFI->isThumbFunction() ? ARM::tBcc : ARM::Bcc; + + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "ARM branch conditions have two components!"); + + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch? + BuildMI(&MBB, get(BOpc)).addMBB(TBB); + else + BuildMI(&MBB, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + return 1; + } + + // Two-way conditional branch. + BuildMI(&MBB, get(BccOpc)).addMBB(TBB) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()); + BuildMI(&MBB, get(BOpc)).addMBB(FBB); + return 2; +} + +bool ARMInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case ARM::BX_RET: // Return. + case ARM::LDM_RET: + case ARM::tBX_RET: + case ARM::tBX_RET_vararg: + case ARM::tPOP_RET: + case ARM::B: + case ARM::tB: // Uncond branch. + case ARM::tBR_JTr: + case ARM::BR_JTr: // Jumptable branch. + case ARM::BR_JTm: // Jumptable branch through mem. + case ARM::BR_JTadd: // Jumptable branch add to pc. + return true; + default: return false; + } +} + +bool ARMInstrInfo:: +ReverseBranchCondition(std::vector<MachineOperand> &Cond) const { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; +} + +bool ARMInstrInfo::isPredicated(const MachineInstr *MI) const { + int PIdx = MI->findFirstPredOperandIdx(); + return PIdx != -1 && MI->getOperand(PIdx).getImmedValue() != ARMCC::AL; +} + +bool ARMInstrInfo::PredicateInstruction(MachineInstr *MI, + const std::vector<MachineOperand> &Pred) const { + unsigned Opc = MI->getOpcode(); + if (Opc == ARM::B || Opc == ARM::tB) { + MI->setInstrDescriptor(get(Opc == ARM::B ? ARM::Bcc : ARM::tBcc)); + MI->addImmOperand(Pred[0].getImmedValue()); + MI->addRegOperand(Pred[1].getReg(), false); + return true; + } + + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setImm(Pred[0].getImmedValue()); + MI->getOperand(PIdx+1).setReg(Pred[1].getReg()); + return true; + } + return false; +} + +bool +ARMInstrInfo::SubsumesPredicate(const std::vector<MachineOperand> &Pred1, + const std::vector<MachineOperand> &Pred2) const{ + if (Pred1.size() > 2 || Pred2.size() > 2) + return false; + + ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImmedValue(); + ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImmedValue(); + if (CC1 == CC2) + return true; + + switch (CC1) { + default: + return false; + case ARMCC::AL: + return true; + case ARMCC::HS: + return CC2 == ARMCC::HI; + case ARMCC::LS: + return CC2 == ARMCC::LO || CC2 == ARMCC::EQ; + case ARMCC::GE: + return CC2 == ARMCC::GT; + case ARMCC::LE: + return CC2 == ARMCC::LT; + } +} + +bool ARMInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + if (!TID->ImplicitDefs && (TID->Flags & M_HAS_OPTIONAL_DEF) == 0) + return false; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.getReg() == ARM::CPSR) { + Pred.push_back(MO); + Found = true; + } + } + + return Found; +} + + +/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) DISABLE_INLINE; +static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT, + unsigned JTI) { + return JT[JTI].MBBs.size(); +} + +/// GetInstSize - Return the size of the specified MachineInstr. +/// +unsigned ARM::GetInstSize(MachineInstr *MI) { + MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const TargetAsmInfo *TAI = MF->getTarget().getTargetAsmInfo(); + + // Basic size info comes from the TSFlags field. + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + unsigned TSFlags = TID->TSFlags; + + switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) { + default: + // If this machine instr is an inline asm, measure it. + if (MI->getOpcode() == ARM::INLINEASM) + return TAI->getInlineAsmLength(MI->getOperand(0).getSymbolName()); + if (MI->getOpcode() == ARM::LABEL) + return 0; + assert(0 && "Unknown or unset size field for instr!"); + break; + case ARMII::Size8Bytes: return 8; // Arm instruction x 2. + case ARMII::Size4Bytes: return 4; // Arm instruction. + case ARMII::Size2Bytes: return 2; // Thumb instruction. + case ARMII::SizeSpecial: { + switch (MI->getOpcode()) { + case ARM::CONSTPOOL_ENTRY: + // If this machine instr is a constant pool entry, its size is recorded as + // operand #2. + return MI->getOperand(2).getImm(); + case ARM::BR_JTr: + case ARM::BR_JTm: + case ARM::BR_JTadd: + case ARM::tBR_JTr: { + // These are jumptable branches, i.e. a branch followed by an inlined + // jumptable. The size is 4 + 4 * number of entries. + unsigned NumOps = TID->numOperands; + MachineOperand JTOP = + MI->getOperand(NumOps - ((TID->Flags & M_PREDICABLE) ? 3 : 2)); + unsigned JTI = JTOP.getJumpTableIndex(); + MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); + const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables(); + assert(JTI < JT.size()); + // Thumb instructions are 2 byte aligned, but JT entries are 4 byte + // 4 aligned. The assembler / linker may add 2 byte padding just before + // the JT entries. The size does not include this padding; the + // constant islands pass does separate bookkeeping for it. + // FIXME: If we know the size of the function is less than (1 << 16) *2 + // bytes, we can use 16-bit entries instead. Then there won't be an + // alignment issue. + return getNumJTEntries(JT, JTI) * 4 + + (MI->getOpcode()==ARM::tBR_JTr ? 2 : 4); + } + default: + // Otherwise, pseudo-instruction sizes are zero. + return 0; + } + } + } +} + +/// GetFunctionSize - Returns the size of the specified MachineFunction. +/// +unsigned ARM::GetFunctionSize(MachineFunction &MF) { + unsigned FnSize = 0; + for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end(); + MBBI != E; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; + for (MachineBasicBlock::iterator I = MBB.begin(),E = MBB.end(); I != E; ++I) + FnSize += ARM::GetInstSize(I); + } + return FnSize; +} diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h new file mode 100644 index 0000000..2c158b8 --- /dev/null +++ b/lib/Target/ARM/ARMInstrInfo.h @@ -0,0 +1,133 @@ +//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMINSTRUCTIONINFO_H +#define ARMINSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "ARMRegisterInfo.h" + +namespace llvm { + class ARMSubtarget; + +/// ARMII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace ARMII { + enum { + //===------------------------------------------------------------------===// + // Instruction Flags. + + //===------------------------------------------------------------------===// + // This three-bit field describes the addressing mode used. Zero is unused + // so that we can tell if we forgot to set a value. + + AddrModeMask = 0xf, + AddrMode1 = 1, + AddrMode2 = 2, + AddrMode3 = 3, + AddrMode4 = 4, + AddrMode5 = 5, + AddrModeT1 = 6, + AddrModeT2 = 7, + AddrModeT4 = 8, + AddrModeTs = 9, // i8 * 4 for pc and sp relative data + + // Size* - Flags to keep track of the size of an instruction. + SizeShift = 4, + SizeMask = 7 << SizeShift, + SizeSpecial = 1, // 0 byte pseudo or special case. + Size8Bytes = 2, + Size4Bytes = 3, + Size2Bytes = 4, + + // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load + // and store ops + IndexModeShift = 7, + IndexModeMask = 3 << IndexModeShift, + IndexModePre = 1, + IndexModePost = 2, + + // Opcode + OpcodeShift = 9, + OpcodeMask = 0xf << OpcodeShift + }; +} + +class ARMInstrInfo : public TargetInstrInfo { + const ARMRegisterInfo RI; +public: + ARMInstrInfo(const ARMSubtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + /// getPointerRegClass - Return the register class to use to hold pointers. + /// This is used for addressing modes. + virtual const TargetRegisterClass *getPointerRegClass() const; + + /// Return true if the instruction is a register to register move and + /// leave the source and dest operands in the passed parameters. + /// + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const; + virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables &LV) const; + + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; + virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const; + virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const; + + // Predication support. + virtual bool isPredicated(const MachineInstr *MI) const; + + virtual + bool PredicateInstruction(MachineInstr *MI, + const std::vector<MachineOperand> &Pred) const; + + virtual + bool SubsumesPredicate(const std::vector<MachineOperand> &Pred1, + const std::vector<MachineOperand> &Pred1) const; + + virtual bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; +}; + + // Utility routines + namespace ARM { + /// GetInstSize - Returns the size of the specified MachineInstr. + /// + unsigned GetInstSize(MachineInstr *MI); + + /// GetFunctionSize - Returns the size of the specified MachineFunction. + /// + unsigned GetFunctionSize(MachineFunction &MF); + } +} + +#endif diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td new file mode 100644 index 0000000..adc203b --- /dev/null +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -0,0 +1,1320 @@ +//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM specific DAG Nodes. +// + +// Type profiles. +def SDT_ARMCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>; + +def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>; + +def SDT_ARMcall : SDTypeProfile<0, -1, [SDTCisInt<0>]>; + +def SDT_ARMCMov : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisVT<3, i32>]>; + +def SDT_ARMBrcond : SDTypeProfile<0, 2, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; + +def SDT_ARMBrJT : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; + +def SDT_ARMCmp : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; + +def SDT_ARMPICAdd : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<1>, SDTCisVT<2, i32>]>; + +def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; + +// Node definitions. +def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; +def ARMWrapperJT : SDNode<"ARMISD::WrapperJT", SDTIntBinOp>; + +def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeq, + [SDNPHasChain, SDNPOutFlag]>; +def ARMcallseq_end : SDNode<"ISD::CALLSEQ_END", SDT_ARMCallSeq, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + +def ARMcall : SDNode<"ARMISD::CALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; +def ARMcall_pred : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; +def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTRet, + [SDNPHasChain, SDNPOptInFlag]>; + +def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, + [SDNPInFlag]>; +def ARMcneg : SDNode<"ARMISD::CNEG", SDT_ARMCMov, + [SDNPInFlag]>; + +def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + +def ARMbrjt : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT, + [SDNPHasChain]>; + +def ARMcmp : SDNode<"ARMISD::CMP", SDT_ARMCmp, + [SDNPOutFlag]>; + +def ARMcmpNZ : SDNode<"ARMISD::CMPNZ", SDT_ARMCmp, + [SDNPOutFlag]>; + +def ARMpic_add : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>; + +def ARMsrl_flag : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; +def ARMsra_flag : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>; +def ARMrrx : SDNode<"ARMISD::RRX" , SDTIntUnaryOp, [SDNPInFlag ]>; + +def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>; + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasV5T : Predicate<"Subtarget->hasV5TOps()">; +def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">; +def HasV6 : Predicate<"Subtarget->hasV6Ops()">; +def IsThumb : Predicate<"Subtarget->isThumb()">; +def IsARM : Predicate<"!Subtarget->isThumb()">; + +//===----------------------------------------------------------------------===// +// ARM Flag Definitions. + +class RegConstraint<string C> { + string Constraints = C; +} + +//===----------------------------------------------------------------------===// +// ARM specific transformation functions and pattern fragments. +// + +// so_imm_XFORM - Return a so_imm value packed into the format described for +// so_imm def below. +def so_imm_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(N->getValue()), + MVT::i32); +}]>; + +// so_imm_neg_XFORM - Return a so_imm value packed into the format described for +// so_imm_neg def below. +def so_imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(-(int)N->getValue()), + MVT::i32); +}]>; + +// so_imm_not_XFORM - Return a so_imm value packed into the format described for +// so_imm_not def below. +def so_imm_not_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(~(int)N->getValue()), + MVT::i32); +}]>; + +// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24. +def rot_imm : PatLeaf<(i32 imm), [{ + int32_t v = (int32_t)N->getValue(); + return v == 8 || v == 16 || v == 24; +}]>; + +/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15]. +def imm1_15 : PatLeaf<(i32 imm), [{ + return (int32_t)N->getValue() >= 1 && (int32_t)N->getValue() < 16; +}]>; + +/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31]. +def imm16_31 : PatLeaf<(i32 imm), [{ + return (int32_t)N->getValue() >= 16 && (int32_t)N->getValue() < 32; +}]>; + +def so_imm_neg : + PatLeaf<(imm), [{ return ARM_AM::getSOImmVal(-(int)N->getValue()) != -1; }], + so_imm_neg_XFORM>; + +def so_imm_not : + PatLeaf<(imm), [{ return ARM_AM::getSOImmVal(~(int)N->getValue()) != -1; }], + so_imm_not_XFORM>; + +// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits. +def sext_16_node : PatLeaf<(i32 GPR:$a), [{ + return CurDAG->ComputeNumSignBits(SDOperand(N,0)) >= 17; +}]>; + + + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// Branch target. +def brtarget : Operand<OtherVT>; + +// A list of registers separated by comma. Used by load/store multiple. +def reglist : Operand<i32> { + let PrintMethod = "printRegisterList"; +} + +// An operand for the CONSTPOOL_ENTRY pseudo-instruction. +def cpinst_operand : Operand<i32> { + let PrintMethod = "printCPInstOperand"; +} + +def jtblock_operand : Operand<i32> { + let PrintMethod = "printJTBlockOperand"; +} + +// Local PC labels. +def pclabel : Operand<i32> { + let PrintMethod = "printPCLabel"; +} + +// shifter_operand operands: so_reg and so_imm. +def so_reg : Operand<i32>, // reg reg imm + ComplexPattern<i32, 3, "SelectShifterOperandReg", + [shl,srl,sra,rotr]> { + let PrintMethod = "printSORegOperand"; + let MIOperandInfo = (ops GPR, GPR, i32imm); +} + +// so_imm - Match a 32-bit shifter_operand immediate operand, which is an +// 8-bit immediate rotated by an arbitrary number of bits. so_imm values are +// represented in the imm field in the same 12-bit form that they are encoded +// into so_imm instructions: the 8-bit immediate is the least significant bits +// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11]. +def so_imm : Operand<i32>, + PatLeaf<(imm), + [{ return ARM_AM::getSOImmVal(N->getValue()) != -1; }], + so_imm_XFORM> { + let PrintMethod = "printSOImmOperand"; +} + +// Break so_imm's up into two pieces. This handles immediates with up to 16 +// bits set in them. This uses so_imm2part to match and so_imm2part_[12] to +// get the first/second pieces. +def so_imm2part : Operand<i32>, + PatLeaf<(imm), + [{ return ARM_AM::isSOImmTwoPartVal((unsigned)N->getValue()); }]> { + let PrintMethod = "printSOImm2PartOperand"; +} + +def so_imm2part_1 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getValue()); + return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32); +}]>; + +def so_imm2part_2 : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getValue()); + return CurDAG->getTargetConstant(ARM_AM::getSOImmVal(V), MVT::i32); +}]>; + + +// Define ARM specific addressing modes. + +// addrmode2 := reg +/- reg shop imm +// addrmode2 := reg +/- imm12 +// +def addrmode2 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode2", []> { + let PrintMethod = "printAddrMode2Operand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def am2offset : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> { + let PrintMethod = "printAddrMode2OffsetOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmode3 := reg +/- reg +// addrmode3 := reg +/- imm8 +// +def addrmode3 : Operand<i32>, + ComplexPattern<i32, 3, "SelectAddrMode3", []> { + let PrintMethod = "printAddrMode3Operand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm); +} + +def am3offset : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> { + let PrintMethod = "printAddrMode3OffsetOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmode4 := reg, <mode|W> +// +def addrmode4 : Operand<i32>, + ComplexPattern<i32, 2, "", []> { + let PrintMethod = "printAddrMode4Operand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmode5 := reg +/- imm8*4 +// +def addrmode5 : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrMode5", []> { + let PrintMethod = "printAddrMode5Operand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// addrmodepc := pc + reg +// +def addrmodepc : Operand<i32>, + ComplexPattern<i32, 2, "SelectAddrModePC", []> { + let PrintMethod = "printAddrModePCOperand"; + let MIOperandInfo = (ops GPR, i32imm); +} + +// ARM Predicate operand. Default to 14 = always (AL). Second part is CC +// register whose default is 0 (no register). +def pred : PredicateOperand<OtherVT, (ops i32imm, CCR), + (ops (i32 14), (i32 zero_reg))> { + let PrintMethod = "printPredicateOperand"; +} + +// Conditional code result for instructions whose 's' bit is set, e.g. subs. +// +def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> { + let PrintMethod = "printSBitModifierOperand"; +} + +//===----------------------------------------------------------------------===// +// ARM Instruction flags. These need to match ARMInstrInfo.h. +// + +// Addressing mode. +class AddrMode<bits<4> val> { + bits<4> Value = val; +} +def AddrModeNone : AddrMode<0>; +def AddrMode1 : AddrMode<1>; +def AddrMode2 : AddrMode<2>; +def AddrMode3 : AddrMode<3>; +def AddrMode4 : AddrMode<4>; +def AddrMode5 : AddrMode<5>; +def AddrModeT1 : AddrMode<6>; +def AddrModeT2 : AddrMode<7>; +def AddrModeT4 : AddrMode<8>; +def AddrModeTs : AddrMode<9>; + +// Instruction size. +class SizeFlagVal<bits<3> val> { + bits<3> Value = val; +} +def SizeInvalid : SizeFlagVal<0>; // Unset. +def SizeSpecial : SizeFlagVal<1>; // Pseudo or special. +def Size8Bytes : SizeFlagVal<2>; +def Size4Bytes : SizeFlagVal<3>; +def Size2Bytes : SizeFlagVal<4>; + +// Load / store index mode. +class IndexMode<bits<2> val> { + bits<2> Value = val; +} +def IndexModeNone : IndexMode<0>; +def IndexModePre : IndexMode<1>; +def IndexModePost : IndexMode<2>; + +//===----------------------------------------------------------------------===// +// ARM Instruction templates. +// + +// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode. +class ARMPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM]; +} +class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV5TE]; +} +class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsARM, HasV6]; +} + +class InstARM<bits<4> opcod, AddrMode am, SizeFlagVal sz, IndexMode im, + string cstr> + : Instruction { + let Namespace = "ARM"; + + bits<4> Opcode = opcod; + AddrMode AM = am; + bits<4> AddrModeBits = AM.Value; + + SizeFlagVal SZ = sz; + bits<3> SizeFlag = SZ.Value; + + IndexMode IM = im; + bits<2> IndexModeBits = IM.Value; + + let Constraints = cstr; +} + +class PseudoInst<dag ops, string asm, list<dag> pattern> + : InstARM<0, AddrModeNone, SizeSpecial, IndexModeNone, ""> { + let OperandList = ops; + let AsmString = asm; + let Pattern = pattern; +} + +// Almost all ARM instructions are predicable. +class I<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im, + string opc, string asm, string cstr, list<dag> pattern> + // FIXME: Set all opcodes to 0 for now. + : InstARM<0, am, sz, im, cstr> { + let OperandList = !con(oprnds, (ops pred:$p)); + let AsmString = !strconcat(opc, !strconcat("${p}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +// Same as I except it can optionally modify CPSR. +class sI<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im, + string opc, string asm, string cstr, list<dag> pattern> + // FIXME: Set all opcodes to 0 for now. + : InstARM<0, am, sz, im, cstr> { + let OperandList = !con(oprnds, (ops pred:$p, cc_out:$s)); + let AsmString = !strconcat(opc, !strconcat("${p}${s}", asm)); + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +class AI<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrModeNone, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AsI<dag ops, string opc, string asm, list<dag> pattern> + : sI<ops, AddrModeNone, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AI1<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode1, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AsI1<dag ops, string opc, string asm, list<dag> pattern> + : sI<ops, AddrMode1, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AI2<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode2, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AI3<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode3, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AI4<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode4, Size4Bytes, IndexModeNone, opc, asm, "", pattern>; +class AI1x2<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode1, Size8Bytes, IndexModeNone, opc, asm, "", pattern>; + +// Pre-indexed ops +class AI2pr<dag ops, string opc, string asm, string cstr, list<dag> pattern> + : I<ops, AddrMode2, Size4Bytes, IndexModePre, opc, asm, cstr, pattern>; +class AI3pr<dag ops, string opc, string asm, string cstr, list<dag> pattern> + : I<ops, AddrMode3, Size4Bytes, IndexModePre, opc, asm, cstr, pattern>; + +// Post-indexed ops +class AI2po<dag ops, string opc, string asm, string cstr, list<dag> pattern> + : I<ops, AddrMode2, Size4Bytes, IndexModePost, opc, asm, cstr, pattern>; +class AI3po<dag ops, string opc, string asm, string cstr, list<dag> pattern> + : I<ops, AddrMode3, Size4Bytes, IndexModePost, opc, asm, cstr, pattern>; + + +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>; + + +/// AI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a +/// binop that produces a value. +multiclass AsI1_bin_irs<string opc, PatFrag opnode> { + def ri : AsI1<(ops GPR:$dst, GPR:$a, so_imm:$b), + opc, " $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>; + def rr : AsI1<(ops GPR:$dst, GPR:$a, GPR:$b), + opc, " $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>; + def rs : AsI1<(ops GPR:$dst, GPR:$a, so_reg:$b), + opc, " $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>; +} + +/// ASI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the +/// instruction modifies the CSPR register. +multiclass ASI1_bin_s_irs<string opc, PatFrag opnode> { + def ri : AI1<(ops GPR:$dst, GPR:$a, so_imm:$b), + opc, "s $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, Imp<[], [CPSR]>; + def rr : AI1<(ops GPR:$dst, GPR:$a, GPR:$b), + opc, "s $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Imp<[], [CPSR]>; + def rs : AI1<(ops GPR:$dst, GPR:$a, so_reg:$b), + opc, "s $dst, $a, $b", + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, Imp<[], [CPSR]>; +} + +/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test +/// patterns. Similar to AsI1_bin_irs except the instruction does not produce +/// a explicit result, only implicitly set CPSR. +multiclass AI1_cmp_irs<string opc, PatFrag opnode> { + def ri : AI1<(ops GPR:$a, so_imm:$b), + opc, " $a, $b", + [(opnode GPR:$a, so_imm:$b)]>, Imp<[], [CPSR]>; + def rr : AI1<(ops GPR:$a, GPR:$b), + opc, " $a, $b", + [(opnode GPR:$a, GPR:$b)]>, Imp<[], [CPSR]>; + def rs : AI1<(ops GPR:$a, so_reg:$b), + opc, " $a, $b", + [(opnode GPR:$a, so_reg:$b)]>, Imp<[], [CPSR]>; +} + +/// AI_unary_rrot - A unary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +multiclass AI_unary_rrot<string opc, PatFrag opnode> { + def r : AI<(ops GPR:$dst, GPR:$Src), + opc, " $dst, $Src", + [(set GPR:$dst, (opnode GPR:$Src))]>, Requires<[IsARM, HasV6]>; + def r_rot : AI<(ops GPR:$dst, GPR:$Src, i32imm:$rot), + opc, " $dst, $Src, ror $rot", + [(set GPR:$dst, (opnode (rotr GPR:$Src, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]>; +} + +/// AI_bin_rrot - A binary operation with two forms: one whose operand is a +/// register and one whose operand is a register rotated by 8/16/24. +multiclass AI_bin_rrot<string opc, PatFrag opnode> { + def rr : AI<(ops GPR:$dst, GPR:$LHS, GPR:$RHS), + opc, " $dst, $LHS, $RHS", + [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>, + Requires<[IsARM, HasV6]>; + def rr_rot : AI<(ops GPR:$dst, GPR:$LHS, GPR:$RHS, i32imm:$rot), + opc, " $dst, $LHS, $RHS, ror $rot", + [(set GPR:$dst, (opnode GPR:$LHS, + (rotr GPR:$RHS, rot_imm:$rot)))]>, + Requires<[IsARM, HasV6]>; +} + +// Special cases. +class XI<dag oprnds, AddrMode am, SizeFlagVal sz, IndexMode im, + string asm, string cstr, list<dag> pattern> + // FIXME: Set all opcodes to 0 for now. + : InstARM<0, am, sz, im, cstr> { + let OperandList = oprnds; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsARM]; +} + +class AXI<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern>; +class AXI1<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode1, Size4Bytes, IndexModeNone, asm, "", pattern>; +class AXI2<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode2, Size4Bytes, IndexModeNone, asm, "", pattern>; +class AXI3<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode3, Size4Bytes, IndexModeNone, asm, "", pattern>; +class AXI4<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode4, Size4Bytes, IndexModeNone, asm, "", pattern>; + +class AXIx2<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrModeNone, Size8Bytes, IndexModeNone, asm, "", pattern>; + +// BR_JT instructions +class JTI<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrModeNone, SizeSpecial, IndexModeNone, asm, "", pattern>; +class JTI1<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode1, SizeSpecial, IndexModeNone, asm, "", pattern>; +class JTI2<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode2, SizeSpecial, IndexModeNone, asm, "", pattern>; + +/// AsXI1_bin_c_irs - Same as AsI1_bin_irs but without the predicate operand and +/// setting carry bit. But it can optionally set CPSR. +multiclass AsXI1_bin_c_irs<string opc, PatFrag opnode> { + def ri : AXI1<(ops GPR:$dst, GPR:$a, so_imm:$b, cc_out:$s), + !strconcat(opc, "${s} $dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>, Imp<[CPSR], []>; + def rr : AXI1<(ops GPR:$dst, GPR:$a, GPR:$b, cc_out:$s), + !strconcat(opc, "${s} $dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>, Imp<[CPSR], []>; + def rs : AXI1<(ops GPR:$dst, GPR:$a, so_reg:$b, cc_out:$s), + !strconcat(opc, "${s} $dst, $a, $b"), + [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>, Imp<[CPSR], []>; +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// +def IMPLICIT_DEF_GPR : +PseudoInst<(ops GPR:$rD, pred:$p), + "@ IMPLICIT_DEF_GPR $rD", + [(set GPR:$rD, (undef))]>; + + +/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in +/// the function. The first operand is the ID# for this instruction, the second +/// is the index into the MachineConstantPool that this is, the third is the +/// size in bytes of this constant pool entry. +let isNotDuplicable = 1 in +def CONSTPOOL_ENTRY : +PseudoInst<(ops cpinst_operand:$instid, cpinst_operand:$cpidx, i32imm:$size), + "${instid:label} ${cpidx:cpentry}", []>; + +def ADJCALLSTACKUP : +PseudoInst<(ops i32imm:$amt, pred:$p), + "@ ADJCALLSTACKUP $amt", + [(ARMcallseq_end imm:$amt)]>, Imp<[SP],[SP]>; + +def ADJCALLSTACKDOWN : +PseudoInst<(ops i32imm:$amt, pred:$p), + "@ ADJCALLSTACKDOWN $amt", + [(ARMcallseq_start imm:$amt)]>, Imp<[SP],[SP]>; + +def DWARF_LOC : +PseudoInst<(ops i32imm:$line, i32imm:$col, i32imm:$file), + ".loc $file, $line, $col", + [(dwarf_loc (i32 imm:$line), (i32 imm:$col), (i32 imm:$file))]>; + +let isNotDuplicable = 1 in { +def PICADD : AXI1<(ops GPR:$dst, GPR:$a, pclabel:$cp, pred:$p), + "$cp:\n\tadd$p $dst, pc, $a", + [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>; + +let isLoad = 1, AddedComplexity = 10 in { +def PICLD : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr$p $dst, $addr", + [(set GPR:$dst, (load addrmodepc:$addr))]>; + +def PICLDZH : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}h $dst, $addr", + [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>; + +def PICLDZB : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}b $dst, $addr", + [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>; + +def PICLDH : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}h $dst, $addr", + [(set GPR:$dst, (extloadi16 addrmodepc:$addr))]>; + +def PICLDB : AXI2<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}b $dst, $addr", + [(set GPR:$dst, (extloadi8 addrmodepc:$addr))]>; + +def PICLDSH : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}sh $dst, $addr", + [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>; + +def PICLDSB : AXI3<(ops GPR:$dst, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tldr${p}sb $dst, $addr", + [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>; +} +let isStore = 1, AddedComplexity = 10 in { +def PICSTR : AXI2<(ops GPR:$src, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tstr$p $src, $addr", + [(store GPR:$src, addrmodepc:$addr)]>; + +def PICSTRH : AXI3<(ops GPR:$src, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tstr${p}h $src, $addr", + [(truncstorei16 GPR:$src, addrmodepc:$addr)]>; + +def PICSTRB : AXI2<(ops GPR:$src, addrmodepc:$addr, pred:$p), + "${addr:label}:\n\tstr${p}b $src, $addr", + [(truncstorei8 GPR:$src, addrmodepc:$addr)]>; +} +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1 in + def BX_RET : AI<(ops), "bx", " lr", [(ARMretflag)]>; + +// FIXME: remove when we have a way to marking a MI with these properties. +let isLoad = 1, isReturn = 1, isTerminator = 1 in + def LDM_RET : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops), + "ldm${p}${addr:submode} $addr, $dst1", + []>; + +let isCall = 1, noResults = 1, + Defs = [R0, R1, R2, R3, R12, LR, + D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in { + def BL : AXI<(ops i32imm:$func, variable_ops), + "bl ${func:call}", + [(ARMcall tglobaladdr:$func)]>; + + def BL_pred : AI<(ops i32imm:$func, variable_ops), + "bl", " ${func:call}", + [(ARMcall_pred tglobaladdr:$func)]>; + + // ARMv5T and above + def BLX : AXI<(ops GPR:$dst, variable_ops), + "blx $dst", + [(ARMcall GPR:$dst)]>, Requires<[IsARM, HasV5T]>; + let Uses = [LR] in { + // ARMv4T + def BX : AXIx2<(ops GPR:$dst, variable_ops), + "mov lr, pc\n\tbx $dst", + [(ARMcall_nolink GPR:$dst)]>; + } +} + +let isBranch = 1, isTerminator = 1, noResults = 1 in { + // B is "predicable" since it can be xformed into a Bcc. + let isBarrier = 1 in { + let isPredicable = 1 in + def B : AXI<(ops brtarget:$dst), "b $dst", + [(br bb:$dst)]>; + + let isNotDuplicable = 1 in { + def BR_JTr : JTI<(ops GPR:$dst, jtblock_operand:$jt, i32imm:$id), + "mov pc, $dst \n$jt", + [(ARMbrjt GPR:$dst, tjumptable:$jt, imm:$id)]>; + def BR_JTm : JTI2<(ops addrmode2:$dst, jtblock_operand:$jt, i32imm:$id), + "ldr pc, $dst \n$jt", + [(ARMbrjt (i32 (load addrmode2:$dst)), tjumptable:$jt, + imm:$id)]>; + def BR_JTadd : JTI1<(ops GPR:$dst, GPR:$idx, jtblock_operand:$jt, i32imm:$id), + "add pc, $dst, $idx \n$jt", + [(ARMbrjt (add GPR:$dst, GPR:$idx), tjumptable:$jt, + imm:$id)]>; + } + } + + // FIXME: should be able to write a pattern for ARMBrcond, but can't use + // a two-value operand where a dag node expects two operands. :( + def Bcc : AI<(ops brtarget:$dst), "b", " $dst", + [/*(ARMbrcond bb:$dst, imm:$cc, CCR:$ccr)*/]>; +} + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +// Load +let isLoad = 1 in { +def LDR : AI2<(ops GPR:$dst, addrmode2:$addr), + "ldr", " $dst, $addr", + [(set GPR:$dst, (load addrmode2:$addr))]>; + +// Special LDR for loads from non-pc-relative constpools. +let isReMaterializable = 1 in +def LDRcp : AI2<(ops GPR:$dst, addrmode2:$addr), + "ldr", " $dst, $addr", []>; + +// Loads with zero extension +def LDRH : AI3<(ops GPR:$dst, addrmode3:$addr), + "ldr", "h $dst, $addr", + [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>; + +def LDRB : AI2<(ops GPR:$dst, addrmode2:$addr), + "ldr", "b $dst, $addr", + [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>; + +// Loads with sign extension +def LDRSH : AI3<(ops GPR:$dst, addrmode3:$addr), + "ldr", "sh $dst, $addr", + [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>; + +def LDRSB : AI3<(ops GPR:$dst, addrmode3:$addr), + "ldr", "sb $dst, $addr", + [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>; + +// Load doubleword +def LDRD : AI3<(ops GPR:$dst, addrmode3:$addr), + "ldr", "d $dst, $addr", + []>, Requires<[IsARM, HasV5T]>; + +// Indexed loads +def LDR_PRE : AI2pr<(ops GPR:$dst, GPR:$base_wb, addrmode2:$addr), + "ldr", " $dst, $addr!", "$addr.base = $base_wb", []>; + +def LDR_POST : AI2po<(ops GPR:$dst, GPR:$base_wb, GPR:$base, am2offset:$offset), + "ldr", " $dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRH_PRE : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr), + "ldr", "h $dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRH_POST : AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset), + "ldr", "h $dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRB_PRE : AI2pr<(ops GPR:$dst, GPR:$base_wb, addrmode2:$addr), + "ldr", "b $dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRB_POST : AI2po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am2offset:$offset), + "ldr", "b $dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRSH_PRE : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr), + "ldr", "sh $dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRSH_POST: AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset), + "ldr", "sh $dst, [$base], $offset", "$base = $base_wb", []>; + +def LDRSB_PRE : AI3pr<(ops GPR:$dst, GPR:$base_wb, addrmode3:$addr), + "ldr", "sb $dst, $addr!", "$addr.base = $base_wb", []>; + +def LDRSB_POST: AI3po<(ops GPR:$dst, GPR:$base_wb, GPR:$base,am3offset:$offset), + "ldr", "sb $dst, [$base], $offset", "$base = $base_wb", []>; +} // isLoad + +// Store +let isStore = 1 in { +def STR : AI2<(ops GPR:$src, addrmode2:$addr), + "str", " $src, $addr", + [(store GPR:$src, addrmode2:$addr)]>; + +// Stores with truncate +def STRH : AI3<(ops GPR:$src, addrmode3:$addr), + "str", "h $src, $addr", + [(truncstorei16 GPR:$src, addrmode3:$addr)]>; + +def STRB : AI2<(ops GPR:$src, addrmode2:$addr), + "str", "b $src, $addr", + [(truncstorei8 GPR:$src, addrmode2:$addr)]>; + +// Store doubleword +def STRD : AI3<(ops GPR:$src, addrmode3:$addr), + "str", "d $src, $addr", + []>, Requires<[IsARM, HasV5T]>; + +// Indexed stores +def STR_PRE : AI2pr<(ops GPR:$base_wb, GPR:$src, GPR:$base, am2offset:$offset), + "str", " $src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>; + +def STR_POST : AI2po<(ops GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset), + "str", " $src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, + (post_store GPR:$src, GPR:$base, am2offset:$offset))]>; + +def STRH_PRE : AI3pr<(ops GPR:$base_wb, GPR:$src, GPR:$base,am3offset:$offset), + "str", "h $src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, + (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>; + +def STRH_POST: AI3po<(ops GPR:$base_wb, GPR:$src, GPR:$base,am3offset:$offset), + "str", "h $src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, (post_truncsti16 GPR:$src, + GPR:$base, am3offset:$offset))]>; + +def STRB_PRE : AI2pr<(ops GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset), + "str", "b $src, [$base, $offset]!", "$base = $base_wb", + [(set GPR:$base_wb, (pre_truncsti8 GPR:$src, + GPR:$base, am2offset:$offset))]>; + +def STRB_POST: AI2po<(ops GPR:$base_wb, GPR:$src, GPR:$base,am2offset:$offset), + "str", "b $src, [$base], $offset", "$base = $base_wb", + [(set GPR:$base_wb, (post_truncsti8 GPR:$src, + GPR:$base, am2offset:$offset))]>; +} // isStore + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +let isLoad = 1 in +def LDM : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$dst1, variable_ops), + "ldm${p}${addr:submode} $addr, $dst1", + []>; + +let isStore = 1 in +def STM : AXI4<(ops addrmode4:$addr, pred:$p, reglist:$src1, variable_ops), + "stm${p}${addr:submode} $addr, $src1", + []>; + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +def MOVr : AsI1<(ops GPR:$dst, GPR:$src), + "mov", " $dst, $src", []>; +def MOVs : AsI1<(ops GPR:$dst, so_reg:$src), + "mov", " $dst, $src", [(set GPR:$dst, so_reg:$src)]>; + +let isReMaterializable = 1 in +def MOVi : AsI1<(ops GPR:$dst, so_imm:$src), + "mov", " $dst, $src", [(set GPR:$dst, so_imm:$src)]>; + +def MOVrx : AsI1<(ops GPR:$dst, GPR:$src), + "mov", " $dst, $src, rrx", + [(set GPR:$dst, (ARMrrx GPR:$src))]>; + +// These aren't really mov instructions, but we have to define them this way +// due to flag operands. + +def MOVsrl_flag : AI1<(ops GPR:$dst, GPR:$src), + "mov", "s $dst, $src, lsr #1", + [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, Imp<[], [CPSR]>; +def MOVsra_flag : AI1<(ops GPR:$dst, GPR:$src), + "mov", "s $dst, $src, asr #1", + [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, Imp<[], [CPSR]>; + +//===----------------------------------------------------------------------===// +// Extend Instructions. +// + +// Sign extenders + +defm SXTB : AI_unary_rrot<"sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>; +defm SXTH : AI_unary_rrot<"sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>; + +defm SXTAB : AI_bin_rrot<"sxtab", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>; +defm SXTAH : AI_bin_rrot<"sxtah", + BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>; + +// TODO: SXT(A){B|H}16 + +// Zero extenders + +let AddedComplexity = 16 in { +defm UXTB : AI_unary_rrot<"uxtb" , UnOpFrag<(and node:$Src, 0x000000FF)>>; +defm UXTH : AI_unary_rrot<"uxth" , UnOpFrag<(and node:$Src, 0x0000FFFF)>>; +defm UXTB16 : AI_unary_rrot<"uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>; + +def : ARMV6Pat<(and (shl GPR:$Src, 8), 0xFF00FF), + (UXTB16r_rot GPR:$Src, 24)>; +def : ARMV6Pat<(and (srl GPR:$Src, 8), 0xFF00FF), + (UXTB16r_rot GPR:$Src, 8)>; + +defm UXTAB : AI_bin_rrot<"uxtab", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>; +defm UXTAH : AI_bin_rrot<"uxtah", + BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>; +} + +// This isn't safe in general, the add is two 16-bit units, not a 32-bit add. +//defm UXTAB16 : xxx<"uxtab16", 0xff00ff>; + +// TODO: UXT(A){B|H}16 + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +defm ADD : AsI1_bin_irs<"add", BinOpFrag<(add node:$LHS, node:$RHS)>>; +defm SUB : AsI1_bin_irs<"sub", BinOpFrag<(sub node:$LHS, node:$RHS)>>; + +// ADD and SUB with 's' bit set. +defm ADDS : ASI1_bin_s_irs<"add", BinOpFrag<(addc node:$LHS, node:$RHS)>>; +defm SUBS : ASI1_bin_s_irs<"sub", BinOpFrag<(subc node:$LHS, node:$RHS)>>; + +// FIXME: Do not allow ADC / SBC to be predicated for now. +defm ADC : AsXI1_bin_c_irs<"adc", BinOpFrag<(adde node:$LHS, node:$RHS)>>; +defm SBC : AsXI1_bin_c_irs<"sbc", BinOpFrag<(sube node:$LHS, node:$RHS)>>; + +// These don't define reg/reg forms, because they are handled above. +def RSBri : AsI1<(ops GPR:$dst, GPR:$a, so_imm:$b), + "rsb", " $dst, $a, $b", + [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]>; + +def RSBrs : AsI1<(ops GPR:$dst, GPR:$a, so_reg:$b), + "rsb", " $dst, $a, $b", + [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]>; + +// RSB with 's' bit set. +def RSBSri : AI1<(ops GPR:$dst, GPR:$a, so_imm:$b), + "rsb", "s $dst, $a, $b", + [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]>, Imp<[], [CPSR]>; +def RSBSrs : AI1<(ops GPR:$dst, GPR:$a, so_reg:$b), + "rsb", "s $dst, $a, $b", + [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]>, Imp<[], [CPSR]>; + +// FIXME: Do not allow RSC to be predicated for now. But they can set CPSR. +def RSCri : AXI1<(ops GPR:$dst, GPR:$a, so_imm:$b, cc_out:$s), + "rsc${s} $dst, $a, $b", + [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>, Imp<[CPSR], []>; +def RSCrs : AXI1<(ops GPR:$dst, GPR:$a, so_reg:$b, cc_out:$s), + "rsc${s} $dst, $a, $b", + [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>, Imp<[CPSR], []>; + +// (sub X, imm) gets canonicalized to (add X, -imm). Match this form. +def : ARMPat<(add GPR:$src, so_imm_neg:$imm), + (SUBri GPR:$src, so_imm_neg:$imm)>; + +//def : ARMPat<(addc GPR:$src, so_imm_neg:$imm), +// (SUBSri GPR:$src, so_imm_neg:$imm)>; +//def : ARMPat<(adde GPR:$src, so_imm_neg:$imm), +// (SBCri GPR:$src, so_imm_neg:$imm)>; + +// Note: These are implemented in C++ code, because they have to generate +// ADD/SUBrs instructions, which use a complex pattern that a xform function +// cannot produce. +// (mul X, 2^n+1) -> (add (X << n), X) +// (mul X, 2^n-1) -> (rsb X, (X << n)) + + +//===----------------------------------------------------------------------===// +// Bitwise Instructions. +// + +defm AND : AsI1_bin_irs<"and", BinOpFrag<(and node:$LHS, node:$RHS)>>; +defm ORR : AsI1_bin_irs<"orr", BinOpFrag<(or node:$LHS, node:$RHS)>>; +defm EOR : AsI1_bin_irs<"eor", BinOpFrag<(xor node:$LHS, node:$RHS)>>; +defm BIC : AsI1_bin_irs<"bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; + +def MVNr : AsI<(ops GPR:$dst, GPR:$src), + "mvn", " $dst, $src", [(set GPR:$dst, (not GPR:$src))]>; +def MVNs : AsI<(ops GPR:$dst, so_reg:$src), + "mvn", " $dst, $src", [(set GPR:$dst, (not so_reg:$src))]>; +let isReMaterializable = 1 in +def MVNi : AsI<(ops GPR:$dst, so_imm:$imm), + "mvn", " $dst, $imm", [(set GPR:$dst, so_imm_not:$imm)]>; + +def : ARMPat<(and GPR:$src, so_imm_not:$imm), + (BICri GPR:$src, so_imm_not:$imm)>; + +//===----------------------------------------------------------------------===// +// Multiply Instructions. +// + +def MUL : AsI<(ops GPR:$dst, GPR:$a, GPR:$b), + "mul", " $dst, $a, $b", + [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>; + +def MLA : AsI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c), + "mla", " $dst, $a, $b, $c", + [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>; + +// Extra precision multiplies with low / high results +def SMULL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b), + "smull", " $ldst, $hdst, $a, $b", []>; + +def UMULL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b), + "umull", " $ldst, $hdst, $a, $b", []>; + +// Multiply + accumulate +def SMLAL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b), + "smlal", " $ldst, $hdst, $a, $b", []>; + +def UMLAL : AsI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b), + "umlal", " $ldst, $hdst, $a, $b", []>; + +def UMAAL : AI<(ops GPR:$ldst, GPR:$hdst, GPR:$a, GPR:$b), + "umaal", " $ldst, $hdst, $a, $b", []>, + Requires<[IsARM, HasV6]>; + +// Most significant word multiply +def SMMUL : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + "smmul", " $dst, $a, $b", + [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>, + Requires<[IsARM, HasV6]>; + +def SMMLA : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c), + "smmla", " $dst, $a, $b, $c", + [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>, + Requires<[IsARM, HasV6]>; + + +def SMMLS : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$c), + "smmls", " $dst, $a, $b, $c", + [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>, + Requires<[IsARM, HasV6]>; + +multiclass AI_smul<string opc, PatFrag opnode> { + def BB : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "bb"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16)))]>, + Requires<[IsARM, HasV5TE]>; + def BT : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "bt"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, 16)))]>, + Requires<[IsARM, HasV5TE]>; + def TB : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "tb"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, 16), + (sext_inreg GPR:$b, i16)))]>, + Requires<[IsARM, HasV5TE]>; + def TT : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "tt"), " $dst, $a, $b", + [(set GPR:$dst, (opnode (sra GPR:$a, 16), + (sra GPR:$b, 16)))]>, + Requires<[IsARM, HasV5TE]>; + def WB : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "wb"), " $dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), 16))]>, + Requires<[IsARM, HasV5TE]>; + def WT : AI<(ops GPR:$dst, GPR:$a, GPR:$b), + !strconcat(opc, "wt"), " $dst, $a, $b", + [(set GPR:$dst, (sra (opnode GPR:$a, + (sra GPR:$b, 16)), 16))]>, + Requires<[IsARM, HasV5TE]>; +} + +multiclass AI_smla<string opc, PatFrag opnode> { + def BB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "bb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, + (opnode (sext_inreg GPR:$a, i16), + (sext_inreg GPR:$b, i16))))]>, + Requires<[IsARM, HasV5TE]>; + def BT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "bt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16), + (sra GPR:$b, 16))))]>, + Requires<[IsARM, HasV5TE]>; + def TB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "tb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16), + (sext_inreg GPR:$b, i16))))]>, + Requires<[IsARM, HasV5TE]>; + def TT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "tt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16), + (sra GPR:$b, 16))))]>, + Requires<[IsARM, HasV5TE]>; + + def WB : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "wb"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sext_inreg GPR:$b, i16)), 16)))]>, + Requires<[IsARM, HasV5TE]>; + def WT : AI<(ops GPR:$dst, GPR:$a, GPR:$b, GPR:$acc), + !strconcat(opc, "wt"), " $dst, $a, $b, $acc", + [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a, + (sra GPR:$b, 16)), 16)))]>, + Requires<[IsARM, HasV5TE]>; +} + +defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>; +defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; + +// TODO: Halfword multiple accumulate long: SMLAL<x><y> +// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD + +//===----------------------------------------------------------------------===// +// Misc. Arithmetic Instructions. +// + +def CLZ : AI<(ops GPR:$dst, GPR:$src), + "clz", " $dst, $src", + [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]>; + +def REV : AI<(ops GPR:$dst, GPR:$src), + "rev", " $dst, $src", + [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]>; + +def REV16 : AI<(ops GPR:$dst, GPR:$src), + "rev16", " $dst, $src", + [(set GPR:$dst, + (or (and (srl GPR:$src, 8), 0xFF), + (or (and (shl GPR:$src, 8), 0xFF00), + (or (and (srl GPR:$src, 8), 0xFF0000), + (and (shl GPR:$src, 8), 0xFF000000)))))]>, + Requires<[IsARM, HasV6]>; + +def REVSH : AI<(ops GPR:$dst, GPR:$src), + "revsh", " $dst, $src", + [(set GPR:$dst, + (sext_inreg + (or (srl (and GPR:$src, 0xFF00), 8), + (shl GPR:$src, 8)), i16))]>, + Requires<[IsARM, HasV6]>; + +def PKHBT : AI<(ops GPR:$dst, GPR:$src1, GPR:$src2, i32imm:$shamt), + "pkhbt", " $dst, $src1, $src2, LSL $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF), + (and (shl GPR:$src2, (i32 imm:$shamt)), + 0xFFFF0000)))]>, + Requires<[IsARM, HasV6]>; + +// Alternate cases for PKHBT where identities eliminate some nodes. +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)), + (PKHBT GPR:$src1, GPR:$src2, 0)>; +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)), + (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>; + + +def PKHTB : AI<(ops GPR:$dst, GPR:$src1, GPR:$src2, i32imm:$shamt), + "pkhtb", " $dst, $src1, $src2, ASR $shamt", + [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000), + (and (sra GPR:$src2, imm16_31:$shamt), + 0xFFFF)))]>, Requires<[IsARM, HasV6]>; + +// Alternate cases for PKHTB where identities eliminate some nodes. Note that +// a shift amount of 0 is *not legal* here, it is PKHBT instead. +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, 16)), + (PKHTB GPR:$src1, GPR:$src2, 16)>; +def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), + (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)), + (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>; + + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +defm CMP : AI1_cmp_irs<"cmp", BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>; +defm CMN : AI1_cmp_irs<"cmn", BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>; + +// Note that TST/TEQ don't set all the same flags that CMP does! +defm TST : AI1_cmp_irs<"tst", BinOpFrag<(ARMcmpNZ (and node:$LHS, node:$RHS), 0)>>; +defm TEQ : AI1_cmp_irs<"teq", BinOpFrag<(ARMcmpNZ (xor node:$LHS, node:$RHS), 0)>>; + +defm CMPnz : AI1_cmp_irs<"cmp", BinOpFrag<(ARMcmpNZ node:$LHS, node:$RHS)>>; +defm CMNnz : AI1_cmp_irs<"cmn", BinOpFrag<(ARMcmpNZ node:$LHS,(ineg node:$RHS))>>; + +def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm), + (CMNri GPR:$src, so_imm_neg:$imm)>; + +def : ARMPat<(ARMcmpNZ GPR:$src, so_imm_neg:$imm), + (CMNri GPR:$src, so_imm_neg:$imm)>; + + +// Conditional moves +// FIXME: should be able to write a pattern for ARMcmov, but can't use +// a two-value operand where a dag node expects two operands. :( +def MOVCCr : AI<(ops GPR:$dst, GPR:$false, GPR:$true), + "mov", " $dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">; + +def MOVCCs : AI<(ops GPR:$dst, GPR:$false, so_reg:$true), + "mov", " $dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">; + +def MOVCCi : AI<(ops GPR:$dst, GPR:$false, so_imm:$true), + "mov", " $dst, $true", + [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>, + RegConstraint<"$false = $dst">; + + +// LEApcrel - Load a pc-relative address into a register without offending the +// assembler. +def LEApcrel : AXI1<(ops GPR:$dst, i32imm:$label, pred:$p), + !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(", + "${:private}PCRELL${:uid}+8))\n"), + !strconcat("${:private}PCRELL${:uid}:\n\t", + "add$p $dst, pc, #PCRELV${:uid}")), + []>; + +def LEApcrelJT : AXI1<(ops GPR:$dst, i32imm:$label, i32imm:$id, pred:$p), + !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(", + "${:private}PCRELL${:uid}+8))\n"), + !strconcat("${:private}PCRELL${:uid}:\n\t", + "add$p $dst, pc, #PCRELV${:uid}")), + []>; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, R12, LR, CPSR] in { + def TPsoft : AXI<(ops), + "bl __aeabi_read_tp", + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>; +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (LEApcrelJT tjumptable:$dst, imm:$id)>; + +// Large immediate handling. + +// Two piece so_imms. +let isReMaterializable = 1 in +def MOVi2pieces : AI1x2<(ops GPR:$dst, so_imm2part:$src), + "mov", " $dst, $src", + [(set GPR:$dst, so_imm2part:$src)]>; + +def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS), + (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; +def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS), + (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)), + (so_imm2part_2 imm:$RHS))>; + +// TODO: add,sub,and, 3-instr forms? + + +// Direct calls +def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>; + +// zextload i1 -> zextload i8 +def : ARMPat<(zextloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; + +// extload -> zextload +def : ARMPat<(extloadi1 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(extloadi8 addrmode2:$addr), (LDRB addrmode2:$addr)>; +def : ARMPat<(extloadi16 addrmode3:$addr), (LDRH addrmode3:$addr)>; + +// truncstore i1 -> truncstore i8 +def : ARMPat<(truncstorei1 GPR:$src, addrmode2:$dst), + (STRB GPR:$src, addrmode2:$dst)>; +def : ARMPat<(pre_truncsti1 GPR:$src, GPR:$base, am2offset:$offset), + (STRB_PRE GPR:$src, GPR:$base, am2offset:$offset)>; +def : ARMPat<(post_truncsti1 GPR:$src, GPR:$base, am2offset:$offset), + (STRB_POST GPR:$src, GPR:$base, am2offset:$offset)>; + +// smul* and smla* +def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra (shl GPR:$b, 16), 16)), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b), + (SMULBB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16)), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, 16)), + (SMULBT GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16)), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(mul (sra GPR:$a, 16), sext_16_node:$b), + (SMULTB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16), + (SMULWB GPR:$a, GPR:$b)>; +def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), 16), + (SMULWB GPR:$a, GPR:$b)>; + +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, 16), 16), + (sra (shl GPR:$b, 16), 16))), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, sext_16_node:$b)), + (SMLABB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul sext_16_node:$a, (sra GPR:$b, 16))), + (SMLABT GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16))), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (mul (sra GPR:$a, 16), sext_16_node:$b)), + (SMLATB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16)), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; +def : ARMV5TEPat<(add GPR:$acc, + (sra (mul GPR:$a, sext_16_node:$b), 16)), + (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>; + +//===----------------------------------------------------------------------===// +// Thumb Support +// + +include "ARMInstrThumb.td" + +//===----------------------------------------------------------------------===// +// Floating Point Support +// + +include "ARMInstrVFP.td" diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td new file mode 100644 index 0000000..27231da --- /dev/null +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -0,0 +1,596 @@ +//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Thumb instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Thumb specific DAG Nodes. +// + +def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +// TI - Thumb instruction. + +// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode. +class ThumbPat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb]; +} + +class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [IsThumb, HasV5T]; +} + +class ThumbI<dag ops, AddrMode am, SizeFlagVal sz, + string asm, string cstr, list<dag> pattern> + // FIXME: Set all opcodes to 0 for now. + : InstARM<0, am, sz, IndexModeNone, cstr> { + let OperandList = ops; + let AsmString = asm; + let Pattern = pattern; + list<Predicate> Predicates = [IsThumb]; +} + +class TI<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "", pattern>; +class TI1<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeT1, Size2Bytes, asm, "", pattern>; +class TI2<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeT2, Size2Bytes, asm, "", pattern>; +class TI4<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeT4, Size2Bytes, asm, "", pattern>; +class TIs<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeTs, Size2Bytes, asm, "", pattern>; + +// Two-address instructions +class TIt<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeNone, Size2Bytes, asm, "$lhs = $dst", pattern>; + +// BL, BLX(1) are translated by assembler into two instructions +class TIx2<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeNone, Size4Bytes, asm, "", pattern>; + +// BR_JT instructions +class TJTI<dag ops, string asm, list<dag> pattern> + : ThumbI<ops, AddrModeNone, SizeSpecial, asm, "", pattern>; + +def imm_neg_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(-(int)N->getValue(), MVT::i32); +}]>; +def imm_comp_XFORM : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(~((uint32_t)N->getValue()), MVT::i32); +}]>; + + +/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7]. +def imm0_7 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getValue() < 8; +}]>; +def imm0_7_neg : PatLeaf<(i32 imm), [{ + return (uint32_t)-N->getValue() < 8; +}], imm_neg_XFORM>; + +def imm0_255 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getValue() < 256; +}]>; +def imm0_255_comp : PatLeaf<(i32 imm), [{ + return ~((uint32_t)N->getValue()) < 256; +}]>; + +def imm8_255 : PatLeaf<(i32 imm), [{ + return (uint32_t)N->getValue() >= 8 && (uint32_t)N->getValue() < 256; +}]>; +def imm8_255_neg : PatLeaf<(i32 imm), [{ + unsigned Val = -N->getValue(); + return Val >= 8 && Val < 256; +}], imm_neg_XFORM>; + +// Break imm's up into two pieces: an immediate + a left shift. +// This uses thumb_immshifted to match and thumb_immshifted_val and +// thumb_immshifted_shamt to get the val/shift pieces. +def thumb_immshifted : PatLeaf<(imm), [{ + return ARM_AM::isThumbImmShiftedVal((unsigned)N->getValue()); +}]>; + +def thumb_immshifted_val : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +def thumb_immshifted_shamt : SDNodeXForm<imm, [{ + unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getValue()); + return CurDAG->getTargetConstant(V, MVT::i32); +}]>; + +// Define Thumb specific addressing modes. + +// t_addrmode_rr := reg + reg +// +def t_addrmode_rr : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> { + let PrintMethod = "printThumbAddrModeRROperand"; + let MIOperandInfo = (ops GPR:$base, GPR:$offsreg); +} + +// t_addrmode_s4 := reg + reg +// reg + imm5 * 4 +// +def t_addrmode_s4 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> { + let PrintMethod = "printThumbAddrModeS4Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg); +} + +// t_addrmode_s2 := reg + reg +// reg + imm5 * 2 +// +def t_addrmode_s2 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> { + let PrintMethod = "printThumbAddrModeS2Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg); +} + +// t_addrmode_s1 := reg + reg +// reg + imm5 +// +def t_addrmode_s1 : Operand<i32>, + ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> { + let PrintMethod = "printThumbAddrModeS1Operand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm, GPR:$offsreg); +} + +// t_addrmode_sp := sp + imm8 * 4 +// +def t_addrmode_sp : Operand<i32>, + ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> { + let PrintMethod = "printThumbAddrModeSPOperand"; + let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm); +} + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +def tADJCALLSTACKUP : +PseudoInst<(ops i32imm:$amt), + "@ tADJCALLSTACKUP $amt", + [(ARMcallseq_end imm:$amt)]>, Imp<[SP],[SP]>, Requires<[IsThumb]>; + +def tADJCALLSTACKDOWN : +PseudoInst<(ops i32imm:$amt), + "@ tADJCALLSTACKDOWN $amt", + [(ARMcallseq_start imm:$amt)]>, Imp<[SP],[SP]>, Requires<[IsThumb]>; + +let isNotDuplicable = 1 in +def tPICADD : TIt<(ops GPR:$dst, GPR:$lhs, pclabel:$cp), + "$cp:\n\tadd $dst, pc", + [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>; + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +let isReturn = 1, isTerminator = 1 in { + def tBX_RET : TI<(ops), "bx lr", [(ARMretflag)]>; + // Alternative return instruction used by vararg functions. + def tBX_RET_vararg : TI<(ops GPR:$dst), "bx $dst", []>; +} + +// FIXME: remove when we have a way to marking a MI with these properties. +let isLoad = 1, isReturn = 1, isTerminator = 1 in +def tPOP_RET : TI<(ops reglist:$dst1, variable_ops), + "pop $dst1", []>; + +let isCall = 1, noResults = 1, + Defs = [R0, R1, R2, R3, LR, + D0, D1, D2, D3, D4, D5, D6, D7] in { + def tBL : TIx2<(ops i32imm:$func, variable_ops), + "bl ${func:call}", + [(ARMtcall tglobaladdr:$func)]>; + // ARMv5T and above + def tBLXi : TIx2<(ops i32imm:$func, variable_ops), + "blx ${func:call}", + [(ARMcall tglobaladdr:$func)]>, Requires<[HasV5T]>; + def tBLXr : TI<(ops GPR:$dst, variable_ops), + "blx $dst", + [(ARMtcall GPR:$dst)]>, Requires<[HasV5T]>; + // ARMv4T + def tBX : TIx2<(ops GPR:$dst, variable_ops), + "cpy lr, pc\n\tbx $dst", + [(ARMcall_nolink GPR:$dst)]>; +} + +let isBranch = 1, isTerminator = 1, noResults = 1 in { + let isBarrier = 1 in { + let isPredicable = 1 in + def tB : TI<(ops brtarget:$dst), "b $dst", [(br bb:$dst)]>; + + // Far jump + def tBfar : TIx2<(ops brtarget:$dst), "bl $dst\t@ far jump", []>; + + def tBR_JTr : TJTI<(ops GPR:$dst, jtblock_operand:$jt, i32imm:$id), + "cpy pc, $dst \n\t.align\t2\n$jt", + [(ARMbrjt GPR:$dst, tjumptable:$jt, imm:$id)]>; + } +} + +// FIXME: should be able to write a pattern for ARMBrcond, but can't use +// a two-value operand where a dag node expects two operands. :( +let isBranch = 1, isTerminator = 1, noResults = 1 in + def tBcc : TI<(ops brtarget:$dst, pred:$cc), "b$cc $dst", + [/*(ARMbrcond bb:$dst, imm:$cc)*/]>; + +//===----------------------------------------------------------------------===// +// Load Store Instructions. +// + +let isLoad = 1 in { +def tLDR : TI4<(ops GPR:$dst, t_addrmode_s4:$addr), + "ldr $dst, $addr", + [(set GPR:$dst, (load t_addrmode_s4:$addr))]>; + +def tLDRB : TI1<(ops GPR:$dst, t_addrmode_s1:$addr), + "ldrb $dst, $addr", + [(set GPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>; + +def tLDRH : TI2<(ops GPR:$dst, t_addrmode_s2:$addr), + "ldrh $dst, $addr", + [(set GPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>; + +def tLDRSB : TI1<(ops GPR:$dst, t_addrmode_rr:$addr), + "ldrsb $dst, $addr", + [(set GPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>; + +def tLDRSH : TI2<(ops GPR:$dst, t_addrmode_rr:$addr), + "ldrsh $dst, $addr", + [(set GPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>; + +def tLDRspi : TIs<(ops GPR:$dst, t_addrmode_sp:$addr), + "ldr $dst, $addr", + [(set GPR:$dst, (load t_addrmode_sp:$addr))]>; + +// Special instruction for restore. It cannot clobber condition register +// when it's expanded by eliminateCallFramePseudoInstr(). +def tRestore : TIs<(ops GPR:$dst, t_addrmode_sp:$addr), + "ldr $dst, $addr", []>; + +// Load tconstpool +def tLDRpci : TIs<(ops GPR:$dst, i32imm:$addr), + "ldr $dst, $addr", + [(set GPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>; + +// Special LDR for loads from non-pc-relative constpools. +let isReMaterializable = 1 in +def tLDRcp : TIs<(ops GPR:$dst, i32imm:$addr), + "ldr $dst, $addr", []>; +} // isLoad + +let isStore = 1 in { +def tSTR : TI4<(ops GPR:$src, t_addrmode_s4:$addr), + "str $src, $addr", + [(store GPR:$src, t_addrmode_s4:$addr)]>; + +def tSTRB : TI1<(ops GPR:$src, t_addrmode_s1:$addr), + "strb $src, $addr", + [(truncstorei8 GPR:$src, t_addrmode_s1:$addr)]>; + +def tSTRH : TI2<(ops GPR:$src, t_addrmode_s2:$addr), + "strh $src, $addr", + [(truncstorei16 GPR:$src, t_addrmode_s2:$addr)]>; + +def tSTRspi : TIs<(ops GPR:$src, t_addrmode_sp:$addr), + "str $src, $addr", + [(store GPR:$src, t_addrmode_sp:$addr)]>; + +// Special instruction for spill. It cannot clobber condition register +// when it's expanded by eliminateCallFramePseudoInstr(). +def tSpill : TIs<(ops GPR:$src, t_addrmode_sp:$addr), + "str $src, $addr", []>; +} + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +// TODO: A7-44: LDMIA - load multiple + +let isLoad = 1 in +def tPOP : TI<(ops reglist:$dst1, variable_ops), + "pop $dst1", []>; + +let isStore = 1 in +def tPUSH : TI<(ops reglist:$src1, variable_ops), + "push $src1", []>; + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions. +// + +// Add with carry +def tADC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "adc $dst, $rhs", + [(set GPR:$dst, (adde GPR:$lhs, GPR:$rhs))]>; + +def tADDS : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "add $dst, $lhs, $rhs", + [(set GPR:$dst, (addc GPR:$lhs, GPR:$rhs))]>; + + +def tADDi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "add $dst, $lhs, $rhs", + [(set GPR:$dst, (add GPR:$lhs, imm0_7:$rhs))]>; + +def tADDi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "add $dst, $rhs", + [(set GPR:$dst, (add GPR:$lhs, imm8_255:$rhs))]>; + +def tADDrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "add $dst, $lhs, $rhs", + [(set GPR:$dst, (add GPR:$lhs, GPR:$rhs))]>; + +def tADDhirr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "add $dst, $rhs", []>; + +def tADDrPCi : TI<(ops GPR:$dst, i32imm:$rhs), + "add $dst, pc, $rhs * 4", []>; +def tADDrSPi : TI<(ops GPR:$dst, GPR:$sp, i32imm:$rhs), + "add $dst, $sp, $rhs * 4", []>; +def tADDspi : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "add $dst, $rhs * 4", []>; + +def tAND : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "and $dst, $rhs", + [(set GPR:$dst, (and GPR:$lhs, GPR:$rhs))]>; + +def tASRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "asr $dst, $lhs, $rhs", + [(set GPR:$dst, (sra GPR:$lhs, imm:$rhs))]>; + +def tASRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "asr $dst, $rhs", + [(set GPR:$dst, (sra GPR:$lhs, GPR:$rhs))]>; + +def tBIC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "bic $dst, $rhs", + [(set GPR:$dst, (and GPR:$lhs, (not GPR:$rhs)))]>; + + +def tCMN : TI<(ops GPR:$lhs, GPR:$rhs), + "cmn $lhs, $rhs", + [(ARMcmp GPR:$lhs, (ineg GPR:$rhs))]>; + +def tCMPi8 : TI<(ops GPR:$lhs, i32imm:$rhs), + "cmp $lhs, $rhs", + [(ARMcmp GPR:$lhs, imm0_255:$rhs)]>; + +def tCMPr : TI<(ops GPR:$lhs, GPR:$rhs), + "cmp $lhs, $rhs", + [(ARMcmp GPR:$lhs, GPR:$rhs)]>; + +def tTST : TI<(ops GPR:$lhs, GPR:$rhs), + "tst $lhs, $rhs", + [(ARMcmpNZ (and GPR:$lhs, GPR:$rhs), 0)]>; + +def tCMNNZ : TI<(ops GPR:$lhs, GPR:$rhs), + "cmn $lhs, $rhs", + [(ARMcmpNZ GPR:$lhs, (ineg GPR:$rhs))]>; + +def tCMPNZi8 : TI<(ops GPR:$lhs, i32imm:$rhs), + "cmp $lhs, $rhs", + [(ARMcmpNZ GPR:$lhs, imm0_255:$rhs)]>; + +def tCMPNZr : TI<(ops GPR:$lhs, GPR:$rhs), + "cmp $lhs, $rhs", + [(ARMcmpNZ GPR:$lhs, GPR:$rhs)]>; + +// TODO: A7-37: CMP(3) - cmp hi regs + +def tEOR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "eor $dst, $rhs", + [(set GPR:$dst, (xor GPR:$lhs, GPR:$rhs))]>; + +def tLSLri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "lsl $dst, $lhs, $rhs", + [(set GPR:$dst, (shl GPR:$lhs, imm:$rhs))]>; + +def tLSLrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "lsl $dst, $rhs", + [(set GPR:$dst, (shl GPR:$lhs, GPR:$rhs))]>; + +def tLSRri : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "lsr $dst, $lhs, $rhs", + [(set GPR:$dst, (srl GPR:$lhs, imm:$rhs))]>; + +def tLSRrr : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "lsr $dst, $rhs", + [(set GPR:$dst, (srl GPR:$lhs, GPR:$rhs))]>; + +// FIXME: This is not rematerializable because mov changes the condition code. +def tMOVi8 : TI<(ops GPR:$dst, i32imm:$src), + "mov $dst, $src", + [(set GPR:$dst, imm0_255:$src)]>; + +// TODO: A7-73: MOV(2) - mov setting flag. + + +// Note: MOV(2) of two low regs updates the flags, so we emit this as 'cpy', +// which is MOV(3). This also supports high registers. +def tMOVr : TI<(ops GPR:$dst, GPR:$src), + "cpy $dst, $src", []>; + +def tMUL : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "mul $dst, $rhs", + [(set GPR:$dst, (mul GPR:$lhs, GPR:$rhs))]>; + +def tMVN : TI<(ops GPR:$dst, GPR:$src), + "mvn $dst, $src", + [(set GPR:$dst, (not GPR:$src))]>; + +def tNEG : TI<(ops GPR:$dst, GPR:$src), + "neg $dst, $src", + [(set GPR:$dst, (ineg GPR:$src))]>; + +def tORR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "orr $dst, $rhs", + [(set GPR:$dst, (or GPR:$lhs, GPR:$rhs))]>; + + +def tREV : TI<(ops GPR:$dst, GPR:$src), + "rev $dst, $src", + [(set GPR:$dst, (bswap GPR:$src))]>, + Requires<[IsThumb, HasV6]>; + +def tREV16 : TI<(ops GPR:$dst, GPR:$src), + "rev16 $dst, $src", + [(set GPR:$dst, + (or (and (srl GPR:$src, 8), 0xFF), + (or (and (shl GPR:$src, 8), 0xFF00), + (or (and (srl GPR:$src, 8), 0xFF0000), + (and (shl GPR:$src, 8), 0xFF000000)))))]>, + Requires<[IsThumb, HasV6]>; + +def tREVSH : TI<(ops GPR:$dst, GPR:$src), + "revsh $dst, $src", + [(set GPR:$dst, + (sext_inreg + (or (srl (and GPR:$src, 0xFFFF), 8), + (shl GPR:$src, 8)), i16))]>, + Requires<[IsThumb, HasV6]>; + +def tROR : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "ror $dst, $rhs", + [(set GPR:$dst, (rotr GPR:$lhs, GPR:$rhs))]>; + + +// Subtract with carry +def tSBC : TIt<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "sbc $dst, $rhs", + [(set GPR:$dst, (sube GPR:$lhs, GPR:$rhs))]>; + +def tSUBS : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "sub $dst, $lhs, $rhs", + [(set GPR:$dst, (subc GPR:$lhs, GPR:$rhs))]>; + + +// TODO: A7-96: STMIA - store multiple. + +def tSUBi3 : TI<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "sub $dst, $lhs, $rhs", + [(set GPR:$dst, (add GPR:$lhs, imm0_7_neg:$rhs))]>; + +def tSUBi8 : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "sub $dst, $rhs", + [(set GPR:$dst, (add GPR:$lhs, imm8_255_neg:$rhs))]>; + +def tSUBrr : TI<(ops GPR:$dst, GPR:$lhs, GPR:$rhs), + "sub $dst, $lhs, $rhs", + [(set GPR:$dst, (sub GPR:$lhs, GPR:$rhs))]>; + +def tSUBspi : TIt<(ops GPR:$dst, GPR:$lhs, i32imm:$rhs), + "sub $dst, $rhs * 4", []>; + +def tSXTB : TI<(ops GPR:$dst, GPR:$src), + "sxtb $dst, $src", + [(set GPR:$dst, (sext_inreg GPR:$src, i8))]>, + Requires<[IsThumb, HasV6]>; +def tSXTH : TI<(ops GPR:$dst, GPR:$src), + "sxth $dst, $src", + [(set GPR:$dst, (sext_inreg GPR:$src, i16))]>, + Requires<[IsThumb, HasV6]>; + + +def tUXTB : TI<(ops GPR:$dst, GPR:$src), + "uxtb $dst, $src", + [(set GPR:$dst, (and GPR:$src, 0xFF))]>, + Requires<[IsThumb, HasV6]>; +def tUXTH : TI<(ops GPR:$dst, GPR:$src), + "uxth $dst, $src", + [(set GPR:$dst, (and GPR:$src, 0xFFFF))]>, + Requires<[IsThumb, HasV6]>; + + +// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation. +// Expanded by the scheduler into a branch sequence. +let usesCustomDAGSchedInserter = 1 in // Expanded by the scheduler. + def tMOVCCr : + PseudoInst<(ops GPR:$dst, GPR:$false, GPR:$true, pred:$cc), + "@ tMOVCCr $cc", + [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc))*/]>; + +// tLEApcrel - Load a pc-relative address into a register without offending the +// assembler. +def tLEApcrel : TIx2<(ops GPR:$dst, i32imm:$label), + !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(", + "${:private}PCRELL${:uid}+4))\n"), + !strconcat("\tmov $dst, #PCRELV${:uid}\n", + "${:private}PCRELL${:uid}:\n\tadd $dst, pc")), + []>; + +def tLEApcrelJT : TIx2<(ops GPR:$dst, i32imm:$label, i32imm:$id), + !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(", + "${:private}PCRELL${:uid}+4))\n"), + !strconcat("\tmov $dst, #PCRELV${:uid}\n", + "${:private}PCRELL${:uid}:\n\tadd $dst, pc")), + []>; + +//===----------------------------------------------------------------------===// +// TLS Instructions +// + +// __aeabi_read_tp preserves the registers r1-r3. +let isCall = 1, + Defs = [R0, LR] in { + def tTPsoft : TIx2<(ops), + "bl __aeabi_read_tp", + [(set R0, ARMthread_pointer)]>; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +// + +// ConstantPool, GlobalAddress +def : ThumbPat<(ARMWrapper tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>; +def : ThumbPat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; + +// JumpTable +def : ThumbPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (tLEApcrelJT tjumptable:$dst, imm:$id)>; + +// Direct calls +def : ThumbPat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>; +def : ThumbV5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>; + +// Indirect calls to ARM routines +def : ThumbV5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>; + +// zextload i1 -> zextload i8 +def : ThumbPat<(zextloadi1 t_addrmode_s1:$addr), + (tLDRB t_addrmode_s1:$addr)>; + +// extload -> zextload +def : ThumbPat<(extloadi1 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; +def : ThumbPat<(extloadi8 t_addrmode_s1:$addr), (tLDRB t_addrmode_s1:$addr)>; +def : ThumbPat<(extloadi16 t_addrmode_s2:$addr), (tLDRH t_addrmode_s2:$addr)>; + +// truncstore i1 -> truncstore i8 +def : ThumbPat<(truncstorei1 GPR:$src, t_addrmode_s1:$dst), + (tSTRB GPR:$src, t_addrmode_s1:$dst)>; + +// Large immediate handling. + +// Two piece imms. +def : ThumbPat<(i32 thumb_immshifted:$src), + (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)), + (thumb_immshifted_shamt imm:$src))>; + +def : ThumbPat<(i32 imm0_255_comp:$src), + (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td new file mode 100644 index 0000000..4bb9f04 --- /dev/null +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -0,0 +1,386 @@ +//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the ARM VP instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM VFP Instruction templates. +// + +// ARM Float Instruction +class ASI<dag ops, string opc, string asm, list<dag> pattern> + : AI<ops, opc, asm, pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +class ASI5<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode5, Size4Bytes, IndexModeNone, opc, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +// ARM Double Instruction +class ADI<dag ops, string opc, string asm, list<dag> pattern> + : AI<ops, opc, asm, pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +class ADI5<dag ops, string opc, string asm, list<dag> pattern> + : I<ops, AddrMode5, Size4Bytes, IndexModeNone, opc, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +// Special cases. +class AXSI<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +class AXSI5<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +class AXDI<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrModeNone, Size4Bytes, IndexModeNone, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + +class AXDI5<dag ops, string asm, list<dag> pattern> + : XI<ops, AddrMode5, Size4Bytes, IndexModeNone, asm, "", pattern> { + // TODO: Mark the instructions with the appropriate subtarget info. +} + + +def SDT_FTOI : +SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; +def SDT_ITOF : +SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; +def SDT_CMPFP0 : +SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDT_FMDRR : +SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>]>; + +def arm_ftoui : SDNode<"ARMISD::FTOUI", SDT_FTOI>; +def arm_ftosi : SDNode<"ARMISD::FTOSI", SDT_FTOI>; +def arm_sitof : SDNode<"ARMISD::SITOF", SDT_ITOF>; +def arm_uitof : SDNode<"ARMISD::UITOF", SDT_ITOF>; +def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTRet, [SDNPInFlag,SDNPOutFlag]>; +def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutFlag]>; +def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutFlag]>; +def arm_fmdrr : SDNode<"ARMISD::FMDRR", SDT_FMDRR>; + +//===----------------------------------------------------------------------===// +// Load / store Instructions. +// + +let isLoad = 1 in { +def FLDD : ADI5<(ops DPR:$dst, addrmode5:$addr), + "fldd", " $dst, $addr", + [(set DPR:$dst, (load addrmode5:$addr))]>; + +def FLDS : ASI5<(ops SPR:$dst, addrmode5:$addr), + "flds", " $dst, $addr", + [(set SPR:$dst, (load addrmode5:$addr))]>; +} // isLoad + +let isStore = 1 in { +def FSTD : ADI5<(ops DPR:$src, addrmode5:$addr), + "fstd", " $src, $addr", + [(store DPR:$src, addrmode5:$addr)]>; + +def FSTS : ASI5<(ops SPR:$src, addrmode5:$addr), + "fsts", " $src, $addr", + [(store SPR:$src, addrmode5:$addr)]>; +} // isStore + +//===----------------------------------------------------------------------===// +// Load / store multiple Instructions. +// + +let isLoad = 1 in { +def FLDMD : AXDI5<(ops addrmode5:$addr, pred:$p, reglist:$dst1, variable_ops), + "fldm${addr:submode}d${p} ${addr:base}, $dst1", + []>; + +def FLDMS : AXSI5<(ops addrmode5:$addr, pred:$p, reglist:$dst1, variable_ops), + "fldm${addr:submode}s${p} ${addr:base}, $dst1", + []>; +} // isLoad + +let isStore = 1 in { +def FSTMD : AXDI5<(ops addrmode5:$addr, pred:$p, reglist:$src1, variable_ops), + "fstm${addr:submode}d${p} ${addr:base}, $src1", + []>; + +def FSTMS : AXSI5<(ops addrmode5:$addr, pred:$p, reglist:$src1, variable_ops), + "fstm${addr:submode}s${p} ${addr:base}, $src1", + []>; +} // isStore + +// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores + +//===----------------------------------------------------------------------===// +// FP Binary Operations. +// + +def FADDD : ADI<(ops DPR:$dst, DPR:$a, DPR:$b), + "faddd", " $dst, $a, $b", + [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>; + +def FADDS : ASI<(ops SPR:$dst, SPR:$a, SPR:$b), + "fadds", " $dst, $a, $b", + [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>; + +def FCMPED : ADI<(ops DPR:$a, DPR:$b), + "fcmped", " $a, $b", + [(arm_cmpfp DPR:$a, DPR:$b)]>; + +def FCMPES : ASI<(ops SPR:$a, SPR:$b), + "fcmpes", " $a, $b", + [(arm_cmpfp SPR:$a, SPR:$b)]>; + +def FDIVD : ADI<(ops DPR:$dst, DPR:$a, DPR:$b), + "fdivd", " $dst, $a, $b", + [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>; + +def FDIVS : ASI<(ops SPR:$dst, SPR:$a, SPR:$b), + "fdivs", " $dst, $a, $b", + [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>; + +def FMULD : ADI<(ops DPR:$dst, DPR:$a, DPR:$b), + "fmuld", " $dst, $a, $b", + [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>; + +def FMULS : ASI<(ops SPR:$dst, SPR:$a, SPR:$b), + "fmuls", " $dst, $a, $b", + [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>; + +def FNMULD : ADI<(ops DPR:$dst, DPR:$a, DPR:$b), + "fnmuld", " $dst, $a, $b", + [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]>; + +def FNMULS : ASI<(ops SPR:$dst, SPR:$a, SPR:$b), + "fnmuls", " $dst, $a, $b", + [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>; + +// Match reassociated forms only if not sign dependent rounding. +def : Pat<(fmul (fneg DPR:$a), DPR:$b), + (FNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>; +def : Pat<(fmul (fneg SPR:$a), SPR:$b), + (FNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>; + + +def FSUBD : ADI<(ops DPR:$dst, DPR:$a, DPR:$b), + "fsubd", " $dst, $a, $b", + [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]>; + +def FSUBS : ASI<(ops SPR:$dst, SPR:$a, SPR:$b), + "fsubs", " $dst, $a, $b", + [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>; + +//===----------------------------------------------------------------------===// +// FP Unary Operations. +// + +def FABSD : ADI<(ops DPR:$dst, DPR:$a), + "fabsd", " $dst, $a", + [(set DPR:$dst, (fabs DPR:$a))]>; + +def FABSS : ASI<(ops SPR:$dst, SPR:$a), + "fabss", " $dst, $a", + [(set SPR:$dst, (fabs SPR:$a))]>; + +def FCMPEZD : ADI<(ops DPR:$a), + "fcmpezd", " $a", + [(arm_cmpfp0 DPR:$a)]>; + +def FCMPEZS : ASI<(ops SPR:$a), + "fcmpezs", " $a", + [(arm_cmpfp0 SPR:$a)]>; + +def FCVTDS : ADI<(ops DPR:$dst, SPR:$a), + "fcvtds", " $dst, $a", + [(set DPR:$dst, (fextend SPR:$a))]>; + +def FCVTSD : ADI<(ops SPR:$dst, DPR:$a), + "fcvtsd", " $dst, $a", + [(set SPR:$dst, (fround DPR:$a))]>; + +def FCPYD : ADI<(ops DPR:$dst, DPR:$a), + "fcpyd", " $dst, $a", []>; + +def FCPYS : ASI<(ops SPR:$dst, SPR:$a), + "fcpys", " $dst, $a", []>; + +def FNEGD : ADI<(ops DPR:$dst, DPR:$a), + "fnegd", " $dst, $a", + [(set DPR:$dst, (fneg DPR:$a))]>; + +def FNEGS : ASI<(ops SPR:$dst, SPR:$a), + "fnegs", " $dst, $a", + [(set SPR:$dst, (fneg SPR:$a))]>; + +def FSQRTD : ADI<(ops DPR:$dst, DPR:$a), + "fsqrtd", " $dst, $a", + [(set DPR:$dst, (fsqrt DPR:$a))]>; + +def FSQRTS : ASI<(ops SPR:$dst, SPR:$a), + "fsqrts", " $dst, $a", + [(set SPR:$dst, (fsqrt SPR:$a))]>; + +//===----------------------------------------------------------------------===// +// FP <-> GPR Copies. Int <-> FP Conversions. +// + +def IMPLICIT_DEF_SPR : PseudoInst<(ops SPR:$rD, pred:$p), + "@ IMPLICIT_DEF_SPR $rD", + [(set SPR:$rD, (undef))]>; +def IMPLICIT_DEF_DPR : PseudoInst<(ops DPR:$rD, pred:$p), + "@ IMPLICIT_DEF_DPR $rD", + [(set DPR:$rD, (undef))]>; + +def FMRS : ASI<(ops GPR:$dst, SPR:$src), + "fmrs", " $dst, $src", + [(set GPR:$dst, (bitconvert SPR:$src))]>; + +def FMSR : ASI<(ops SPR:$dst, GPR:$src), + "fmsr", " $dst, $src", + [(set SPR:$dst, (bitconvert GPR:$src))]>; + + +def FMRRD : ADI<(ops GPR:$dst1, GPR:$dst2, DPR:$src), + "fmrrd", " $dst1, $dst2, $src", + [/* FIXME: Can't write pattern for multiple result instr*/]>; + +// FMDHR: GPR -> SPR +// FMDLR: GPR -> SPR + +def FMDRR : ADI<(ops DPR:$dst, GPR:$src1, GPR:$src2), + "fmdrr", " $dst, $src1, $src2", + [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]>; + +// FMRDH: SPR -> GPR +// FMRDL: SPR -> GPR +// FMRRS: SPR -> GPR +// FMRX : SPR system reg -> GPR + +// FMSRR: GPR -> SPR + +def FMSTAT : ASI<(ops), "fmstat", "", [(arm_fmstat)]>, Imp<[], [CPSR]>; + +// FMXR: GPR -> VFP Sstem reg + + +// Int to FP: + +def FSITOD : ADI<(ops DPR:$dst, SPR:$a), + "fsitod", " $dst, $a", + [(set DPR:$dst, (arm_sitof SPR:$a))]>; + +def FSITOS : ASI<(ops SPR:$dst, SPR:$a), + "fsitos", " $dst, $a", + [(set SPR:$dst, (arm_sitof SPR:$a))]>; + +def FUITOD : ADI<(ops DPR:$dst, SPR:$a), + "fuitod", " $dst, $a", + [(set DPR:$dst, (arm_uitof SPR:$a))]>; + +def FUITOS : ASI<(ops SPR:$dst, SPR:$a), + "fuitos", " $dst, $a", + [(set SPR:$dst, (arm_uitof SPR:$a))]>; + +// FP to Int: +// Always set Z bit in the instruction, i.e. "round towards zero" variants. + +def FTOSIZD : ADI<(ops SPR:$dst, DPR:$a), + "ftosizd", " $dst, $a", + [(set SPR:$dst, (arm_ftosi DPR:$a))]>; + +def FTOSIZS : ASI<(ops SPR:$dst, SPR:$a), + "ftosizs", " $dst, $a", + [(set SPR:$dst, (arm_ftosi SPR:$a))]>; + +def FTOUIZD : ADI<(ops SPR:$dst, DPR:$a), + "ftouizd", " $dst, $a", + [(set SPR:$dst, (arm_ftoui DPR:$a))]>; + +def FTOUIZS : ASI<(ops SPR:$dst, SPR:$a), + "ftouizs", " $dst, $a", + [(set SPR:$dst, (arm_ftoui SPR:$a))]>; + +//===----------------------------------------------------------------------===// +// FP FMA Operations. +// + +def FMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b), + "fmacd", " $dst, $a, $b", + [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b), + "fmacs", " $dst, $a, $b", + [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b), + "fmscd", " $dst, $a, $b", + [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b), + "fmscs", " $dst, $a, $b", + [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FNMACD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b), + "fnmacd", " $dst, $a, $b", + [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FNMACS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b), + "fnmacs", " $dst, $a, $b", + [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FNMSCD : ADI<(ops DPR:$dst, DPR:$dstin, DPR:$a, DPR:$b), + "fnmscd", " $dst, $a, $b", + [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +def FNMSCS : ASI<(ops SPR:$dst, SPR:$dstin, SPR:$a, SPR:$b), + "fnmscs", " $dst, $a, $b", + [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>, + RegConstraint<"$dstin = $dst">; + +//===----------------------------------------------------------------------===// +// FP Conditional moves. +// + +def FCPYDcc : ADI<(ops DPR:$dst, DPR:$false, DPR:$true), + "fcpyd", " $dst, $true", + [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def FCPYScc : ASI<(ops SPR:$dst, SPR:$false, SPR:$true), + "fcpys", " $dst, $true", + [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def FNEGDcc : ADI<(ops DPR:$dst, DPR:$false, DPR:$true), + "fnegd", " $dst, $true", + [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; + +def FNEGScc : ASI<(ops SPR:$dst, SPR:$false, SPR:$true), + "fnegs", " $dst, $true", + [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>, + RegConstraint<"$false = $dst">; diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp new file mode 100644 index 0000000..294a12b --- /dev/null +++ b/lib/Target/ARM/ARMJITInfo.cpp @@ -0,0 +1,131 @@ +//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Raul Herbster and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the ARM target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "ARMJITInfo.h" +#include "ARMRelocations.h" +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/Config/alloca.h" +#include <cstdlib> +using namespace llvm; + +void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + unsigned char *OldByte = (unsigned char *)Old; + *OldByte++ = 0xEA; // Emit B opcode. + unsigned *OldWord = (unsigned *)OldByte; + unsigned NewAddr = (intptr_t)New; + unsigned OldAddr = (intptr_t)OldWord; + *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. +} + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us. Instead, +// write our own wrapper, which does things our way, so we have complete control +// over register saving and restoring. +extern "C" { +#if defined(__arm__) + void ARMCompilationCallback(void); + asm( + ".text\n" + ".align 2\n" + ".globl ARMCompilationCallback\n" + "ARMCompilationCallback:\n" + // save main registers + "mov ip, sp\n" + "stmfd sp!, {fp, ip, lr, pc}\n" + "sub fp, ip, #4\n" + // arguments to Compilation Callback + // r0 - our lr (address of the call instruction in stub plus 4) + // r1 - stub's lr (address of instruction that called the stub plus 4) + "mov r0, fp\n" // stub's frame + "mov r1, lr\n" // stub's lr + "bl ARMCompilationCallbackC\n" + // restore main registers + "ldmfd sp, {fp, sp, pc}\n"); +#else // Not an ARM host + void ARMCompilationCallback() { + assert(0 && "Cannot call ARMCompilationCallback() on a non-ARM arch!\n"); + abort(); + } +#endif +} + +/// ARMCompilationCallbackC - This is the target-specific function invoked by the +/// function stub when we did not know the real target of a call. This function +/// must locate the start of the stub or call site and pass it into the JIT +/// compiler function. +extern "C" void ARMCompilationCallbackC(intptr_t *StackPtr, intptr_t RetAddr) { + intptr_t *RetAddrLoc = &StackPtr[-1]; + + assert(*RetAddrLoc == RetAddr && + "Could not find return address on the stack!"); +#if 0 + DOUT << "In callback! Addr=" << (void*)RetAddr + << " FP=" << (void*)StackPtr + << ": Resolving call to function: " + << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"; +#endif + + // Sanity check to make sure this really is a branch and link instruction. + assert(((unsigned char*)RetAddr-1)[3] == 0xEB && "Not a branch and link instr!"); + + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. + *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); + + // Change the return address to reexecute the branch and link instruction... + *RetAddrLoc -= 1; +} + +TargetJITInfo::LazyResolverFn +ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + return ARMCompilationCallback; +} + +void *ARMJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) { + unsigned addr = (intptr_t)Fn-MCE.getCurrentPCValue()-4; + // If this is just a call to an external function, emit a branch instead of a + // call. The code is the same except for one bit of the last instruction. + if (Fn != (void*)(intptr_t)ARMCompilationCallback) { + MCE.startFunctionStub(4, 2); + MCE.emitByte(0xEA); // branch to the corresponding function addr + MCE.emitByte((unsigned char)(addr >> 0)); + MCE.emitByte((unsigned char)(addr >> 8)); + MCE.emitByte((unsigned char)(addr >> 16)); + return MCE.finishFunctionStub(0); + } else { + MCE.startFunctionStub(5, 2); + MCE.emitByte(0xEB); // branch and link to the corresponding function addr + } + MCE.emitByte((unsigned char)(addr >> 0)); + MCE.emitByte((unsigned char)(addr >> 8)); + MCE.emitByte((unsigned char)(addr >> 16)); + + return MCE.finishFunctionStub(0); +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void ARMJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + +} diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h new file mode 100644 index 0000000..bd0ea84 --- /dev/null +++ b/lib/Target/ARM/ARMJITInfo.h @@ -0,0 +1,50 @@ +//===- ARMJITInfo.h - ARM implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Raul Herbster and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMJITINFO_H +#define ARMJITINFO_H + +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { + class ARMTargetMachine; + + class ARMJITInfo : public TargetJITInfo { + ARMTargetMachine &TM; + public: + ARMJITInfo(ARMTargetMachine &tm) : TM(tm) {useGOT = 0;} + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + }; +} + +#endif diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp new file mode 100644 index 0000000..7562c5b --- /dev/null +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -0,0 +1,750 @@ +//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-ldst-opt" +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/MRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +STATISTIC(NumLDMGened , "Number of ldm instructions generated"); +STATISTIC(NumSTMGened , "Number of stm instructions generated"); +STATISTIC(NumFLDMGened, "Number of fldm instructions generated"); +STATISTIC(NumFSTMGened, "Number of fstm instructions generated"); + +namespace { + struct VISIBILITY_HIDDEN ARMLoadStoreOpt : public MachineFunctionPass { + static char ID; + ARMLoadStoreOpt() : MachineFunctionPass((intptr_t)&ID) {} + + const TargetInstrInfo *TII; + const MRegisterInfo *MRI; + ARMFunctionInfo *AFI; + RegScavenger *RS; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "ARM load / store optimization pass"; + } + + private: + struct MemOpQueueEntry { + int Offset; + unsigned Position; + MachineBasicBlock::iterator MBBI; + bool Merged; + MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) + : Offset(o), Position(p), MBBI(i), Merged(false) {}; + }; + typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; + typedef MemOpQueue::iterator MemOpQueueIter; + + SmallVector<MachineBasicBlock::iterator, 4> + MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, + int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps); + + void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); + bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + }; + char ARMLoadStoreOpt::ID = 0; +} + +/// createARMLoadStoreOptimizationPass - returns an instance of the load / store +/// optimization pass. +FunctionPass *llvm::createARMLoadStoreOptimizationPass() { + return new ARMLoadStoreOpt(); +} + +static int getLoadStoreMultipleOpcode(int Opcode) { + switch (Opcode) { + case ARM::LDR: + NumLDMGened++; + return ARM::LDM; + case ARM::STR: + NumSTMGened++; + return ARM::STM; + case ARM::FLDS: + NumFLDMGened++; + return ARM::FLDMS; + case ARM::FSTS: + NumFSTMGened++; + return ARM::FSTMS; + case ARM::FLDD: + NumFLDMGened++; + return ARM::FLDMD; + case ARM::FSTD: + NumFSTMGened++; + return ARM::FSTMD; + default: abort(); + } + return 0; +} + +/// mergeOps - Create and insert a LDM or STM with Base as base register and +/// registers in Regs as the register operands that would be loaded / stored. +/// It returns true if the transformation is done. +static bool mergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + int Offset, unsigned Base, bool BaseKill, int Opcode, + ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, + SmallVector<std::pair<unsigned, bool>, 8> &Regs, + const TargetInstrInfo *TII) { + // Only a single register to load / store. Don't bother. + unsigned NumRegs = Regs.size(); + if (NumRegs <= 1) + return false; + + ARM_AM::AMSubMode Mode = ARM_AM::ia; + bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR; + if (isAM4 && Offset == 4) + Mode = ARM_AM::ib; + else if (isAM4 && Offset == -4 * (int)NumRegs + 4) + Mode = ARM_AM::da; + else if (isAM4 && Offset == -4 * (int)NumRegs) + Mode = ARM_AM::db; + else if (Offset != 0) { + // If starting offset isn't zero, insert a MI to materialize a new base. + // But only do so if it is cost effective, i.e. merging more than two + // loads / stores. + if (NumRegs <= 2) + return false; + + unsigned NewBase; + if (Opcode == ARM::LDR) + // If it is a load, then just use one of the destination register to + // use as the new base. + NewBase = Regs[NumRegs-1].first; + else { + // Use the scratch register to use as a new base. + NewBase = Scratch; + if (NewBase == 0) + return false; + } + int BaseOpc = ARM::ADDri; + if (Offset < 0) { + BaseOpc = ARM::SUBri; + Offset = - Offset; + } + int ImmedOffset = ARM_AM::getSOImmVal(Offset); + if (ImmedOffset == -1) + return false; // Probably not worth it then. + + BuildMI(MBB, MBBI, TII->get(BaseOpc), NewBase) + .addReg(Base, false, false, BaseKill).addImm(ImmedOffset) + .addImm(Pred).addReg(PredReg).addReg(0); + Base = NewBase; + BaseKill = true; // New base is always killed right its use. + } + + bool isDPR = Opcode == ARM::FLDD || Opcode == ARM::FSTD; + bool isDef = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD; + Opcode = getLoadStoreMultipleOpcode(Opcode); + MachineInstrBuilder MIB = (isAM4) + ? BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base, false, false, BaseKill) + .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) + : BuildMI(MBB, MBBI, TII->get(Opcode)).addReg(Base, false, false, BaseKill) + .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) + .addImm(Pred).addReg(PredReg); + for (unsigned i = 0; i != NumRegs; ++i) + MIB = MIB.addReg(Regs[i].first, isDef, false, Regs[i].second); + + return true; +} + +/// MergeLDR_STR - Merge a number of load / store instructions into one or more +/// load / store multiple instructions. +SmallVector<MachineBasicBlock::iterator, 4> +ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, + unsigned Base, int Opcode, unsigned Size, + ARMCC::CondCodes Pred, unsigned PredReg, + unsigned Scratch, MemOpQueue &MemOps) { + SmallVector<MachineBasicBlock::iterator, 4> Merges; + bool isAM4 = Opcode == ARM::LDR || Opcode == ARM::STR; + int Offset = MemOps[SIndex].Offset; + int SOffset = Offset; + unsigned Pos = MemOps[SIndex].Position; + MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; + unsigned PReg = MemOps[SIndex].MBBI->getOperand(0).getReg(); + unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg); + bool isKill = MemOps[SIndex].MBBI->getOperand(0).isKill(); + + SmallVector<std::pair<unsigned,bool>, 8> Regs; + Regs.push_back(std::make_pair(PReg, isKill)); + for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { + int NewOffset = MemOps[i].Offset; + unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg(); + unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); + isKill = MemOps[i].MBBI->getOperand(0).isKill(); + // AM4 - register numbers in ascending order. + // AM5 - consecutive register numbers in ascending order. + if (NewOffset == Offset + (int)Size && + ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { + Offset += Size; + Regs.push_back(std::make_pair(Reg, isKill)); + PRegNum = RegNum; + } else { + // Can't merge this in. Try merge the earlier ones first. + if (mergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg, + Scratch, Regs, TII)) { + Merges.push_back(prior(Loc)); + for (unsigned j = SIndex; j < i; ++j) { + MBB.erase(MemOps[j].MBBI); + MemOps[j].Merged = true; + } + } + SmallVector<MachineBasicBlock::iterator, 4> Merges2 = + MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,MemOps); + Merges.append(Merges2.begin(), Merges2.end()); + return Merges; + } + + if (MemOps[i].Position > Pos) { + Pos = MemOps[i].Position; + Loc = MemOps[i].MBBI; + } + } + + bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; + if (mergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg, + Scratch, Regs, TII)) { + Merges.push_back(prior(Loc)); + for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) { + MBB.erase(MemOps[i].MBBI); + MemOps[i].Merged = true; + } + } + + return Merges; +} + +/// getInstrPredicate - If instruction is predicated, returns its predicate +/// condition, otherwise returns AL. It also returns the condition code +/// register by reference. +static ARMCC::CondCodes getInstrPredicate(MachineInstr *MI, unsigned &PredReg) { + int PIdx = MI->findFirstPredOperandIdx(); + if (PIdx == -1) { + PredReg = 0; + return ARMCC::AL; + } + + PredReg = MI->getOperand(PIdx+1).getReg(); + return (ARMCC::CondCodes)MI->getOperand(PIdx).getImmedValue(); +} + +static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, ARMCC::CondCodes Pred, + unsigned PredReg) { + unsigned MyPredReg = 0; + return (MI && MI->getOpcode() == ARM::SUBri && + MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes && + getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, + unsigned Bytes, ARMCC::CondCodes Pred, + unsigned PredReg) { + unsigned MyPredReg = 0; + return (MI && MI->getOpcode() == ARM::ADDri && + MI->getOperand(0).getReg() == Base && + MI->getOperand(1).getReg() == Base && + ARM_AM::getAM2Offset(MI->getOperand(2).getImm()) == Bytes && + getInstrPredicate(MI, MyPredReg) == Pred && + MyPredReg == PredReg); +} + +static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: return 0; + case ARM::LDR: + case ARM::STR: + case ARM::FLDS: + case ARM::FSTS: + return 4; + case ARM::FLDD: + case ARM::FSTD: + return 8; + case ARM::LDM: + case ARM::STM: + return (MI->getNumOperands() - 4) * 4; + case ARM::FLDMS: + case ARM::FSTMS: + case ARM::FLDMD: + case ARM::FSTMD: + return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; + } +} + +/// mergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base +/// register into the LDM/STM/FLDM{D|S}/FSTM{D|S} op when possible: +/// +/// stmia rn, <ra, rb, rc> +/// rn := rn + 4 * 3; +/// => +/// stmia rn!, <ra, rb, rc> +/// +/// rn := rn - 4 * 3; +/// ldmia rn, <ra, rb, rc> +/// => +/// ldmdb rn!, <ra, rb, rc> +static bool mergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(0).getReg(); + unsigned Bytes = getLSMultipleTransferSize(MI); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + int Opcode = MI->getOpcode(); + bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::STM; + + if (isAM4) { + if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) + return false; + + // Can't use the updating AM4 sub-mode if the base register is also a dest + // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. + for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).getReg() == Base) + return false; + } + + ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); + MBB.erase(PrevMBBI); + return true; + } else if (Mode == ARM_AM::ib && + isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); + MBB.erase(PrevMBBI); + return true; + } + } + + if (MBBI != MBB.end()) { + MachineBasicBlock::iterator NextMBBI = next(MBBI); + if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && + isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); + MBB.erase(NextMBBI); + return true; + } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && + isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); + MBB.erase(NextMBBI); + return true; + } + } + } else { + // FLDM{D|S}, FSTM{D|S} addressing mode 5 ops. + if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) + return false; + + ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); + unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + if (Mode == ARM_AM::ia && + isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); + MBB.erase(PrevMBBI); + return true; + } + } + + if (MBBI != MBB.end()) { + MachineBasicBlock::iterator NextMBBI = next(MBBI); + if (Mode == ARM_AM::ia && + isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); + MBB.erase(NextMBBI); + } + return true; + } + } + + return false; +} + +static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { + switch (Opc) { + case ARM::LDR: return ARM::LDR_PRE; + case ARM::STR: return ARM::STR_PRE; + case ARM::FLDS: return ARM::FLDMS; + case ARM::FLDD: return ARM::FLDMD; + case ARM::FSTS: return ARM::FSTMS; + case ARM::FSTD: return ARM::FSTMD; + default: abort(); + } + return 0; +} + +static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { + switch (Opc) { + case ARM::LDR: return ARM::LDR_POST; + case ARM::STR: return ARM::STR_POST; + case ARM::FLDS: return ARM::FLDMS; + case ARM::FLDD: return ARM::FLDMD; + case ARM::FSTS: return ARM::FSTMS; + case ARM::FSTD: return ARM::FSTMD; + default: abort(); + } + return 0; +} + +/// mergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base +/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: +static bool mergeBaseUpdateLoadStore(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const TargetInstrInfo *TII) { + MachineInstr *MI = MBBI; + unsigned Base = MI->getOperand(1).getReg(); + bool BaseKill = MI->getOperand(1).isKill(); + unsigned Bytes = getLSMultipleTransferSize(MI); + int Opcode = MI->getOpcode(); + bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; + if ((isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) || + (!isAM2 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)) + return false; + + bool isLd = Opcode == ARM::LDR || Opcode == ARM::FLDS || Opcode == ARM::FLDD; + // Can't do the merge if the destination register is the same as the would-be + // writeback register. + if (isLd && MI->getOperand(0).getReg() == Base) + return false; + + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); + bool DoMerge = false; + ARM_AM::AddrOpc AddSub = ARM_AM::add; + unsigned NewOpc = 0; + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PrevMBBI = prior(MBBI); + if (isMatchingDecrement(PrevMBBI, Base, Bytes, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + NewOpc = getPreIndexedLoadStoreOpcode(Opcode); + } else if (isAM2 && isMatchingIncrement(PrevMBBI, Base, Bytes, + Pred, PredReg)) { + DoMerge = true; + NewOpc = getPreIndexedLoadStoreOpcode(Opcode); + } + if (DoMerge) + MBB.erase(PrevMBBI); + } + + if (!DoMerge && MBBI != MBB.end()) { + MachineBasicBlock::iterator NextMBBI = next(MBBI); + if (isAM2 && isMatchingDecrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + DoMerge = true; + AddSub = ARM_AM::sub; + NewOpc = getPostIndexedLoadStoreOpcode(Opcode); + } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Pred, PredReg)) { + DoMerge = true; + NewOpc = getPostIndexedLoadStoreOpcode(Opcode); + } + if (DoMerge) + MBB.erase(NextMBBI); + } + + if (!DoMerge) + return false; + + bool isDPR = NewOpc == ARM::FLDMD || NewOpc == ARM::FSTMD; + unsigned Offset = isAM2 ? ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift) + : ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) ? ARM_AM::db : ARM_AM::ia, + true, isDPR ? 2 : 1); + if (isLd) { + if (isAM2) + // LDR_PRE, LDR_POST; + BuildMI(MBB, MBBI, TII->get(NewOpc), MI->getOperand(0).getReg()) + .addReg(Base, true) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + else + // FLDMS, FLDMD + BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base, false, false, BaseKill) + .addImm(Offset).addImm(Pred).addReg(PredReg) + .addReg(MI->getOperand(0).getReg(), true); + } else { + MachineOperand &MO = MI->getOperand(0); + if (isAM2) + // STR_PRE, STR_POST; + BuildMI(MBB, MBBI, TII->get(NewOpc), Base) + .addReg(MO.getReg(), false, false, MO.isKill()) + .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + else + // FSTMS, FSTMD + BuildMI(MBB, MBBI, TII->get(NewOpc)).addReg(Base).addImm(Offset) + .addImm(Pred).addReg(PredReg) + .addReg(MO.getReg(), false, false, MO.isKill()); + } + MBB.erase(MBBI); + + return true; +} + +/// isMemoryOp - Returns true if instruction is a memory operations (that this +/// pass is capable of operating on). +static bool isMemoryOp(MachineInstr *MI) { + int Opcode = MI->getOpcode(); + switch (Opcode) { + default: break; + case ARM::LDR: + case ARM::STR: + return MI->getOperand(1).isRegister() && MI->getOperand(2).getReg() == 0; + case ARM::FLDS: + case ARM::FSTS: + return MI->getOperand(1).isRegister(); + case ARM::FLDD: + case ARM::FSTD: + return MI->getOperand(1).isRegister(); + } + return false; +} + +/// AdvanceRS - Advance register scavenger to just before the earliest memory +/// op that is being merged. +void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { + MachineBasicBlock::iterator Loc = MemOps[0].MBBI; + unsigned Position = MemOps[0].Position; + for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { + if (MemOps[i].Position < Position) { + Position = MemOps[i].Position; + Loc = MemOps[i].MBBI; + } + } + + if (Loc != MBB.begin()) + RS->forward(prior(Loc)); +} + +/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR +/// ops of the same base and incrementing offset into LDM / STM ops. +bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { + unsigned NumMerges = 0; + unsigned NumMemOps = 0; + MemOpQueue MemOps; + unsigned CurrBase = 0; + int CurrOpc = -1; + unsigned CurrSize = 0; + ARMCC::CondCodes CurrPred = ARMCC::AL; + unsigned CurrPredReg = 0; + unsigned Position = 0; + + RS->enterBasicBlock(&MBB); + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + bool Advance = false; + bool TryMerge = false; + bool Clobber = false; + + bool isMemOp = isMemoryOp(MBBI); + if (isMemOp) { + int Opcode = MBBI->getOpcode(); + bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; + unsigned Size = getLSMultipleTransferSize(MBBI); + unsigned Base = MBBI->getOperand(1).getReg(); + unsigned PredReg = 0; + ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); + const TargetInstrDescriptor *TID = MBBI->getInstrDescriptor(); + unsigned OffField = MBBI->getOperand(TID->numOperands-3).getImm(); + int Offset = isAM2 + ? ARM_AM::getAM2Offset(OffField) : ARM_AM::getAM5Offset(OffField) * 4; + if (isAM2) { + if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } else { + if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) + Offset = -Offset; + } + // Watch out for: + // r4 := ldr [r5] + // r5 := ldr [r5, #4] + // r6 := ldr [r5, #8] + // + // The second ldr has effectively broken the chain even though it + // looks like the later ldr(s) use the same base register. Try to + // merge the ldr's so far, including this one. But don't try to + // combine the following ldr(s). + Clobber = (Opcode == ARM::LDR && Base == MBBI->getOperand(0).getReg()); + if (CurrBase == 0 && !Clobber) { + // Start of a new chain. + CurrBase = Base; + CurrOpc = Opcode; + CurrSize = Size; + CurrPred = Pred; + CurrPredReg = PredReg; + MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + } else { + if (Clobber) { + TryMerge = true; + Advance = true; + } + + if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { + // No need to match PredReg. + // Continue adding to the queue. + if (Offset > MemOps.back().Offset) { + MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + } else { + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); + I != E; ++I) { + if (Offset < I->Offset) { + MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); + NumMemOps++; + Advance = true; + break; + } else if (Offset == I->Offset) { + // Collision! This can't be merged! + break; + } + } + } + } + } + } + + if (Advance) { + ++Position; + ++MBBI; + } else + TryMerge = true; + + if (TryMerge) { + if (NumMemOps > 1) { + // Try to find a free register to use as a new base in case it's needed. + // First advance to the instruction just before the start of the chain. + AdvanceRS(MBB, MemOps); + // Find a scratch register. Make sure it's a call clobbered register or + // a spilled callee-saved register. + unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, true); + if (!Scratch) + Scratch = RS->FindUnusedReg(&ARM::GPRRegClass, + AFI->getSpilledCSRegisters()); + // Process the load / store instructions. + RS->forward(prior(MBBI)); + + // Merge ops. + SmallVector<MachineBasicBlock::iterator,4> MBBII = + MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, + CurrPred, CurrPredReg, Scratch, MemOps); + + // Try folding preceeding/trailing base inc/dec into the generated + // LDM/STM ops. + for (unsigned i = 0, e = MBBII.size(); i < e; ++i) + if (mergeBaseUpdateLSMultiple(MBB, MBBII[i])) + NumMerges++; + NumMerges += MBBII.size(); + + // Try folding preceeding/trailing base inc/dec into those load/store + // that were not merged to form LDM/STM ops. + for (unsigned i = 0; i != NumMemOps; ++i) + if (!MemOps[i].Merged) + if (mergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII)) + NumMerges++; + + // RS may be pointing to an instruction that's deleted. + RS->skipTo(prior(MBBI)); + } + + CurrBase = 0; + CurrOpc = -1; + CurrSize = 0; + CurrPred = ARMCC::AL; + CurrPredReg = 0; + if (NumMemOps) { + MemOps.clear(); + NumMemOps = 0; + } + + // If iterator hasn't been advanced and this is not a memory op, skip it. + // It can't start a new chain anyway. + if (!Advance && !isMemOp && MBBI != E) { + ++Position; + ++MBBI; + } + } + } + return NumMerges > 0; +} + +/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op +/// (bx lr) into the preceeding stack restore so it directly restore the value +/// of LR into pc. +/// ldmfd sp!, {r7, lr} +/// bx lr +/// => +/// ldmfd sp!, {r7, pc} +bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { + if (MBB.empty()) return false; + + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + if (MBBI->getOpcode() == ARM::BX_RET && MBBI != MBB.begin()) { + MachineInstr *PrevMI = prior(MBBI); + if (PrevMI->getOpcode() == ARM::LDM) { + MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); + if (MO.getReg() == ARM::LR) { + PrevMI->setInstrDescriptor(TII->get(ARM::LDM_RET)); + MO.setReg(ARM::PC); + MBB.erase(MBBI); + return true; + } + } + } + return false; +} + +bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + AFI = Fn.getInfo<ARMFunctionInfo>(); + TII = TM.getInstrInfo(); + MRI = TM.getRegisterInfo(); + RS = new RegScavenger(); + + bool Modified = false; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + Modified |= LoadStoreMultipleOpti(MBB); + Modified |= MergeReturnIntoLDM(MBB); + } + + delete RS; + return Modified; +} diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h new file mode 100644 index 0000000..665c5e3 --- /dev/null +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -0,0 +1,220 @@ +//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares ARM-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMMACHINEFUNCTIONINFO_H +#define ARMMACHINEFUNCTIONINFO_H + +#include "ARMSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/MRegisterInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + +/// ARMFunctionInfo - This class is derived from MachineFunction private +/// ARM target-specific information for each MachineFunction. +class ARMFunctionInfo : public MachineFunctionInfo { + + /// isThumb - True if this function is compiled under Thumb mode. + /// Used to initialized Align, so must precede it. + bool isThumb; + + /// Align - required alignment. ARM functions and Thumb functions with + /// constant pools require 4-byte alignment; other Thumb functions + /// require only 2-byte alignment. + unsigned Align; + + /// VarArgsRegSaveSize - Size of the register save area for vararg functions. + /// + unsigned VarArgsRegSaveSize; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// processFunctionBeforeCalleeSavedScan(). + bool HasStackFrame; + + /// LRSpilledForFarJump - True if the LR register has been for spilled to + /// enable far jump. + bool LRSpilledForFarJump; + + /// R3IsLiveIn - True if R3 is live in to this function. + /// FIXME: Remove when register scavenger for Thumb is done. + bool R3IsLiveIn; + + /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer + /// spill stack offset. + unsigned FramePtrSpillOffset; + + /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved + /// register spills areas. For Mac OS X: + /// + /// GPR callee-saved (1) : r4, r5, r6, r7, lr + /// -------------------------------------------- + /// GPR callee-saved (2) : r8, r10, r11 + /// -------------------------------------------- + /// DPR callee-saved : d8 - d15 + unsigned GPRCS1Offset; + unsigned GPRCS2Offset; + unsigned DPRCSOffset; + + /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills + /// areas. + unsigned GPRCS1Size; + unsigned GPRCS2Size; + unsigned DPRCSSize; + + /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices + /// which belong to these spill areas. + BitVector GPRCS1Frames; + BitVector GPRCS2Frames; + BitVector DPRCSFrames; + + /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers. + /// + BitVector SpilledCSRegs; + + /// JumpTableUId - Unique id for jumptables. + /// + unsigned JumpTableUId; + +public: + ARMFunctionInfo() : + isThumb(false), + Align(2U), + VarArgsRegSaveSize(0), HasStackFrame(false), + LRSpilledForFarJump(false), R3IsLiveIn(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0), + JumpTableUId(0) {} + + ARMFunctionInfo(MachineFunction &MF) : + isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()), + Align(isThumb ? 1U : 2U), + VarArgsRegSaveSize(0), HasStackFrame(false), + LRSpilledForFarJump(false), R3IsLiveIn(false), + FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), + GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), + GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32), + SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()), + JumpTableUId(0) {} + + bool isThumbFunction() const { return isThumb; } + + unsigned getAlign() const { return Align; } + void setAlign(unsigned a) { Align = a; } + + unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; } + void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } + void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } + + // FIXME: Remove when register scavenger for Thumb is done. + bool isR3LiveIn() const { return R3IsLiveIn; } + void setR3IsLiveIn(bool l) { R3IsLiveIn = l; } + + unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } + void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } + + unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; } + unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; } + unsigned getDPRCalleeSavedAreaOffset() const { return DPRCSOffset; } + + void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; } + void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; } + void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; } + + unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } + unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } + unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } + + void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } + void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } + void setDPRCalleeSavedAreaSize(unsigned s) { DPRCSSize = s; } + + bool isGPRCalleeSavedArea1Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS1Frames.size()) + return false; + return GPRCS1Frames[fi]; + } + bool isGPRCalleeSavedArea2Frame(int fi) const { + if (fi < 0 || fi >= (int)GPRCS2Frames.size()) + return false; + return GPRCS2Frames[fi]; + } + bool isDPRCalleeSavedAreaFrame(int fi) const { + if (fi < 0 || fi >= (int)DPRCSFrames.size()) + return false; + return DPRCSFrames[fi]; + } + + void addGPRCalleeSavedArea1Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS1Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS1Frames.resize(Size); + } + GPRCS1Frames[fi] = true; + } + } + void addGPRCalleeSavedArea2Frame(int fi) { + if (fi >= 0) { + int Size = GPRCS2Frames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + GPRCS2Frames.resize(Size); + } + GPRCS2Frames[fi] = true; + } + } + void addDPRCalleeSavedAreaFrame(int fi) { + if (fi >= 0) { + int Size = DPRCSFrames.size(); + if (fi >= Size) { + Size *= 2; + if (fi >= Size) + Size = fi+1; + DPRCSFrames.resize(Size); + } + DPRCSFrames[fi] = true; + } + } + + void setCSRegisterIsSpilled(unsigned Reg) { + SpilledCSRegs.set(Reg); + } + + bool isCSRegisterSpilled(unsigned Reg) { + return SpilledCSRegs[Reg]; + } + + const BitVector &getSpilledCSRegisters() const { + return SpilledCSRegs; + } + + unsigned createJumpTableUId() { + return JumpTableUId++; + } +}; +} // End llvm namespace + +#endif // ARMMACHINEFUNCTIONINFO_H diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp new file mode 100644 index 0000000..f8e10de --- /dev/null +++ b/lib/Target/ARM/ARMRegisterInfo.cpp @@ -0,0 +1,1566 @@ +//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMAddressingModes.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CommandLine.h" +#include <algorithm> +using namespace llvm; + +static cl::opt<bool> ThumbRegScavenging("enable-thumb-reg-scavenging", + cl::Hidden, + cl::desc("Enable register scavenging on Thumb")); + +unsigned ARMRegisterInfo::getRegisterNumbering(unsigned RegEnum) { + using namespace ARM; + switch (RegEnum) { + case R0: case S0: case D0: return 0; + case R1: case S1: case D1: return 1; + case R2: case S2: case D2: return 2; + case R3: case S3: case D3: return 3; + case R4: case S4: case D4: return 4; + case R5: case S5: case D5: return 5; + case R6: case S6: case D6: return 6; + case R7: case S7: case D7: return 7; + case R8: case S8: case D8: return 8; + case R9: case S9: case D9: return 9; + case R10: case S10: case D10: return 10; + case R11: case S11: case D11: return 11; + case R12: case S12: case D12: return 12; + case SP: case S13: case D13: return 13; + case LR: case S14: case D14: return 14; + case PC: case S15: case D15: return 15; + case S16: return 16; + case S17: return 17; + case S18: return 18; + case S19: return 19; + case S20: return 20; + case S21: return 21; + case S22: return 22; + case S23: return 23; + case S24: return 24; + case S25: return 25; + case S26: return 26; + case S27: return 27; + case S28: return 28; + case S29: return 29; + case S30: return 30; + case S31: return 31; + default: + assert(0 && "Unknown ARM register!"); + abort(); + } +} + +ARMRegisterInfo::ARMRegisterInfo(const TargetInstrInfo &tii, + const ARMSubtarget &sti) + : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), + TII(tii), STI(sti), + FramePtr((STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11) { +} + +bool ARMRegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (!AFI->isThumbFunction() || CSI.empty()) + return false; + + MachineInstrBuilder MIB = BuildMI(MBB, MI, TII.get(ARM::tPUSH)); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + MIB.addReg(Reg, false/*isDef*/,false/*isImp*/,true/*isKill*/); + } + return true; +} + +bool ARMRegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (!AFI->isThumbFunction() || CSI.empty()) + return false; + + bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + MachineInstr *PopMI = new MachineInstr(TII.get(ARM::tPOP)); + MBB.insert(MI, PopMI); + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (Reg == ARM::LR) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + Reg = ARM::PC; + PopMI->setInstrDescriptor(TII.get(ARM::tPOP_RET)); + MBB.erase(MI); + } + PopMI->addRegOperand(Reg, true); + } + return true; +} + +void ARMRegisterInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, int FI, + const TargetRegisterClass *RC) const { + if (RC == ARM::GPRRegisterClass) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (AFI->isThumbFunction()) + BuildMI(MBB, I, TII.get(ARM::tSpill)).addReg(SrcReg, false, false, true) + .addFrameIndex(FI).addImm(0); + else + BuildMI(MBB, I, TII.get(ARM::STR)).addReg(SrcReg, false, false, true) + .addFrameIndex(FI).addReg(0).addImm(0).addImm((int64_t)ARMCC::AL) + .addReg(0); + } else if (RC == ARM::DPRRegisterClass) { + BuildMI(MBB, I, TII.get(ARM::FSTD)).addReg(SrcReg, false, false, true) + .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0); + } else { + assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); + BuildMI(MBB, I, TII.get(ARM::FSTS)).addReg(SrcReg, false, false, true) + .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0); + } +} + +void ARMRegisterInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC) const { + if (RC == ARM::GPRRegisterClass) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (AFI->isThumbFunction()) + BuildMI(MBB, I, TII.get(ARM::tRestore), DestReg) + .addFrameIndex(FI).addImm(0); + else + BuildMI(MBB, I, TII.get(ARM::LDR), DestReg) + .addFrameIndex(FI).addReg(0).addImm(0).addImm((int64_t)ARMCC::AL) + .addReg(0); + } else if (RC == ARM::DPRRegisterClass) { + BuildMI(MBB, I, TII.get(ARM::FLDD), DestReg) + .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0); + } else { + assert(RC == ARM::SPRRegisterClass && "Unknown regclass!"); + BuildMI(MBB, I, TII.get(ARM::FLDS), DestReg) + .addFrameIndex(FI).addImm(0).addImm((int64_t)ARMCC::AL).addReg(0); + } +} + +void ARMRegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + if (RC == ARM::GPRRegisterClass) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + if (AFI->isThumbFunction()) + BuildMI(MBB, I, TII.get(ARM::tMOVr), DestReg).addReg(SrcReg); + else + BuildMI(MBB, I, TII.get(ARM::MOVr), DestReg).addReg(SrcReg) + .addImm((int64_t)ARMCC::AL).addReg(0).addReg(0); + } else if (RC == ARM::SPRRegisterClass) + BuildMI(MBB, I, TII.get(ARM::FCPYS), DestReg).addReg(SrcReg) + .addImm((int64_t)ARMCC::AL).addReg(0); + else if (RC == ARM::DPRRegisterClass) + BuildMI(MBB, I, TII.get(ARM::FCPYD), DestReg).addReg(SrcReg) + .addImm((int64_t)ARMCC::AL).addReg(0); + else + abort(); +} + +/// emitLoadConstPool - Emits a load from constpool to materialize the +/// specified immediate. +static void emitLoadConstPool(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, int Val, + ARMCC::CondCodes Pred, unsigned PredReg, + const TargetInstrInfo &TII, bool isThumb) { + MachineFunction &MF = *MBB.getParent(); + MachineConstantPool *ConstantPool = MF.getConstantPool(); + Constant *C = ConstantInt::get(Type::Int32Ty, Val); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, 2); + if (isThumb) + BuildMI(MBB, MBBI, TII.get(ARM::tLDRcp), DestReg).addConstantPoolIndex(Idx); + else + BuildMI(MBB, MBBI, TII.get(ARM::LDRcp), DestReg).addConstantPoolIndex(Idx) + .addReg(0).addImm(0).addImm((unsigned)Pred).addReg(PredReg); +} + +void ARMRegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + if (Orig->getOpcode() == ARM::MOVi2pieces) { + emitLoadConstPool(MBB, I, DestReg, + Orig->getOperand(1).getImmedValue(), + (ARMCC::CondCodes)Orig->getOperand(2).getImmedValue(), + Orig->getOperand(3).getReg(), + TII, false); + return; + } + + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +/// isLowRegister - Returns true if the register is low register r0-r7. +/// +static bool isLowRegister(unsigned Reg) { + using namespace ARM; + switch (Reg) { + case R0: case R1: case R2: case R3: + case R4: case R5: case R6: case R7: + return true; + default: + return false; + } +} + +MachineInstr *ARMRegisterInfo::foldMemoryOperand(MachineInstr *MI, + unsigned OpNum, int FI) const { + unsigned Opc = MI->getOpcode(); + MachineInstr *NewMI = NULL; + switch (Opc) { + default: break; + case ARM::MOVr: { + if (MI->getOperand(4).getReg() == ARM::CPSR) + // If it is updating CPSR, then it cannot be foled. + break; + unsigned Pred = MI->getOperand(2).getImmedValue(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + NewMI = BuildMI(TII.get(ARM::STR)).addReg(SrcReg).addFrameIndex(FI) + .addReg(0).addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + NewMI = BuildMI(TII.get(ARM::LDR), DstReg).addFrameIndex(FI).addReg(0) + .addImm(0).addImm(Pred).addReg(PredReg); + } + break; + } + case ARM::tMOVr: { + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + if (isPhysicalRegister(SrcReg) && !isLowRegister(SrcReg)) + // tSpill cannot take a high register operand. + break; + NewMI = BuildMI(TII.get(ARM::tSpill)).addReg(SrcReg).addFrameIndex(FI) + .addImm(0); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + if (isPhysicalRegister(DstReg) && !isLowRegister(DstReg)) + // tRestore cannot target a high register operand. + break; + NewMI = BuildMI(TII.get(ARM::tRestore), DstReg).addFrameIndex(FI) + .addImm(0); + } + break; + } + case ARM::FCPYS: { + unsigned Pred = MI->getOperand(2).getImmedValue(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + NewMI = BuildMI(TII.get(ARM::FSTS)).addReg(SrcReg).addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + NewMI = BuildMI(TII.get(ARM::FLDS), DstReg).addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } + break; + } + case ARM::FCPYD: { + unsigned Pred = MI->getOperand(2).getImmedValue(); + unsigned PredReg = MI->getOperand(3).getReg(); + if (OpNum == 0) { // move -> store + unsigned SrcReg = MI->getOperand(1).getReg(); + NewMI = BuildMI(TII.get(ARM::FSTD)).addReg(SrcReg).addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } else { // move -> load + unsigned DstReg = MI->getOperand(0).getReg(); + NewMI = BuildMI(TII.get(ARM::FLDD), DstReg).addFrameIndex(FI) + .addImm(0).addImm(Pred).addReg(PredReg); + } + break; + } + } + + if (NewMI) + NewMI->copyKillDeadInfo(MI); + return NewMI; +} + +const unsigned* ARMRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + static const unsigned CalleeSavedRegs[] = { + ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8, + ARM::R7, ARM::R6, ARM::R5, ARM::R4, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + + static const unsigned DarwinCalleeSavedRegs[] = { + ARM::LR, ARM::R7, ARM::R6, ARM::R5, ARM::R4, + ARM::R11, ARM::R10, ARM::R9, ARM::R8, + + ARM::D15, ARM::D14, ARM::D13, ARM::D12, + ARM::D11, ARM::D10, ARM::D9, ARM::D8, + 0 + }; + return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs; +} + +const TargetRegisterClass* const * +ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass, + + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, + 0 + }; + return CalleeSavedRegClasses; +} + +BitVector ARMRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + // FIXME: avoid re-calculating this everytime. + BitVector Reserved(getNumRegs()); + Reserved.set(ARM::SP); + Reserved.set(ARM::PC); + if (STI.isTargetDarwin() || hasFP(MF)) + Reserved.set(FramePtr); + // Some targets reserve R9. + if (STI.isR9Reserved()) + Reserved.set(ARM::R9); + return Reserved; +} + +bool +ARMRegisterInfo::isReservedReg(const MachineFunction &MF, unsigned Reg) const { + switch (Reg) { + default: break; + case ARM::SP: + case ARM::PC: + return true; + case ARM::R7: + case ARM::R11: + if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF))) + return true; + break; + case ARM::R9: + return STI.isR9Reserved(); + } + + return false; +} + +bool +ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { + const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + return ThumbRegScavenging || !AFI->isThumbFunction(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +/// +bool ARMRegisterInfo::hasFP(const MachineFunction &MF) const { + return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects(); +} + +// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +// not required, we reserve argument space for call sites in the function +// immediately on entry to the current function. This eliminates the need for +// add/sub sp brackets around call sites. Returns true if the call frame is +// included as part of the stack frame. +bool ARMRegisterInfo::hasReservedCallFrame(MachineFunction &MF) const { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + unsigned CFSize = FFI->getMaxCallFrameSize(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + // It's not always a good idea to include the call frame as part of the + // stack frame. ARM (especially Thumb) has small immediate offset to + // address the stack frame. So a large call frame can cause poor codegen + // and may even makes it impossible to scavenge a register. + if (AFI->isThumbFunction()) { + if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4 + return false; + } else { + if (CFSize >= ((1 << 12) - 1) / 2) // Half of imm12 + return false; + } + return !hasFP(MF); +} + +/// emitARMRegPlusImmediate - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in ARM code. +static +void emitARMRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, int NumBytes, + ARMCC::CondCodes Pred, unsigned PredReg, + const TargetInstrInfo &TII) { + bool isSub = NumBytes < 0; + if (isSub) NumBytes = -NumBytes; + + while (NumBytes) { + unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes); + unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt); + assert(ThisVal && "Didn't extract field correctly"); + + // We will handle these bits from offset, clear them. + NumBytes &= ~ThisVal; + + // Get the properly encoded SOImmVal field. + int SOImmVal = ARM_AM::getSOImmVal(ThisVal); + assert(SOImmVal != -1 && "Bit extraction didn't work?"); + + // Build the new ADD / SUB. + BuildMI(MBB, MBBI, TII.get(isSub ? ARM::SUBri : ARM::ADDri), DestReg) + .addReg(BaseReg, false, false, true).addImm(SOImmVal) + .addImm((unsigned)Pred).addReg(PredReg).addReg(0); + BaseReg = DestReg; + } +} + +/// calcNumMI - Returns the number of instructions required to materialize +/// the specific add / sub r, c instruction. +static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes, + unsigned NumBits, unsigned Scale) { + unsigned NumMIs = 0; + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + + if (Opc == ARM::tADDrSPi) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + NumMIs++; + NumBits = 8; + Scale = 1; // Followed by a number of tADDi8. + Chunk = ((1 << NumBits) - 1) * Scale; + } + + NumMIs += Bytes / Chunk; + if ((Bytes % Chunk) != 0) + NumMIs++; + if (ExtraOpc) + NumMIs++; + return NumMIs; +} + +/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. Materialize the immediate +/// in a register using mov / mvn sequences or load the immediate from a +/// constpool entry. +static +void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, bool CanChangeCC, + const TargetInstrInfo &TII) { + bool isHigh = !isLowRegister(DestReg) || + (BaseReg != 0 && !isLowRegister(BaseReg)); + bool isSub = false; + // Subtract doesn't have high register version. Load the negative value + // if either base or dest register is a high register. Also, if do not + // issue sub as part of the sequence if condition register is to be + // preserved. + if (NumBytes < 0 && !isHigh && CanChangeCC) { + isSub = true; + NumBytes = -NumBytes; + } + unsigned LdReg = DestReg; + if (DestReg == ARM::SP) { + assert(BaseReg == ARM::SP && "Unexpected!"); + LdReg = ARM::R3; + BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::R12) + .addReg(ARM::R3, false, false, true); + } + + if (NumBytes <= 255 && NumBytes >= 0) + BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); + else if (NumBytes < 0 && NumBytes >= -255) { + BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), LdReg).addImm(NumBytes); + BuildMI(MBB, MBBI, TII.get(ARM::tNEG), LdReg) + .addReg(LdReg, false, false, true); + } else + emitLoadConstPool(MBB, MBBI, LdReg, NumBytes, ARMCC::AL, 0, TII, true); + + // Emit add / sub. + int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr); + const MachineInstrBuilder MIB = BuildMI(MBB, MBBI, TII.get(Opc), DestReg); + if (DestReg == ARM::SP || isSub) + MIB.addReg(BaseReg).addReg(LdReg, false, false, true); + else + MIB.addReg(LdReg).addReg(BaseReg, false, false, true); + if (DestReg == ARM::SP) + BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::R3) + .addReg(ARM::R12, false, false, true); +} + +/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize +/// a destreg = basereg + immediate in Thumb code. +static +void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, unsigned BaseReg, + int NumBytes, const TargetInstrInfo &TII) { + bool isSub = NumBytes < 0; + unsigned Bytes = (unsigned)NumBytes; + if (isSub) Bytes = -NumBytes; + bool isMul4 = (Bytes & 3) == 0; + bool isTwoAddr = false; + bool DstNotEqBase = false; + unsigned NumBits = 1; + unsigned Scale = 1; + int Opc = 0; + int ExtraOpc = 0; + + if (DestReg == BaseReg && BaseReg == ARM::SP) { + assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!"); + NumBits = 7; + Scale = 4; + Opc = isSub ? ARM::tSUBspi : ARM::tADDspi; + isTwoAddr = true; + } else if (!isSub && BaseReg == ARM::SP) { + // r1 = add sp, 403 + // => + // r1 = add sp, 100 * 4 + // r1 = add r1, 3 + if (!isMul4) { + Bytes &= ~3; + ExtraOpc = ARM::tADDi3; + } + NumBits = 8; + Scale = 4; + Opc = ARM::tADDrSPi; + } else { + // sp = sub sp, c + // r1 = sub sp, c + // r8 = sub sp, c + if (DestReg != BaseReg) + DstNotEqBase = true; + NumBits = 8; + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + isTwoAddr = true; + } + + unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale); + unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2; + if (NumMIs > Threshold) { + // This will expand into too many instructions. Load the immediate from a + // constpool entry. + emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII); + return; + } + + if (DstNotEqBase) { + if (isLowRegister(DestReg) && isLowRegister(BaseReg)) { + // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7) + unsigned Chunk = (1 << 3) - 1; + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + BuildMI(MBB, MBBI, TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3), DestReg) + .addReg(BaseReg, false, false, true).addImm(ThisVal); + } else { + BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), DestReg) + .addReg(BaseReg, false, false, true); + } + BaseReg = DestReg; + } + + unsigned Chunk = ((1 << NumBits) - 1) * Scale; + while (Bytes) { + unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes; + Bytes -= ThisVal; + ThisVal /= Scale; + // Build the new tADD / tSUB. + if (isTwoAddr) + BuildMI(MBB, MBBI, TII.get(Opc), DestReg).addReg(DestReg).addImm(ThisVal); + else { + bool isKill = BaseReg != ARM::SP; + BuildMI(MBB, MBBI, TII.get(Opc), DestReg) + .addReg(BaseReg, false, false, isKill).addImm(ThisVal); + BaseReg = DestReg; + + if (Opc == ARM::tADDrSPi) { + // r4 = add sp, imm + // r4 = add r4, imm + // ... + NumBits = 8; + Scale = 1; + Chunk = ((1 << NumBits) - 1) * Scale; + Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8; + isTwoAddr = true; + } + } + } + + if (ExtraOpc) + BuildMI(MBB, MBBI, TII.get(ExtraOpc), DestReg) + .addReg(DestReg, false, false, true) + .addImm(((unsigned)NumBytes) & 3); +} + +static +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + int NumBytes, ARMCC::CondCodes Pred, unsigned PredReg, + bool isThumb, const TargetInstrInfo &TII) { + if (isThumb) + emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII); + else + emitARMRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, + Pred, PredReg, TII); +} + +void ARMRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + unsigned Amount = Old->getOperand(0).getImmedValue(); + if (Amount != 0) { + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + bool isThumb = AFI->isThumbFunction(); + ARMCC::CondCodes Pred = isThumb + ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(1).getImmedValue(); + unsigned PredReg = isThumb ? 0 : Old->getOperand(2).getReg(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + emitSPUpdate(MBB, I, -Amount, Pred, PredReg, isThumb, TII); + } else { + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(MBB, I, Amount, Pred, PredReg, isThumb, TII); + } + } + } + MBB.erase(I); +} + +/// emitThumbConstant - Emit a series of instructions to materialize a +/// constant. +static void emitThumbConstant(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned DestReg, int Imm, + const TargetInstrInfo &TII) { + bool isSub = Imm < 0; + if (isSub) Imm = -Imm; + + int Chunk = (1 << 8) - 1; + int ThisVal = (Imm > Chunk) ? Chunk : Imm; + Imm -= ThisVal; + BuildMI(MBB, MBBI, TII.get(ARM::tMOVi8), DestReg).addImm(ThisVal); + if (Imm > 0) + emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII); + if (isSub) + BuildMI(MBB, MBBI, TII.get(ARM::tNEG), DestReg) + .addReg(DestReg, false, false, true); +} + +/// findScratchRegister - Find a 'free' ARM register. If register scavenger +/// is not being used, R12 is available. Otherwise, try for a call-clobbered +/// register first and then a spilled callee-saved register if that fails. +static +unsigned findScratchRegister(RegScavenger *RS, const TargetRegisterClass *RC, + ARMFunctionInfo *AFI) { + unsigned Reg = RS ? RS->FindUnusedReg(RC, true) : (unsigned) ARM::R12; + if (Reg == 0) + // Try a already spilled CS register. + Reg = RS->FindUnusedReg(RC, AFI->getSpilledCSRegisters()); + + return Reg; +} + +void ARMRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const{ + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isThumb = AFI->isThumbFunction(); + + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + unsigned FrameReg = ARM::SP; + int FrameIndex = MI.getOperand(i).getFrameIndex(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MF.getFrameInfo()->getStackSize() + SPAdj; + + if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea1Offset(); + else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex)) + Offset -= AFI->getGPRCalleeSavedArea2Offset(); + else if (AFI->isDPRCalleeSavedAreaFrame(FrameIndex)) + Offset -= AFI->getDPRCalleeSavedAreaOffset(); + else if (hasFP(MF)) { + assert(SPAdj == 0 && "Unexpected"); + // There is alloca()'s in this function, must reference off the frame + // pointer instead. + FrameReg = getFrameRegister(MF); + Offset -= AFI->getFramePtrSpillOffset(); + } + + unsigned Opcode = MI.getOpcode(); + const TargetInstrDescriptor &Desc = TII.get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + bool isSub = false; + + if (Opcode == ARM::ADDri) { + Offset += MI.getOperand(i+1).getImm(); + if (Offset == 0) { + // Turn it into a move. + MI.setInstrDescriptor(TII.get(ARM::MOVr)); + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(i+1); + return; + } else if (Offset < 0) { + Offset = -Offset; + isSub = true; + MI.setInstrDescriptor(TII.get(ARM::SUBri)); + } + + // Common case: small offset, fits into instruction. + int ImmedOffset = ARM_AM::getSOImmVal(Offset); + if (ImmedOffset != -1) { + // Replace the FrameIndex with sp / fp + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(ImmedOffset); + return; + } + + // Otherwise, we fallback to common code below to form the imm offset with + // a sequence of ADDri instructions. First though, pull as much of the imm + // into this ADDri as possible. + unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset); + unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt); + + // We will handle these bits from offset, clear them. + Offset &= ~ThisImmVal; + + // Get the properly encoded SOImmVal field. + int ThisSOImmVal = ARM_AM::getSOImmVal(ThisImmVal); + assert(ThisSOImmVal != -1 && "Bit extraction didn't work?"); + MI.getOperand(i+1).ChangeToImmediate(ThisSOImmVal); + } else if (Opcode == ARM::tADDrSPi) { + Offset += MI.getOperand(i+1).getImm(); + + // Can't use tADDrSPi if it's based off the frame pointer. + unsigned NumBits = 0; + unsigned Scale = 1; + if (FrameReg != ARM::SP) { + Opcode = ARM::tADDi3; + MI.setInstrDescriptor(TII.get(ARM::tADDi3)); + NumBits = 3; + } else { + NumBits = 8; + Scale = 4; + assert((Offset & 3) == 0 && + "Thumb add/sub sp, #imm immediate must be multiple of 4!"); + } + + if (Offset == 0) { + // Turn it into a move. + MI.setInstrDescriptor(TII.get(ARM::tMOVr)); + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.RemoveOperand(i+1); + return; + } + + // Common case: small offset, fits into instruction. + unsigned Mask = (1 << NumBits) - 1; + if (((Offset / Scale) & ~Mask) == 0) { + // Replace the FrameIndex with sp / fp + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Offset / Scale); + return; + } + + unsigned DestReg = MI.getOperand(0).getReg(); + unsigned Bytes = (Offset > 0) ? Offset : -Offset; + unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale); + // MI would expand into a large number of instructions. Don't try to + // simplify the immediate. + if (NumMIs > 2) { + emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII); + MBB.erase(II); + return; + } + + if (Offset > 0) { + // Translate r0 = add sp, imm to + // r0 = add sp, 255*4 + // r0 = add r0, (imm - 255*4) + MI.getOperand(i).ChangeToRegister(FrameReg, false); + MI.getOperand(i+1).ChangeToImmediate(Mask); + Offset = (Offset - Mask * Scale); + MachineBasicBlock::iterator NII = next(II); + emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII); + } else { + // Translate r0 = add sp, -imm to + // r0 = -imm (this is then translated into a series of instructons) + // r0 = add r0, sp + emitThumbConstant(MBB, II, DestReg, Offset, TII); + MI.setInstrDescriptor(TII.get(ARM::tADDhirr)); + MI.getOperand(i).ChangeToRegister(DestReg, false, false, true); + MI.getOperand(i+1).ChangeToRegister(FrameReg, false); + } + return; + } else { + unsigned ImmIdx = 0; + int InstrOffs = 0; + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case ARMII::AddrMode2: { + ImmIdx = i+2; + InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 12; + break; + } + case ARMII::AddrMode3: { + ImmIdx = i+2; + InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + break; + } + case ARMII::AddrMode5: { + ImmIdx = i+1; + InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm()); + if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub) + InstrOffs *= -1; + NumBits = 8; + Scale = 4; + break; + } + case ARMII::AddrModeTs: { + ImmIdx = i+1; + InstrOffs = MI.getOperand(ImmIdx).getImm(); + NumBits = (FrameReg == ARM::SP) ? 8 : 5; + Scale = 4; + break; + } + default: + assert(0 && "Unsupported addressing mode!"); + abort(); + break; + } + + Offset += InstrOffs * Scale; + assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!"); + if (Offset < 0 && !isThumb) { + Offset = -Offset; + isSub = true; + } + + // Common case: small offset, fits into instruction. + MachineOperand &ImmOp = MI.getOperand(ImmIdx); + int ImmedOffset = Offset / Scale; + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) { + // Replace the FrameIndex with sp + MI.getOperand(i).ChangeToRegister(FrameReg, false); + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + return; + } + + bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill; + if (AddrMode == ARMII::AddrModeTs) { + // Thumb tLDRspi, tSTRspi. These will change to instructions that use + // a different base register. + NumBits = 5; + Mask = (1 << NumBits) - 1; + } + // If this is a thumb spill / restore, we will be using a constpool load to + // materialize the offset. + if (AddrMode == ARMII::AddrModeTs && isThumSpillRestore) + ImmOp.ChangeToImmediate(0); + else { + // Otherwise, it didn't fit. Pull in what we can to simplify the immed. + ImmedOffset = ImmedOffset & Mask; + if (isSub) + ImmedOffset |= 1 << NumBits; + ImmOp.ChangeToImmediate(ImmedOffset); + Offset &= ~(Mask*Scale); + } + } + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above, handle the rest, providing a register that is + // SP+LargeImm. + assert(Offset && "This code isn't needed if offset already handled!"); + + if (isThumb) { + if (TII.isLoad(Opcode)) { + // Use the destination register to materialize sp + offset. + unsigned TmpReg = MI.getOperand(0).getReg(); + bool UseRR = false; + if (Opcode == ARM::tRestore) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,Offset,false,TII); + else { + emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, TII, true); + UseRR = true; + } + } else + emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII); + MI.setInstrDescriptor(TII.get(ARM::tLDR)); + MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); + if (UseRR) + MI.addRegOperand(FrameReg, false); // Use [reg, reg] addrmode. + else + MI.addRegOperand(0, false); // tLDR has an extra register operand. + } else if (TII.isStore(Opcode)) { + // FIXME! This is horrific!!! We need register scavenging. + // Our temporary workaround has marked r3 unavailable. Of course, r3 is + // also a ABI register so it's possible that is is the register that is + // being storing here. If that's the case, we do the following: + // r12 = r2 + // Use r2 to materialize sp + offset + // str r3, r2 + // r2 = r12 + unsigned ValReg = MI.getOperand(0).getReg(); + unsigned TmpReg = ARM::R3; + bool UseRR = false; + if (ValReg == ARM::R3) { + BuildMI(MBB, II, TII.get(ARM::tMOVr), ARM::R12) + .addReg(ARM::R2, false, false, true); + TmpReg = ARM::R2; + } + if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) + BuildMI(MBB, II, TII.get(ARM::tMOVr), ARM::R12) + .addReg(ARM::R3, false, false, true); + if (Opcode == ARM::tSpill) { + if (FrameReg == ARM::SP) + emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,Offset,false,TII); + else { + emitLoadConstPool(MBB, II, TmpReg, Offset, ARMCC::AL, 0, TII, true); + UseRR = true; + } + } else + emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII); + MI.setInstrDescriptor(TII.get(ARM::tSTR)); + MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true); + if (UseRR) + MI.addRegOperand(FrameReg, false); // Use [reg, reg] addrmode. + else + MI.addRegOperand(0, false); // tSTR has an extra register operand. + + MachineBasicBlock::iterator NII = next(II); + if (ValReg == ARM::R3) + BuildMI(MBB, NII, TII.get(ARM::tMOVr), ARM::R2) + .addReg(ARM::R12, false, false, true); + if (TmpReg == ARM::R3 && AFI->isR3LiveIn()) + BuildMI(MBB, NII, TII.get(ARM::tMOVr), ARM::R3) + .addReg(ARM::R12, false, false, true); + } else + assert(false && "Unexpected opcode!"); + } else { + // Insert a set of r12 with the full address: r12 = sp + offset + // If the offset we have is too large to fit into the instruction, we need + // to form it with a series of ADDri's. Do this by taking 8-bit chunks + // out of 'Offset'. + unsigned ScratchReg = findScratchRegister(RS, &ARM::GPRRegClass, AFI); + if (ScratchReg == 0) + // No register is "free". Scavenge a register. + ScratchReg = RS->scavengeRegister(&ARM::GPRRegClass, II, SPAdj); + int PIdx = MI.findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImmedValue(); + unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg(); + emitARMRegPlusImmediate(MBB, II, ScratchReg, FrameReg, + isSub ? -Offset : Offset, Pred, PredReg, TII); + MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true); + } +} + +static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + int Offset = 0; + for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -FFI->getObjectOffset(i); + if (FixedOff > Offset) Offset = FixedOff; + } + for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { + Offset += FFI->getObjectSize(i); + unsigned Align = FFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset+Align-1)/Align*Align; + } + return (unsigned)Offset; +} + +void +ARMRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + // This tells PEI to spill the FP as if it is any other callee-save register + // to take advantage the eliminateFrameIndex machinery. This also ensures it + // is spilled in the order specified by getCalleeSavedRegs() to make it easier + // to combine multiple loads / stores. + bool CanEliminateFrame = true; + bool CS1Spilled = false; + bool LRSpilled = false; + unsigned NumGPRSpills = 0; + SmallVector<unsigned, 4> UnspilledCS1GPRs; + SmallVector<unsigned, 4> UnspilledCS2GPRs; + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + + // Don't spill FP if the frame can be eliminated. This is determined + // by scanning the callee-save registers to see if any is used. + const unsigned *CSRegs = getCalleeSavedRegs(); + const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + bool Spilled = false; + if (MF.isPhysRegUsed(Reg)) { + AFI->setCSRegisterIsSpilled(Reg); + Spilled = true; + CanEliminateFrame = false; + } else { + // Check alias registers too. + for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) { + if (MF.isPhysRegUsed(*Aliases)) { + Spilled = true; + CanEliminateFrame = false; + } + } + } + + if (CSRegClasses[i] == &ARM::GPRRegClass) { + if (Spilled) { + NumGPRSpills++; + + if (!STI.isTargetDarwin()) { + if (Reg == ARM::LR) + LRSpilled = true; + CS1Spilled = true; + continue; + } + + // Keep track if LR and any of R4, R5, R6, and R7 is spilled. + switch (Reg) { + case ARM::LR: + LRSpilled = true; + // Fallthrough + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + CS1Spilled = true; + break; + default: + break; + } + } else { + if (!STI.isTargetDarwin()) { + UnspilledCS1GPRs.push_back(Reg); + continue; + } + + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + UnspilledCS1GPRs.push_back(Reg); + break; + default: + UnspilledCS2GPRs.push_back(Reg); + break; + } + } + } + } + + bool ForceLRSpill = false; + if (!LRSpilled && AFI->isThumbFunction()) { + unsigned FnSize = ARM::GetFunctionSize(MF); + // Force LR to be spilled if the Thumb function size is > 2048. This enables + // use of BL to implement far jump. If it turns out that it's not needed + // then the branch fix up path will undo it. + if (FnSize >= (1 << 11)) { + CanEliminateFrame = false; + ForceLRSpill = true; + } + } + + bool ExtraCSSpill = false; + if (!CanEliminateFrame || hasFP(MF)) { + AFI->setHasStackFrame(true); + + // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. + // Spill LR as well so we can fold BX_RET to the registers restore (LDM). + if (!LRSpilled && CS1Spilled) { + MF.setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + NumGPRSpills++; + UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), + UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + ForceLRSpill = false; + ExtraCSSpill = true; + } + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if (STI.isTargetDarwin() || hasFP(MF)) { + MF.setPhysRegUsed(FramePtr); + NumGPRSpills++; + } + + // If stack and double are 8-byte aligned and we are spilling an odd number + // of GPRs. Spill one extra callee save GPR so we won't have to pad between + // the integer and double callee save areas. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + if (TargetAlign == 8 && (NumGPRSpills & 1)) { + if (CS1Spilled && !UnspilledCS1GPRs.empty()) { + for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { + unsigned Reg = UnspilledCS1GPRs[i]; + // Don't spiil high register if the function is thumb + if (!AFI->isThumbFunction() || isLowRegister(Reg) || Reg == ARM::LR) { + MF.setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + break; + } + } + } else if (!UnspilledCS2GPRs.empty() && + !AFI->isThumbFunction()) { + unsigned Reg = UnspilledCS2GPRs.front(); + MF.setPhysRegUsed(Reg); + AFI->setCSRegisterIsSpilled(Reg); + if (!isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } + } + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additiona + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. + if (RS && !ExtraCSSpill && !AFI->isThumbFunction()) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Size = estimateStackSize(MF, MFI); + unsigned Limit = (1 << 12) - 1; + for (MachineFunction::iterator BB = MF.begin(),E = MF.end();BB != E; ++BB) + for (MachineBasicBlock::iterator I= BB->begin(); I != BB->end(); ++I) { + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (I->getOperand(i).isFrameIndex()) { + unsigned Opcode = I->getOpcode(); + const TargetInstrDescriptor &Desc = TII.get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + if (AddrMode == ARMII::AddrMode3) { + Limit = (1 << 8) - 1; + goto DoneEstimating; + } else if (AddrMode == ARMII::AddrMode5) { + unsigned ThisLimit = ((1 << 8) - 1) * 4; + if (ThisLimit < Limit) + Limit = ThisLimit; + } + } + } + DoneEstimating: + if (Size >= Limit) { + // If any non-reserved CS register isn't spilled, just spill one or two + // extra. That should take care of it! + unsigned NumExtras = TargetAlign / 4; + SmallVector<unsigned, 2> Extras; + while (NumExtras && !UnspilledCS1GPRs.empty()) { + unsigned Reg = UnspilledCS1GPRs.back(); + UnspilledCS1GPRs.pop_back(); + if (!isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + while (NumExtras && !UnspilledCS2GPRs.empty()) { + unsigned Reg = UnspilledCS2GPRs.back(); + UnspilledCS2GPRs.pop_back(); + if (!isReservedReg(MF, Reg)) { + Extras.push_back(Reg); + NumExtras--; + } + } + if (Extras.size() && NumExtras == 0) { + for (unsigned i = 0, e = Extras.size(); i != e; ++i) { + MF.setPhysRegUsed(Extras[i]); + AFI->setCSRegisterIsSpilled(Extras[i]); + } + } else { + // Reserve a slot closest to SP or frame pointer. + const TargetRegisterClass *RC = &ARM::GPRRegClass; + RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment())); + } + } + } + } + + if (ForceLRSpill) { + MF.setPhysRegUsed(ARM::LR); + AFI->setCSRegisterIsSpilled(ARM::LR); + AFI->setLRIsSpilledForFarJump(true); + } +} + +/// Move iterator pass the next bunch of callee save load / store ops for +/// the particular spill area (1: integer area 1, 2: integer area 2, +/// 3: fp area, 0: don't care). +static void movePastCSLoadStoreOps(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + int Opc, unsigned Area, + const ARMSubtarget &STI) { + while (MBBI != MBB.end() && + MBBI->getOpcode() == Opc && MBBI->getOperand(1).isFrameIndex()) { + if (Area != 0) { + bool Done = false; + unsigned Category = 0; + switch (MBBI->getOperand(0).getReg()) { + case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: + case ARM::LR: + Category = 1; + break; + case ARM::R8: case ARM::R9: case ARM::R10: case ARM::R11: + Category = STI.isTargetDarwin() ? 2 : 1; + break; + case ARM::D8: case ARM::D9: case ARM::D10: case ARM::D11: + case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15: + Category = 3; + break; + default: + Done = true; + break; + } + if (Done || Category != Area) + break; + } + + ++MBBI; + } +} + +void ARMRegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isThumb = AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned NumBytes = MFI->getStackSize(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + if (isThumb) { + // Check if R3 is live in. It might have to be used as a scratch register. + for (MachineFunction::livein_iterator I=MF.livein_begin(),E=MF.livein_end(); + I != E; ++I) { + if ((*I).first == ARM::R3) { + AFI->setR3IsLiveIn(true); + break; + } + } + + // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4. + NumBytes = (NumBytes + 3) & ~3; + MFI->setStackSize(NumBytes); + } + + // Determine the sizes of each callee-save spill areas and record which frame + // belongs to which callee-save spill areas. + unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; + int FramePtrSpillFI = 0; + + if (VARegSaveSize) + emitSPUpdate(MBB, MBBI, -VARegSaveSize, ARMCC::AL, 0, isThumb, TII); + + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII); + return; + } + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + int FI = CSI[i].getFrameIdx(); + switch (Reg) { + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + break; + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + if (Reg == FramePtr) + FramePtrSpillFI = FI; + if (STI.isTargetDarwin()) { + AFI->addGPRCalleeSavedArea2Frame(FI); + GPRCS2Size += 4; + } else { + AFI->addGPRCalleeSavedArea1Frame(FI); + GPRCS1Size += 4; + } + break; + default: + AFI->addDPRCalleeSavedAreaFrame(FI); + DPRCSSize += 8; + } + } + + if (!isThumb) { + // Build the new SUBri to adjust SP for integer callee-save spill area 1. + emitSPUpdate(MBB, MBBI, -GPRCS1Size, ARMCC::AL, 0, isThumb, TII); + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 1, STI); + } else if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) + ++MBBI; + + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if (STI.isTargetDarwin() || hasFP(MF)) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, TII.get(isThumb ? ARM::tADDrSPi : ARM::ADDri),FramePtr) + .addFrameIndex(FramePtrSpillFI).addImm(0); + if (!isThumb) MIB.addImm(ARMCC::AL).addReg(0).addReg(0); + } + + if (!isThumb) { + // Build the new SUBri to adjust SP for integer callee-save spill area 2. + emitSPUpdate(MBB, MBBI, -GPRCS2Size, ARMCC::AL, 0, false, TII); + + // Build the new SUBri to adjust SP for FP callee-save spill area. + movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, 2, STI); + emitSPUpdate(MBB, MBBI, -DPRCSSize, ARMCC::AL, 0, false, TII); + } + + // Determine starting offsets of spill areas. + unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); + unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; + unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; + AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); + AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); + AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + + NumBytes = DPRCSOffset; + if (NumBytes) { + // Insert it after all the callee-save spills. + if (!isThumb) + movePastCSLoadStoreOps(MBB, MBBI, ARM::FSTD, 3, STI); + emitSPUpdate(MBB, MBBI, -NumBytes, ARMCC::AL, 0, isThumb, TII); + } + + if(STI.isTargetELF() && hasFP(MF)) { + MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - + AFI->getFramePtrSpillOffset()); + } + + AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); + AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); + AFI->setDPRCalleeSavedAreaSize(DPRCSSize); +} + +static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) { + return ((MI->getOpcode() == ARM::FLDD || + MI->getOpcode() == ARM::LDR || + MI->getOpcode() == ARM::tRestore) && + MI->getOperand(1).isFrameIndex() && + isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs)); +} + +void ARMRegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert((MBBI->getOpcode() == ARM::BX_RET || + MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET) && + "Can only insert epilog into returning blocks"); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool isThumb = AFI->isThumbFunction(); + unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + int NumBytes = (int)MFI->getStackSize(); + if (!AFI->hasStackFrame()) { + if (NumBytes != 0) + emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII); + } else { + // Unwind MBBI to point to first LDR / FLDD. + const unsigned *CSRegs = getCalleeSavedRegs(); + if (MBBI != MBB.begin()) { + do + --MBBI; + while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs)); + if (!isCSRestore(MBBI, CSRegs)) + ++MBBI; + } + + // Move SP to start of FP callee save spill area. + NumBytes -= (AFI->getGPRCalleeSavedArea1Size() + + AFI->getGPRCalleeSavedArea2Size() + + AFI->getDPRCalleeSavedAreaSize()); + if (isThumb) { + if (hasFP(MF)) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (NumBytes) + emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes, TII); + else + BuildMI(MBB, MBBI, TII.get(ARM::tMOVr), ARM::SP).addReg(FramePtr); + } else { + if (MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && + prior(MBBI)->getOpcode() == ARM::tPOP) { + MachineBasicBlock::iterator PMBBI = prior(MBBI); + emitSPUpdate(MBB, PMBBI, NumBytes, ARMCC::AL, 0, isThumb, TII); + } else + emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, isThumb, TII); + } + } else { + // Darwin ABI requires FP to point to the stack slot that contains the + // previous FP. + if ((STI.isTargetDarwin() && NumBytes) || hasFP(MF)) { + NumBytes = AFI->getFramePtrSpillOffset() - NumBytes; + // Reset SP based on frame pointer only if the stack frame extends beyond + // frame pointer stack slot or target is ELF and the function has FP. + if (AFI->getGPRCalleeSavedArea2Size() || + AFI->getDPRCalleeSavedAreaSize() || + AFI->getDPRCalleeSavedAreaOffset()|| + hasFP(MF)) + if (NumBytes) + BuildMI(MBB, MBBI, TII.get(ARM::SUBri), ARM::SP).addReg(FramePtr) + .addImm(NumBytes) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + else + BuildMI(MBB, MBBI, TII.get(ARM::MOVr), ARM::SP).addReg(FramePtr) + .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); + } else if (NumBytes) { + emitSPUpdate(MBB, MBBI, NumBytes, ARMCC::AL, 0, false, TII); + } + + // Move SP to start of integer callee save spill area 2. + movePastCSLoadStoreOps(MBB, MBBI, ARM::FLDD, 3, STI); + emitSPUpdate(MBB, MBBI, AFI->getDPRCalleeSavedAreaSize(), ARMCC::AL, 0, + false, TII); + + // Move SP to start of integer callee save spill area 1. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 2, STI); + emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea2Size(), ARMCC::AL, 0, + false, TII); + + // Move SP to SP upon entry to the function. + movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, 1, STI); + emitSPUpdate(MBB, MBBI, AFI->getGPRCalleeSavedArea1Size(), ARMCC::AL, 0, + false, TII); + } + } + + if (VARegSaveSize) { + if (isThumb) + // Epilogue for vararg functions: pop LR to R3 and branch off it. + // FIXME: Verify this is still ok when R3 is no longer being reserved. + BuildMI(MBB, MBBI, TII.get(ARM::tPOP)).addReg(ARM::R3); + + emitSPUpdate(MBB, MBBI, VARegSaveSize, ARMCC::AL, 0, isThumb, TII); + + if (isThumb) { + BuildMI(MBB, MBBI, TII.get(ARM::tBX_RET_vararg)).addReg(ARM::R3); + MBB.erase(MBBI); + } + } +} + +unsigned ARMRegisterInfo::getRARegister() const { + return ARM::LR; +} + +unsigned ARMRegisterInfo::getFrameRegister(MachineFunction &MF) const { + if (STI.isTargetDarwin() || hasFP(MF)) + return (STI.useThumbBacktraces() || STI.isThumb()) ? ARM::R7 : ARM::R11; + else + return ARM::SP; +} + +unsigned ARMRegisterInfo::getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned ARMRegisterInfo::getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +#include "ARMGenRegisterInfo.inc" + diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h new file mode 100644 index 0000000..3db1d89 --- /dev/null +++ b/lib/Target/ARM/ARMRegisterInfo.h @@ -0,0 +1,108 @@ +//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the ARM implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMREGISTERINFO_H +#define ARMREGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "ARMGenRegisterInfo.h.inc" + +namespace llvm { + class ARMSubtarget; + class TargetInstrInfo; + class Type; + +struct ARMRegisterInfo : public ARMGenRegisterInfo { + const TargetInstrInfo &TII; + const ARMSubtarget &STI; +private: + /// FramePtr - ARM physical register used as frame ptr. + unsigned FramePtr; + +public: + ARMRegisterInfo(const TargetInstrInfo &tii, const ARMSubtarget &STI); + + /// getRegisterNumbering - Given the enum value for some register, e.g. + /// ARM::LR, return the number that it corresponds to (e.g. 14). + static unsigned getRegisterNumbering(unsigned RegEnum); + + /// Code Generation virtual methods... + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum, + int FrameIndex) const; + + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + bool requiresRegisterScavenging(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + bool hasReservedCallFrame(MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td new file mode 100644 index 0000000..3d2646e --- /dev/null +++ b/lib/Target/ARM/ARMRegisterInfo.td @@ -0,0 +1,196 @@ +//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the ARM register file +//===----------------------------------------------------------------------===// + +// Registers are identified with 4-bit ID numbers. +class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> { + field bits<4> Num; + let Namespace = "ARM"; + let SubRegs = subregs; +} + +class ARMFReg<bits<5> num, string n> : Register<n> { + field bits<5> Num; + let Namespace = "ARM"; +} + +// Integer registers +def R0 : ARMReg< 0, "r0">, DwarfRegNum<0>; +def R1 : ARMReg< 1, "r1">, DwarfRegNum<1>; +def R2 : ARMReg< 2, "r2">, DwarfRegNum<2>; +def R3 : ARMReg< 3, "r3">, DwarfRegNum<3>; +def R4 : ARMReg< 4, "r4">, DwarfRegNum<4>; +def R5 : ARMReg< 5, "r5">, DwarfRegNum<5>; +def R6 : ARMReg< 6, "r6">, DwarfRegNum<6>; +def R7 : ARMReg< 7, "r7">, DwarfRegNum<7>; +def R8 : ARMReg< 8, "r8">, DwarfRegNum<8>; +def R9 : ARMReg< 9, "r9">, DwarfRegNum<9>; +def R10 : ARMReg<10, "r10">, DwarfRegNum<10>; +def R11 : ARMReg<11, "r11">, DwarfRegNum<11>; +def R12 : ARMReg<12, "r12">, DwarfRegNum<12>; +def SP : ARMReg<13, "sp">, DwarfRegNum<13>; +def LR : ARMReg<14, "lr">, DwarfRegNum<14>; +def PC : ARMReg<15, "pc">, DwarfRegNum<15>; + +// Float registers +def S0 : ARMFReg< 0, "s0">; def S1 : ARMFReg< 1, "s1">; +def S2 : ARMFReg< 2, "s2">; def S3 : ARMFReg< 3, "s3">; +def S4 : ARMFReg< 4, "s4">; def S5 : ARMFReg< 5, "s5">; +def S6 : ARMFReg< 6, "s6">; def S7 : ARMFReg< 7, "s7">; +def S8 : ARMFReg< 8, "s8">; def S9 : ARMFReg< 9, "s9">; +def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">; +def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">; +def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">; +def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">; +def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">; +def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">; +def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">; +def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">; +def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">; +def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">; +def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">; + +// Aliases of the F* registers used to hold 64-bit fp values (doubles) +def D0 : ARMReg< 0, "d0", [S0, S1]>; +def D1 : ARMReg< 1, "d1", [S2, S3]>; +def D2 : ARMReg< 2, "d2", [S4, S5]>; +def D3 : ARMReg< 3, "d3", [S6, S7]>; +def D4 : ARMReg< 4, "d4", [S8, S9]>; +def D5 : ARMReg< 5, "d5", [S10, S11]>; +def D6 : ARMReg< 6, "d6", [S12, S13]>; +def D7 : ARMReg< 7, "d7", [S14, S15]>; +def D8 : ARMReg< 8, "d8", [S16, S17]>; +def D9 : ARMReg< 9, "d9", [S18, S19]>; +def D10 : ARMReg<10, "d10", [S20, S21]>; +def D11 : ARMReg<11, "d11", [S22, S23]>; +def D12 : ARMReg<12, "d12", [S24, S25]>; +def D13 : ARMReg<13, "d13", [S26, S27]>; +def D14 : ARMReg<14, "d14", [S28, S29]>; +def D15 : ARMReg<15, "d15", [S30, S31]>; + +// Current Program Status Register. +def CPSR : ARMReg<0, "cpsr">; + +// Register classes. +// +// pc == Program Counter +// lr == Link Register +// sp == Stack Pointer +// r12 == ip (scratch) +// r7 == Frame Pointer (thumb-style backtraces) +// r11 == Frame Pointer (arm-style backtraces) +// r10 == Stack Limit +// +def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, + R7, R8, R9, R10, R12, R11, + LR, SP, PC]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + // FIXME: We are reserving r12 in case the PEI needs to use it to + // generate large stack offset. Make it available once we have register + // scavenging. Similarly r3 is reserved in Thumb mode for now. + let MethodBodies = [{ + // FP is R11, R9 is available. + static const unsigned ARM_GPR_AO_1[] = { + ARM::R3, ARM::R2, ARM::R1, ARM::R0, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R9, ARM::R10, + ARM::R11 }; + // FP is R11, R9 is not available. + static const unsigned ARM_GPR_AO_2[] = { + ARM::R3, ARM::R2, ARM::R1, ARM::R0, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, ARM::R7, + ARM::R8, ARM::R10, + ARM::R11 }; + // FP is R7, R9 is available. + static const unsigned ARM_GPR_AO_3[] = { + ARM::R3, ARM::R2, ARM::R1, ARM::R0, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R9, ARM::R10,ARM::R11, + ARM::R7 }; + // FP is R7, R9 is not available. + static const unsigned ARM_GPR_AO_4[] = { + ARM::R3, ARM::R2, ARM::R1, ARM::R0, + ARM::R12,ARM::LR, + ARM::R4, ARM::R5, ARM::R6, + ARM::R8, ARM::R10,ARM::R11, + ARM::R7 }; + + // FP is R7, only low registers available. + static const unsigned THUMB_GPR_AO[] = { + ARM::R2, ARM::R1, ARM::R0, + ARM::R4, ARM::R5, ARM::R6, ARM::R7 }; + + GPRClass::iterator + GPRClass::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + if (Subtarget.isThumb()) + return THUMB_GPR_AO; + if (Subtarget.useThumbBacktraces()) { + if (Subtarget.isR9Reserved()) + return ARM_GPR_AO_4; + else + return ARM_GPR_AO_3; + } else { + if (Subtarget.isR9Reserved()) + return ARM_GPR_AO_2; + else + return ARM_GPR_AO_1; + } + } + + GPRClass::iterator + GPRClass::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>(); + GPRClass::iterator I; + if (Subtarget.isThumb()) + I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned)); + else if (Subtarget.useThumbBacktraces()) { + if (Subtarget.isR9Reserved()) { + I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned)); + } else { + I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned)); + } + } else { + if (Subtarget.isR9Reserved()) { + I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned)); + } else { + I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned)); + } + } + + // Mac OS X requires FP not to be clobbered for backtracing purpose. + return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I; + } + }]; +} + +def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8, + S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, + S23, S24, S25, S26, S27, S28, S29, S30, S31]>; + +// ARM requires only word alignment for double. It's more performant if it +// is double-word alignment though. +def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8, + D9, D10, D11, D12, D13, D14, D15]>; + +// Condition code registers. +def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>; diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h new file mode 100644 index 0000000..beea52b --- /dev/null +++ b/lib/Target/ARM/ARMRelocations.h @@ -0,0 +1,28 @@ +//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Raul Herbster and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the ARM target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMRELOCATIONS_H +#define ARMRELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace ARM { + enum RelocationType { + + }; + } +} + +#endif + diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp new file mode 100644 index 0000000..6db36df --- /dev/null +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -0,0 +1,57 @@ +//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the ARM specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "ARMSubtarget.h" +#include "ARMGenSubtarget.inc" +#include "llvm/Module.h" +using namespace llvm; + +ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS, bool thumb) + : ARMArchVersion(V4T) + , HasVFP2(false) + , IsThumb(thumb) + , UseThumbBacktraces(false) + , IsR9Reserved(false) + , stackAlignment(4) + , TargetType(isELF) // Default to ELF unless otherwise specified. + , TargetABI(ARM_ABI_APCS) { + + // Determine default and user specified characteristics + std::string CPU = "generic"; + + // Parse features string. + ParseSubtargetFeatures(FS, CPU); + + // Set the boolean corresponding to the current target triple, or the default + // if one cannot be determined, to true. + const std::string& TT = M.getTargetTriple(); + if (TT.length() > 5) { + if (TT.find("-darwin") != std::string::npos) + TargetType = isDarwin; + } else if (TT.empty()) { +#if defined(__APPLE__) + TargetType = isDarwin; +#endif + } + + if (TT.find("eabi") != std::string::npos) + TargetABI = ARM_ABI_AAPCS; + + if (isAAPCS_ABI()) + stackAlignment = 8; + + if (isTargetDarwin()) { + UseThumbBacktraces = true; + IsR9Reserved = true; + } +} diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h new file mode 100644 index 0000000..62367ca --- /dev/null +++ b/lib/Target/ARM/ARMSubtarget.h @@ -0,0 +1,94 @@ +//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMSUBTARGET_H +#define ARMSUBTARGET_H + +#include "llvm/Target/TargetSubtarget.h" +#include <string> + +namespace llvm { +class Module; + +class ARMSubtarget : public TargetSubtarget { +protected: + enum ARMArchEnum { + V4T, V5T, V5TE, V6 + }; + + /// ARMArchVersion - ARM architecture vecrsion: V4T (base), V5T, V5TE, + /// and V6. + ARMArchEnum ARMArchVersion; + + /// HasVFP2 - True if the processor supports Vector Floating Point (VFP) V2 + /// instructions. + bool HasVFP2; + + /// IsThumb - True if we are in thumb mode, false if in ARM mode. + bool IsThumb; + + /// UseThumbBacktraces - True if we use thumb style backtraces. + bool UseThumbBacktraces; + + /// IsR9Reserved - True if R9 is a not available as general purpose register. + bool IsR9Reserved; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + public: + enum { + isELF, isDarwin + } TargetType; + + enum { + ARM_ABI_APCS, + ARM_ABI_AAPCS // ARM EABI + } TargetABI; + + /// This constructor initializes the data members to match that + /// of the specified module. + /// + ARMSubtarget(const Module &M, const std::string &FS, bool thumb); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + + bool hasV4TOps() const { return ARMArchVersion >= V4T; } + bool hasV5TOps() const { return ARMArchVersion >= V5T; } + bool hasV5TEOps() const { return ARMArchVersion >= V5TE; } + bool hasV6Ops() const { return ARMArchVersion >= V6; } + + bool hasVFP2() const { return HasVFP2; } + + bool isTargetDarwin() const { return TargetType == isDarwin; } + bool isTargetELF() const { return TargetType == isELF; } + + bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; } + bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; } + + bool isThumb() const { return IsThumb; } + + bool useThumbBacktraces() const { return UseThumbBacktraces; } + bool isR9Reserved() const { return IsR9Reserved; } + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } +}; +} // End llvm namespace + +#endif // ARMSUBTARGET_H diff --git a/lib/Target/ARM/ARMTargetAsmInfo.cpp b/lib/Target/ARM/ARMTargetAsmInfo.cpp new file mode 100644 index 0000000..1dea1c1 --- /dev/null +++ b/lib/Target/ARM/ARMTargetAsmInfo.cpp @@ -0,0 +1,276 @@ + +//===-- ARMTargetAsmInfo.cpp - ARM asm properties ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the ARMTargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetAsmInfo.h" +#include "ARMTargetMachine.h" +#include <cstring> +#include <cctype> +using namespace llvm; + +static const char* arm_asm_table[] = {"{r0}", "r0", + "{r1}", "r1", + "{r2}", "r2", + "{r3}", "r3", + "{r4}", "r4", + "{r5}", "r5", + "{r6}", "r6", + "{r7}", "r7", + "{r8}", "r8", + "{r9}", "r9", + "{r10}", "r10", + "{r11}", "r11", + "{r12}", "r12", + "{r13}", "r13", + "{r14}", "r14", + "{lr}", "lr", + "{sp}", "sp", + "{ip}", "ip", + "{fp}", "fp", + "{sl}", "sl", + "{memory}", "memory", + "{cc}", "cc", + 0,0}; + +ARMTargetAsmInfo::ARMTargetAsmInfo(const ARMTargetMachine &TM) { + Subtarget = &TM.getSubtarget<ARMSubtarget>(); + AsmTransCBE = arm_asm_table; + if (Subtarget->isTargetDarwin()) { + GlobalPrefix = "_"; + PrivateGlobalPrefix = "L"; + BSSSection = 0; // no BSS section. + ZeroFillDirective = "\t.zerofill\t"; // Uses .zerofill + SetDirective = "\t.set"; + WeakRefDirective = "\t.weak_reference\t"; + HiddenDirective = "\t.private_extern\t"; + ProtectedDirective = NULL; + JumpTableDataSection = ".const"; + CStringSection = "\t.cstring"; + FourByteConstantSection = "\t.literal4\n"; + EightByteConstantSection = "\t.literal8\n"; + ReadOnlySection = "\t.const\n"; + HasDotTypeDotSizeDirective = false; + if (TM.getRelocationModel() == Reloc::Static) { + StaticCtorsSection = ".constructor"; + StaticDtorsSection = ".destructor"; + } else { + StaticCtorsSection = ".mod_init_func"; + StaticDtorsSection = ".mod_term_func"; + } + + // In non-PIC modes, emit a special label before jump tables so that the + // linker can perform more accurate dead code stripping. + if (TM.getRelocationModel() != Reloc::PIC_) { + // Emit a local label that is preserved until the linker runs. + JumpTableSpecialLabelPrefix = "l"; + } + + NeedsSet = true; + DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug"; + DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug"; + DwarfLineSection = ".section __DWARF,__debug_line,regular,debug"; + DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug"; + DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug"; + DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug"; + DwarfStrSection = ".section __DWARF,__debug_str,regular,debug"; + DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug"; + DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug"; + DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug"; + DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug"; + } else { + NeedsSet = false; + HasLEB128 = true; + AbsoluteDebugSectionOffsets = true; + ReadOnlySection = "\t.section\t.rodata\n"; + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + SetDirective = "\t.set\t"; + DwarfRequiresFrameSection = false; + DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"\",%progbits"; + DwarfInfoSection = "\t.section\t.debug_info,\"\",%progbits"; + DwarfLineSection = "\t.section\t.debug_line,\"\",%progbits"; + DwarfFrameSection = "\t.section\t.debug_frame,\"\",%progbits"; + DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",%progbits"; + DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",%progbits"; + DwarfStrSection = "\t.section\t.debug_str,\"\",%progbits"; + DwarfLocSection = "\t.section\t.debug_loc,\"\",%progbits"; + DwarfARangesSection = "\t.section\t.debug_aranges,\"\",%progbits"; + DwarfRangesSection = "\t.section\t.debug_ranges,\"\",%progbits"; + DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",%progbits"; + + if (Subtarget->isAAPCS_ABI()) { + StaticCtorsSection = "\t.section .init_array,\"aw\",%init_array"; + StaticDtorsSection = "\t.section .fini_array,\"aw\",%fini_array"; + } else { + StaticCtorsSection = "\t.section .ctors,\"aw\",%progbits"; + StaticDtorsSection = "\t.section .dtors,\"aw\",%progbits"; + } + TLSDataSection = "\t.section .tdata,\"awT\",%progbits"; + TLSBSSSection = "\t.section .tbss,\"awT\",%nobits"; + } + + ZeroDirective = "\t.space\t"; + AlignmentIsInBytes = false; + Data64bitsDirective = 0; + CommentString = "@"; + DataSection = "\t.data"; + ConstantPoolSection = "\t.text\n"; + COMMDirectiveTakesAlignment = false; + InlineAsmStart = "@ InlineAsm Start"; + InlineAsmEnd = "@ InlineAsm End"; + LCOMMDirective = "\t.lcomm\t"; +} + +/// Count the number of comma-separated arguments. +/// Do not try to detect errors. +unsigned ARMTargetAsmInfo::countArguments(const char* p) const { + unsigned count = 0; + while (*p && isspace(*p) && *p != '\n') + p++; + count++; + while (*p && *p!='\n' && + strncmp(p, CommentString, strlen(CommentString))!=0) { + if (*p==',') + count++; + p++; + } + return count; +} + +/// Count the length of a string enclosed in quote characters. +/// Do not try to detect errors. +unsigned ARMTargetAsmInfo::countString(const char* p) const { + unsigned count = 0; + while (*p && isspace(*p) && *p!='\n') + p++; + if (!*p || *p != '\"') + return count; + while (*++p && *p != '\"') + count++; + return count; +} + +/// ARM-specific version of TargetAsmInfo::getInlineAsmLength. +unsigned ARMTargetAsmInfo::getInlineAsmLength(const char *Str) const { + // Count the number of bytes in the asm. + bool atInsnStart = true; + bool inTextSection = true; + unsigned Length = 0; + for (; *Str; ++Str) { + if (atInsnStart) { + // Skip whitespace + while (*Str && isspace(*Str) && *Str != '\n') + Str++; + // Skip label + for (const char* p = Str; *p && !isspace(*p); p++) + if (*p == ':') { + Str = p+1; + while (*Str && isspace(*Str) && *Str != '\n') + Str++; + break; + } + // Ignore everything from comment char(s) to EOL + if (strncmp(Str, CommentString, strlen(CommentString))==-0) + atInsnStart = false; + // FIXME do something like the following for non-Darwin + else if (*Str == '.' && Subtarget->isTargetDarwin()) { + // Directive. + atInsnStart = false; + // Some change the section, but don't generate code. + if (strncasecmp(Str, ".literal4", strlen(".literal4"))==0 || + strncasecmp(Str, ".literal8", strlen(".literal8"))==0 || + strncasecmp(Str, ".const", strlen(".const"))==0 || + strncasecmp(Str, ".constructor", strlen(".constructor"))==0 || + strncasecmp(Str, ".cstring", strlen(".cstring"))==0 || + strncasecmp(Str, ".data", strlen(".data"))==0 || + strncasecmp(Str, ".destructor", strlen(".destructor"))==0 || + strncasecmp(Str, ".fvmlib_init0", strlen(".fvmlib_init0"))==0 || + strncasecmp(Str, ".fvmlib_init1", strlen(".fvmlib_init1"))==0 || + strncasecmp(Str, ".mod_init_func", strlen(".mod_init_func"))==0 || + strncasecmp(Str, ".mod_term_func", strlen(".mod_term_func"))==0 || + strncasecmp(Str, ".picsymbol_stub", strlen(".picsymbol_stub"))==0 || + strncasecmp(Str, ".symbol_stub", strlen(".symbol_stub"))==0 || + strncasecmp(Str, ".static_data", strlen(".static_data"))==0 || + strncasecmp(Str, ".section", strlen(".section"))==0 || + strncasecmp(Str, ".lazy_symbol_pointer", strlen(".lazy_symbol_pointer"))==0 || + strncasecmp(Str, ".non_lazy_symbol_pointer", strlen(".non_lazy_symbol_pointer"))==0 || + strncasecmp(Str, ".dyld", strlen(".dyld"))==0 || + strncasecmp(Str, ".const_data", strlen(".const_data"))==0 || + strncasecmp(Str, ".objc", strlen(".objc"))==0 || //// many directives + strncasecmp(Str, ".static_const", strlen(".static_const"))==0) + inTextSection=false; + else if (strncasecmp(Str, ".text", strlen(".text"))==0) + inTextSection = true; + // Some can't really be handled without implementing significant pieces + // of an assembler. Others require dynamic adjustment of block sizes in + // AdjustBBOffsetsAfter; it's a big compile-time speed hit to check every + // instruction in there, and none of these are currently used in the kernel. + else if (strncasecmp(Str, ".macro", strlen(".macro"))==0 || + strncasecmp(Str, ".if", strlen(".if"))==0 || + strncasecmp(Str, ".align", strlen(".align"))==0 || + strncasecmp(Str, ".fill", strlen(".fill"))==0 || + strncasecmp(Str, ".space", strlen(".space"))==0 || + strncasecmp(Str, ".zerofill", strlen(".zerofill"))==0 || + strncasecmp(Str, ".p2align", strlen(".p2align"))==0 || + strncasecmp(Str, ".p2alignw", strlen(".p2alignw"))==0 || + strncasecmp(Str, ".p2alignl", strlen(".p2alignl"))==0 || + strncasecmp(Str, ".align32", strlen(".p2align32"))==0 || + strncasecmp(Str, ".include", strlen(".include"))==0) + cerr << "Directive " << Str << " in asm may lead to invalid offsets for" << + " constant pools (the assembler will tell you if this happens).\n"; + // Some generate code, but this is only interesting in the text section. + else if (inTextSection) { + if (strncasecmp(Str, ".long", strlen(".long"))==0) + Length += 4*countArguments(Str+strlen(".long")); + else if (strncasecmp(Str, ".short", strlen(".short"))==0) + Length += 2*countArguments(Str+strlen(".short")); + else if (strncasecmp(Str, ".byte", strlen(".byte"))==0) + Length += 1*countArguments(Str+strlen(".byte")); + else if (strncasecmp(Str, ".single", strlen(".single"))==0) + Length += 4*countArguments(Str+strlen(".single")); + else if (strncasecmp(Str, ".double", strlen(".double"))==0) + Length += 8*countArguments(Str+strlen(".double")); + else if (strncasecmp(Str, ".quad", strlen(".quad"))==0) + Length += 16*countArguments(Str+strlen(".quad")); + else if (strncasecmp(Str, ".ascii", strlen(".ascii"))==0) + Length += countString(Str+strlen(".ascii")); + else if (strncasecmp(Str, ".asciz", strlen(".asciz"))==0) + Length += countString(Str+strlen(".asciz"))+1; + } + } else if (inTextSection) { + // An instruction + atInsnStart = false; + if (Subtarget->isThumb()) { + // BL and BLX <non-reg> are 4 bytes, all others 2. + if (strncasecmp(Str, "blx", strlen("blx"))==0) { + const char* p = Str+3; + while (*p && isspace(*p)) + p++; + if (*p == 'r' || *p=='R') + Length += 2; // BLX reg + else + Length += 4; // BLX non-reg + } else if (strncasecmp(Str, "bl", strlen("bl"))==0) + Length += 4; // BL + else + Length += 2; // Thumb anything else + } + else + Length += 4; // ARM + } + } + if (*Str == '\n' || *Str == SeparatorChar) + atInsnStart = true; + } + return Length; +} diff --git a/lib/Target/ARM/ARMTargetAsmInfo.h b/lib/Target/ARM/ARMTargetAsmInfo.h new file mode 100644 index 0000000..9dd45e5 --- /dev/null +++ b/lib/Target/ARM/ARMTargetAsmInfo.h @@ -0,0 +1,38 @@ +//=====-- ARMTargetAsmInfo.h - ARM asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the ARMTargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMTARGETASMINFO_H +#define ARMTARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" +#include "ARMSubtarget.h" + +namespace llvm { + + // Forward declaration. + class ARMTargetMachine; + + struct ARMTargetAsmInfo : public TargetAsmInfo { + ARMTargetAsmInfo(const ARMTargetMachine &TM); + + const ARMSubtarget *Subtarget; + + virtual unsigned getInlineAsmLength(const char *Str) const; + unsigned countArguments(const char *p) const; + unsigned countString(const char *p) const; + }; + + +} // namespace llvm + +#endif diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp new file mode 100644 index 0000000..58b3ab9 --- /dev/null +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -0,0 +1,160 @@ +//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "ARMTargetMachine.h" +#include "ARMTargetAsmInfo.h" +#include "ARMFrameInfo.h" +#include "ARM.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachineRegistry.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl::opt<bool> DisableLdStOpti("disable-arm-loadstore-opti", cl::Hidden, + cl::desc("Disable load store optimization pass")); +static cl::opt<bool> EnableIfConversion("enable-arm-if-conversion", cl::Hidden, + cl::desc("Enable if-conversion pass")); + +namespace { + // Register the target. + RegisterTarget<ARMTargetMachine> X("arm", " ARM"); + RegisterTarget<ThumbTargetMachine> Y("thumb", " Thumb"); +} + +/// ThumbTargetMachine - Create an Thumb architecture model. +/// +unsigned ThumbTargetMachine::getJITMatchQuality() { +#if defined(__arm__) + return 10; +#endif + return 0; +} + +unsigned ThumbTargetMachine::getModuleMatchQuality(const Module &M) { + std::string TT = M.getTargetTriple(); + if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "thumb-") + return 20; + + // If the target triple is something non-thumb, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::LittleEndian && + M.getPointerSize() == Module::Pointer32) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + +ThumbTargetMachine::ThumbTargetMachine(const Module &M, const std::string &FS) + : ARMTargetMachine(M, FS, true) { +} + +/// TargetMachine ctor - Create an ARM architecture model. +/// +ARMTargetMachine::ARMTargetMachine(const Module &M, const std::string &FS, + bool isThumb) + : Subtarget(M, FS, isThumb), + DataLayout(Subtarget.isAPCS_ABI() ? + // APCS ABI + (isThumb ? + std::string("e-p:32:32-f64:32:32-i64:32:32-" + "i16:16:32-i8:8:32-i1:8:32-a:0:32") : + std::string("e-p:32:32-f64:32:32-i64:32:32")) : + // AAPCS ABI + (isThumb ? + std::string("e-p:32:32-f64:64:64-i64:64:64-" + "i16:16:32-i8:8:32-i1:8:32-a:0:32") : + std::string("e-p:32:32-f64:64:64-i64:64:64"))), + InstrInfo(Subtarget), + FrameInfo(Subtarget), + JITInfo(*this), + TLInfo(*this) {} + +unsigned ARMTargetMachine::getJITMatchQuality() { +#if defined(__thumb__) + return 10; +#endif + return 0; +} + +unsigned ARMTargetMachine::getModuleMatchQuality(const Module &M) { + std::string TT = M.getTargetTriple(); + if (TT.size() >= 4 && std::string(TT.begin(), TT.begin()+4) == "arm-") + return 20; + // If the target triple is something non-arm, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::LittleEndian && + M.getPointerSize() == Module::Pointer32) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + + +const TargetAsmInfo *ARMTargetMachine::createTargetAsmInfo() const { + return new ARMTargetAsmInfo(*this); +} + + +// Pass Pipeline Configuration +bool ARMTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + PM.add(createARMISelDag(*this)); + return false; +} + +bool ARMTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) { + // FIXME: temporarily disabling load / store optimization pass for Thumb mode. + if (!Fast && !DisableLdStOpti && !Subtarget.isThumb()) + PM.add(createARMLoadStoreOptimizationPass()); + + if (!Fast && EnableIfConversion && !Subtarget.isThumb()) + PM.add(createIfConverterPass()); + + PM.add(createARMConstantIslandPass()); + return true; +} + +bool ARMTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + // Output assembly language. + PM.add(createARMCodePrinterPass(Out, *this)); + return false; +} + + +bool ARMTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + // FIXME: Move this to TargetJITInfo! + setRelocationModel(Reloc::Static); + + // Machine code emitter pass for ARM. + PM.add(createARMCodeEmitterPass(*this, MCE)); + return false; +} + +bool ARMTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + // Machine code emitter pass for ARM. + PM.add(createARMCodeEmitterPass(*this, MCE)); + return false; +} diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h new file mode 100644 index 0000000..183a582 --- /dev/null +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -0,0 +1,81 @@ +//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the "Instituto Nokia de Tecnologia" and +// is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the ARM specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef ARMTARGETMACHINE_H +#define ARMTARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "ARMInstrInfo.h" +#include "ARMFrameInfo.h" +#include "ARMJITInfo.h" +#include "ARMSubtarget.h" +#include "ARMISelLowering.h" + +namespace llvm { + +class Module; + +class ARMTargetMachine : public LLVMTargetMachine { + ARMSubtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + ARMInstrInfo InstrInfo; + ARMFrameInfo FrameInfo; + ARMJITInfo JITInfo; + ARMTargetLowering TLInfo; + +public: + ARMTargetMachine(const Module &M, const std::string &FS, bool isThumb = false); + + virtual const ARMInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual TargetJITInfo *getJITInfo() { return &JITInfo; } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } + virtual ARMTargetLowering *getTargetLowering() const { + return const_cast<ARMTargetLowering*>(&TLInfo); + } + static unsigned getModuleMatchQuality(const Module &M); + static unsigned getJITMatchQuality(); + + virtual const TargetAsmInfo *createTargetAsmInfo() const; + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); + virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); + virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); +}; + +/// ThumbTargetMachine - Thumb target machine. +/// +class ThumbTargetMachine : public ARMTargetMachine { +public: + ThumbTargetMachine(const Module &M, const std::string &FS); + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile new file mode 100644 index 0000000..77300a1 --- /dev/null +++ b/lib/Target/ARM/Makefile @@ -0,0 +1,21 @@ +##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the "Instituto Nokia de Tecnologia" and +# is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMARM +TARGET = ARM + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \ + ARMGenRegisterInfo.inc ARMGenInstrNames.inc \ + ARMGenInstrInfo.inc ARMGenAsmWriter.inc \ + ARMGenDAGISel.inc ARMGenSubtarget.inc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt new file mode 100644 index 0000000..380097d --- /dev/null +++ b/lib/Target/ARM/README-Thumb.txt @@ -0,0 +1,223 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend (Thumb specific). +//===---------------------------------------------------------------------===// + +* Add support for compiling functions in both ARM and Thumb mode, then taking + the smallest. +* Add support for compiling individual basic blocks in thumb mode, when in a + larger ARM function. This can be used for presumed cold code, like paths + to abort (failure path of asserts), EH handling code, etc. + +* Thumb doesn't have normal pre/post increment addressing modes, but you can + load/store 32-bit integers with pre/postinc by using load/store multiple + instrs with a single register. + +* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add + and cmp instructions can use high registers. Also, we can use them as + temporaries to spill values into. + +* In thumb mode, short, byte, and bool preferred alignments are currently set + to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple + of 4). + +//===---------------------------------------------------------------------===// + +Potential jumptable improvements: + +* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit + jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the + function is even smaller. This also applies to ARM. + +* Thumb jumptable codegen can improve given some help from the assembler. This + is what we generate right now: + + .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4)) +LPCRELL0: + mov r1, #PCRELV0 + add r1, pc + ldr r0, [r0, r1] + cpy pc, r0 + .align 2 +LJTI1_0_0: + .long LBB1_3 + ... + +Note there is another pc relative add that we can take advantage of. + add r1, pc, #imm_8 * 4 + +We should be able to generate: + +LPCRELL0: + add r1, LJTI1_0_0 + ldr r0, [r0, r1] + cpy pc, r0 + .align 2 +LJTI1_0_0: + .long LBB1_3 + +if the assembler can translate the add to: + add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc) + +Note the assembler also does something similar to constpool load: +LPCRELL0: + ldr r0, LCPI1_0 +=> + ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc) + + +//===---------------------------------------------------------------------===// + +We compiles the following: + +define i16 @func_entry_2E_ce(i32 %i) { + switch i32 %i, label %bb12.exitStub [ + i32 0, label %bb4.exitStub + i32 1, label %bb9.exitStub + i32 2, label %bb4.exitStub + i32 3, label %bb4.exitStub + i32 7, label %bb9.exitStub + i32 8, label %bb.exitStub + i32 9, label %bb9.exitStub + ] + +bb12.exitStub: + ret i16 0 + +bb4.exitStub: + ret i16 1 + +bb9.exitStub: + ret i16 2 + +bb.exitStub: + ret i16 3 +} + +into: + +_func_entry_2E_ce: + mov r2, #1 + lsl r2, r0 + cmp r0, #9 + bhi LBB1_4 @bb12.exitStub +LBB1_1: @newFuncRoot + mov r1, #13 + tst r2, r1 + bne LBB1_5 @bb4.exitStub +LBB1_2: @newFuncRoot + ldr r1, LCPI1_0 + tst r2, r1 + bne LBB1_6 @bb9.exitStub +LBB1_3: @newFuncRoot + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + bne LBB1_7 @bb.exitStub +LBB1_4: @bb12.exitStub + mov r0, #0 + bx lr +LBB1_5: @bb4.exitStub + mov r0, #1 + bx lr +LBB1_6: @bb9.exitStub + mov r0, #2 + bx lr +LBB1_7: @bb.exitStub + mov r0, #3 + bx lr +LBB1_8: + .align 2 +LCPI1_0: + .long 642 + + +gcc compiles to: + + cmp r0, #9 + @ lr needed for prologue + bhi L2 + ldr r3, L11 + mov r2, #1 + mov r1, r2, asl r0 + ands r0, r3, r2, asl r0 + movne r0, #2 + bxne lr + tst r1, #13 + beq L9 +L3: + mov r0, r2 + bx lr +L9: + tst r1, #256 + movne r0, #3 + bxne lr +L2: + mov r0, #0 + bx lr +L12: + .align 2 +L11: + .long 642 + + +GCC is doing a couple of clever things here: + 1. It is predicating one of the returns. This isn't a clear win though: in + cases where that return isn't taken, it is replacing one condbranch with + two 'ne' predicated instructions. + 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of + tst. This will probably require whole function isel. + 3. GCC emits: + tst r1, #256 + we emit: + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + + +//===---------------------------------------------------------------------===// + +When spilling in thumb mode and the sp offset is too large to fit in the ldr / +str offset field, we load the offset from a constpool entry and add it to sp: + +ldr r2, LCPI +add r2, sp +ldr r2, [r2] + +These instructions preserve the condition code which is important if the spill +is between a cmp and a bcc instruction. However, we can use the (potentially) +cheaper sequnce if we know it's ok to clobber the condition register. + +add r2, sp, #255 * 4 +add r2, #132 +ldr r2, [r2, #7 * 4] + +This is especially bad when dynamic alloca is used. The all fixed size stack +objects are referenced off the frame pointer with negative offsets. See +oggenc for an example. + +//===---------------------------------------------------------------------===// + +We are reserving R3 as a scratch register under thumb mode. So if it is live in +to the function, we save / restore R3 to / from R12. Until register scavenging +is done, we should save R3 to a high callee saved reg at emitPrologue time +(when hasFP is true or stack size is large) and restore R3 from that register +instead. This allows us to at least get rid of the save to r12 everytime it is +used. + +//===---------------------------------------------------------------------===// + +Poor codegen test/CodeGen/ARM/select.ll f7: + + ldr r5, LCPI1_0 +LPC0: + add r5, pc + ldr r6, LCPI1_1 + ldr r2, LCPI1_2 + cpy r3, r6 + cpy lr, pc + bx r5 + +//===---------------------------------------------------------------------===// + +Make register allocator / spiller smarter so we can re-materialize "mov r, imm", +etc. Almost all Thumb instructions clobber condition code. diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt new file mode 100644 index 0000000..3db8f54 --- /dev/null +++ b/lib/Target/ARM/README.txt @@ -0,0 +1,530 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the ARM backend. +//===---------------------------------------------------------------------===// + +Reimplement 'select' in terms of 'SEL'. + +* We would really like to support UXTAB16, but we need to prove that the + add doesn't need to overflow between the two 16-bit chunks. + +* Implement pre/post increment support. (e.g. PR935) +* Coalesce stack slots! +* Implement smarter constant generation for binops with large immediates. + +* Consider materializing FP constants like 0.0f and 1.0f using integer + immediate instructions then copy to FPU. Slower than load into FPU? + +//===---------------------------------------------------------------------===// + +Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the +time regalloc happens, these values are now in a 32-bit register, usually with +the top-bits known to be sign or zero extended. If spilled, we should be able +to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part +of the reload. + +Doing this reduces the size of the stack frame (important for thumb etc), and +also increases the likelihood that we will be able to reload multiple values +from the stack with a single load. + +//===---------------------------------------------------------------------===// + +The constant island pass is in good shape. Some cleanups might be desirable, +but there is unlikely to be much improvement in the generated code. + +1. There may be some advantage to trying to be smarter about the initial +placement, rather than putting everything at the end. + +2. There might be some compile-time efficiency to be had by representing +consecutive islands as a single block rather than multiple blocks. + +3. Use a priority queue to sort constant pool users in inverse order of + position so we always process the one closed to the end of functions + first. This may simply CreateNewWater. + +//===---------------------------------------------------------------------===// + +Eliminate copysign custom expansion. We are still generating crappy code with +default expansion + if-conversion. + +//===---------------------------------------------------------------------===// + +Eliminate one instruction from: + +define i32 @_Z6slow4bii(i32 %x, i32 %y) { + %tmp = icmp sgt i32 %x, %y + %retval = select i1 %tmp, i32 %x, i32 %y + ret i32 %retval +} + +__Z6slow4bii: + cmp r0, r1 + movgt r1, r0 + mov r0, r1 + bx lr +=> + +__Z6slow4bii: + cmp r0, r1 + movle r0, r1 + bx lr + +//===---------------------------------------------------------------------===// + +Implement long long "X-3" with instructions that fold the immediate in. These +were disabled due to badness with the ARM carry flag on subtracts. + +//===---------------------------------------------------------------------===// + +We currently compile abs: +int foo(int p) { return p < 0 ? -p : p; } + +into: + +_foo: + rsb r1, r0, #0 + cmn r0, #1 + movgt r1, r0 + mov r0, r1 + bx lr + +This is very, uh, literal. This could be a 3 operation sequence: + t = (p sra 31); + res = (p xor t)-t + +Which would be better. This occurs in png decode. + +//===---------------------------------------------------------------------===// + +More load / store optimizations: +1) Look past instructions without side-effects (not load, store, branch, etc.) + when forming the list of loads / stores to optimize. + +2) Smarter register allocation? +We are probably missing some opportunities to use ldm / stm. Consider: + +ldr r5, [r0] +ldr r4, [r0, #4] + +This cannot be merged into a ldm. Perhaps we will need to do the transformation +before register allocation. Then teach the register allocator to allocate a +chunk of consecutive registers. + +3) Better representation for block transfer? This is from Olden/power: + + fldd d0, [r4] + fstd d0, [r4, #+32] + fldd d0, [r4, #+8] + fstd d0, [r4, #+40] + fldd d0, [r4, #+16] + fstd d0, [r4, #+48] + fldd d0, [r4, #+24] + fstd d0, [r4, #+56] + +If we can spare the registers, it would be better to use fldm and fstm here. +Need major register allocator enhancement though. + +4) Can we recognize the relative position of constantpool entries? i.e. Treat + + ldr r0, LCPI17_3 + ldr r1, LCPI17_4 + ldr r2, LCPI17_5 + + as + ldr r0, LCPI17 + ldr r1, LCPI17+4 + ldr r2, LCPI17+8 + + Then the ldr's can be combined into a single ldm. See Olden/power. + +Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a +double 64-bit FP constant: + + adr r0, L6 + ldmia r0, {r0-r1} + + .align 2 +L6: + .long -858993459 + .long 1074318540 + +5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use +ldrd/strd instead if there are only two destination registers that form an +odd/even pair. However, we probably would pay a penalty if the address is not +aligned on 8-byte boundary. This requires more information on load / store +nodes (and MI's?) then we currently carry. + +6) struct copies appear to be done field by field +instead of by words, at least sometimes: + +struct foo { int x; short s; char c1; char c2; }; +void cpy(struct foo*a, struct foo*b) { *a = *b; } + +llvm code (-O2) + ldrb r3, [r1, #+6] + ldr r2, [r1] + ldrb r12, [r1, #+7] + ldrh r1, [r1, #+4] + str r2, [r0] + strh r1, [r0, #+4] + strb r3, [r0, #+6] + strb r12, [r0, #+7] +gcc code (-O2) + ldmia r1, {r1-r2} + stmia r0, {r1-r2} + +In this benchmark poor handling of aggregate copies has shown up as +having a large effect on size, and possibly speed as well (we don't have +a good way to measure on ARM). + +//===---------------------------------------------------------------------===// + +* Consider this silly example: + +double bar(double x) { + double r = foo(3.1); + return x+r; +} + +_bar: + sub sp, sp, #16 + str r4, [sp, #+12] + str r5, [sp, #+8] + str lr, [sp, #+4] + mov r4, r0 + mov r5, r1 + ldr r0, LCPI2_0 + bl _foo + fmsr f0, r0 + fcvtsd d0, f0 + fmdrr d1, r4, r5 + faddd d0, d0, d1 + fmrrd r0, r1, d0 + ldr lr, [sp, #+4] + ldr r5, [sp, #+8] + ldr r4, [sp, #+12] + add sp, sp, #16 + bx lr + +Ignore the prologue and epilogue stuff for a second. Note + mov r4, r0 + mov r5, r1 +the copys to callee-save registers and the fact they are only being used by the +fmdrr instruction. It would have been better had the fmdrr been scheduled +before the call and place the result in a callee-save DPR register. The two +mov ops would not have been necessary. + +//===---------------------------------------------------------------------===// + +Calling convention related stuff: + +* gcc's parameter passing implementation is terrible and we suffer as a result: + +e.g. +struct s { + double d1; + int s1; +}; + +void foo(struct s S) { + printf("%g, %d\n", S.d1, S.s1); +} + +'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and +then reload them to r1, r2, and r3 before issuing the call (r0 contains the +address of the format string): + + stmfd sp!, {r7, lr} + add r7, sp, #0 + sub sp, sp, #12 + stmia sp, {r0, r1, r2} + ldmia sp, {r1-r2} + ldr r0, L5 + ldr r3, [sp, #8] +L2: + add r0, pc, r0 + bl L_printf$stub + +Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves? + +* Return an aggregate type is even worse: + +e.g. +struct s foo(void) { + struct s S = {1.1, 2}; + return S; +} + + mov ip, r0 + ldr r0, L5 + sub sp, sp, #12 +L2: + add r0, pc, r0 + @ lr needed for prologue + ldmia r0, {r0, r1, r2} + stmia sp, {r0, r1, r2} + stmia ip, {r0, r1, r2} + mov r0, ip + add sp, sp, #12 + bx lr + +r0 (and later ip) is the hidden parameter from caller to store the value in. The +first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1, +r2 into the address passed in. However, there is one additional stmia that +stores r0, r1, and r2 to some stack location. The store is dead. + +The llvm-gcc generated code looks like this: + +csretcc void %foo(%struct.s* %agg.result) { +entry: + %S = alloca %struct.s, align 4 ; <%struct.s*> [#uses=1] + %memtmp = alloca %struct.s ; <%struct.s*> [#uses=1] + cast %struct.s* %S to sbyte* ; <sbyte*>:0 [#uses=2] + call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 ) + cast %struct.s* %agg.result to sbyte* ; <sbyte*>:1 [#uses=2] + call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 ) + cast %struct.s* %memtmp to sbyte* ; <sbyte*>:2 [#uses=1] + call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 ) + ret void +} + +llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from +constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated +into a number of load and stores, or 2) custom lower memcpy (of small size) to +be ldmia / stmia. I think option 2 is better but the current register +allocator cannot allocate a chunk of registers at a time. + +A feasible temporary solution is to use specific physical registers at the +lowering time for small (<= 4 words?) transfer size. + +* ARM CSRet calling convention requires the hidden argument to be returned by +the callee. + +//===---------------------------------------------------------------------===// + +We can definitely do a better job on BB placements to eliminate some branches. +It's very common to see llvm generated assembly code that looks like this: + +LBB3: + ... +LBB4: +... + beq LBB3 + b LBB2 + +If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can +then eliminate beq and and turn the unconditional branch to LBB2 to a bne. + +See McCat/18-imp/ComputeBoundingBoxes for an example. + +//===---------------------------------------------------------------------===// + +Register scavenging is now implemented. The example in the previous version +of this document produces optimal code at -O2. + +//===---------------------------------------------------------------------===// + +Pre-/post- indexed load / stores: + +1) We should not make the pre/post- indexed load/store transform if the base ptr +is guaranteed to be live beyond the load/store. This can happen if the base +ptr is live out of the block we are performing the optimization. e.g. + +mov r1, r2 +ldr r3, [r1], #4 +... + +vs. + +ldr r3, [r2] +add r1, r2, #4 +... + +In most cases, this is just a wasted optimization. However, sometimes it can +negatively impact the performance because two-address code is more restrictive +when it comes to scheduling. + +Unfortunately, liveout information is currently unavailable during DAG combine +time. + +2) Consider spliting a indexed load / store into a pair of add/sub + load/store + to solve #1 (in TwoAddressInstructionPass.cpp). + +3) Enhance LSR to generate more opportunities for indexed ops. + +4) Once we added support for multiple result patterns, write indexed loads + patterns instead of C++ instruction selection code. + +5) Use FLDM / FSTM to emulate indexed FP load / store. + +//===---------------------------------------------------------------------===// + +We should add i64 support to take advantage of the 64-bit load / stores. +We can add a pseudo i64 register class containing pseudo registers that are +register pairs. All other ops (e.g. add, sub) would be expanded as usual. + +We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers +from the i64 register. These are single moves which can be eliminated if the +destination register is a sub-register of the source. We should implement proper +subreg support in the register allocator to coalesce these away. + +There are other minor issues such as multiple instructions for a spill / restore +/ move. + +//===---------------------------------------------------------------------===// + +Implement support for some more tricky ways to materialize immediates. For +example, to get 0xffff8000, we can use: + +mov r9, #&3f8000 +sub r9, r9, #&400000 + +//===---------------------------------------------------------------------===// + +We sometimes generate multiple add / sub instructions to update sp in prologue +and epilogue if the inc / dec value is too large to fit in a single immediate +operand. In some cases, perhaps it might be better to load the value from a +constantpool instead. + +//===---------------------------------------------------------------------===// + +GCC generates significantly better code for this function. + +int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) { + int i = 0; + + if (StackPtr != 0) { + while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768))) + Line[i++] = Stack[--StackPtr]; + if (LineLen > 32768) + { + while (StackPtr != 0 && i < LineLen) + { + i++; + --StackPtr; + } + } + } + return StackPtr; +} + +//===---------------------------------------------------------------------===// + +This should compile to the mlas instruction: +int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; } + +//===---------------------------------------------------------------------===// + +At some point, we should triage these to see if they still apply to us: + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016 + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982 + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702 +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663 + +http://www.inf.u-szeged.hu/gcc-arm/ +http://citeseer.ist.psu.edu/debus04linktime.html + +//===---------------------------------------------------------------------===// + +gcc generates smaller code for this function at -O2 or -Os: + +void foo(signed char* p) { + if (*p == 3) + bar(); + else if (*p == 4) + baz(); + else if (*p == 5) + quux(); +} + +llvm decides it's a good idea to turn the repeated if...else into a +binary tree, as if it were a switch; the resulting code requires -1 +compare-and-branches when *p<=2 or *p==5, the same number if *p==4 +or *p>6, and +1 if *p==3. So it should be a speed win +(on balance). However, the revised code is larger, with 4 conditional +branches instead of 3. + +More seriously, there is a byte->word extend before +each comparison, where there should be only one, and the condition codes +are not remembered when the same two values are compared twice. + +//===---------------------------------------------------------------------===// + +More register scavenging work: + +1. Use the register scavenger to track frame index materialized into registers + (those that do not fit in addressing modes) to allow reuse in the same BB. +2. Finish scavenging for Thumb. +3. We know some spills and restores are unnecessary. The issue is once live + intervals are merged, they are not never split. So every def is spilled + and every use requires a restore if the register allocator decides the + resulting live interval is not assigned a physical register. It may be + possible (with the help of the scavenger) to turn some spill / restore + pairs into register copies. + +//===---------------------------------------------------------------------===// + +More LSR enhancements possible: + +1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged + in a load / store. +2. Allow iv reuse even when a type conversion is required. For example, i8 + and i32 load / store addressing modes are identical. + + +//===---------------------------------------------------------------------===// + +This: + +int foo(int a, int b, int c, int d) { + long long acc = (long long)a * (long long)b; + acc += (long long)c * (long long)d; + return (int)(acc >> 32); +} + +Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies +two signed 32-bit values to produce a 64-bit value, and accumulates this with +a 64-bit value. + +We currently get this with v6: + +_foo: + mul r12, r1, r0 + smmul r1, r1, r0 + smmul r0, r3, r2 + mul r3, r3, r2 + adds r3, r3, r12 + adc r0, r0, r1 + bx lr + +and this with v4: + +_foo: + stmfd sp!, {r7, lr} + mov r7, sp + mul r12, r1, r0 + smull r0, r1, r1, r0 + smull lr, r0, r3, r2 + mul r3, r3, r2 + adds r3, r3, r12 + adc r0, r0, r1 + ldmfd sp!, {r7, pc} + +This apparently occurs in real code. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/Alpha/Alpha.h b/lib/Target/Alpha/Alpha.h new file mode 100644 index 0000000..a1acde4 --- /dev/null +++ b/lib/Target/Alpha/Alpha.h @@ -0,0 +1,48 @@ +//===-- Alpha.h - Top-level interface for Alpha representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// Alpha back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_ALPHA_H +#define TARGET_ALPHA_H + +#include <iosfwd> + +namespace llvm { + + class AlphaTargetMachine; + class FunctionPass; + class TargetMachine; + class MachineCodeEmitter; + + FunctionPass *createAlphaSimpleInstructionSelector(TargetMachine &TM); + FunctionPass *createAlphaISelDag(TargetMachine &TM); + FunctionPass *createAlphaCodePrinterPass(std::ostream &OS, + TargetMachine &TM); + FunctionPass *createAlphaPatternInstructionSelector(TargetMachine &TM); + FunctionPass *createAlphaCodeEmitterPass(AlphaTargetMachine &TM, + MachineCodeEmitter &MCE); + FunctionPass *createAlphaLLRPPass(AlphaTargetMachine &tm); + FunctionPass *createAlphaBranchSelectionPass(); + +} // end namespace llvm; + +// Defines symbolic names for Alpha registers. This defines a mapping from +// register name to register number. +// +#include "AlphaGenRegisterNames.inc" + +// Defines symbolic names for the Alpha instructions. +// +#include "AlphaGenInstrNames.inc" + +#endif diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td new file mode 100644 index 0000000..fbf7ed9 --- /dev/null +++ b/lib/Target/Alpha/Alpha.td @@ -0,0 +1,66 @@ +//===- Alpha.td - Describe the Alpha Target Machine --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "../Target.td" + +//Alpha is little endian + +//===----------------------------------------------------------------------===// +// Subtarget Features +//===----------------------------------------------------------------------===// + +def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true", + "Enable CIX extentions">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "AlphaRegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Schedule Description +//===----------------------------------------------------------------------===// + +include "AlphaSchedule.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "AlphaInstrInfo.td" + +def AlphaInstrInfo : InstrInfo { + // Define how we want to layout our target-specific information field. + // let TSFlagsFields = []; + // let TSFlagsShifts = []; +} + +//===----------------------------------------------------------------------===// +// Alpha Processor Definitions +//===----------------------------------------------------------------------===// + +def : Processor<"generic", Alpha21264Itineraries, []>; +def : Processor<"ev6" , Alpha21264Itineraries, []>; +def : Processor<"ev67" , Alpha21264Itineraries, [FeatureCIX]>; + +//===----------------------------------------------------------------------===// +// The Alpha Target +//===----------------------------------------------------------------------===// + + +def Alpha : Target { + // Pull in Instruction Info: + let InstructionSet = AlphaInstrInfo; +} diff --git a/lib/Target/Alpha/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AlphaAsmPrinter.cpp new file mode 100644 index 0000000..0494777 --- /dev/null +++ b/lib/Target/Alpha/AlphaAsmPrinter.cpp @@ -0,0 +1,297 @@ +//===-- AlphaAsmPrinter.cpp - Alpha LLVM assembly writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format Alpha assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "Alpha.h" +#include "AlphaInstrInfo.h" +#include "AlphaTargetMachine.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Mangler.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct VISIBILITY_HIDDEN AlphaAsmPrinter : public AsmPrinter { + + /// Unique incrementer for label values for referencing Global values. + /// + + AlphaAsmPrinter(std::ostream &o, TargetMachine &tm, const TargetAsmInfo *T) + : AsmPrinter(o, tm, T) { + } + + virtual const char *getPassName() const { + return "Alpha Assembly Printer"; + } + bool printInstruction(const MachineInstr *MI); + void printOp(const MachineOperand &MO, bool IsCallOp = false); + void printOperand(const MachineInstr *MI, int opNum); + void printBaseOffsetPair (const MachineInstr *MI, int i, bool brackets=true); + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + bool PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode); + }; +} // end of anonymous namespace + +/// createAlphaCodePrinterPass - Returns a pass that prints the Alpha +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. This should work +/// regardless of whether the function is in SSA form. +/// +FunctionPass *llvm::createAlphaCodePrinterPass(std::ostream &o, + TargetMachine &tm) { + return new AlphaAsmPrinter(o, tm, tm.getTargetAsmInfo()); +} + +#include "AlphaGenAsmWriter.inc" + +void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum) +{ + const MachineOperand &MO = MI->getOperand(opNum); + if (MO.getType() == MachineOperand::MO_Register) { + assert(MRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physreg??"); + O << TM.getRegisterInfo()->get(MO.getReg()).Name; + } else if (MO.isImmediate()) { + O << MO.getImmedValue(); + assert(MO.getImmedValue() < (1 << 30)); + } else { + printOp(MO); + } +} + + +void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) { + const MRegisterInfo &RI = *TM.getRegisterInfo(); + + switch (MO.getType()) { + case MachineOperand::MO_Register: + O << RI.get(MO.getReg()).Name; + return; + + case MachineOperand::MO_Immediate: + cerr << "printOp() does not handle immediate values\n"; + abort(); + return; + + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + + case MachineOperand::MO_ConstantPoolIndex: + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" + << MO.getConstantPoolIndex(); + return; + + case MachineOperand::MO_ExternalSymbol: + O << MO.getSymbolName(); + return; + + case MachineOperand::MO_GlobalAddress: { + GlobalValue *GV = MO.getGlobal(); + O << Mang->getValueName(GV); + if (GV->isDeclaration() && GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + return; + } + + case MachineOperand::MO_JumpTableIndex: + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << MO.getJumpTableIndex(); + return; + + default: + O << "<unknown operand type: " << MO.getType() << ">"; + return; + } +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool AlphaAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // Print out jump tables referenced by the function + EmitJumpTableInfo(MF.getJumpTableInfo(), MF); + + // Print out labels for the function. + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + EmitAlignment(4, F); + switch (F->getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: // Symbols default to internal. + break; + case Function::ExternalLinkage: + O << "\t.globl " << CurrentFnName << "\n"; + break; + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + O << TAI->getWeakRefDirective() << CurrentFnName << "\n"; + break; + } + + O << "\t.ent " << CurrentFnName << "\n"; + + O << CurrentFnName << ":\n"; + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + ++EmittedInsts; + O << "\t"; + if (!printInstruction(II)) { + assert(0 && "Unhandled instruction in asm writer!"); + abort(); + } + } + } + + O << "\t.end " << CurrentFnName << "\n"; + + // We didn't modify anything. + return false; +} + +bool AlphaAsmPrinter::doInitialization(Module &M) +{ + if(TM.getSubtarget<AlphaSubtarget>().hasCT()) + O << "\t.arch ev6\n"; //This might need to be ev67, so leave this test here + else + O << "\t.arch ev6\n"; + O << "\t.set noat\n"; + AsmPrinter::doInitialization(M); + return false; +} + +bool AlphaAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); I != E; ++I) { + + if (!I->hasInitializer()) continue; // External global require no code + + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + unsigned Size = TD->getTypeSize(C->getType()); + unsigned Align = TD->getPreferredAlignmentLog(I); + + //1: hidden? + if (I->hasHiddenVisibility()) + O << TAI->getHiddenDirective() << name << "\n"; + + //2: kind + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + O << TAI->getWeakRefDirective() << name << '\n'; + break; + case GlobalValue::AppendingLinkage: + case GlobalValue::ExternalLinkage: + O << "\t.globl " << name << "\n"; + break; + case GlobalValue::InternalLinkage: + break; + default: + assert(0 && "Unknown linkage type!"); + cerr << "Unknown linkage type!\n"; + abort(); + } + + //3: Section (if changed) + if (I->hasSection() && + (I->getSection() == ".ctors" || + I->getSection() == ".dtors")) { + std::string SectionName = ".section\t" + I->getSection() + + ",\"aw\",@progbits"; + SwitchToDataSection(SectionName.c_str()); + } else { + if (C->isNullValue()) + SwitchToDataSection("\t.section\t.bss", I); + else + SwitchToDataSection("\t.section\t.data", I); + } + + //4: Type, Size, Align + O << "\t.type\t" << name << ", @object\n"; + O << "\t.size\t" << name << ", " << Size << "\n"; + EmitAlignment(Align, I); + + O << name << ":\n"; + + // If the initializer is a extern weak symbol, remember to emit the weak + // reference! + if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + EmitGlobalConstant(C); + O << '\n'; + } + + AsmPrinter::doFinalization(M); + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool AlphaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + printOperand(MI, OpNo); + return false; +} + +bool AlphaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + O << "0("; + printOperand(MI, OpNo); + O << ")"; + return false; +} diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp new file mode 100644 index 0000000..ac789b3 --- /dev/null +++ b/lib/Target/Alpha/AlphaBranchSelector.cpp @@ -0,0 +1,67 @@ +//===-- AlphaBranchSelector.cpp - Convert Pseudo branchs ----------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Replace Pseudo COND_BRANCH_* with their appropriate real branch +// Simplified version of the PPC Branch Selector +// +//===----------------------------------------------------------------------===// + +#include "Alpha.h" +#include "AlphaInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetAsmInfo.h" +using namespace llvm; + +namespace { + struct VISIBILITY_HIDDEN AlphaBSel : public MachineFunctionPass { + static char ID; + AlphaBSel() : MachineFunctionPass((intptr_t)&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "Alpha Branch Selection"; + } + }; + char AlphaBSel::ID = 0; +} + +/// createAlphaBranchSelectionPass - returns an instance of the Branch Selection +/// Pass +/// +FunctionPass *llvm::createAlphaBranchSelectionPass() { + return new AlphaBSel(); +} + +bool AlphaBSel::runOnMachineFunction(MachineFunction &Fn) { + + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock *MBB = MFI; + + for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); + MBBI != EE; ++MBBI) { + if (MBBI->getOpcode() == Alpha::COND_BRANCH_I || + MBBI->getOpcode() == Alpha::COND_BRANCH_F) { + + // condbranch operands: + // 0. bc opcode + // 1. reg + // 2. target MBB + const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + MBBI->setInstrDescriptor(TII->get(MBBI->getOperand(0).getImm())); + } + } + } + + return true; +} + diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp new file mode 100644 index 0000000..3549551 --- /dev/null +++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp @@ -0,0 +1,222 @@ +//===-- Alpha/AlphaCodeEmitter.cpp - Convert Alpha code to machine code ---===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the Alpha machine instructions +// into relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "alpha-emitter" +#include "AlphaTargetMachine.h" +#include "AlphaRelocations.h" +#include "Alpha.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/Support/Debug.h" +using namespace llvm; + +namespace { + class AlphaCodeEmitter : public MachineFunctionPass { + const AlphaInstrInfo *II; + TargetMachine &TM; + MachineCodeEmitter &MCE; + + /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr + /// + int getMachineOpValue(MachineInstr &MI, MachineOperand &MO); + + public: + static char ID; + explicit AlphaCodeEmitter(TargetMachine &tm, MachineCodeEmitter &mce) + : MachineFunctionPass((intptr_t)&ID), II(0), TM(tm), MCE(mce) {} + AlphaCodeEmitter(TargetMachine &tm, MachineCodeEmitter &mce, + const AlphaInstrInfo& ii) + : MachineFunctionPass((intptr_t)&ID), II(&ii), TM(tm), MCE(mce) {} + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "Alpha Machine Code Emitter"; + } + + void emitInstruction(const MachineInstr &MI); + + /// getBinaryCodeForInstr - This function, generated by the + /// CodeEmitterGenerator using TableGen, produces the binary encoding for + /// machine instructions. + /// + unsigned getBinaryCodeForInstr(MachineInstr &MI); + + private: + void emitBasicBlock(MachineBasicBlock &MBB); + + }; + char AlphaCodeEmitter::ID = 0; +} + +/// createAlphaCodeEmitterPass - Return a pass that emits the collected Alpha code +/// to the specified MCE object. +FunctionPass *llvm::createAlphaCodeEmitterPass(AlphaTargetMachine &TM, + MachineCodeEmitter &MCE) { + return new AlphaCodeEmitter(TM, MCE); +} + +bool AlphaCodeEmitter::runOnMachineFunction(MachineFunction &MF) { + II = ((AlphaTargetMachine&)MF.getTarget()).getInstrInfo(); + + do { + MCE.startFunction(MF); + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + emitBasicBlock(*I); + } while (MCE.finishFunction(MF)); + + return false; +} + +void AlphaCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) { + MCE.StartMachineBasicBlock(&MBB); + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + switch(MI.getOpcode()) { + default: + MCE.emitWordLE(getBinaryCodeForInstr(*I)); + break; + case Alpha::ALTENT: + case Alpha::PCLABEL: + case Alpha::MEMLABEL: + case Alpha::IDEF_I: + case Alpha::IDEF_F32: + case Alpha::IDEF_F64: + break; //skip these + } + } +} + +static unsigned getAlphaRegNumber(unsigned Reg) { + switch (Reg) { + case Alpha::R0 : case Alpha::F0 : return 0; + case Alpha::R1 : case Alpha::F1 : return 1; + case Alpha::R2 : case Alpha::F2 : return 2; + case Alpha::R3 : case Alpha::F3 : return 3; + case Alpha::R4 : case Alpha::F4 : return 4; + case Alpha::R5 : case Alpha::F5 : return 5; + case Alpha::R6 : case Alpha::F6 : return 6; + case Alpha::R7 : case Alpha::F7 : return 7; + case Alpha::R8 : case Alpha::F8 : return 8; + case Alpha::R9 : case Alpha::F9 : return 9; + case Alpha::R10 : case Alpha::F10 : return 10; + case Alpha::R11 : case Alpha::F11 : return 11; + case Alpha::R12 : case Alpha::F12 : return 12; + case Alpha::R13 : case Alpha::F13 : return 13; + case Alpha::R14 : case Alpha::F14 : return 14; + case Alpha::R15 : case Alpha::F15 : return 15; + case Alpha::R16 : case Alpha::F16 : return 16; + case Alpha::R17 : case Alpha::F17 : return 17; + case Alpha::R18 : case Alpha::F18 : return 18; + case Alpha::R19 : case Alpha::F19 : return 19; + case Alpha::R20 : case Alpha::F20 : return 20; + case Alpha::R21 : case Alpha::F21 : return 21; + case Alpha::R22 : case Alpha::F22 : return 22; + case Alpha::R23 : case Alpha::F23 : return 23; + case Alpha::R24 : case Alpha::F24 : return 24; + case Alpha::R25 : case Alpha::F25 : return 25; + case Alpha::R26 : case Alpha::F26 : return 26; + case Alpha::R27 : case Alpha::F27 : return 27; + case Alpha::R28 : case Alpha::F28 : return 28; + case Alpha::R29 : case Alpha::F29 : return 29; + case Alpha::R30 : case Alpha::F30 : return 30; + case Alpha::R31 : case Alpha::F31 : return 31; + default: + assert(0 && "Unhandled reg"); + abort(); + } +} + +int AlphaCodeEmitter::getMachineOpValue(MachineInstr &MI, MachineOperand &MO) { + + int rv = 0; // Return value; defaults to 0 for unhandled cases + // or things that get fixed up later by the JIT. + + if (MO.isRegister()) { + rv = getAlphaRegNumber(MO.getReg()); + } else if (MO.isImmediate()) { + rv = MO.getImmedValue(); + } else if (MO.isGlobalAddress() || MO.isExternalSymbol() + || MO.isConstantPoolIndex()) { + DOUT << MO << " is a relocated op for " << MI << "\n"; + unsigned Reloc = 0; + int Offset = 0; + bool useGOT = false; + switch (MI.getOpcode()) { + case Alpha::BSR: + Reloc = Alpha::reloc_bsr; + break; + case Alpha::LDLr: + case Alpha::LDQr: + case Alpha::LDBUr: + case Alpha::LDWUr: + case Alpha::LDSr: + case Alpha::LDTr: + case Alpha::LDAr: + case Alpha::STQr: + case Alpha::STLr: + case Alpha::STWr: + case Alpha::STBr: + case Alpha::STSr: + case Alpha::STTr: + Reloc = Alpha::reloc_gprellow; + break; + case Alpha::LDAHr: + Reloc = Alpha::reloc_gprelhigh; + break; + case Alpha::LDQl: + Reloc = Alpha::reloc_literal; + useGOT = true; + break; + case Alpha::LDAg: + case Alpha::LDAHg: + Reloc = Alpha::reloc_gpdist; + Offset = MI.getOperand(3).getImmedValue(); + break; + default: + assert(0 && "unknown relocatable instruction"); + abort(); + } + if (MO.isGlobalAddress()) + MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), + Reloc, MO.getGlobal(), Offset, + false, useGOT)); + else if (MO.isExternalSymbol()) + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, MO.getSymbolName(), Offset, + true)); + else + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, MO.getConstantPoolIndex(), + Offset)); + } else if (MO.isMachineBasicBlock()) { + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + Alpha::reloc_bsr, + MO.getMachineBasicBlock())); + }else { + cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n"; + abort(); + } + + return rv; +} + + +#include "AlphaGenCodeEmitter.inc" + diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp new file mode 100644 index 0000000..4f7533c --- /dev/null +++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp @@ -0,0 +1,563 @@ +//===-- AlphaISelDAGToDAG.cpp - Alpha pattern matching inst selector ------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for Alpha, +// converting from a legalized dag to a Alpha dag. +// +//===----------------------------------------------------------------------===// + +#include "Alpha.h" +#include "AlphaTargetMachine.h" +#include "AlphaISelLowering.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalValue.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include <algorithm> +#include <queue> +#include <set> +using namespace llvm; + +namespace { + + //===--------------------------------------------------------------------===// + /// AlphaDAGToDAGISel - Alpha specific code to select Alpha machine + /// instructions for SelectionDAG operations. + class AlphaDAGToDAGISel : public SelectionDAGISel { + AlphaTargetLowering AlphaLowering; + + static const int64_t IMM_LOW = -32768; + static const int64_t IMM_HIGH = 32767; + static const int64_t IMM_MULT = 65536; + static const int64_t IMM_FULLHIGH = IMM_HIGH + IMM_HIGH * IMM_MULT; + static const int64_t IMM_FULLLOW = IMM_LOW + IMM_LOW * IMM_MULT; + + static int64_t get_ldah16(int64_t x) { + int64_t y = x / IMM_MULT; + if (x % IMM_MULT > IMM_HIGH) + ++y; + return y; + } + + static int64_t get_lda16(int64_t x) { + return x - get_ldah16(x) * IMM_MULT; + } + + /// get_zapImm - Return a zap mask if X is a valid immediate for a zapnot + /// instruction (if not, return 0). Note that this code accepts partial + /// zap masks. For example (and LHS, 1) is a valid zap, as long we know + /// that the bits 1-7 of LHS are already zero. If LHS is non-null, we are + /// in checking mode. If LHS is null, we assume that the mask has already + /// been validated before. + uint64_t get_zapImm(SDOperand LHS, uint64_t Constant) { + uint64_t BitsToCheck = 0; + unsigned Result = 0; + for (unsigned i = 0; i != 8; ++i) { + if (((Constant >> 8*i) & 0xFF) == 0) { + // nothing to do. + } else { + Result |= 1 << i; + if (((Constant >> 8*i) & 0xFF) == 0xFF) { + // If the entire byte is set, zapnot the byte. + } else if (LHS.Val == 0) { + // Otherwise, if the mask was previously validated, we know its okay + // to zapnot this entire byte even though all the bits aren't set. + } else { + // Otherwise we don't know that the it's okay to zapnot this entire + // byte. Only do this iff we can prove that the missing bits are + // already null, so the bytezap doesn't need to really null them. + BitsToCheck |= ~Constant & (0xFF << 8*i); + } + } + } + + // If there are missing bits in a byte (for example, X & 0xEF00), check to + // see if the missing bits (0x1000) are already known zero if not, the zap + // isn't okay to do, as it won't clear all the required bits. + if (BitsToCheck && + !CurDAG->MaskedValueIsZero(LHS, BitsToCheck)) + return 0; + + return Result; + } + + static uint64_t get_zapImm(uint64_t x) { + unsigned build = 0; + for(int i = 0; i != 8; ++i) { + if ((x & 0x00FF) == 0x00FF) + build |= 1 << i; + else if ((x & 0x00FF) != 0) + return 0; + x >>= 8; + } + return build; + } + + + static uint64_t getNearPower2(uint64_t x) { + if (!x) return 0; + unsigned at = CountLeadingZeros_64(x); + uint64_t complow = 1 << (63 - at); + uint64_t comphigh = 1 << (64 - at); + //cerr << x << ":" << complow << ":" << comphigh << "\n"; + if (abs(complow - x) <= abs(comphigh - x)) + return complow; + else + return comphigh; + } + + static bool chkRemNearPower2(uint64_t x, uint64_t r, bool swap) { + uint64_t y = getNearPower2(x); + if (swap) + return (y - x) == r; + else + return (x - y) == r; + } + + static bool isFPZ(SDOperand N) { + ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N); + return (CN && (CN->isExactlyValue(+0.0) || CN->isExactlyValue(-0.0))); + } + static bool isFPZn(SDOperand N) { + ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N); + return (CN && CN->isExactlyValue(-0.0)); + } + static bool isFPZp(SDOperand N) { + ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N); + return (CN && CN->isExactlyValue(+0.0)); + } + + public: + AlphaDAGToDAGISel(TargetMachine &TM) + : SelectionDAGISel(AlphaLowering), + AlphaLowering(*(AlphaTargetLowering*)(TM.getTargetLowering())) + {} + + /// getI64Imm - Return a target constant with the specified value, of type + /// i64. + inline SDOperand getI64Imm(int64_t Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i64); + } + + // Select - Convert the specified operand from a target-independent to a + // target-specific node if it hasn't already been changed. + SDNode *Select(SDOperand Op); + + /// InstructionSelectBasicBlock - This callback is invoked by + /// SelectionDAGISel when it has created a SelectionDAG for us to codegen. + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + + virtual const char *getPassName() const { + return "Alpha DAG->DAG Pattern Instruction Selection"; + } + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op, + char ConstraintCode, + std::vector<SDOperand> &OutOps, + SelectionDAG &DAG) { + SDOperand Op0; + switch (ConstraintCode) { + default: return true; + case 'm': // memory + Op0 = Op; + AddToISelQueue(Op0); + break; + } + + OutOps.push_back(Op0); + return false; + } + +// Include the pieces autogenerated from the target description. +#include "AlphaGenDAGISel.inc" + +private: + SDOperand getGlobalBaseReg(); + SDOperand getGlobalRetAddr(); + void SelectCALL(SDOperand Op); + + }; +} + +/// getGlobalBaseReg - Output the instructions required to put the +/// GOT address into a register. +/// +SDOperand AlphaDAGToDAGISel::getGlobalBaseReg() { + MachineFunction* MF = BB->getParent(); + unsigned GP = 0; + for(MachineFunction::livein_iterator ii = MF->livein_begin(), + ee = MF->livein_end(); ii != ee; ++ii) + if (ii->first == Alpha::R29) { + GP = ii->second; + break; + } + assert(GP && "GOT PTR not in liveins"); + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + GP, MVT::i64); +} + +/// getRASaveReg - Grab the return address +/// +SDOperand AlphaDAGToDAGISel::getGlobalRetAddr() { + MachineFunction* MF = BB->getParent(); + unsigned RA = 0; + for(MachineFunction::livein_iterator ii = MF->livein_begin(), + ee = MF->livein_end(); ii != ee; ++ii) + if (ii->first == Alpha::R26) { + RA = ii->second; + break; + } + assert(RA && "RA PTR not in liveins"); + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + RA, MVT::i64); +} + +/// InstructionSelectBasicBlock - This callback is invoked by +/// SelectionDAGISel when it has created a SelectionDAG for us to codegen. +void AlphaDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + + // Select target instructions for the DAG. + DAG.setRoot(SelectRoot(DAG.getRoot())); + DAG.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(DAG); +} + +// Select - Convert the specified operand from a target-independent to a +// target-specific node if it hasn't already been changed. +SDNode *AlphaDAGToDAGISel::Select(SDOperand Op) { + SDNode *N = Op.Val; + if (N->getOpcode() >= ISD::BUILTIN_OP_END && + N->getOpcode() < AlphaISD::FIRST_NUMBER) { + return NULL; // Already selected. + } + + switch (N->getOpcode()) { + default: break; + case AlphaISD::CALL: + SelectCALL(Op); + return NULL; + + case ISD::FrameIndex: { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + return CurDAG->SelectNodeTo(N, Alpha::LDA, MVT::i64, + CurDAG->getTargetFrameIndex(FI, MVT::i32), + getI64Imm(0)); + } + case ISD::GLOBAL_OFFSET_TABLE: { + SDOperand Result = getGlobalBaseReg(); + ReplaceUses(Op, Result); + return NULL; + } + case AlphaISD::GlobalRetAddr: { + SDOperand Result = getGlobalRetAddr(); + ReplaceUses(Op, Result); + return NULL; + } + + case AlphaISD::DivCall: { + SDOperand Chain = CurDAG->getEntryNode(); + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + AddToISelQueue(N0); + AddToISelQueue(N1); + AddToISelQueue(N2); + Chain = CurDAG->getCopyToReg(Chain, Alpha::R24, N1, + SDOperand(0,0)); + Chain = CurDAG->getCopyToReg(Chain, Alpha::R25, N2, + Chain.getValue(1)); + Chain = CurDAG->getCopyToReg(Chain, Alpha::R27, N0, + Chain.getValue(1)); + SDNode *CNode = + CurDAG->getTargetNode(Alpha::JSRs, MVT::Other, MVT::Flag, + Chain, Chain.getValue(1)); + Chain = CurDAG->getCopyFromReg(Chain, Alpha::R27, MVT::i64, + SDOperand(CNode, 1)); + return CurDAG->SelectNodeTo(N, Alpha::BISr, MVT::i64, Chain, Chain); + } + + case ISD::READCYCLECOUNTER: { + SDOperand Chain = N->getOperand(0); + AddToISelQueue(Chain); //Select chain + return CurDAG->getTargetNode(Alpha::RPCC, MVT::i64, MVT::Other, + Chain); + } + + case ISD::Constant: { + uint64_t uval = cast<ConstantSDNode>(N)->getValue(); + + if (uval == 0) { + SDOperand Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + Alpha::R31, MVT::i64); + ReplaceUses(Op, Result); + return NULL; + } + + int64_t val = (int64_t)uval; + int32_t val32 = (int32_t)val; + if (val <= IMM_HIGH + IMM_HIGH * IMM_MULT && + val >= IMM_LOW + IMM_LOW * IMM_MULT) + break; //(LDAH (LDA)) + if ((uval >> 32) == 0 && //empty upper bits + val32 <= IMM_HIGH + IMM_HIGH * IMM_MULT) + // val32 >= IMM_LOW + IMM_LOW * IMM_MULT) //always true + break; //(zext (LDAH (LDA))) + //Else use the constant pool + ConstantInt *C = ConstantInt::get(Type::Int64Ty, uval); + SDOperand CPI = CurDAG->getTargetConstantPool(C, MVT::i64); + SDNode *Tmp = CurDAG->getTargetNode(Alpha::LDAHr, MVT::i64, CPI, + getGlobalBaseReg()); + return CurDAG->SelectNodeTo(N, Alpha::LDQr, MVT::i64, MVT::Other, + CPI, SDOperand(Tmp, 0), CurDAG->getEntryNode()); + } + case ISD::TargetConstantFP: { + ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N); + bool isDouble = N->getValueType(0) == MVT::f64; + MVT::ValueType T = isDouble ? MVT::f64 : MVT::f32; + if (CN->isExactlyValue(+0.0)) { + return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYST : Alpha::CPYSS, + T, CurDAG->getRegister(Alpha::F31, T), + CurDAG->getRegister(Alpha::F31, T)); + } else if ( CN->isExactlyValue(-0.0)) { + return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYSNT : Alpha::CPYSNS, + T, CurDAG->getRegister(Alpha::F31, T), + CurDAG->getRegister(Alpha::F31, T)); + } else { + abort(); + } + break; + } + + case ISD::SETCC: + if (MVT::isFloatingPoint(N->getOperand(0).Val->getValueType(0))) { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + unsigned Opc = Alpha::WTF; + bool rev = false; + bool inv = false; + switch(CC) { + default: DEBUG(N->dump(CurDAG)); assert(0 && "Unknown FP comparison!"); + case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ: + Opc = Alpha::CMPTEQ; break; + case ISD::SETLT: case ISD::SETOLT: case ISD::SETULT: + Opc = Alpha::CMPTLT; break; + case ISD::SETLE: case ISD::SETOLE: case ISD::SETULE: + Opc = Alpha::CMPTLE; break; + case ISD::SETGT: case ISD::SETOGT: case ISD::SETUGT: + Opc = Alpha::CMPTLT; rev = true; break; + case ISD::SETGE: case ISD::SETOGE: case ISD::SETUGE: + Opc = Alpha::CMPTLE; rev = true; break; + case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE: + Opc = Alpha::CMPTEQ; inv = true; break; + case ISD::SETO: + Opc = Alpha::CMPTUN; inv = true; break; + case ISD::SETUO: + Opc = Alpha::CMPTUN; break; + }; + SDOperand tmp1 = N->getOperand(rev?1:0); + SDOperand tmp2 = N->getOperand(rev?0:1); + AddToISelQueue(tmp1); + AddToISelQueue(tmp2); + SDNode *cmp = CurDAG->getTargetNode(Opc, MVT::f64, tmp1, tmp2); + if (inv) + cmp = CurDAG->getTargetNode(Alpha::CMPTEQ, MVT::f64, SDOperand(cmp, 0), + CurDAG->getRegister(Alpha::F31, MVT::f64)); + switch(CC) { + case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE: + case ISD::SETUNE: case ISD::SETUGT: case ISD::SETUGE: + { + SDNode* cmp2 = CurDAG->getTargetNode(Alpha::CMPTUN, MVT::f64, + tmp1, tmp2); + cmp = CurDAG->getTargetNode(Alpha::ADDT, MVT::f64, + SDOperand(cmp2, 0), SDOperand(cmp, 0)); + break; + } + default: break; + } + + SDNode* LD = CurDAG->getTargetNode(Alpha::FTOIT, MVT::i64, SDOperand(cmp, 0)); + return CurDAG->getTargetNode(Alpha::CMPULT, MVT::i64, + CurDAG->getRegister(Alpha::R31, MVT::i64), + SDOperand(LD,0)); + } + break; + + case ISD::SELECT: + if (MVT::isFloatingPoint(N->getValueType(0)) && + (N->getOperand(0).getOpcode() != ISD::SETCC || + !MVT::isFloatingPoint(N->getOperand(0).getOperand(1).getValueType()))) { + //This should be the condition not covered by the Patterns + //FIXME: Don't have SelectCode die, but rather return something testable + // so that things like this can be caught in fall though code + //move int to fp + bool isDouble = N->getValueType(0) == MVT::f64; + SDOperand cond = N->getOperand(0); + SDOperand TV = N->getOperand(1); + SDOperand FV = N->getOperand(2); + AddToISelQueue(cond); + AddToISelQueue(TV); + AddToISelQueue(FV); + + SDNode* LD = CurDAG->getTargetNode(Alpha::ITOFT, MVT::f64, cond); + return CurDAG->getTargetNode(isDouble?Alpha::FCMOVNET:Alpha::FCMOVNES, + MVT::f64, FV, TV, SDOperand(LD,0)); + } + break; + + case ISD::AND: { + ConstantSDNode* SC = NULL; + ConstantSDNode* MC = NULL; + if (N->getOperand(0).getOpcode() == ISD::SRL && + (MC = dyn_cast<ConstantSDNode>(N->getOperand(1))) && + (SC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)))) { + uint64_t sval = SC->getValue(); + uint64_t mval = MC->getValue(); + // If the result is a zap, let the autogened stuff handle it. + if (get_zapImm(N->getOperand(0), mval)) + break; + // given mask X, and shift S, we want to see if there is any zap in the + // mask if we play around with the botton S bits + uint64_t dontcare = (~0ULL) >> (64 - sval); + uint64_t mask = mval << sval; + + if (get_zapImm(mask | dontcare)) + mask = mask | dontcare; + + if (get_zapImm(mask)) { + AddToISelQueue(N->getOperand(0).getOperand(0)); + SDOperand Z = + SDOperand(CurDAG->getTargetNode(Alpha::ZAPNOTi, MVT::i64, + N->getOperand(0).getOperand(0), + getI64Imm(get_zapImm(mask))), 0); + return CurDAG->getTargetNode(Alpha::SRLr, MVT::i64, Z, + getI64Imm(sval)); + } + } + break; + } + + } + + return SelectCode(Op); +} + +void AlphaDAGToDAGISel::SelectCALL(SDOperand Op) { + //TODO: add flag stuff to prevent nondeturministic breakage! + + SDNode *N = Op.Val; + SDOperand Chain = N->getOperand(0); + SDOperand Addr = N->getOperand(1); + SDOperand InFlag(0,0); // Null incoming flag value. + AddToISelQueue(Chain); + + std::vector<SDOperand> CallOperands; + std::vector<MVT::ValueType> TypeOperands; + + //grab the arguments + for(int i = 2, e = N->getNumOperands(); i < e; ++i) { + TypeOperands.push_back(N->getOperand(i).getValueType()); + AddToISelQueue(N->getOperand(i)); + CallOperands.push_back(N->getOperand(i)); + } + int count = N->getNumOperands() - 2; + + static const unsigned args_int[] = {Alpha::R16, Alpha::R17, Alpha::R18, + Alpha::R19, Alpha::R20, Alpha::R21}; + static const unsigned args_float[] = {Alpha::F16, Alpha::F17, Alpha::F18, + Alpha::F19, Alpha::F20, Alpha::F21}; + + for (int i = 6; i < count; ++i) { + unsigned Opc = Alpha::WTF; + if (MVT::isInteger(TypeOperands[i])) { + Opc = Alpha::STQ; + } else if (TypeOperands[i] == MVT::f32) { + Opc = Alpha::STS; + } else if (TypeOperands[i] == MVT::f64) { + Opc = Alpha::STT; + } else + assert(0 && "Unknown operand"); + + SDOperand Ops[] = { CallOperands[i], getI64Imm((i - 6) * 8), + CurDAG->getCopyFromReg(Chain, Alpha::R30, MVT::i64), + Chain }; + Chain = SDOperand(CurDAG->getTargetNode(Opc, MVT::Other, Ops, 4), 0); + } + for (int i = 0; i < std::min(6, count); ++i) { + if (MVT::isInteger(TypeOperands[i])) { + Chain = CurDAG->getCopyToReg(Chain, args_int[i], CallOperands[i], InFlag); + InFlag = Chain.getValue(1); + } else if (TypeOperands[i] == MVT::f32 || TypeOperands[i] == MVT::f64) { + Chain = CurDAG->getCopyToReg(Chain, args_float[i], CallOperands[i], InFlag); + InFlag = Chain.getValue(1); + } else + assert(0 && "Unknown operand"); + } + + // Finally, once everything is in registers to pass to the call, emit the + // call itself. + if (Addr.getOpcode() == AlphaISD::GPRelLo) { + SDOperand GOT = getGlobalBaseReg(); + Chain = CurDAG->getCopyToReg(Chain, Alpha::R29, GOT, InFlag); + InFlag = Chain.getValue(1); + Chain = SDOperand(CurDAG->getTargetNode(Alpha::BSR, MVT::Other, MVT::Flag, + Addr.getOperand(0), Chain, InFlag), 0); + } else { + AddToISelQueue(Addr); + Chain = CurDAG->getCopyToReg(Chain, Alpha::R27, Addr, InFlag); + InFlag = Chain.getValue(1); + Chain = SDOperand(CurDAG->getTargetNode(Alpha::JSR, MVT::Other, MVT::Flag, + Chain, InFlag), 0); + } + InFlag = Chain.getValue(1); + + std::vector<SDOperand> CallResults; + + switch (N->getValueType(0)) { + default: assert(0 && "Unexpected ret value!"); + case MVT::Other: break; + case MVT::i64: + Chain = CurDAG->getCopyFromReg(Chain, Alpha::R0, MVT::i64, InFlag).getValue(1); + CallResults.push_back(Chain.getValue(0)); + break; + case MVT::f32: + Chain = CurDAG->getCopyFromReg(Chain, Alpha::F0, MVT::f32, InFlag).getValue(1); + CallResults.push_back(Chain.getValue(0)); + break; + case MVT::f64: + Chain = CurDAG->getCopyFromReg(Chain, Alpha::F0, MVT::f64, InFlag).getValue(1); + CallResults.push_back(Chain.getValue(0)); + break; + } + + CallResults.push_back(Chain); + for (unsigned i = 0, e = CallResults.size(); i != e; ++i) + ReplaceUses(Op.getValue(i), CallResults[i]); +} + + +/// createAlphaISelDag - This pass converts a legalized DAG into a +/// Alpha-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createAlphaISelDag(TargetMachine &TM) { + return new AlphaDAGToDAGISel(TM); +} diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp new file mode 100644 index 0000000..d4777b2 --- /dev/null +++ b/lib/Target/Alpha/AlphaISelLowering.cpp @@ -0,0 +1,623 @@ +//===-- AlphaISelLowering.cpp - Alpha DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AlphaISelLowering class. +// +//===----------------------------------------------------------------------===// + +#include "AlphaISelLowering.h" +#include "AlphaTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +/// AddLiveIn - This helper function adds the specified physical register to the +/// MachineFunction as a live in value. It also creates a corresponding virtual +/// register for it. +static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg, + TargetRegisterClass *RC) { + assert(RC->contains(PReg) && "Not the correct regclass!"); + unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC); + MF.addLiveIn(PReg, VReg); + return VReg; +} + +AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM) { + // Set up the TargetLowering object. + //I am having problems with shr n ubyte 1 + setShiftAmountType(MVT::i64); + setSetCCResultType(MVT::i64); + setSetCCResultContents(ZeroOrOneSetCCResult); + + setUsesGlobalOffsetTable(true); + + addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass); + addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass); + addRegisterClass(MVT::f32, Alpha::F4RCRegisterClass); + + setLoadXAction(ISD::EXTLOAD, MVT::i1, Promote); + setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand); + + setLoadXAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadXAction(ISD::ZEXTLOAD, MVT::i32, Expand); + + setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadXAction(ISD::SEXTLOAD, MVT::i8, Expand); + setLoadXAction(ISD::SEXTLOAD, MVT::i16, Expand); + + setStoreXAction(MVT::i1, Promote); + + // setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + + if (!TM.getSubtarget<AlphaSubtarget>().hasCT()) { + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + setOperationAction(ISD::CTLZ , MVT::i64 , Expand); + } + setOperationAction(ISD::BSWAP , MVT::i64, Expand); + setOperationAction(ISD::ROTL , MVT::i64, Expand); + setOperationAction(ISD::ROTR , MVT::i64, Expand); + + setOperationAction(ISD::SREM , MVT::i64, Custom); + setOperationAction(ISD::UREM , MVT::i64, Custom); + setOperationAction(ISD::SDIV , MVT::i64, Custom); + setOperationAction(ISD::UDIV , MVT::i64, Custom); + + setOperationAction(ISD::MEMMOVE , MVT::Other, Expand); + setOperationAction(ISD::MEMSET , MVT::Other, Expand); + setOperationAction(ISD::MEMCPY , MVT::Other, Expand); + + // We don't support sin/cos/sqrt + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + + setOperationAction(ISD::SETCC, MVT::f32, Promote); + + setOperationAction(ISD::BIT_CONVERT, MVT::f32, Promote); + + // We don't have line number support yet. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + setOperationAction(ISD::LABEL, MVT::Other, Expand); + + // Not implemented yet. + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + // We want to legalize GlobalAddress and ConstantPool and + // ExternalSymbols nodes into the appropriate instructions to + // materialize the address. + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::i32, Custom); + + setOperationAction(ISD::RET, MVT::Other, Custom); + + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); + + setStackPointerRegisterToSaveRestore(Alpha::R30); + + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + addLegalFPImmediate(+0.0); //F31 + addLegalFPImmediate(-0.0); //-F31 + + setJumpBufSize(272); + setJumpBufAlignment(16); + + computeRegisterProperties(); +} + +const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case AlphaISD::CVTQT_: return "Alpha::CVTQT_"; + case AlphaISD::CVTQS_: return "Alpha::CVTQS_"; + case AlphaISD::CVTTQ_: return "Alpha::CVTTQ_"; + case AlphaISD::GPRelHi: return "Alpha::GPRelHi"; + case AlphaISD::GPRelLo: return "Alpha::GPRelLo"; + case AlphaISD::RelLit: return "Alpha::RelLit"; + case AlphaISD::GlobalRetAddr: return "Alpha::GlobalRetAddr"; + case AlphaISD::CALL: return "Alpha::CALL"; + case AlphaISD::DivCall: return "Alpha::DivCall"; + case AlphaISD::RET_FLAG: return "Alpha::RET_FLAG"; + case AlphaISD::COND_BRANCH_I: return "Alpha::COND_BRANCH_I"; + case AlphaISD::COND_BRANCH_F: return "Alpha::COND_BRANCH_F"; + } +} + +static SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType PtrVT = Op.getValueType(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + SDOperand Zero = DAG.getConstant(0, PtrVT); + + SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi, MVT::i64, JTI, + DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64)); + SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, JTI, Hi); + return Lo; +} + +//http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/ +//AA-PY8AC-TET1_html/callCH3.html#BLOCK21 + +//For now, just use variable size stack frame format + +//In a standard call, the first six items are passed in registers $16 +//- $21 and/or registers $f16 - $f21. (See Section 4.1.2 for details +//of argument-to-register correspondence.) The remaining items are +//collected in a memory argument list that is a naturally aligned +//array of quadwords. In a standard call, this list, if present, must +//be passed at 0(SP). +//7 ... n 0(SP) ... (n-7)*8(SP) + +// //#define FP $15 +// //#define RA $26 +// //#define PV $27 +// //#define GP $29 +// //#define SP $30 + +static SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG, + int &VarArgsBase, + int &VarArgsOffset) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + std::vector<SDOperand> ArgValues; + SDOperand Root = Op.getOperand(0); + + AddLiveIn(MF, Alpha::R29, &Alpha::GPRCRegClass); //GP + AddLiveIn(MF, Alpha::R26, &Alpha::GPRCRegClass); //RA + + unsigned args_int[] = { + Alpha::R16, Alpha::R17, Alpha::R18, Alpha::R19, Alpha::R20, Alpha::R21}; + unsigned args_float[] = { + Alpha::F16, Alpha::F17, Alpha::F18, Alpha::F19, Alpha::F20, Alpha::F21}; + + for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) { + SDOperand argt; + MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType(); + SDOperand ArgVal; + + if (ArgNo < 6) { + switch (ObjectVT) { + default: + cerr << "Unknown Type " << ObjectVT << "\n"; + abort(); + case MVT::f64: + args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], + &Alpha::F8RCRegClass); + ArgVal = DAG.getCopyFromReg(Root, args_float[ArgNo], ObjectVT); + break; + case MVT::f32: + args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo], + &Alpha::F4RCRegClass); + ArgVal = DAG.getCopyFromReg(Root, args_float[ArgNo], ObjectVT); + break; + case MVT::i64: + args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo], + &Alpha::GPRCRegClass); + ArgVal = DAG.getCopyFromReg(Root, args_int[ArgNo], MVT::i64); + break; + } + } else { //more args + // Create the frame index object for this incoming parameter... + int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6)); + + // Create the SelectionDAG nodes corresponding to a load + //from this parameter + SDOperand FIN = DAG.getFrameIndex(FI, MVT::i64); + ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0); + } + ArgValues.push_back(ArgVal); + } + + // If the functions takes variable number of arguments, copy all regs to stack + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + if (isVarArg) { + VarArgsOffset = (Op.Val->getNumValues()-1) * 8; + std::vector<SDOperand> LS; + for (int i = 0; i < 6; ++i) { + if (MRegisterInfo::isPhysicalRegister(args_int[i])) + args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass); + SDOperand argt = DAG.getCopyFromReg(Root, args_int[i], MVT::i64); + int FI = MFI->CreateFixedObject(8, -8 * (6 - i)); + if (i == 0) VarArgsBase = FI; + SDOperand SDFI = DAG.getFrameIndex(FI, MVT::i64); + LS.push_back(DAG.getStore(Root, argt, SDFI, NULL, 0)); + + if (MRegisterInfo::isPhysicalRegister(args_float[i])) + args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass); + argt = DAG.getCopyFromReg(Root, args_float[i], MVT::f64); + FI = MFI->CreateFixedObject(8, - 8 * (12 - i)); + SDFI = DAG.getFrameIndex(FI, MVT::i64); + LS.push_back(DAG.getStore(Root, argt, SDFI, NULL, 0)); + } + + //Set up a token factor with all the stack traffic + Root = DAG.getNode(ISD::TokenFactor, MVT::Other, &LS[0], LS.size()); + } + + ArgValues.push_back(Root); + + // Return the new list of results. + std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(), + Op.Val->value_end()); + return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size()); +} + +static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG) { + SDOperand Copy = DAG.getCopyToReg(Op.getOperand(0), Alpha::R26, + DAG.getNode(AlphaISD::GlobalRetAddr, + MVT::i64), + SDOperand()); + switch (Op.getNumOperands()) { + default: + assert(0 && "Do not know how to return this many arguments!"); + abort(); + case 1: + break; + //return SDOperand(); // ret void is legal + case 3: { + MVT::ValueType ArgVT = Op.getOperand(1).getValueType(); + unsigned ArgReg; + if (MVT::isInteger(ArgVT)) + ArgReg = Alpha::R0; + else { + assert(MVT::isFloatingPoint(ArgVT)); + ArgReg = Alpha::F0; + } + Copy = DAG.getCopyToReg(Copy, ArgReg, Op.getOperand(1), Copy.getValue(1)); + if (DAG.getMachineFunction().liveout_empty()) + DAG.getMachineFunction().addLiveOut(ArgReg); + break; + } + } + return DAG.getNode(AlphaISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1)); +} + +std::pair<SDOperand, SDOperand> +AlphaTargetLowering::LowerCallTo(SDOperand Chain, const Type *RetTy, + bool RetTyIsSigned, bool isVarArg, + unsigned CallingConv, bool isTailCall, + SDOperand Callee, ArgListTy &Args, + SelectionDAG &DAG) { + int NumBytes = 0; + if (Args.size() > 6) + NumBytes = (Args.size() - 6) * 8; + + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getConstant(NumBytes, getPointerTy())); + std::vector<SDOperand> args_to_use; + for (unsigned i = 0, e = Args.size(); i != e; ++i) + { + switch (getValueType(Args[i].Ty)) { + default: assert(0 && "Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + // Promote the integer to 64 bits. If the input type is signed use a + // sign extend, otherwise use a zero extend. + if (Args[i].isSExt) + Args[i].Node = DAG.getNode(ISD::SIGN_EXTEND, MVT::i64, Args[i].Node); + else if (Args[i].isZExt) + Args[i].Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i64, Args[i].Node); + else + Args[i].Node = DAG.getNode(ISD::ANY_EXTEND, MVT::i64, Args[i].Node); + break; + case MVT::i64: + case MVT::f64: + case MVT::f32: + break; + } + args_to_use.push_back(Args[i].Node); + } + + std::vector<MVT::ValueType> RetVals; + MVT::ValueType RetTyVT = getValueType(RetTy); + MVT::ValueType ActualRetTyVT = RetTyVT; + if (RetTyVT >= MVT::i1 && RetTyVT <= MVT::i32) + ActualRetTyVT = MVT::i64; + + if (RetTyVT != MVT::isVoid) + RetVals.push_back(ActualRetTyVT); + RetVals.push_back(MVT::Other); + + std::vector<SDOperand> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + Ops.insert(Ops.end(), args_to_use.begin(), args_to_use.end()); + SDOperand TheCall = DAG.getNode(AlphaISD::CALL, RetVals, &Ops[0], Ops.size()); + Chain = TheCall.getValue(RetTyVT != MVT::isVoid); + Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain, + DAG.getConstant(NumBytes, getPointerTy())); + SDOperand RetVal = TheCall; + + if (RetTyVT != ActualRetTyVT) { + RetVal = DAG.getNode(RetTyIsSigned ? ISD::AssertSext : ISD::AssertZext, + MVT::i64, RetVal, DAG.getValueType(RetTyVT)); + RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal); + } + + return std::make_pair(RetVal, Chain); +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDOperand AlphaTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Wasn't expecting to be able to lower this!"); + case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG, + VarArgsBase, + VarArgsOffset); + + case ISD::RET: return LowerRET(Op,DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + + case ISD::SINT_TO_FP: { + assert(MVT::i64 == Op.getOperand(0).getValueType() && + "Unhandled SINT_TO_FP type in custom expander!"); + SDOperand LD; + bool isDouble = MVT::f64 == Op.getValueType(); + LD = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, Op.getOperand(0)); + SDOperand FP = DAG.getNode(isDouble?AlphaISD::CVTQT_:AlphaISD::CVTQS_, + isDouble?MVT::f64:MVT::f32, LD); + return FP; + } + case ISD::FP_TO_SINT: { + bool isDouble = MVT::f64 == Op.getOperand(0).getValueType(); + SDOperand src = Op.getOperand(0); + + if (!isDouble) //Promote + src = DAG.getNode(ISD::FP_EXTEND, MVT::f64, src); + + src = DAG.getNode(AlphaISD::CVTTQ_, MVT::f64, src); + + return DAG.getNode(ISD::BIT_CONVERT, MVT::i64, src); + } + case ISD::ConstantPool: { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + Constant *C = CP->getConstVal(); + SDOperand CPI = DAG.getTargetConstantPool(C, MVT::i64, CP->getAlignment()); + + SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi, MVT::i64, CPI, + DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64)); + SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, CPI, Hi); + return Lo; + } + case ISD::GlobalTLSAddress: + assert(0 && "TLS not implemented for Alpha."); + case ISD::GlobalAddress: { + GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); + GlobalValue *GV = GSDN->getGlobal(); + SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i64, GSDN->getOffset()); + + // if (!GV->hasWeakLinkage() && !GV->isDeclaration() && !GV->hasLinkOnceLinkage()) { + if (GV->hasInternalLinkage()) { + SDOperand Hi = DAG.getNode(AlphaISD::GPRelHi, MVT::i64, GA, + DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64)); + SDOperand Lo = DAG.getNode(AlphaISD::GPRelLo, MVT::i64, GA, Hi); + return Lo; + } else + return DAG.getNode(AlphaISD::RelLit, MVT::i64, GA, + DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64)); + } + case ISD::ExternalSymbol: { + return DAG.getNode(AlphaISD::RelLit, MVT::i64, + DAG.getTargetExternalSymbol(cast<ExternalSymbolSDNode>(Op) + ->getSymbol(), MVT::i64), + DAG.getNode(ISD::GLOBAL_OFFSET_TABLE, MVT::i64)); + } + + case ISD::UREM: + case ISD::SREM: + //Expand only on constant case + if (Op.getOperand(1).getOpcode() == ISD::Constant) { + MVT::ValueType VT = Op.Val->getValueType(0); + SDOperand Tmp1 = Op.Val->getOpcode() == ISD::UREM ? + BuildUDIV(Op.Val, DAG, NULL) : + BuildSDIV(Op.Val, DAG, NULL); + Tmp1 = DAG.getNode(ISD::MUL, VT, Tmp1, Op.getOperand(1)); + Tmp1 = DAG.getNode(ISD::SUB, VT, Op.getOperand(0), Tmp1); + return Tmp1; + } + //fall through + case ISD::SDIV: + case ISD::UDIV: + if (MVT::isInteger(Op.getValueType())) { + if (Op.getOperand(1).getOpcode() == ISD::Constant) + return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.Val, DAG, NULL) + : BuildUDIV(Op.Val, DAG, NULL); + const char* opstr = 0; + switch (Op.getOpcode()) { + case ISD::UREM: opstr = "__remqu"; break; + case ISD::SREM: opstr = "__remq"; break; + case ISD::UDIV: opstr = "__divqu"; break; + case ISD::SDIV: opstr = "__divq"; break; + } + SDOperand Tmp1 = Op.getOperand(0), + Tmp2 = Op.getOperand(1), + Addr = DAG.getExternalSymbol(opstr, MVT::i64); + return DAG.getNode(AlphaISD::DivCall, MVT::i64, Addr, Tmp1, Tmp2); + } + break; + + case ISD::VAARG: { + SDOperand Chain = Op.getOperand(0); + SDOperand VAListP = Op.getOperand(1); + SrcValueSDNode *VAListS = cast<SrcValueSDNode>(Op.getOperand(2)); + + SDOperand Base = DAG.getLoad(MVT::i64, Chain, VAListP, VAListS->getValue(), + VAListS->getOffset()); + SDOperand Tmp = DAG.getNode(ISD::ADD, MVT::i64, VAListP, + DAG.getConstant(8, MVT::i64)); + SDOperand Offset = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Base.getValue(1), + Tmp, NULL, 0, MVT::i32); + SDOperand DataPtr = DAG.getNode(ISD::ADD, MVT::i64, Base, Offset); + if (MVT::isFloatingPoint(Op.getValueType())) + { + //if fp && Offset < 6*8, then subtract 6*8 from DataPtr + SDOperand FPDataPtr = DAG.getNode(ISD::SUB, MVT::i64, DataPtr, + DAG.getConstant(8*6, MVT::i64)); + SDOperand CC = DAG.getSetCC(MVT::i64, Offset, + DAG.getConstant(8*6, MVT::i64), ISD::SETLT); + DataPtr = DAG.getNode(ISD::SELECT, MVT::i64, CC, FPDataPtr, DataPtr); + } + + SDOperand NewOffset = DAG.getNode(ISD::ADD, MVT::i64, Offset, + DAG.getConstant(8, MVT::i64)); + SDOperand Update = DAG.getTruncStore(Offset.getValue(1), NewOffset, + Tmp, NULL, 0, MVT::i32); + + SDOperand Result; + if (Op.getValueType() == MVT::i32) + Result = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Update, DataPtr, + NULL, 0, MVT::i32); + else + Result = DAG.getLoad(Op.getValueType(), Update, DataPtr, NULL, 0); + return Result; + } + case ISD::VACOPY: { + SDOperand Chain = Op.getOperand(0); + SDOperand DestP = Op.getOperand(1); + SDOperand SrcP = Op.getOperand(2); + SrcValueSDNode *DestS = cast<SrcValueSDNode>(Op.getOperand(3)); + SrcValueSDNode *SrcS = cast<SrcValueSDNode>(Op.getOperand(4)); + + SDOperand Val = DAG.getLoad(getPointerTy(), Chain, SrcP, + SrcS->getValue(), SrcS->getOffset()); + SDOperand Result = DAG.getStore(Val.getValue(1), Val, DestP, DestS->getValue(), + DestS->getOffset()); + SDOperand NP = DAG.getNode(ISD::ADD, MVT::i64, SrcP, + DAG.getConstant(8, MVT::i64)); + Val = DAG.getExtLoad(ISD::SEXTLOAD, MVT::i64, Result, NP, NULL,0, MVT::i32); + SDOperand NPD = DAG.getNode(ISD::ADD, MVT::i64, DestP, + DAG.getConstant(8, MVT::i64)); + return DAG.getTruncStore(Val.getValue(1), Val, NPD, NULL, 0, MVT::i32); + } + case ISD::VASTART: { + SDOperand Chain = Op.getOperand(0); + SDOperand VAListP = Op.getOperand(1); + SrcValueSDNode *VAListS = cast<SrcValueSDNode>(Op.getOperand(2)); + + // vastart stores the address of the VarArgsBase and VarArgsOffset + SDOperand FR = DAG.getFrameIndex(VarArgsBase, MVT::i64); + SDOperand S1 = DAG.getStore(Chain, FR, VAListP, VAListS->getValue(), + VAListS->getOffset()); + SDOperand SA2 = DAG.getNode(ISD::ADD, MVT::i64, VAListP, + DAG.getConstant(8, MVT::i64)); + return DAG.getTruncStore(S1, DAG.getConstant(VarArgsOffset, MVT::i64), + SA2, NULL, 0, MVT::i32); + } + case ISD::RETURNADDR: + return DAG.getNode(AlphaISD::GlobalRetAddr, MVT::i64); + //FIXME: implement + case ISD::FRAMEADDR: break; + } + + return SDOperand(); +} + +SDOperand AlphaTargetLowering::CustomPromoteOperation(SDOperand Op, + SelectionDAG &DAG) { + assert(Op.getValueType() == MVT::i32 && + Op.getOpcode() == ISD::VAARG && + "Unknown node to custom promote!"); + + // The code in LowerOperation already handles i32 vaarg + return LowerOperation(Op, DAG); +} + + +//Inline Asm + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +AlphaTargetLowering::ConstraintType +AlphaTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'f': + case 'r': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::vector<unsigned> AlphaTargetLowering:: +getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; // Unknown constriant letter + case 'f': + return make_vector<unsigned>(Alpha::F0 , Alpha::F1 , Alpha::F2 , + Alpha::F3 , Alpha::F4 , Alpha::F5 , + Alpha::F6 , Alpha::F7 , Alpha::F8 , + Alpha::F9 , Alpha::F10, Alpha::F11, + Alpha::F12, Alpha::F13, Alpha::F14, + Alpha::F15, Alpha::F16, Alpha::F17, + Alpha::F18, Alpha::F19, Alpha::F20, + Alpha::F21, Alpha::F22, Alpha::F23, + Alpha::F24, Alpha::F25, Alpha::F26, + Alpha::F27, Alpha::F28, Alpha::F29, + Alpha::F30, Alpha::F31, 0); + case 'r': + return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 , + Alpha::R3 , Alpha::R4 , Alpha::R5 , + Alpha::R6 , Alpha::R7 , Alpha::R8 , + Alpha::R9 , Alpha::R10, Alpha::R11, + Alpha::R12, Alpha::R13, Alpha::R14, + Alpha::R15, Alpha::R16, Alpha::R17, + Alpha::R18, Alpha::R19, Alpha::R20, + Alpha::R21, Alpha::R22, Alpha::R23, + Alpha::R24, Alpha::R25, Alpha::R26, + Alpha::R27, Alpha::R28, Alpha::R29, + Alpha::R30, Alpha::R31, 0); + } + } + + return std::vector<unsigned>(); +} diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h new file mode 100644 index 0000000..24e40a5 --- /dev/null +++ b/lib/Target/Alpha/AlphaISelLowering.h @@ -0,0 +1,94 @@ +//===-- AlphaISelLowering.h - Alpha DAG Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that Alpha uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H +#define LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H + +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "Alpha.h" + +namespace llvm { + + namespace AlphaISD { + enum NodeType { + // Start the numbering where the builting ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END+Alpha::INSTRUCTION_LIST_END, + //These corrospond to the identical Instruction + CVTQT_, CVTQS_, CVTTQ_, + + /// GPRelHi/GPRelLo - These represent the high and low 16-bit + /// parts of a global address respectively. + GPRelHi, GPRelLo, + + /// RetLit - Literal Relocation of a Global + RelLit, + + /// GlobalRetAddr - used to restore the return address + GlobalRetAddr, + + /// CALL - Normal call. + CALL, + + /// DIVCALL - used for special library calls for div and rem + DivCall, + + /// return flag operand + RET_FLAG, + + /// CHAIN = COND_BRANCH CHAIN, OPC, (G|F)PRC, DESTBB [, INFLAG] - This + /// corresponds to the COND_BRANCH pseudo instruction. + /// *PRC is the input register to compare to zero, + /// OPC is the branch opcode to use (e.g. Alpha::BEQ), + /// DESTBB is the destination block to branch to, and INFLAG is + /// an optional input flag argument. + COND_BRANCH_I, COND_BRANCH_F + + }; + } + + class AlphaTargetLowering : public TargetLowering { + int VarArgsOffset; // What is the offset to the first vaarg + int VarArgsBase; // What is the base FrameIndex + bool useITOF; + public: + AlphaTargetLowering(TargetMachine &TM); + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + virtual SDOperand CustomPromoteOperation(SDOperand Op, SelectionDAG &DAG); + + //Friendly names for dumps + const char *getTargetNodeName(unsigned Opcode) const; + + /// LowerCallTo - This hook lowers an abstract call to a function into an + /// actual call. + virtual std::pair<SDOperand, SDOperand> + LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, + bool isVarArg, unsigned CC, bool isTailCall, SDOperand Callee, + ArgListTy &Args, SelectionDAG &DAG); + + ConstraintType getConstraintType(const std::string &Constraint) const; + + std::vector<unsigned> + getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + + bool hasITOF() { return useITOF; } + }; +} + +#endif // LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H diff --git a/lib/Target/Alpha/AlphaInstrFormats.td b/lib/Target/Alpha/AlphaInstrFormats.td new file mode 100644 index 0000000..259e9af --- /dev/null +++ b/lib/Target/Alpha/AlphaInstrFormats.td @@ -0,0 +1,249 @@ +//===- AlphaInstrFormats.td - Alpha Instruction Formats ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//3.3: +//Memory +//Branch +//Operate +//Floating-point +//PALcode + +def u8imm : Operand<i64>; +def s14imm : Operand<i64>; +def s16imm : Operand<i64>; +def s21imm : Operand<i64>; +def s64imm : Operand<i64>; +def u64imm : Operand<i64>; + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// +// Alpha instruction baseline +class InstAlpha<bits<6> op, string asmstr, InstrItinClass itin> : Instruction { + field bits<32> Inst; + let Namespace = "Alpha"; + let AsmString = asmstr; + let Inst{31-26} = op; + let Itinerary = itin; +} + + +//3.3.1 +class MForm<bits<6> opcode, bit store, bit load, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let isStore = store; + let isLoad = load; + let Defs = [R28]; //We may use this for frame index calculations, so reserve it here + + bits<5> Ra; + bits<16> disp; + bits<5> Rb; + + let Inst{25-21} = Ra; + let Inst{20-16} = Rb; + let Inst{15-0} = disp; +} +class MfcForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + bits<5> Ra; + + let OperandList = (ops GPRC:$RA); + let Inst{25-21} = Ra; + let Inst{20-16} = 0; + let Inst{15-0} = fc; +} + +class MbrForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + bits<5> Ra; + bits<5> Rb; + bits<14> disp; + + let OperandList = OL; + + let Inst{25-21} = Ra; + let Inst{20-16} = Rb; + let Inst{15-14} = TB; + let Inst{13-0} = disp; +} +class MbrpForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern=pattern; + bits<5> Ra; + bits<5> Rb; + bits<14> disp; + + let OperandList = OL; + + let Inst{25-21} = Ra; + let Inst{20-16} = Rb; + let Inst{15-14} = TB; + let Inst{13-0} = disp; +} + +//3.3.2 +def target : Operand<OtherVT> {} + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, noResults = 1 in { +class BFormN<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let OperandList = OL; + bits<64> Opc; //dummy + bits<5> Ra; + bits<21> disp; + + let Inst{25-21} = Ra; + let Inst{20-0} = disp; +} +} + +let isBranch = 1, isTerminator = 1 in +class BFormD<bits<6> opcode, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops target:$DISP); + bits<5> Ra; + bits<21> disp; + + let Inst{25-21} = Ra; + let Inst{20-0} = disp; +} + +//3.3.3 +class OForm<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops GPRC:$RC, GPRC:$RA, GPRC:$RB); + + bits<5> Rc; + bits<5> Ra; + bits<5> Rb; + bits<7> Function = fun; + + let Inst{25-21} = Ra; + let Inst{20-16} = Rb; + let Inst{15-13} = 0; + let Inst{12} = 0; + let Inst{11-5} = Function; + let Inst{4-0} = Rc; +} + +class OForm2<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops GPRC:$RC, GPRC:$RB); + + bits<5> Rc; + bits<5> Rb; + bits<7> Function = fun; + + let Inst{25-21} = 31; + let Inst{20-16} = Rb; + let Inst{15-13} = 0; + let Inst{12} = 0; + let Inst{11-5} = Function; + let Inst{4-0} = Rc; +} + +class OForm4<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops GPRC:$RDEST, GPRC:$RCOND, GPRC:$RTRUE, GPRC:$RFALSE); + let Constraints = "$RFALSE = $RDEST"; + let DisableEncoding = "$RFALSE"; + + bits<5> Rc; + bits<5> Ra; + bits<5> Rb; + bits<7> Function = fun; + +// let isTwoAddress = 1; + let Inst{25-21} = Ra; + let Inst{20-16} = Rb; + let Inst{15-13} = 0; + let Inst{12} = 0; + let Inst{11-5} = Function; + let Inst{4-0} = Rc; +} + + +class OFormL<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops GPRC:$RC, GPRC:$RA, u8imm:$L); + + bits<5> Rc; + bits<5> Ra; + bits<8> LIT; + bits<7> Function = fun; + + let Inst{25-21} = Ra; + let Inst{20-13} = LIT; + let Inst{12} = 1; + let Inst{11-5} = Function; + let Inst{4-0} = Rc; +} + +class OForm4L<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + let OperandList = (ops GPRC:$RDEST, GPRC:$RCOND, s64imm:$RTRUE, GPRC:$RFALSE); + let Constraints = "$RFALSE = $RDEST"; + let DisableEncoding = "$RFALSE"; + + bits<5> Rc; + bits<5> Ra; + bits<8> LIT; + bits<7> Function = fun; + +// let isTwoAddress = 1; + let Inst{25-21} = Ra; + let Inst{20-13} = LIT; + let Inst{12} = 1; + let Inst{11-5} = Function; + let Inst{4-0} = Rc; +} + +//3.3.4 +class FPForm<bits<6> opcode, bits<11> fun, string asmstr, list<dag> pattern, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let Pattern = pattern; + + bits<5> Fc; + bits<5> Fa; + bits<5> Fb; + bits<11> Function = fun; + + let Inst{25-21} = Fa; + let Inst{20-16} = Fb; + let Inst{15-5} = Function; + let Inst{4-0} = Fc; +} + +//3.3.5 +class PALForm<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : InstAlpha<opcode, asmstr, itin> { + let OperandList = OL; + bits<26> Function; + + let Inst{25-0} = Function; +} + + +// Pseudo instructions. +class PseudoInstAlpha<dag OL, string nm, list<dag> pattern, InstrItinClass itin> + : InstAlpha<0, nm, itin> { + let OperandList = OL; + let Pattern = pattern; + +} diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp new file mode 100644 index 0000000..718587d --- /dev/null +++ b/lib/Target/Alpha/AlphaInstrInfo.cpp @@ -0,0 +1,266 @@ +//===- AlphaInstrInfo.cpp - Alpha Instruction Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Alpha implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Alpha.h" +#include "AlphaInstrInfo.h" +#include "AlphaGenInstrInfo.inc" +#include "llvm/CodeGen/MachineInstrBuilder.h" +using namespace llvm; + +AlphaInstrInfo::AlphaInstrInfo() + : TargetInstrInfo(AlphaInsts, sizeof(AlphaInsts)/sizeof(AlphaInsts[0])), + RI(*this) { } + + +bool AlphaInstrInfo::isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const { + MachineOpCode oc = MI.getOpcode(); + if (oc == Alpha::BISr || + oc == Alpha::CPYSS || + oc == Alpha::CPYST || + oc == Alpha::CPYSSt || + oc == Alpha::CPYSTs) { + // or r1, r2, r2 + // cpys(s|t) r1 r2 r2 + assert(MI.getNumOperands() >= 3 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + MI.getOperand(2).isRegister() && + "invalid Alpha BIS instruction!"); + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + } + return false; +} + +unsigned +AlphaInstrInfo::isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const { + switch (MI->getOpcode()) { + case Alpha::LDL: + case Alpha::LDQ: + case Alpha::LDBU: + case Alpha::LDWU: + case Alpha::LDS: + case Alpha::LDT: + if (MI->getOperand(1).isFrameIndex()) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned +AlphaInstrInfo::isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const { + switch (MI->getOpcode()) { + case Alpha::STL: + case Alpha::STQ: + case Alpha::STB: + case Alpha::STW: + case Alpha::STS: + case Alpha::STT: + if (MI->getOperand(1).isFrameIndex()) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +static bool isAlphaIntCondCode(unsigned Opcode) { + switch (Opcode) { + case Alpha::BEQ: + case Alpha::BNE: + case Alpha::BGE: + case Alpha::BGT: + case Alpha::BLE: + case Alpha::BLT: + case Alpha::BLBC: + case Alpha::BLBS: + return true; + default: + return false; + } +} + +unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond)const{ + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "Alpha branch conditions have two components!"); + + // One-way branch. + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch + BuildMI(&MBB, get(Alpha::BR)).addMBB(TBB); + else // Conditional branch + if (isAlphaIntCondCode(Cond[0].getImm())) + BuildMI(&MBB, get(Alpha::COND_BRANCH_I)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + else + BuildMI(&MBB, get(Alpha::COND_BRANCH_F)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + return 1; + } + + // Two-way Conditional Branch. + if (isAlphaIntCondCode(Cond[0].getImm())) + BuildMI(&MBB, get(Alpha::COND_BRANCH_I)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + else + BuildMI(&MBB, get(Alpha::COND_BRANCH_F)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + BuildMI(&MBB, get(Alpha::BR)).addMBB(FBB); + return 2; +} + +static unsigned AlphaRevCondCode(unsigned Opcode) { + switch (Opcode) { + case Alpha::BEQ: return Alpha::BNE; + case Alpha::BNE: return Alpha::BEQ; + case Alpha::BGE: return Alpha::BLT; + case Alpha::BGT: return Alpha::BLE; + case Alpha::BLE: return Alpha::BGT; + case Alpha::BLT: return Alpha::BGE; + case Alpha::BLBC: return Alpha::BLBS; + case Alpha::BLBS: return Alpha::BLBC; + case Alpha::FBEQ: return Alpha::FBNE; + case Alpha::FBNE: return Alpha::FBEQ; + case Alpha::FBGE: return Alpha::FBLT; + case Alpha::FBGT: return Alpha::FBLE; + case Alpha::FBLE: return Alpha::FBGT; + case Alpha::FBLT: return Alpha::FBGE; + default: + assert(0 && "Unknown opcode"); + } +} + +// Branch analysis. +bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastInst->getOpcode() == Alpha::BR) { + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } else if (LastInst->getOpcode() == Alpha::COND_BRANCH_I || + LastInst->getOpcode() == Alpha::COND_BRANCH_F) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(2).getMachineBasicBlock(); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + return false; + } + // Otherwise, don't know what this is. + return true; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && + isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with Alpha::BR and Alpha::COND_BRANCH_*, handle it. + if ((SecondLastInst->getOpcode() == Alpha::COND_BRANCH_I || + SecondLastInst->getOpcode() == Alpha::COND_BRANCH_F) && + LastInst->getOpcode() == Alpha::BR) { + TBB = SecondLastInst->getOperand(2).getMachineBasicBlock(); + Cond.push_back(SecondLastInst->getOperand(0)); + Cond.push_back(SecondLastInst->getOperand(1)); + FBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + + // If the block ends with two Alpha::BRs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == Alpha::BR && + LastInst->getOpcode() == Alpha::BR) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + I = LastInst; + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (I->getOpcode() != Alpha::BR && + I->getOpcode() != Alpha::COND_BRANCH_I && + I->getOpcode() != Alpha::COND_BRANCH_F) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != Alpha::COND_BRANCH_I && + I->getOpcode() != Alpha::COND_BRANCH_F) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +void AlphaInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + BuildMI(MBB, MI, get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31) + .addReg(Alpha::R31); +} + +bool AlphaInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case Alpha::RETDAG: // Return. + case Alpha::RETDAGp: + case Alpha::BR: // Uncond branch. + case Alpha::JMP: // Indirect branch. + return true; + default: return false; + } +} +bool AlphaInstrInfo:: +ReverseBranchCondition(std::vector<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Invalid Alpha branch opcode!"); + Cond[0].setImm(AlphaRevCondCode(Cond[0].getImm())); + return false; +} + diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h new file mode 100644 index 0000000..84009be --- /dev/null +++ b/lib/Target/Alpha/AlphaInstrInfo.h @@ -0,0 +1,57 @@ +//===- AlphaInstrInfo.h - Alpha Instruction Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Alpha implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHAINSTRUCTIONINFO_H +#define ALPHAINSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "AlphaRegisterInfo.h" + +namespace llvm { + +class AlphaInstrInfo : public TargetInstrInfo { + const AlphaRegisterInfo RI; +public: + AlphaInstrInfo(); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + /// Return true if the instruction is a register to register move and + /// leave the source and dest operands in the passed parameters. + /// + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const; + + virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; + bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const; + unsigned RemoveBranch(MachineBasicBlock &MBB) const; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const; + bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const; +}; + +} + +#endif diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td new file mode 100644 index 0000000..4a834da --- /dev/null +++ b/lib/Target/Alpha/AlphaInstrInfo.td @@ -0,0 +1,1088 @@ +//===- AlphaInstrInfo.td - The Alpha Instruction Set -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "AlphaInstrFormats.td" + +//******************** +//Custom DAG Nodes +//******************** + +def SDTFPUnaryOpUnC : SDTypeProfile<1, 1, [ + SDTCisFP<1>, SDTCisFP<0> +]>; +def Alpha_cvtqt : SDNode<"AlphaISD::CVTQT_", SDTFPUnaryOpUnC, []>; +def Alpha_cvtqs : SDNode<"AlphaISD::CVTQS_", SDTFPUnaryOpUnC, []>; +def Alpha_cvttq : SDNode<"AlphaISD::CVTTQ_" , SDTFPUnaryOp, []>; +def Alpha_gprello : SDNode<"AlphaISD::GPRelLo", SDTIntBinOp, []>; +def Alpha_gprelhi : SDNode<"AlphaISD::GPRelHi", SDTIntBinOp, []>; +def Alpha_rellit : SDNode<"AlphaISD::RelLit", SDTIntBinOp, []>; + +def retflag : SDNode<"AlphaISD::RET_FLAG", SDTRet, + [SDNPHasChain, SDNPOptInFlag]>; + +// These are target-independent nodes, but have target-specific formats. +def SDT_AlphaCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i64> ]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AlphaCallSeq, + [SDNPHasChain, SDNPOutFlag]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_AlphaCallSeq, + [SDNPHasChain, SDNPOutFlag]>; + +//******************** +//Paterns for matching +//******************** +def invX : SDNodeXForm<imm, [{ //invert + return getI64Imm(~N->getValue()); +}]>; +def negX : SDNodeXForm<imm, [{ //negate + return getI64Imm(~N->getValue() + 1); +}]>; +def SExt32 : SDNodeXForm<imm, [{ //signed extend int to long + return getI64Imm(((int64_t)N->getValue() << 32) >> 32); +}]>; +def SExt16 : SDNodeXForm<imm, [{ //signed extend int to long + return getI64Imm(((int64_t)N->getValue() << 48) >> 48); +}]>; +def LL16 : SDNodeXForm<imm, [{ //lda part of constant + return getI64Imm(get_lda16(N->getValue())); +}]>; +def LH16 : SDNodeXForm<imm, [{ //ldah part of constant (or more if too big) + return getI64Imm(get_ldah16(N->getValue())); +}]>; +def iZAPX : SDNodeXForm<and, [{ // get imm to ZAPi + ConstantSDNode *RHS = cast<ConstantSDNode>(N->getOperand(1)); + return getI64Imm(get_zapImm(SDOperand(), RHS->getValue())); +}]>; +def nearP2X : SDNodeXForm<imm, [{ + return getI64Imm(Log2_64(getNearPower2((uint64_t)N->getValue()))); +}]>; +def nearP2RemX : SDNodeXForm<imm, [{ + uint64_t x = abs(N->getValue() - getNearPower2((uint64_t)N->getValue())); + return getI64Imm(Log2_64(x)); +}]>; + +def immUExt8 : PatLeaf<(imm), [{ //imm fits in 8 bit zero extended field + return (uint64_t)N->getValue() == (uint8_t)N->getValue(); +}]>; +def immUExt8inv : PatLeaf<(imm), [{ //inverted imm fits in 8 bit zero extended field + return (uint64_t)~N->getValue() == (uint8_t)~N->getValue(); +}], invX>; +def immUExt8neg : PatLeaf<(imm), [{ //negated imm fits in 8 bit zero extended field + return ((uint64_t)~N->getValue() + 1) == (uint8_t)((uint64_t)~N->getValue() + 1); +}], negX>; +def immSExt16 : PatLeaf<(imm), [{ //imm fits in 16 bit sign extended field + return ((int64_t)N->getValue() << 48) >> 48 == (int64_t)N->getValue(); +}]>; +def immSExt16int : PatLeaf<(imm), [{ //(int)imm fits in a 16 bit sign extended field + return ((int64_t)N->getValue() << 48) >> 48 == ((int64_t)N->getValue() << 32) >> 32; +}], SExt16>; + +def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm:$L), [{ + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1))) { + uint64_t build = get_zapImm(N->getOperand(0), (uint64_t)RHS->getValue()); + return build != 0; + } + return false; +}]>; + +def immFPZ : PatLeaf<(fpimm), [{ //the only fpconstant nodes are +/- 0.0 + (void)N; // silence warning. + return true; +}]>; + +def immRem1 : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),1, 0);}]>; +def immRem2 : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),2, 0);}]>; +def immRem3 : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),3, 0);}]>; +def immRem4 : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),4, 0);}]>; +def immRem5 : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),5, 0);}]>; +def immRem1n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),1, 1);}]>; +def immRem2n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),2, 1);}]>; +def immRem3n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),3, 1);}]>; +def immRem4n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),4, 1);}]>; +def immRem5n : PatLeaf<(imm), [{return chkRemNearPower2(N->getValue(),5, 1);}]>; + +def immRemP2n : PatLeaf<(imm), [{ + return isPowerOf2_64(getNearPower2((uint64_t)N->getValue()) - N->getValue()); +}]>; +def immRemP2 : PatLeaf<(imm), [{ + return isPowerOf2_64(N->getValue() - getNearPower2((uint64_t)N->getValue())); +}]>; +def immUExt8ME : PatLeaf<(imm), [{ //use this imm for mulqi + int64_t d = abs((int64_t)N->getValue() - (int64_t)getNearPower2((uint64_t)N->getValue())); + if (isPowerOf2_64(d)) return false; + switch (d) { + case 1: case 3: case 5: return false; + default: return (uint64_t)N->getValue() == (uint8_t)N->getValue(); + }; +}]>; + +def intop : PatFrag<(ops node:$op), (sext_inreg node:$op, i32)>; +def add4 : PatFrag<(ops node:$op1, node:$op2), + (add (shl node:$op1, 2), node:$op2)>; +def sub4 : PatFrag<(ops node:$op1, node:$op2), + (sub (shl node:$op1, 2), node:$op2)>; +def add8 : PatFrag<(ops node:$op1, node:$op2), + (add (shl node:$op1, 3), node:$op2)>; +def sub8 : PatFrag<(ops node:$op1, node:$op2), + (sub (shl node:$op1, 3), node:$op2)>; +class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>; +class CmpOpFrag<dag res> : PatFrag<(ops node:$R), res>; + +//Pseudo ops for selection + +def IDEF_I : PseudoInstAlpha<(ops GPRC:$RA), ";#idef $RA", + [(set GPRC:$RA, (undef))], s_pseudo>; +def IDEF_F32 : PseudoInstAlpha<(ops F4RC:$RA), ";#idef $RA", + [(set F4RC:$RA, (undef))], s_pseudo>; +def IDEF_F64 : PseudoInstAlpha<(ops F8RC:$RA), ";#idef $RA", + [(set F8RC:$RA, (undef))], s_pseudo>; + +def WTF : PseudoInstAlpha<(ops variable_ops), "#wtf", [], s_pseudo>; + +let isLoad = 1, hasCtrlDep = 1 in { +def ADJUSTSTACKUP : PseudoInstAlpha<(ops s64imm:$amt), "; ADJUP $amt", + [(callseq_start imm:$amt)], s_pseudo>, Imp<[R30],[R30]>; +def ADJUSTSTACKDOWN : PseudoInstAlpha<(ops s64imm:$amt), "; ADJDOWN $amt", + [(callseq_end imm:$amt)], s_pseudo>, Imp<[R30],[R30]>; +} +def ALTENT : PseudoInstAlpha<(ops s64imm:$TARGET), "$$$TARGET..ng:\n", [], s_pseudo>; +def PCLABEL : PseudoInstAlpha<(ops s64imm:$num), "PCMARKER_$num:\n",[], s_pseudo>; +def MEMLABEL : PseudoInstAlpha<(ops s64imm:$i, s64imm:$j, s64imm:$k, s64imm:$m), + "LSMARKER$$$i$$$j$$$k$$$m:", [], s_pseudo>; + + +//*********************** +//Real instructions +//*********************** + +//Operation Form: + +//conditional moves, int + +multiclass cmov_inst<bits<7> fun, string asmstr, PatFrag OpNode> { +def r : OForm4<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"), + [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), GPRC:$RTRUE, GPRC:$RFALSE))], s_cmov>; +def i : OForm4L<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"), + [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), immUExt8:$RTRUE, GPRC:$RFALSE))], s_cmov>; +} + +defm CMOVEQ : cmov_inst<0x24, "cmoveq", CmpOpFrag<(seteq node:$R, 0)>>; +defm CMOVNE : cmov_inst<0x26, "cmovne", CmpOpFrag<(setne node:$R, 0)>>; +defm CMOVLT : cmov_inst<0x44, "cmovlt", CmpOpFrag<(setlt node:$R, 0)>>; +defm CMOVLE : cmov_inst<0x64, "cmovle", CmpOpFrag<(setle node:$R, 0)>>; +defm CMOVGT : cmov_inst<0x66, "cmovgt", CmpOpFrag<(setgt node:$R, 0)>>; +defm CMOVGE : cmov_inst<0x46, "cmovge", CmpOpFrag<(setge node:$R, 0)>>; +defm CMOVLBC : cmov_inst<0x16, "cmovlbc", CmpOpFrag<(xor node:$R, 1)>>; +defm CMOVLBS : cmov_inst<0x14, "cmovlbs", CmpOpFrag<(and node:$R, 1)>>; + +//General pattern for cmov +def : Pat<(select GPRC:$which, GPRC:$src1, GPRC:$src2), + (CMOVNEr GPRC:$src2, GPRC:$src1, GPRC:$which)>; +def : Pat<(select GPRC:$which, GPRC:$src1, immUExt8:$src2), + (CMOVEQi GPRC:$src1, immUExt8:$src2, GPRC:$which)>; + +//Invert sense when we can for constants: +def : Pat<(select (setne GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE), + (CMOVEQi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>; +def : Pat<(select (setgt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE), + (CMOVLEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>; +def : Pat<(select (setge GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE), + (CMOVLTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>; +def : Pat<(select (setlt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE), + (CMOVGEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>; +def : Pat<(select (setle GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE), + (CMOVGTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>; + +multiclass all_inst<bits<6> opc, bits<7> funl, bits<7> funq, + string asmstr, PatFrag OpNode, InstrItinClass itin> { + def Lr : OForm< opc, funl, !strconcat(asmstr, "l $RA,$RB,$RC"), + [(set GPRC:$RC, (intop (OpNode GPRC:$RA, GPRC:$RB)))], itin>; + def Li : OFormL<opc, funl, !strconcat(asmstr, "l $RA,$L,$RC"), + [(set GPRC:$RC, (intop (OpNode GPRC:$RA, immUExt8:$L)))], itin>; + def Qr : OForm< opc, funq, !strconcat(asmstr, "q $RA,$RB,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>; + def Qi : OFormL<opc, funq, !strconcat(asmstr, "q $RA,$L,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>; +} + +defm MUL : all_inst<0x13, 0x00, 0x20, "mul", BinOpFrag<(mul node:$LHS, node:$RHS)>, s_imul>; +defm ADD : all_inst<0x10, 0x00, 0x20, "add", BinOpFrag<(add node:$LHS, node:$RHS)>, s_iadd>; +defm S4ADD : all_inst<0x10, 0x02, 0x22, "s4add", add4, s_iadd>; +defm S8ADD : all_inst<0x10, 0x12, 0x32, "s8add", add8, s_iadd>; +defm S4SUB : all_inst<0x10, 0x0B, 0x2B, "s4sub", sub4, s_iadd>; +defm S8SUB : all_inst<0x10, 0x1B, 0x3B, "s8sub", sub8, s_iadd>; +defm SUB : all_inst<0x10, 0x09, 0x29, "sub", BinOpFrag<(sub node:$LHS, node:$RHS)>, s_iadd>; +//Const cases since legalize does sub x, int -> add x, inv(int) + 1 +def : Pat<(intop (add GPRC:$RA, immUExt8neg:$L)), (SUBLi GPRC:$RA, immUExt8neg:$L)>; +def : Pat<(add GPRC:$RA, immUExt8neg:$L), (SUBQi GPRC:$RA, immUExt8neg:$L)>; +def : Pat<(intop (add4 GPRC:$RA, immUExt8neg:$L)), (S4SUBLi GPRC:$RA, immUExt8neg:$L)>; +def : Pat<(add4 GPRC:$RA, immUExt8neg:$L), (S4SUBQi GPRC:$RA, immUExt8neg:$L)>; +def : Pat<(intop (add8 GPRC:$RA, immUExt8neg:$L)), (S8SUBLi GPRC:$RA, immUExt8neg:$L)>; +def : Pat<(add8 GPRC:$RA, immUExt8neg:$L), (S8SUBQi GPRC:$RA, immUExt8neg:$L)>; + +multiclass log_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> { +def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>; +def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>; +} +multiclass inv_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> { +def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, (not GPRC:$RB)))], itin>; +def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"), + [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8inv:$L))], itin>; +} + +defm AND : log_inst<0x11, 0x00, "and", and, s_ilog>; +defm BIC : inv_inst<0x11, 0x08, "bic", and, s_ilog>; +defm BIS : log_inst<0x11, 0x20, "bis", or, s_ilog>; +defm ORNOT : inv_inst<0x11, 0x28, "ornot", or, s_ilog>; +defm XOR : log_inst<0x11, 0x40, "xor", xor, s_ilog>; +defm EQV : inv_inst<0x11, 0x48, "eqv", xor, s_ilog>; + +defm SL : log_inst<0x12, 0x39, "sll", shl, s_ishf>; +defm SRA : log_inst<0x12, 0x3c, "sra", sra, s_ishf>; +defm SRL : log_inst<0x12, 0x34, "srl", srl, s_ishf>; +defm UMULH : log_inst<0x13, 0x30, "umulh", mulhu, s_imul>; + +def CTLZ : OForm2<0x1C, 0x32, "CTLZ $RB,$RC", + [(set GPRC:$RC, (ctlz GPRC:$RB))], s_imisc>; +def CTPOP : OForm2<0x1C, 0x30, "CTPOP $RB,$RC", + [(set GPRC:$RC, (ctpop GPRC:$RB))], s_imisc>; +def CTTZ : OForm2<0x1C, 0x33, "CTTZ $RB,$RC", + [(set GPRC:$RC, (cttz GPRC:$RB))], s_imisc>; +def EXTBL : OForm< 0x12, 0x06, "EXTBL $RA,$RB,$RC", + [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 255))], s_ishf>; +def EXTWL : OForm< 0x12, 0x16, "EXTWL $RA,$RB,$RC", + [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 65535))], s_ishf>; +def EXTLL : OForm< 0x12, 0x26, "EXTLL $RA,$RB,$RC", + [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 4294967295))], s_ishf>; +def SEXTB : OForm2<0x1C, 0x00, "sextb $RB,$RC", + [(set GPRC:$RC, (sext_inreg GPRC:$RB, i8))], s_ishf>; +def SEXTW : OForm2<0x1C, 0x01, "sextw $RB,$RC", + [(set GPRC:$RC, (sext_inreg GPRC:$RB, i16))], s_ishf>; + +//def EXTBLi : OFormL<0x12, 0x06, "EXTBL $RA,$L,$RC", []>; //Extract byte low +//def EXTLH : OForm< 0x12, 0x6A, "EXTLH $RA,$RB,$RC", []>; //Extract longword high +//def EXTLHi : OFormL<0x12, 0x6A, "EXTLH $RA,$L,$RC", []>; //Extract longword high +//def EXTLLi : OFormL<0x12, 0x26, "EXTLL $RA,$L,$RC", []>; //Extract longword low +//def EXTQH : OForm< 0x12, 0x7A, "EXTQH $RA,$RB,$RC", []>; //Extract quadword high +//def EXTQHi : OFormL<0x12, 0x7A, "EXTQH $RA,$L,$RC", []>; //Extract quadword high +//def EXTQ : OForm< 0x12, 0x36, "EXTQ $RA,$RB,$RC", []>; //Extract quadword low +//def EXTQi : OFormL<0x12, 0x36, "EXTQ $RA,$L,$RC", []>; //Extract quadword low +//def EXTWH : OForm< 0x12, 0x5A, "EXTWH $RA,$RB,$RC", []>; //Extract word high +//def EXTWHi : OFormL<0x12, 0x5A, "EXTWH $RA,$L,$RC", []>; //Extract word high +//def EXTWLi : OFormL<0x12, 0x16, "EXTWL $RA,$L,$RC", []>; //Extract word low + +//def INSBL : OForm< 0x12, 0x0B, "INSBL $RA,$RB,$RC", []>; //Insert byte low +//def INSBLi : OFormL<0x12, 0x0B, "INSBL $RA,$L,$RC", []>; //Insert byte low +//def INSLH : OForm< 0x12, 0x67, "INSLH $RA,$RB,$RC", []>; //Insert longword high +//def INSLHi : OFormL<0x12, 0x67, "INSLH $RA,$L,$RC", []>; //Insert longword high +//def INSLL : OForm< 0x12, 0x2B, "INSLL $RA,$RB,$RC", []>; //Insert longword low +//def INSLLi : OFormL<0x12, 0x2B, "INSLL $RA,$L,$RC", []>; //Insert longword low +//def INSQH : OForm< 0x12, 0x77, "INSQH $RA,$RB,$RC", []>; //Insert quadword high +//def INSQHi : OFormL<0x12, 0x77, "INSQH $RA,$L,$RC", []>; //Insert quadword high +//def INSQL : OForm< 0x12, 0x3B, "INSQL $RA,$RB,$RC", []>; //Insert quadword low +//def INSQLi : OFormL<0x12, 0x3B, "INSQL $RA,$L,$RC", []>; //Insert quadword low +//def INSWH : OForm< 0x12, 0x57, "INSWH $RA,$RB,$RC", []>; //Insert word high +//def INSWHi : OFormL<0x12, 0x57, "INSWH $RA,$L,$RC", []>; //Insert word high +//def INSWL : OForm< 0x12, 0x1B, "INSWL $RA,$RB,$RC", []>; //Insert word low +//def INSWLi : OFormL<0x12, 0x1B, "INSWL $RA,$L,$RC", []>; //Insert word low + +//def MSKBL : OForm< 0x12, 0x02, "MSKBL $RA,$RB,$RC", []>; //Mask byte low +//def MSKBLi : OFormL<0x12, 0x02, "MSKBL $RA,$L,$RC", []>; //Mask byte low +//def MSKLH : OForm< 0x12, 0x62, "MSKLH $RA,$RB,$RC", []>; //Mask longword high +//def MSKLHi : OFormL<0x12, 0x62, "MSKLH $RA,$L,$RC", []>; //Mask longword high +//def MSKLL : OForm< 0x12, 0x22, "MSKLL $RA,$RB,$RC", []>; //Mask longword low +//def MSKLLi : OFormL<0x12, 0x22, "MSKLL $RA,$L,$RC", []>; //Mask longword low +//def MSKQH : OForm< 0x12, 0x72, "MSKQH $RA,$RB,$RC", []>; //Mask quadword high +//def MSKQHi : OFormL<0x12, 0x72, "MSKQH $RA,$L,$RC", []>; //Mask quadword high +//def MSKQL : OForm< 0x12, 0x32, "MSKQL $RA,$RB,$RC", []>; //Mask quadword low +//def MSKQLi : OFormL<0x12, 0x32, "MSKQL $RA,$L,$RC", []>; //Mask quadword low +//def MSKWH : OForm< 0x12, 0x52, "MSKWH $RA,$RB,$RC", []>; //Mask word high +//def MSKWHi : OFormL<0x12, 0x52, "MSKWH $RA,$L,$RC", []>; //Mask word high +//def MSKWL : OForm< 0x12, 0x12, "MSKWL $RA,$RB,$RC", []>; //Mask word low +//def MSKWLi : OFormL<0x12, 0x12, "MSKWL $RA,$L,$RC", []>; //Mask word low + +def ZAPNOTi : OFormL<0x12, 0x31, "zapnot $RA,$L,$RC", [], s_ishf>; + +// Define the pattern that produces ZAPNOTi. +def : Pat<(i64 (zappat GPRC:$RA):$imm), + (ZAPNOTi GPRC:$RA, (iZAPX GPRC:$imm))>; + + +//Comparison, int +//So this is a waste of what this instruction can do, but it still saves something +def CMPBGE : OForm< 0x10, 0x0F, "cmpbge $RA,$RB,$RC", + [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), (and GPRC:$RB, 255)))], s_ilog>; +def CMPBGEi : OFormL<0x10, 0x0F, "cmpbge $RA,$L,$RC", + [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), immUExt8:$L))], s_ilog>; +def CMPEQ : OForm< 0x10, 0x2D, "cmpeq $RA,$RB,$RC", + [(set GPRC:$RC, (seteq GPRC:$RA, GPRC:$RB))], s_iadd>; +def CMPEQi : OFormL<0x10, 0x2D, "cmpeq $RA,$L,$RC", + [(set GPRC:$RC, (seteq GPRC:$RA, immUExt8:$L))], s_iadd>; +def CMPLE : OForm< 0x10, 0x6D, "cmple $RA,$RB,$RC", + [(set GPRC:$RC, (setle GPRC:$RA, GPRC:$RB))], s_iadd>; +def CMPLEi : OFormL<0x10, 0x6D, "cmple $RA,$L,$RC", + [(set GPRC:$RC, (setle GPRC:$RA, immUExt8:$L))], s_iadd>; +def CMPLT : OForm< 0x10, 0x4D, "cmplt $RA,$RB,$RC", + [(set GPRC:$RC, (setlt GPRC:$RA, GPRC:$RB))], s_iadd>; +def CMPLTi : OFormL<0x10, 0x4D, "cmplt $RA,$L,$RC", + [(set GPRC:$RC, (setlt GPRC:$RA, immUExt8:$L))], s_iadd>; +def CMPULE : OForm< 0x10, 0x3D, "cmpule $RA,$RB,$RC", + [(set GPRC:$RC, (setule GPRC:$RA, GPRC:$RB))], s_iadd>; +def CMPULEi : OFormL<0x10, 0x3D, "cmpule $RA,$L,$RC", + [(set GPRC:$RC, (setule GPRC:$RA, immUExt8:$L))], s_iadd>; +def CMPULT : OForm< 0x10, 0x1D, "cmpult $RA,$RB,$RC", + [(set GPRC:$RC, (setult GPRC:$RA, GPRC:$RB))], s_iadd>; +def CMPULTi : OFormL<0x10, 0x1D, "cmpult $RA,$L,$RC", + [(set GPRC:$RC, (setult GPRC:$RA, immUExt8:$L))], s_iadd>; + +//Patterns for unsupported int comparisons +def : Pat<(setueq GPRC:$X, GPRC:$Y), (CMPEQ GPRC:$X, GPRC:$Y)>; +def : Pat<(setueq GPRC:$X, immUExt8:$Y), (CMPEQi GPRC:$X, immUExt8:$Y)>; + +def : Pat<(setugt GPRC:$X, GPRC:$Y), (CMPULT GPRC:$Y, GPRC:$X)>; +def : Pat<(setugt immUExt8:$X, GPRC:$Y), (CMPULTi GPRC:$Y, immUExt8:$X)>; + +def : Pat<(setuge GPRC:$X, GPRC:$Y), (CMPULE GPRC:$Y, GPRC:$X)>; +def : Pat<(setuge immUExt8:$X, GPRC:$Y), (CMPULEi GPRC:$Y, immUExt8:$X)>; + +def : Pat<(setgt GPRC:$X, GPRC:$Y), (CMPLT GPRC:$Y, GPRC:$X)>; +def : Pat<(setgt immUExt8:$X, GPRC:$Y), (CMPLTi GPRC:$Y, immUExt8:$X)>; + +def : Pat<(setge GPRC:$X, GPRC:$Y), (CMPLE GPRC:$Y, GPRC:$X)>; +def : Pat<(setge immUExt8:$X, GPRC:$Y), (CMPLEi GPRC:$Y, immUExt8:$X)>; + +def : Pat<(setne GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>; +def : Pat<(setne GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQi GPRC:$X, immUExt8:$Y), 0)>; + +def : Pat<(setune GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>; +def : Pat<(setune GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQ GPRC:$X, immUExt8:$Y), 0)>; + + +let isReturn = 1, isTerminator = 1, noResults = 1, Ra = 31, Rb = 26, disp = 1, Uses = [R26] in { + def RETDAG : MbrForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", s_jsr>; //Return from subroutine + def RETDAGp : MbrpForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", [(retflag)], s_jsr>; //Return from subroutine +} + +let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1, +Ra = 31, disp = 0 in +def JMP : MbrpForm< 0x1A, 0x00, (ops GPRC:$RS), "jmp $$31,($RS),0", + [(brind GPRC:$RS)], s_jsr>; //Jump + +let isCall = 1, noResults = 1, Ra = 26, + Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, + R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, + F0, F1, + F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, + F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R29] in { + def BSR : BFormD<0x34, "bsr $$26,$$$DISP..ng", [], s_jsr>; //Branch to subroutine +} +let isCall = 1, noResults = 1, Ra = 26, Rb = 27, disp = 0, + Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, + R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, + F0, F1, + F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, + F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R27, R29] in { + def JSR : MbrForm< 0x1A, 0x01, (ops ), "jsr $$26,($$27),0", s_jsr>; //Jump to subroutine +} + +let isCall = 1, noResults = 1, Ra = 23, Rb = 27, disp = 0, + Defs = [R23, R24, R25, R27, R28], Uses = [R24, R25, R27] in + def JSRs : MbrForm< 0x1A, 0x01, (ops ), "jsr $$23,($$27),0", s_jsr>; //Jump to div or rem + + +def JSR_COROUTINE : MbrForm< 0x1A, 0x03, (ops GPRC:$RD, GPRC:$RS, s14imm:$DISP), "jsr_coroutine $RD,($RS),$DISP", s_jsr>; //Jump to subroutine return + + +let OperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in { +def LDQ : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)", + [(set GPRC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_ild>; +def LDQr : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)\t\t!gprellow", + [(set GPRC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>; +def LDL : MForm<0x28, 0, 1, "ldl $RA,$DISP($RB)", + [(set GPRC:$RA, (sextloadi32 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>; +def LDLr : MForm<0x28, 0, 1, "ldl $RA,$DISP($RB)\t\t!gprellow", + [(set GPRC:$RA, (sextloadi32 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>; +def LDBU : MForm<0x0A, 0, 1, "ldbu $RA,$DISP($RB)", + [(set GPRC:$RA, (zextloadi8 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>; +def LDBUr : MForm<0x0A, 0, 1, "ldbu $RA,$DISP($RB)\t\t!gprellow", + [(set GPRC:$RA, (zextloadi8 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>; +def LDWU : MForm<0x0C, 0, 1, "ldwu $RA,$DISP($RB)", + [(set GPRC:$RA, (zextloadi16 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>; +def LDWUr : MForm<0x0C, 0, 1, "ldwu $RA,$DISP($RB)\t\t!gprellow", + [(set GPRC:$RA, (zextloadi16 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>; + + +def STB : MForm<0x0E, 1, 0, "stb $RA,$DISP($RB)", + [(truncstorei8 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>; +def STBr : MForm<0x0E, 1, 0, "stb $RA,$DISP($RB)\t\t!gprellow", + [(truncstorei8 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>; +def STW : MForm<0x0D, 1, 0, "stw $RA,$DISP($RB)", + [(truncstorei16 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>; +def STWr : MForm<0x0D, 1, 0, "stw $RA,$DISP($RB)\t\t!gprellow", + [(truncstorei16 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>; +def STL : MForm<0x2C, 1, 0, "stl $RA,$DISP($RB)", + [(truncstorei32 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>; +def STLr : MForm<0x2C, 1, 0, "stl $RA,$DISP($RB)\t\t!gprellow", + [(truncstorei32 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>; +def STQ : MForm<0x2D, 1, 0, "stq $RA,$DISP($RB)", + [(store GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>; +def STQr : MForm<0x2D, 1, 0, "stq $RA,$DISP($RB)\t\t!gprellow", + [(store GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>; + +//Load address +def LDA : MForm<0x08, 0, 0, "lda $RA,$DISP($RB)", + [(set GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_lda>; +def LDAr : MForm<0x08, 0, 0, "lda $RA,$DISP($RB)\t\t!gprellow", + [(set GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_lda>; //Load address +def LDAH : MForm<0x09, 0, 0, "ldah $RA,$DISP($RB)", + [], s_lda>; //Load address high +def LDAHr : MForm<0x09, 0, 0, "ldah $RA,$DISP($RB)\t\t!gprelhigh", + [(set GPRC:$RA, (Alpha_gprelhi tglobaladdr:$DISP, GPRC:$RB))], s_lda>; //Load address high +} + +let OperandList = (ops F4RC:$RA, s64imm:$DISP, GPRC:$RB) in { +def STS : MForm<0x26, 1, 0, "sts $RA,$DISP($RB)", + [(store F4RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>; +def STSr : MForm<0x26, 1, 0, "sts $RA,$DISP($RB)\t\t!gprellow", + [(store F4RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>; +def LDS : MForm<0x22, 0, 1, "lds $RA,$DISP($RB)", + [(set F4RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>; +def LDSr : MForm<0x22, 0, 1, "lds $RA,$DISP($RB)\t\t!gprellow", + [(set F4RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>; +} +let OperandList = (ops F8RC:$RA, s64imm:$DISP, GPRC:$RB) in { +def STT : MForm<0x27, 1, 0, "stt $RA,$DISP($RB)", + [(store F8RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>; +def STTr : MForm<0x27, 1, 0, "stt $RA,$DISP($RB)\t\t!gprellow", + [(store F8RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>; +def LDT : MForm<0x23, 0, 1, "ldt $RA,$DISP($RB)", + [(set F8RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>; +def LDTr : MForm<0x23, 0, 1, "ldt $RA,$DISP($RB)\t\t!gprellow", + [(set F8RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>; +} + + +//constpool rels +def : Pat<(i64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDQr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(i64 (sextloadi32 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDLr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(i64 (zextloadi8 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDBUr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(i64 (zextloadi16 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDWUr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(i64 (Alpha_gprello tconstpool:$DISP, GPRC:$RB)), + (LDAr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(i64 (Alpha_gprelhi tconstpool:$DISP, GPRC:$RB)), + (LDAHr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(f32 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDSr tconstpool:$DISP, GPRC:$RB)>; +def : Pat<(f64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))), + (LDTr tconstpool:$DISP, GPRC:$RB)>; + +//jumptable rels +def : Pat<(i64 (Alpha_gprelhi tjumptable:$DISP, GPRC:$RB)), + (LDAHr tjumptable:$DISP, GPRC:$RB)>; +def : Pat<(i64 (Alpha_gprello tjumptable:$DISP, GPRC:$RB)), + (LDAr tjumptable:$DISP, GPRC:$RB)>; + + +//misc ext patterns +def : Pat<(i64 (extloadi8 (add GPRC:$RB, immSExt16:$DISP))), + (LDBU immSExt16:$DISP, GPRC:$RB)>; +def : Pat<(i64 (extloadi16 (add GPRC:$RB, immSExt16:$DISP))), + (LDWU immSExt16:$DISP, GPRC:$RB)>; +def : Pat<(i64 (extloadi32 (add GPRC:$RB, immSExt16:$DISP))), + (LDL immSExt16:$DISP, GPRC:$RB)>; + +//0 disp patterns +def : Pat<(i64 (load GPRC:$addr)), + (LDQ 0, GPRC:$addr)>; +def : Pat<(f64 (load GPRC:$addr)), + (LDT 0, GPRC:$addr)>; +def : Pat<(f32 (load GPRC:$addr)), + (LDS 0, GPRC:$addr)>; +def : Pat<(i64 (sextloadi32 GPRC:$addr)), + (LDL 0, GPRC:$addr)>; +def : Pat<(i64 (zextloadi16 GPRC:$addr)), + (LDWU 0, GPRC:$addr)>; +def : Pat<(i64 (zextloadi8 GPRC:$addr)), + (LDBU 0, GPRC:$addr)>; +def : Pat<(i64 (extloadi8 GPRC:$addr)), + (LDBU 0, GPRC:$addr)>; +def : Pat<(i64 (extloadi16 GPRC:$addr)), + (LDWU 0, GPRC:$addr)>; +def : Pat<(i64 (extloadi32 GPRC:$addr)), + (LDL 0, GPRC:$addr)>; + +def : Pat<(store GPRC:$DATA, GPRC:$addr), + (STQ GPRC:$DATA, 0, GPRC:$addr)>; +def : Pat<(store F8RC:$DATA, GPRC:$addr), + (STT F8RC:$DATA, 0, GPRC:$addr)>; +def : Pat<(store F4RC:$DATA, GPRC:$addr), + (STS F4RC:$DATA, 0, GPRC:$addr)>; +def : Pat<(truncstorei32 GPRC:$DATA, GPRC:$addr), + (STL GPRC:$DATA, 0, GPRC:$addr)>; +def : Pat<(truncstorei16 GPRC:$DATA, GPRC:$addr), + (STW GPRC:$DATA, 0, GPRC:$addr)>; +def : Pat<(truncstorei8 GPRC:$DATA, GPRC:$addr), + (STB GPRC:$DATA, 0, GPRC:$addr)>; + + +//load address, rellocated gpdist form +let OperandList = (ops GPRC:$RA, s16imm:$DISP, GPRC:$RB, s16imm:$NUM) in { +def LDAg : MForm<0x08, 0, 1, "lda $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>; //Load address +def LDAHg : MForm<0x09, 0, 1, "ldah $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>; //Load address +} + +//Load quad, rellocated literal form +let OperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in +def LDQl : MForm<0x29, 0, 1, "ldq $RA,$DISP($RB)\t\t!literal", + [(set GPRC:$RA, (Alpha_rellit tglobaladdr:$DISP, GPRC:$RB))], s_ild>; +def : Pat<(Alpha_rellit texternalsym:$ext, GPRC:$RB), + (LDQl texternalsym:$ext, GPRC:$RB)>; + + +def RPCC : MfcForm<0x18, 0xC000, "rpcc $RA", s_rpcc>; //Read process cycle counter + +//Basic Floating point ops + +//Floats + +let OperandList = (ops F4RC:$RC, F4RC:$RB), Fa = 31 in +def SQRTS : FPForm<0x14, 0x58B, "sqrts/su $RB,$RC", + [(set F4RC:$RC, (fsqrt F4RC:$RB))], s_fsqrts>; + +let OperandList = (ops F4RC:$RC, F4RC:$RA, F4RC:$RB) in { +def ADDS : FPForm<0x16, 0x580, "adds/su $RA,$RB,$RC", + [(set F4RC:$RC, (fadd F4RC:$RA, F4RC:$RB))], s_fadd>; +def SUBS : FPForm<0x16, 0x581, "subs/su $RA,$RB,$RC", + [(set F4RC:$RC, (fsub F4RC:$RA, F4RC:$RB))], s_fadd>; +def DIVS : FPForm<0x16, 0x583, "divs/su $RA,$RB,$RC", + [(set F4RC:$RC, (fdiv F4RC:$RA, F4RC:$RB))], s_fdivs>; +def MULS : FPForm<0x16, 0x582, "muls/su $RA,$RB,$RC", + [(set F4RC:$RC, (fmul F4RC:$RA, F4RC:$RB))], s_fmul>; + +def CPYSS : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC", + [(set F4RC:$RC, (fcopysign F4RC:$RB, F4RC:$RA))], s_fadd>; +def CPYSES : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent +def CPYSNS : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC", + [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F4RC:$RA)))], s_fadd>; +} + +//Doubles + +let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in +def SQRTT : FPForm<0x14, 0x5AB, "sqrtt/su $RB,$RC", + [(set F8RC:$RC, (fsqrt F8RC:$RB))], s_fsqrtt>; + +let OperandList = (ops F8RC:$RC, F8RC:$RA, F8RC:$RB) in { +def ADDT : FPForm<0x16, 0x5A0, "addt/su $RA,$RB,$RC", + [(set F8RC:$RC, (fadd F8RC:$RA, F8RC:$RB))], s_fadd>; +def SUBT : FPForm<0x16, 0x5A1, "subt/su $RA,$RB,$RC", + [(set F8RC:$RC, (fsub F8RC:$RA, F8RC:$RB))], s_fadd>; +def DIVT : FPForm<0x16, 0x5A3, "divt/su $RA,$RB,$RC", + [(set F8RC:$RC, (fdiv F8RC:$RA, F8RC:$RB))], s_fdivt>; +def MULT : FPForm<0x16, 0x5A2, "mult/su $RA,$RB,$RC", + [(set F8RC:$RC, (fmul F8RC:$RA, F8RC:$RB))], s_fmul>; + +def CPYST : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC", + [(set F8RC:$RC, (fcopysign F8RC:$RB, F8RC:$RA))], s_fadd>; +def CPYSET : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent +def CPYSNT : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC", + [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F8RC:$RA)))], s_fadd>; + +def CMPTEQ : FPForm<0x16, 0x5A5, "cmpteq/su $RA,$RB,$RC", [], s_fadd>; +// [(set F8RC:$RC, (seteq F8RC:$RA, F8RC:$RB))]>; +def CMPTLE : FPForm<0x16, 0x5A7, "cmptle/su $RA,$RB,$RC", [], s_fadd>; +// [(set F8RC:$RC, (setle F8RC:$RA, F8RC:$RB))]>; +def CMPTLT : FPForm<0x16, 0x5A6, "cmptlt/su $RA,$RB,$RC", [], s_fadd>; +// [(set F8RC:$RC, (setlt F8RC:$RA, F8RC:$RB))]>; +def CMPTUN : FPForm<0x16, 0x5A4, "cmptun/su $RA,$RB,$RC", [], s_fadd>; +// [(set F8RC:$RC, (setuo F8RC:$RA, F8RC:$RB))]>; +} + +//More CPYS forms: +let OperandList = (ops F8RC:$RC, F4RC:$RA, F8RC:$RB) in { +def CPYSTs : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC", + [(set F8RC:$RC, (fcopysign F8RC:$RB, F4RC:$RA))], s_fadd>; +def CPYSNTs : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC", + [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F4RC:$RA)))], s_fadd>; +} +let OperandList = (ops F4RC:$RC, F8RC:$RA, F4RC:$RB) in { +def CPYSSt : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC", + [(set F4RC:$RC, (fcopysign F4RC:$RB, F8RC:$RA))], s_fadd>; +def CPYSESt : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent +def CPYSNSt : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC", + [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F8RC:$RA)))], s_fadd>; +} + +//conditional moves, floats +let OperandList = (ops F4RC:$RDEST, F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND), + isTwoAddress = 1 in { +def FCMOVEQS : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if = zero +def FCMOVGES : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if >= zero +def FCMOVGTS : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if > zero +def FCMOVLES : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if <= zero +def FCMOVLTS : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; // FCMOVE if < zero +def FCMOVNES : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if != zero +} +//conditional moves, doubles +let OperandList = (ops F8RC:$RDEST, F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND), + isTwoAddress = 1 in { +def FCMOVEQT : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +def FCMOVGET : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +def FCMOVGTT : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +def FCMOVLET : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +def FCMOVLTT : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +def FCMOVNET : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST", [], s_fcmov>; +} + +//misc FP selects +//Select double + +def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; + +def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; + +def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf), + (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; + +//Select single +def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>; + +def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; +def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>; + +def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>; + +def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; +def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf), + (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>; + + + +let OperandList = (ops GPRC:$RC, F4RC:$RA), Fb = 31 in +def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",[], s_ftoi>; //Floating to integer move, S_floating +let OperandList = (ops GPRC:$RC, F8RC:$RA), Fb = 31 in +def FTOIT : FPForm<0x1C, 0x070, "ftoit $RA,$RC", + [(set GPRC:$RC, (bitconvert F8RC:$RA))], s_ftoi>; //Floating to integer move +let OperandList = (ops F4RC:$RC, GPRC:$RA), Fb = 31 in +def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",[], s_itof>; //Integer to floating move, S_floating +let OperandList = (ops F8RC:$RC, GPRC:$RA), Fb = 31 in +def ITOFT : FPForm<0x14, 0x024, "itoft $RA,$RC", + [(set F8RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move + + +let OperandList = (ops F4RC:$RC, F8RC:$RB), Fa = 31 in +def CVTQS : FPForm<0x16, 0x7BC, "cvtqs/sui $RB,$RC", + [(set F4RC:$RC, (Alpha_cvtqs F8RC:$RB))], s_fadd>; +let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in +def CVTQT : FPForm<0x16, 0x7BE, "cvtqt/sui $RB,$RC", + [(set F8RC:$RC, (Alpha_cvtqt F8RC:$RB))], s_fadd>; +let OperandList = (ops F8RC:$RC, F8RC:$RB), Fa = 31 in +def CVTTQ : FPForm<0x16, 0x52F, "cvttq/svc $RB,$RC", + [(set F8RC:$RC, (Alpha_cvttq F8RC:$RB))], s_fadd>; +let OperandList = (ops F8RC:$RC, F4RC:$RB), Fa = 31 in +def CVTST : FPForm<0x16, 0x6AC, "cvtst/s $RB,$RC", + [(set F8RC:$RC, (fextend F4RC:$RB))], s_fadd>; +let OperandList = (ops F4RC:$RC, F8RC:$RB), Fa = 31 in +def CVTTS : FPForm<0x16, 0x7AC, "cvtts/sui $RB,$RC", + [(set F4RC:$RC, (fround F8RC:$RB))], s_fadd>; + + +///////////////////////////////////////////////////////// +//Branching +///////////////////////////////////////////////////////// +class br_icc<bits<6> opc, string asmstr> + : BFormN<opc, (ops u64imm:$opc, GPRC:$R, target:$dst), + !strconcat(asmstr, " $R,$dst"), s_icbr>; +class br_fcc<bits<6> opc, string asmstr> + : BFormN<opc, (ops u64imm:$opc, F8RC:$R, target:$dst), + !strconcat(asmstr, " $R,$dst"), s_fbr>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, noResults = 1 in { +let Ra = 31 in +def BR : BFormD<0x30, "br $$31,$DISP", [(br bb:$DISP)], s_ubr>; + +def COND_BRANCH_I : BFormN<0, (ops u64imm:$opc, GPRC:$R, target:$dst), + "{:comment} COND_BRANCH imm:$opc, GPRC:$R, bb:$dst", + s_icbr>; +def COND_BRANCH_F : BFormN<0, (ops u64imm:$opc, F8RC:$R, target:$dst), + "{:comment} COND_BRANCH imm:$opc, F8RC:$R, bb:$dst", + s_fbr>; +//Branches, int +def BEQ : br_icc<0x39, "beq">; +def BGE : br_icc<0x3E, "bge">; +def BGT : br_icc<0x3F, "bgt">; +def BLBC : br_icc<0x38, "blbc">; +def BLBS : br_icc<0x3C, "blbs">; +def BLE : br_icc<0x3B, "ble">; +def BLT : br_icc<0x3A, "blt">; +def BNE : br_icc<0x3D, "bne">; + +//Branches, float +def FBEQ : br_fcc<0x31, "fbeq">; +def FBGE : br_fcc<0x36, "fbge">; +def FBGT : br_fcc<0x37, "fbgt">; +def FBLE : br_fcc<0x33, "fble">; +def FBLT : br_fcc<0x32, "fblt">; +def FBNE : br_fcc<0x36, "fbne">; +} + +//An ugly trick to get the opcode as an imm I can use +def immBRCond : SDNodeXForm<imm, [{ + switch((uint64_t)N->getValue()) { + case 0: return getI64Imm(Alpha::BEQ); + case 1: return getI64Imm(Alpha::BNE); + case 2: return getI64Imm(Alpha::BGE); + case 3: return getI64Imm(Alpha::BGT); + case 4: return getI64Imm(Alpha::BLE); + case 5: return getI64Imm(Alpha::BLT); + case 6: return getI64Imm(Alpha::BLBS); + case 7: return getI64Imm(Alpha::BLBC); + case 20: return getI64Imm(Alpha::FBEQ); + case 21: return getI64Imm(Alpha::FBNE); + case 22: return getI64Imm(Alpha::FBGE); + case 23: return getI64Imm(Alpha::FBGT); + case 24: return getI64Imm(Alpha::FBLE); + case 25: return getI64Imm(Alpha::FBLT); + default: assert(0 && "Unknown branch type"); + } +}]>; + +//Int cond patterns +def : Pat<(brcond (seteq GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 0), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setge GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 2), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setgt GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 3), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (and GPRC:$RA, 1), bb:$DISP), + (COND_BRANCH_I (immBRCond 6), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setle GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 4), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setlt GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 5), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setne GPRC:$RA, 0), bb:$DISP), + (COND_BRANCH_I (immBRCond 1), GPRC:$RA, bb:$DISP)>; + +def : Pat<(brcond GPRC:$RA, bb:$DISP), + (COND_BRANCH_I (immBRCond 1), GPRC:$RA, bb:$DISP)>; +def : Pat<(brcond (setne GPRC:$RA, GPRC:$RB), bb:$DISP), + (COND_BRANCH_I (immBRCond 0), (CMPEQ GPRC:$RA, GPRC:$RB), bb:$DISP)>; +def : Pat<(brcond (setne GPRC:$RA, immUExt8:$L), bb:$DISP), + (COND_BRANCH_I (immBRCond 0), (CMPEQi GPRC:$RA, immUExt8:$L), bb:$DISP)>; + +//FP cond patterns +def : Pat<(brcond (seteq F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), F8RC:$RA, bb:$DISP)>; +def : Pat<(brcond (setne F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), F8RC:$RA, bb:$DISP)>; +def : Pat<(brcond (setge F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 22), F8RC:$RA, bb:$DISP)>; +def : Pat<(brcond (setgt F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 23), F8RC:$RA, bb:$DISP)>; +def : Pat<(brcond (setle F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 24), F8RC:$RA, bb:$DISP)>; +def : Pat<(brcond (setlt F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 25), F8RC:$RA, bb:$DISP)>; + + +def : Pat<(brcond (seteq F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setoeq F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setueq F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; + +def : Pat<(brcond (setlt F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setolt F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setult F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>; + +def : Pat<(brcond (setle F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setole F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setule F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>; + +def : Pat<(brcond (setgt F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>; +def : Pat<(brcond (setogt F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>; +def : Pat<(brcond (setugt F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>; + +def : Pat<(brcond (setge F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>; +def : Pat<(brcond (setoge F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>; +def : Pat<(brcond (setuge F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>; + +def : Pat<(brcond (setne F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setone F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; +def : Pat<(brcond (setune F8RC:$RA, F8RC:$RB), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>; + + +def : Pat<(brcond (setoeq F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setueq F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>; + +def : Pat<(brcond (setoge F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setuge F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>; + +def : Pat<(brcond (setogt F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setugt F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>; + +def : Pat<(brcond (setole F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setule F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>; + +def : Pat<(brcond (setolt F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setult F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>; + +def : Pat<(brcond (setone F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>; +def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP), + (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>; + +//End Branches + +//S_floating : IEEE Single +//T_floating : IEEE Double + +//Unused instructions +//Mnemonic Format Opcode Description +//CALL_PAL Pcd 00 Trap to PALcode +//ECB Mfc 18.E800 Evict cache block +//EXCB Mfc 18.0400 Exception barrier +//FETCH Mfc 18.8000 Prefetch data +//FETCH_M Mfc 18.A000 Prefetch data, modify intent +//LDL_L Mem 2A Load sign-extended longword locked +//LDQ_L Mem 2B Load quadword locked +//LDQ_U Mem 0B Load unaligned quadword +//MB Mfc 18.4000 Memory barrier +//STL_C Mem 2E Store longword conditional +//STQ_C Mem 2F Store quadword conditional +//STQ_U Mem 0F Store unaligned quadword +//TRAPB Mfc 18.0000 Trap barrier +//WH64 Mfc 18.F800 Write hint 64 bytes +//WMB Mfc 18.4400 Write memory barrier +//MF_FPCR F-P 17.025 Move from FPCR +//MT_FPCR F-P 17.024 Move to FPCR +//There are in the Multimedia extentions, so let's not use them yet +//def MAXSB8 : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum +//def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum +//def MAXUB8 : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum +//def MAXUW4 : OForm< 0x1C, 0x3D, "MAXUW4 $RA,$RB,$RC">; //Vector unsigned word maximum +//def MINSB8 : OForm< 0x1C, 0x38, "MINSB8 $RA,$RB,$RC">; //Vector signed byte minimum +//def MINSW4 : OForm< 0x1C, 0x39, "MINSW4 $RA,$RB,$RC">; //Vector signed word minimum +//def MINUB8 : OForm< 0x1C, 0x3A, "MINUB8 $RA,$RB,$RC">; //Vector unsigned byte minimum +//def MINUW4 : OForm< 0x1C, 0x3B, "MINUW4 $RA,$RB,$RC">; //Vector unsigned word minimum +//def PERR : OForm< 0x1C, 0x31, "PERR $RA,$RB,$RC">; //Pixel error +//def PKLB : OForm< 0x1C, 0x37, "PKLB $RA,$RB,$RC">; //Pack longwords to bytes +//def PKWB : OForm<0x1C, 0x36, "PKWB $RA,$RB,$RC">; //Pack words to bytes +//def UNPKBL : OForm< 0x1C, 0x35, "UNPKBL $RA,$RB,$RC">; //Unpack bytes to longwords +//def UNPKBW : OForm< 0x1C, 0x34, "UNPKBW $RA,$RB,$RC">; //Unpack bytes to words +//CVTLQ F-P 17.010 Convert longword to quadword +//CVTQL F-P 17.030 Convert quadword to longword + + +//Constant handling + +def immConst2Part : PatLeaf<(imm), [{ + //true if imm fits in a LDAH LDA pair + int64_t val = (int64_t)N->getValue(); + return (val <= IMM_FULLHIGH && val >= IMM_FULLLOW); +}]>; +def immConst2PartInt : PatLeaf<(imm), [{ + //true if imm fits in a LDAH LDA pair with zeroext + uint64_t uval = N->getValue(); + int32_t val32 = (int32_t)uval; + return ((uval >> 32) == 0 && //empty upper bits + val32 <= IMM_FULLHIGH); +// val32 >= IMM_FULLLOW + IMM_LOW * IMM_MULT); //Always True +}], SExt32>; + +def : Pat<(i64 immConst2Part:$imm), + (LDA (LL16 immConst2Part:$imm), (LDAH (LH16 immConst2Part:$imm), R31))>; + +def : Pat<(i64 immSExt16:$imm), + (LDA immSExt16:$imm, R31)>; + +def : Pat<(i64 immSExt16int:$imm), + (ZAPNOTi (LDA (SExt16 immSExt16int:$imm), R31), 15)>; +def : Pat<(i64 immConst2PartInt:$imm), + (ZAPNOTi (LDA (LL16 (SExt32 immConst2PartInt:$imm)), + (LDAH (LH16 (SExt32 immConst2PartInt:$imm)), R31)), 15)>; + + +//TODO: I want to just define these like this! +//def : Pat<(i64 0), +// (R31)>; +//def : Pat<(f64 0.0), +// (F31)>; +//def : Pat<(f64 -0.0), +// (CPYSNT F31, F31)>; +//def : Pat<(f32 0.0), +// (F31)>; +//def : Pat<(f32 -0.0), +// (CPYSNS F31, F31)>; + +//Misc Patterns: + +def : Pat<(sext_inreg GPRC:$RB, i32), + (ADDLi GPRC:$RB, 0)>; + +def : Pat<(fabs F8RC:$RB), + (CPYST F31, F8RC:$RB)>; +def : Pat<(fabs F4RC:$RB), + (CPYSS F31, F4RC:$RB)>; +def : Pat<(fneg F8RC:$RB), + (CPYSNT F8RC:$RB, F8RC:$RB)>; +def : Pat<(fneg F4RC:$RB), + (CPYSNS F4RC:$RB, F4RC:$RB)>; + +def : Pat<(fcopysign F4RC:$A, (fneg F4RC:$B)), + (CPYSNS F4RC:$B, F4RC:$A)>; +def : Pat<(fcopysign F8RC:$A, (fneg F8RC:$B)), + (CPYSNT F8RC:$B, F8RC:$A)>; +def : Pat<(fcopysign F4RC:$A, (fneg F8RC:$B)), + (CPYSNSt F8RC:$B, F4RC:$A)>; +def : Pat<(fcopysign F8RC:$A, (fneg F4RC:$B)), + (CPYSNTs F4RC:$B, F8RC:$A)>; + +//Yes, signed multiply high is ugly +def : Pat<(mulhs GPRC:$RA, GPRC:$RB), + (SUBQr (UMULHr GPRC:$RA, GPRC:$RB), (ADDQr (CMOVGEr GPRC:$RB, R31, GPRC:$RA), + (CMOVGEr GPRC:$RA, R31, GPRC:$RB)))>; + +//Stupid crazy arithmetic stuff: +let AddedComplexity = 1 in { +def : Pat<(mul GPRC:$RA, 5), (S4ADDQr GPRC:$RA, GPRC:$RA)>; +def : Pat<(mul GPRC:$RA, 9), (S8ADDQr GPRC:$RA, GPRC:$RA)>; +def : Pat<(mul GPRC:$RA, 3), (S4SUBQr GPRC:$RA, GPRC:$RA)>; +def : Pat<(mul GPRC:$RA, 7), (S8SUBQr GPRC:$RA, GPRC:$RA)>; + +//slight tree expansion if we are multiplying near to a power of 2 +//n is above a power of 2 +def : Pat<(mul GPRC:$RA, immRem1:$imm), + (ADDQr (SLr GPRC:$RA, (nearP2X immRem1:$imm)), GPRC:$RA)>; +def : Pat<(mul GPRC:$RA, immRem2:$imm), + (ADDQr (SLr GPRC:$RA, (nearP2X immRem2:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRem3:$imm), + (ADDQr (SLr GPRC:$RA, (nearP2X immRem3:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRem4:$imm), + (S4ADDQr GPRC:$RA, (SLr GPRC:$RA, (nearP2X immRem4:$imm)))>; +def : Pat<(mul GPRC:$RA, immRem5:$imm), + (ADDQr (SLr GPRC:$RA, (nearP2X immRem5:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRemP2:$imm), + (ADDQr (SLr GPRC:$RA, (nearP2X immRemP2:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2:$imm)))>; + +//n is below a power of 2 +def : Pat<(mul GPRC:$RA, immRem1n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRem1n:$imm)), GPRC:$RA)>; +def : Pat<(mul GPRC:$RA, immRem2n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRem2n:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRem3n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRem3n:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRem4n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRem4n:$imm)), (SLi GPRC:$RA, 2))>; +def : Pat<(mul GPRC:$RA, immRem5n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRem5n:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>; +def : Pat<(mul GPRC:$RA, immRemP2n:$imm), + (SUBQr (SLr GPRC:$RA, (nearP2X immRemP2n:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2n:$imm)))>; +} //Added complexity diff --git a/lib/Target/Alpha/AlphaJITInfo.cpp b/lib/Target/Alpha/AlphaJITInfo.cpp new file mode 100644 index 0000000..669a2d5 --- /dev/null +++ b/lib/Target/Alpha/AlphaJITInfo.cpp @@ -0,0 +1,305 @@ +//===-- AlphaJITInfo.cpp - Implement the JIT interfaces for the Alpha ---===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the Alpha target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "AlphaJITInfo.h" +#include "AlphaRelocations.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/Config/alloca.h" +#include "llvm/Support/Debug.h" +#include <cstdlib> +#include <map> +using namespace llvm; + +#define BUILD_OFormatI(Op, RA, LIT, FUN, RC) \ + ((Op << 26) | (RA << 21) | (LIT << 13) | (1 << 12) | (FUN << 5) | (RC)) +#define BUILD_OFormat(Op, RA, RB, FUN, RC) \ + ((Op << 26) | (RA << 21) | (RB << 16) | (FUN << 5) | (RC)) + +#define BUILD_LDA(RD, RS, IMM16) \ + ((0x08 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535)) +#define BUILD_LDAH(RD, RS, IMM16) \ + ((0x09 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535)) + +#define BUILD_LDQ(RD, RS, IMM16) \ + ((0x29 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 0xFFFF)) + +#define BUILD_JMP(RD, RS, IMM16) \ + ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x00 << 14) | ((IMM16) & 0x3FFF)) +#define BUILD_JSR(RD, RS, IMM16) \ + ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x01 << 14) | ((IMM16) & 0x3FFF)) + +#define BUILD_SLLi(RD, RS, IMM8) \ + (BUILD_OFormatI(0x12, RS, IMM8, 0x39, RD)) + +#define BUILD_ORi(RD, RS, IMM8) \ + (BUILD_OFormatI(0x11, RS, IMM8, 0x20, RD)) + +#define BUILD_OR(RD, RS, RT) \ + (BUILD_OFormat(0x11, RS, RT, 0x20, RD)) + + + +static void EmitBranchToAt(void *At, void *To) { + unsigned long Fn = (unsigned long)To; + + unsigned *AtI = (unsigned*)At; + + AtI[0] = BUILD_OR(0, 27, 27); + + DOUT << "Stub targeting " << To << "\n"; + + for (int x = 1; x <= 8; ++x) { + AtI[2*x - 1] = BUILD_SLLi(27,27,8); + unsigned d = (Fn >> (64 - 8 * x)) & 0x00FF; + //DOUT << "outputing " << hex << d << dec << "\n"; + AtI[2*x] = BUILD_ORi(27, 27, d); + } + AtI[17] = BUILD_JMP(31,27,0); //jump, preserving ra, and setting pv + AtI[18] = 0x00FFFFFF; //mark this as a stub +} + +void AlphaJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + //FIXME + assert(0); +} + +static TargetJITInfo::JITCompilerFn JITCompilerFunction; +//static AlphaJITInfo* AlphaJTI; + +extern "C" { +#ifdef __alpha + + void AlphaCompilationCallbackC(long* oldpv, void* CameFromStub) + { + void* Target = JITCompilerFunction(CameFromStub); + + //rewrite the stub to an unconditional branch + if (((unsigned*)CameFromStub)[18] == 0x00FFFFFF) { + DOUT << "Came from a stub, rewriting\n"; + EmitBranchToAt(CameFromStub, Target); + } else { + DOUT << "confused, didn't come from stub at " << CameFromStub + << " old jump vector " << oldpv + << " new jump vector " << Target << "\n"; + } + + //Change pv to new Target + *oldpv = (long)Target; + } + + void AlphaCompilationCallback(void); + + asm( + ".text\n" + ".globl AlphaComilationCallbackC\n" + ".align 4\n" + ".globl AlphaCompilationCallback\n" + ".ent AlphaCompilationCallback\n" +"AlphaCompilationCallback:\n" + // //get JIT's GOT + "ldgp $29, 0($27)\n" + //Save args, callee saved, and perhaps others? + //args: $16-$21 $f16-$f21 (12) + //callee: $9-$14 $f2-$f9 (14) + //others: fp:$15 ra:$26 pv:$27 (3) + "lda $30, -232($30)\n" + "stq $16, 0($30)\n" + "stq $17, 8($30)\n" + "stq $18, 16($30)\n" + "stq $19, 24($30)\n" + "stq $20, 32($30)\n" + "stq $21, 40($30)\n" + "stt $f16, 48($30)\n" + "stt $f17, 56($30)\n" + "stt $f18, 64($30)\n" + "stt $f19, 72($30)\n" + "stt $f20, 80($30)\n" + "stt $f21, 88($30)\n" + "stq $9, 96($30)\n" + "stq $10, 104($30)\n" + "stq $11, 112($30)\n" + "stq $12, 120($30)\n" + "stq $13, 128($30)\n" + "stq $14, 136($30)\n" + "stt $f2, 144($30)\n" + "stt $f3, 152($30)\n" + "stt $f4, 160($30)\n" + "stt $f5, 168($30)\n" + "stt $f6, 176($30)\n" + "stt $f7, 184($30)\n" + "stt $f8, 192($30)\n" + "stt $f9, 200($30)\n" + "stq $15, 208($30)\n" + "stq $26, 216($30)\n" + "stq $27, 224($30)\n" + + "addq $30, 224, $16\n" //pass the addr of saved pv as the first arg + "bis $0, $0, $17\n" //pass the roughly stub addr in second arg + "jsr $26, AlphaCompilationCallbackC\n" //call without saving ra + + "ldq $16, 0($30)\n" + "ldq $17, 8($30)\n" + "ldq $18, 16($30)\n" + "ldq $19, 24($30)\n" + "ldq $20, 32($30)\n" + "ldq $21, 40($30)\n" + "ldt $f16, 48($30)\n" + "ldt $f17, 56($30)\n" + "ldt $f18, 64($30)\n" + "ldt $f19, 72($30)\n" + "ldt $f20, 80($30)\n" + "ldt $f21, 88($30)\n" + "ldq $9, 96($30)\n" + "ldq $10, 104($30)\n" + "ldq $11, 112($30)\n" + "ldq $12, 120($30)\n" + "ldq $13, 128($30)\n" + "ldq $14, 136($30)\n" + "ldt $f2, 144($30)\n" + "ldt $f3, 152($30)\n" + "ldt $f4, 160($30)\n" + "ldt $f5, 168($30)\n" + "ldt $f6, 176($30)\n" + "ldt $f7, 184($30)\n" + "ldt $f8, 192($30)\n" + "ldt $f9, 200($30)\n" + "ldq $15, 208($30)\n" + "ldq $26, 216($30)\n" + "ldq $27, 224($30)\n" //this was updated in the callback with the target + + "lda $30, 232($30)\n" //restore sp + "jmp $31, ($27)\n" //jump to the new function + ".end AlphaCompilationCallback\n" + ); +#else + void AlphaCompilationCallback() { + cerr << "Cannot call AlphaCompilationCallback() on a non-Alpha arch!\n"; + abort(); + } +#endif +} + +void *AlphaJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) { + //assert(Fn == AlphaCompilationCallback && "Where are you going?\n"); + //Do things in a stupid slow way! + MCE.startFunctionStub(19*4); + void* Addr = (void*)(intptr_t)MCE.getCurrentPCValue(); + for (int x = 0; x < 19; ++ x) + MCE.emitWordLE(0); + EmitBranchToAt(Addr, Fn); + DOUT << "Emitting Stub to " << Fn << " at [" << Addr << "]\n"; + return MCE.finishFunctionStub(0); +} + +TargetJITInfo::LazyResolverFn +AlphaJITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + // setZerothGOTEntry((void*)AlphaCompilationCallback); + return AlphaCompilationCallback; +} + +//These describe LDAx +static const int IMM_LOW = -32768; +static const int IMM_HIGH = 32767; +static const int IMM_MULT = 65536; + +static long getUpper16(long l) +{ + long y = l / IMM_MULT; + if (l % IMM_MULT > IMM_HIGH) + ++y; + if (l % IMM_MULT < IMM_LOW) + --y; + assert((short)y == y && "displacement out of range"); + return y; +} + +static long getLower16(long l) +{ + long h = getUpper16(l); + long y = l - h * IMM_MULT; + assert(y == (short)y && "Displacement out of range"); + return y; +} + +void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + //because gpdist are paired and relative to the pc of the first inst, + //we need to have some state + + static std::map<std::pair<void*, int>, void*> gpdistmap; + + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4; + long idx = 0; + bool doCommon = true; + switch ((Alpha::RelocationType)MR->getRelocationType()) { + default: assert(0 && "Unknown relocation type!"); + case Alpha::reloc_literal: + //This is a LDQl + idx = MR->getGOTIndex(); + DOUT << "Literal relocation to slot " << idx; + idx = (idx - GOToffset) * 8; + DOUT << " offset " << idx << "\n"; + break; + case Alpha::reloc_gprellow: + idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8]; + idx = getLower16(idx); + DOUT << "gprellow relocation offset " << idx << "\n"; + DOUT << " Pointer is " << (void*)MR->getResultPointer() + << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n"; + break; + case Alpha::reloc_gprelhigh: + idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8]; + idx = getUpper16(idx); + DOUT << "gprelhigh relocation offset " << idx << "\n"; + DOUT << " Pointer is " << (void*)MR->getResultPointer() + << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n"; + break; + case Alpha::reloc_gpdist: + switch (*RelocPos >> 26) { + case 0x09: //LDAH + idx = &GOTBase[GOToffset * 8] - (unsigned char*)RelocPos; + idx = getUpper16(idx); + DOUT << "LDAH: " << idx << "\n"; + //add the relocation to the map + gpdistmap[std::make_pair(Function, MR->getConstantVal())] = RelocPos; + break; + case 0x08: //LDA + assert(gpdistmap[std::make_pair(Function, MR->getConstantVal())] && + "LDAg without seeing LDAHg"); + idx = &GOTBase[GOToffset * 8] - + (unsigned char*)gpdistmap[std::make_pair(Function, MR->getConstantVal())]; + idx = getLower16(idx); + DOUT << "LDA: " << idx << "\n"; + break; + default: + assert(0 && "Cannot handle gpdist yet"); + } + break; + case Alpha::reloc_bsr: { + idx = (((unsigned char*)MR->getResultPointer() - + (unsigned char*)RelocPos) >> 2) + 1; //skip first 2 inst of fun + *RelocPos |= (idx & ((1 << 21)-1)); + doCommon = false; + break; + } + } + if (doCommon) { + short x = (short)idx; + assert(x == idx); + *(short*)RelocPos = x; + } + } +} diff --git a/lib/Target/Alpha/AlphaJITInfo.h b/lib/Target/Alpha/AlphaJITInfo.h new file mode 100644 index 0000000..26c45b1 --- /dev/null +++ b/lib/Target/Alpha/AlphaJITInfo.h @@ -0,0 +1,49 @@ +//===- AlphaJITInfo.h - Alpha impl. of the JIT interface ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Alpha implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHA_JITINFO_H +#define ALPHA_JITINFO_H + +#include "llvm/Target/TargetJITInfo.h" +#include "llvm/GlobalValue.h" +#include <string> +#include <map> + +namespace llvm { + class TargetMachine; + + class AlphaJITInfo : public TargetJITInfo { + protected: + TargetMachine &TM; + public: + AlphaJITInfo(TargetMachine &tm) : TM(tm) + { useGOT = true; } + + virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE); + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + private: + static const unsigned GOToffset = 4096; + + }; +} + +#endif diff --git a/lib/Target/Alpha/AlphaLLRP.cpp b/lib/Target/Alpha/AlphaLLRP.cpp new file mode 100644 index 0000000..27c2738 --- /dev/null +++ b/lib/Target/Alpha/AlphaLLRP.cpp @@ -0,0 +1,162 @@ +//===-- AlphaLLRP.cpp - Alpha Load Load Replay Trap elimination pass. -- --===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Here we check for potential replay traps introduced by the spiller +// We also align some branch targets if we can do so for free. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "alpha-nops" +#include "Alpha.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +STATISTIC(nopintro, "Number of nops inserted"); +STATISTIC(nopalign, "Number of nops inserted for alignment"); + +namespace { + cl::opt<bool> + AlignAll("alpha-align-all", cl::Hidden, + cl::desc("Align all blocks")); + + struct AlphaLLRPPass : public MachineFunctionPass { + /// Target machine description which we query for reg. names, data + /// layout, etc. + /// + AlphaTargetMachine &TM; + + static char ID; + AlphaLLRPPass(AlphaTargetMachine &tm) + : MachineFunctionPass((intptr_t)&ID), TM(tm) { } + + virtual const char *getPassName() const { + return "Alpha NOP inserter"; + } + + bool runOnMachineFunction(MachineFunction &F) { + const TargetInstrInfo *TII = F.getTarget().getInstrInfo(); + bool Changed = false; + MachineInstr* prev[3] = {0,0,0}; + unsigned count = 0; + for (MachineFunction::iterator FI = F.begin(), FE = F.end(); + FI != FE; ++FI) { + MachineBasicBlock& MBB = *FI; + bool ub = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { + if (count%4 == 0) + prev[0] = prev[1] = prev[2] = 0; //Slots cleared at fetch boundary + ++count; + MachineInstr *MI = I++; + switch (MI->getOpcode()) { + case Alpha::LDQ: case Alpha::LDL: + case Alpha::LDWU: case Alpha::LDBU: + case Alpha::LDT: case Alpha::LDS: + case Alpha::STQ: case Alpha::STL: + case Alpha::STW: case Alpha::STB: + case Alpha::STT: case Alpha::STS: + if (MI->getOperand(2).getReg() == Alpha::R30) { + if (prev[0] + && prev[0]->getOperand(2).getReg() == + MI->getOperand(2).getReg() + && prev[0]->getOperand(1).getImmedValue() == + MI->getOperand(1).getImmedValue()) { + prev[0] = prev[1]; + prev[1] = prev[2]; + prev[2] = 0; + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31) + .addReg(Alpha::R31) + .addReg(Alpha::R31); + Changed = true; nopintro += 1; + count += 1; + } else if (prev[1] + && prev[1]->getOperand(2).getReg() == + MI->getOperand(2).getReg() + && prev[1]->getOperand(1).getImmedValue() == + MI->getOperand(1).getImmedValue()) { + prev[0] = prev[2]; + prev[1] = prev[2] = 0; + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31) + .addReg(Alpha::R31) + .addReg(Alpha::R31); + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31) + .addReg(Alpha::R31) + .addReg(Alpha::R31); + Changed = true; nopintro += 2; + count += 2; + } else if (prev[2] + && prev[2]->getOperand(2).getReg() == + MI->getOperand(2).getReg() + && prev[2]->getOperand(1).getImmedValue() == + MI->getOperand(1).getImmedValue()) { + prev[0] = prev[1] = prev[2] = 0; + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31) + .addReg(Alpha::R31); + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31) + .addReg(Alpha::R31); + BuildMI(MBB, MI, TII->get(Alpha::BISr), Alpha::R31).addReg(Alpha::R31) + .addReg(Alpha::R31); + Changed = true; nopintro += 3; + count += 3; + } + prev[0] = prev[1]; + prev[1] = prev[2]; + prev[2] = MI; + break; + } + prev[0] = prev[1]; + prev[1] = prev[2]; + prev[2] = 0; + break; + case Alpha::ALTENT: + case Alpha::MEMLABEL: + case Alpha::PCLABEL: + case Alpha::IDEF_I: + case Alpha::IDEF_F32: + case Alpha::IDEF_F64: + --count; + break; + case Alpha::BR: + case Alpha::JMP: + ub = true; + //fall through + default: + prev[0] = prev[1]; + prev[1] = prev[2]; + prev[2] = 0; + break; + } + } + if (ub || AlignAll) { + //we can align stuff for free at this point + while (count % 4) { + BuildMI(MBB, MBB.end(), TII->get(Alpha::BISr), Alpha::R31) + .addReg(Alpha::R31).addReg(Alpha::R31); + ++count; + ++nopalign; + prev[0] = prev[1]; + prev[1] = prev[2]; + prev[2] = 0; + } + } + } + return Changed; + } + }; + char AlphaLLRPPass::ID = 0; +} // end of anonymous namespace + +FunctionPass *llvm::createAlphaLLRPPass(AlphaTargetMachine &tm) { + return new AlphaLLRPPass(tm); +} diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp new file mode 100644 index 0000000..59d3e81 --- /dev/null +++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp @@ -0,0 +1,433 @@ +//===- AlphaRegisterInfo.cpp - Alpha Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Alpha implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reginfo" +#include "Alpha.h" +#include "AlphaRegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/Type.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include <cstdlib> +using namespace llvm; + +//These describe LDAx +static const int IMM_LOW = -32768; +static const int IMM_HIGH = 32767; +static const int IMM_MULT = 65536; + +static long getUpper16(long l) +{ + long y = l / IMM_MULT; + if (l % IMM_MULT > IMM_HIGH) + ++y; + return y; +} + +static long getLower16(long l) +{ + long h = getUpper16(l); + return l - h * IMM_MULT; +} + +AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii) + : AlphaGenRegisterInfo(Alpha::ADJUSTSTACKDOWN, Alpha::ADJUSTSTACKUP), + TII(tii) +{ +} + +void +AlphaRegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIdx, + const TargetRegisterClass *RC) const { + //cerr << "Trying to store " << getPrettyName(SrcReg) << " to " + // << FrameIdx << "\n"; + //BuildMI(MBB, MI, Alpha::WTF, 0).addReg(SrcReg); + if (RC == Alpha::F4RCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::STS)) + .addReg(SrcReg, false, false, true) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else if (RC == Alpha::F8RCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::STT)) + .addReg(SrcReg, false, false, true) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else if (RC == Alpha::GPRCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::STQ)) + .addReg(SrcReg, false, false, true) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else + abort(); +} + +void +AlphaRegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC) const { + //cerr << "Trying to load " << getPrettyName(DestReg) << " to " + // << FrameIdx << "\n"; + if (RC == Alpha::F4RCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::LDS), DestReg) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else if (RC == Alpha::F8RCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::LDT), DestReg) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else if (RC == Alpha::GPRCRegisterClass) + BuildMI(MBB, MI, TII.get(Alpha::LDQ), DestReg) + .addFrameIndex(FrameIdx).addReg(Alpha::F31); + else + abort(); +} + +MachineInstr *AlphaRegisterInfo::foldMemoryOperand(MachineInstr *MI, + unsigned OpNum, + int FrameIndex) const { + // Make sure this is a reg-reg copy. + unsigned Opc = MI->getOpcode(); + + MachineInstr *NewMI = NULL; + switch(Opc) { + default: + break; + case Alpha::BISr: + case Alpha::CPYSS: + case Alpha::CPYST: + if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + if (OpNum == 0) { // move -> store + unsigned InReg = MI->getOperand(1).getReg(); + Opc = (Opc == Alpha::BISr) ? Alpha::STQ : + ((Opc == Alpha::CPYSS) ? Alpha::STS : Alpha::STT); + NewMI = BuildMI(TII.get(Opc)).addReg(InReg).addFrameIndex(FrameIndex) + .addReg(Alpha::F31); + } else { // load -> move + unsigned OutReg = MI->getOperand(0).getReg(); + Opc = (Opc == Alpha::BISr) ? Alpha::LDQ : + ((Opc == Alpha::CPYSS) ? Alpha::LDS : Alpha::LDT); + NewMI = BuildMI(TII.get(Opc), OutReg).addFrameIndex(FrameIndex) + .addReg(Alpha::F31); + } + } + break; + } + if (NewMI) + NewMI->copyKillDeadInfo(MI); + return 0; +} + + +void AlphaRegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + //cerr << "copyRegToReg " << DestReg << " <- " << SrcReg << "\n"; + if (RC == Alpha::GPRCRegisterClass) { + BuildMI(MBB, MI, TII.get(Alpha::BISr), DestReg).addReg(SrcReg).addReg(SrcReg); + } else if (RC == Alpha::F4RCRegisterClass) { + BuildMI(MBB, MI, TII.get(Alpha::CPYSS), DestReg).addReg(SrcReg).addReg(SrcReg); + } else if (RC == Alpha::F8RCRegisterClass) { + BuildMI(MBB, MI, TII.get(Alpha::CPYST), DestReg).addReg(SrcReg).addReg(SrcReg); + } else { + cerr << "Attempt to copy register that is not GPR or FPR"; + abort(); + } +} + +void AlphaRegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + static const unsigned CalleeSavedRegs[] = { + Alpha::R9, Alpha::R10, + Alpha::R11, Alpha::R12, + Alpha::R13, Alpha::R14, + Alpha::F2, Alpha::F3, + Alpha::F4, Alpha::F5, + Alpha::F6, Alpha::F7, + Alpha::F8, Alpha::F9, 0 + }; + return CalleeSavedRegs; +} + +const TargetRegisterClass* const* +AlphaRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &Alpha::GPRCRegClass, &Alpha::GPRCRegClass, + &Alpha::GPRCRegClass, &Alpha::GPRCRegClass, + &Alpha::GPRCRegClass, &Alpha::GPRCRegClass, + &Alpha::F8RCRegClass, &Alpha::F8RCRegClass, + &Alpha::F8RCRegClass, &Alpha::F8RCRegClass, + &Alpha::F8RCRegClass, &Alpha::F8RCRegClass, + &Alpha::F8RCRegClass, &Alpha::F8RCRegClass, 0 + }; + return CalleeSavedRegClasses; +} + +BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(Alpha::R15); + Reserved.set(Alpha::R30); + Reserved.set(Alpha::R31); + return Reserved; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +// hasFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +// +bool AlphaRegisterInfo::hasFP(const MachineFunction &MF) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + return MFI->hasVarSizedObjects(); +} + +void AlphaRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (hasFP(MF)) { + // If we have a frame pointer, turn the adjcallstackup instruction into a + // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP, + // <amt>' + MachineInstr *Old = I; + uint64_t Amount = Old->getOperand(0).getImmedValue(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + MachineInstr *New; + if (Old->getOpcode() == Alpha::ADJUSTSTACKDOWN) { + New=BuildMI(TII.get(Alpha::LDA), Alpha::R30) + .addImm(-Amount).addReg(Alpha::R30); + } else { + assert(Old->getOpcode() == Alpha::ADJUSTSTACKUP); + New=BuildMI(TII.get(Alpha::LDA), Alpha::R30) + .addImm(Amount).addReg(Alpha::R30); + } + + // Replace the pseudo instruction with a new instruction... + MBB.insert(I, New); + } + } + + MBB.erase(I); +} + +//Alpha has a slightly funny stack: +//Args +//<- incoming SP +//fixed locals (and spills, callee saved, etc) +//<- FP +//variable locals +//<- SP + +void AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + bool FP = hasFP(MF); + + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getFrameIndex(); + + // Add the base register of R30 (SP) or R15 (FP). + MI.getOperand(i + 1).ChangeToRegister(FP ? Alpha::R15 : Alpha::R30, false); + + // Now add the frame object offset to the offset from the virtual frame index. + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex); + + DOUT << "FI: " << FrameIndex << " Offset: " << Offset << "\n"; + + Offset += MF.getFrameInfo()->getStackSize(); + + DOUT << "Corrected Offset " << Offset + << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n"; + + if (Offset > IMM_HIGH || Offset < IMM_LOW) { + DOUT << "Unconditionally using R28 for evil purposes Offset: " + << Offset << "\n"; + //so in this case, we need to use a temporary register, and move the + //original inst off the SP/FP + //fix up the old: + MI.getOperand(i + 1).ChangeToRegister(Alpha::R28, false); + MI.getOperand(i).ChangeToImmediate(getLower16(Offset)); + //insert the new + MachineInstr* nMI=BuildMI(TII.get(Alpha::LDAH), Alpha::R28) + .addImm(getUpper16(Offset)).addReg(FP ? Alpha::R15 : Alpha::R30); + MBB.insert(II, nMI); + } else { + MI.getOperand(i).ChangeToImmediate(Offset); + } +} + + +void AlphaRegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool FP = hasFP(MF); + + static int curgpdist = 0; + + //handle GOP offset + BuildMI(MBB, MBBI, TII.get(Alpha::LDAHg), Alpha::R29) + .addGlobalAddress(const_cast<Function*>(MF.getFunction())) + .addReg(Alpha::R27).addImm(++curgpdist); + BuildMI(MBB, MBBI, TII.get(Alpha::LDAg), Alpha::R29) + .addGlobalAddress(const_cast<Function*>(MF.getFunction())) + .addReg(Alpha::R29).addImm(curgpdist); + + //evil const_cast until MO stuff setup to handle const + BuildMI(MBB, MBBI, TII.get(Alpha::ALTENT)) + .addGlobalAddress(const_cast<Function*>(MF.getFunction())); + + // Get the number of bytes to allocate from the FrameInfo + long NumBytes = MFI->getStackSize(); + + if (FP) + NumBytes += 8; //reserve space for the old FP + + // Do we need to allocate space on the stack? + if (NumBytes == 0) return; + + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + NumBytes = (NumBytes+Align-1)/Align*Align; + + // Update frame info to pretend that this is part of the stack... + MFI->setStackSize(NumBytes); + + // adjust stack pointer: r30 -= numbytes + NumBytes = -NumBytes; + if (NumBytes >= IMM_LOW) { + BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes) + .addReg(Alpha::R30); + } else if (getUpper16(NumBytes) >= IMM_LOW) { + BuildMI(MBB, MBBI, TII.get(Alpha::LDAH), Alpha::R30).addImm(getUpper16(NumBytes)) + .addReg(Alpha::R30); + BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(getLower16(NumBytes)) + .addReg(Alpha::R30); + } else { + cerr << "Too big a stack frame at " << NumBytes << "\n"; + abort(); + } + + //now if we need to, save the old FP and set the new + if (FP) + { + BuildMI(MBB, MBBI, TII.get(Alpha::STQ)) + .addReg(Alpha::R15).addImm(0).addReg(Alpha::R30); + //this must be the last instr in the prolog + BuildMI(MBB, MBBI, TII.get(Alpha::BISr), Alpha::R15) + .addReg(Alpha::R30).addReg(Alpha::R30); + } + +} + +void AlphaRegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert(MBBI->getOpcode() == Alpha::RETDAG || + MBBI->getOpcode() == Alpha::RETDAGp + && "Can only insert epilog into returning blocks"); + + bool FP = hasFP(MF); + + // Get the number of bytes allocated from the FrameInfo... + long NumBytes = MFI->getStackSize(); + + //now if we need to, restore the old FP + if (FP) + { + //copy the FP into the SP (discards allocas) + BuildMI(MBB, MBBI, TII.get(Alpha::BISr), Alpha::R30).addReg(Alpha::R15) + .addReg(Alpha::R15); + //restore the FP + BuildMI(MBB, MBBI, TII.get(Alpha::LDQ), Alpha::R15).addImm(0).addReg(Alpha::R15); + } + + if (NumBytes != 0) + { + if (NumBytes <= IMM_HIGH) { + BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes) + .addReg(Alpha::R30); + } else if (getUpper16(NumBytes) <= IMM_HIGH) { + BuildMI(MBB, MBBI, TII.get(Alpha::LDAH), Alpha::R30) + .addImm(getUpper16(NumBytes)).addReg(Alpha::R30); + BuildMI(MBB, MBBI, TII.get(Alpha::LDA), Alpha::R30) + .addImm(getLower16(NumBytes)).addReg(Alpha::R30); + } else { + cerr << "Too big a stack frame at " << NumBytes << "\n"; + abort(); + } + } +} + +unsigned AlphaRegisterInfo::getRARegister() const { + assert(0 && "What is the return address register"); + return 0; +} + +unsigned AlphaRegisterInfo::getFrameRegister(MachineFunction &MF) const { + return hasFP(MF) ? Alpha::R15 : Alpha::R30; +} + +unsigned AlphaRegisterInfo::getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned AlphaRegisterInfo::getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +#include "AlphaGenRegisterInfo.inc" + +std::string AlphaRegisterInfo::getPrettyName(unsigned reg) +{ + std::string s(RegisterDescriptors[reg].Name); + return s; +} diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h new file mode 100644 index 0000000..2872e59 --- /dev/null +++ b/lib/Target/Alpha/AlphaRegisterInfo.h @@ -0,0 +1,85 @@ +//===- AlphaRegisterInfo.h - Alpha Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Alpha implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHAREGISTERINFO_H +#define ALPHAREGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "AlphaGenRegisterInfo.h.inc" + +namespace llvm { + +class TargetInstrInfo; +class Type; + +struct AlphaRegisterInfo : public AlphaGenRegisterInfo { + const TargetInstrInfo &TII; + + AlphaRegisterInfo(const TargetInstrInfo &tii); + + /// Code Generation virtual methods... + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + MachineInstr* foldMemoryOperand(MachineInstr *MI, unsigned OpNum, + int FrameIndex) const; + + void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; + + static std::string getPrettyName(unsigned reg); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td new file mode 100644 index 0000000..9855ce2 --- /dev/null +++ b/lib/Target/Alpha/AlphaRegisterInfo.td @@ -0,0 +1,171 @@ +//===- AlphaRegisterInfo.td - The Alpha Register File ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Alpha register set. +// +//===----------------------------------------------------------------------===// + +class AlphaReg<string n> : Register<n> { + field bits<5> Num; + let Namespace = "Alpha"; +} + +// We identify all our registers with a 5-bit ID, for consistency's sake. + +// GPR - One of the 32 32-bit general-purpose registers +class GPR<bits<5> num, string n> : AlphaReg<n> { + let Num = num; +} + +// FPR - One of the 32 64-bit floating-point registers +class FPR<bits<5> num, string n> : AlphaReg<n> { + let Num = num; +} + +//#define FP $15 +//#define RA $26 +//#define PV $27 +//#define GP $29 +//#define SP $30 + +// General-purpose registers +def R0 : GPR< 0, "$0">, DwarfRegNum<0>; +def R1 : GPR< 1, "$1">, DwarfRegNum<1>; +def R2 : GPR< 2, "$2">, DwarfRegNum<2>; +def R3 : GPR< 3, "$3">, DwarfRegNum<3>; +def R4 : GPR< 4, "$4">, DwarfRegNum<4>; +def R5 : GPR< 5, "$5">, DwarfRegNum<5>; +def R6 : GPR< 6, "$6">, DwarfRegNum<6>; +def R7 : GPR< 7, "$7">, DwarfRegNum<7>; +def R8 : GPR< 8, "$8">, DwarfRegNum<8>; +def R9 : GPR< 9, "$9">, DwarfRegNum<9>; +def R10 : GPR<10, "$10">, DwarfRegNum<10>; +def R11 : GPR<11, "$11">, DwarfRegNum<11>; +def R12 : GPR<12, "$12">, DwarfRegNum<12>; +def R13 : GPR<13, "$13">, DwarfRegNum<13>; +def R14 : GPR<14, "$14">, DwarfRegNum<14>; +def R15 : GPR<15, "$15">, DwarfRegNum<15>; +def R16 : GPR<16, "$16">, DwarfRegNum<16>; +def R17 : GPR<17, "$17">, DwarfRegNum<17>; +def R18 : GPR<18, "$18">, DwarfRegNum<18>; +def R19 : GPR<19, "$19">, DwarfRegNum<19>; +def R20 : GPR<20, "$20">, DwarfRegNum<20>; +def R21 : GPR<21, "$21">, DwarfRegNum<21>; +def R22 : GPR<22, "$22">, DwarfRegNum<22>; +def R23 : GPR<23, "$23">, DwarfRegNum<23>; +def R24 : GPR<24, "$24">, DwarfRegNum<24>; +def R25 : GPR<25, "$25">, DwarfRegNum<25>; +def R26 : GPR<26, "$26">, DwarfRegNum<26>; +def R27 : GPR<27, "$27">, DwarfRegNum<27>; +def R28 : GPR<28, "$28">, DwarfRegNum<28>; +def R29 : GPR<29, "$29">, DwarfRegNum<29>; +def R30 : GPR<30, "$30">, DwarfRegNum<30>; +def R31 : GPR<31, "$31">, DwarfRegNum<31>; + +// Floating-point registers +def F0 : FPR< 0, "$f0">, DwarfRegNum<33>; +def F1 : FPR< 1, "$f1">, DwarfRegNum<34>; +def F2 : FPR< 2, "$f2">, DwarfRegNum<35>; +def F3 : FPR< 3, "$f3">, DwarfRegNum<36>; +def F4 : FPR< 4, "$f4">, DwarfRegNum<37>; +def F5 : FPR< 5, "$f5">, DwarfRegNum<38>; +def F6 : FPR< 6, "$f6">, DwarfRegNum<39>; +def F7 : FPR< 7, "$f7">, DwarfRegNum<40>; +def F8 : FPR< 8, "$f8">, DwarfRegNum<41>; +def F9 : FPR< 9, "$f9">, DwarfRegNum<42>; +def F10 : FPR<10, "$f10">, DwarfRegNum<43>; +def F11 : FPR<11, "$f11">, DwarfRegNum<44>; +def F12 : FPR<12, "$f12">, DwarfRegNum<45>; +def F13 : FPR<13, "$f13">, DwarfRegNum<46>; +def F14 : FPR<14, "$f14">, DwarfRegNum<47>; +def F15 : FPR<15, "$f15">, DwarfRegNum<48>; +def F16 : FPR<16, "$f16">, DwarfRegNum<49>; +def F17 : FPR<17, "$f17">, DwarfRegNum<50>; +def F18 : FPR<18, "$f18">, DwarfRegNum<51>; +def F19 : FPR<19, "$f19">, DwarfRegNum<52>; +def F20 : FPR<20, "$f20">, DwarfRegNum<53>; +def F21 : FPR<21, "$f21">, DwarfRegNum<54>; +def F22 : FPR<22, "$f22">, DwarfRegNum<55>; +def F23 : FPR<23, "$f23">, DwarfRegNum<56>; +def F24 : FPR<24, "$f24">, DwarfRegNum<57>; +def F25 : FPR<25, "$f25">, DwarfRegNum<58>; +def F26 : FPR<26, "$f26">, DwarfRegNum<59>; +def F27 : FPR<27, "$f27">, DwarfRegNum<60>; +def F28 : FPR<28, "$f28">, DwarfRegNum<61>; +def F29 : FPR<29, "$f29">, DwarfRegNum<62>; +def F30 : FPR<30, "$f30">, DwarfRegNum<63>; +def F31 : FPR<31, "$f31">, DwarfRegNum<64>; + + // //#define FP $15 + // //#define RA $26 + // //#define PV $27 + // //#define GP $29 + // //#define SP $30 + // $28 is undefined after any and all calls + +/// Register classes +def GPRC : RegisterClass<"Alpha", [i64], 64, + // Volatile + [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22, + R23, R24, R25, R28, + //Special meaning, but volatile + R27, //procedure address + R26, //return address + R29, //global offset table address + // Non-volatile + R9, R10, R11, R12, R13, R14, +// Don't allocate 15, 30, 31 + R15, R30, R31 ]> //zero +{ + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + GPRCClass::iterator + GPRCClass::allocation_order_end(const MachineFunction &MF) const { + return end()-3; + } + }]; +} + +def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, + F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, + F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, + // Saved: + F2, F3, F4, F5, F6, F7, F8, F9, + F31 ]> //zero +{ + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + F4RCClass::iterator + F4RCClass::allocation_order_end(const MachineFunction &MF) const { + return end()-1; + } + }]; +} + +def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, + F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, + F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, + // Saved: + F2, F3, F4, F5, F6, F7, F8, F9, + F31 ]> //zero +{ + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + F8RCClass::iterator + F8RCClass::allocation_order_end(const MachineFunction &MF) const { + return end()-1; + } + }]; +} diff --git a/lib/Target/Alpha/AlphaRelocations.h b/lib/Target/Alpha/AlphaRelocations.h new file mode 100644 index 0000000..c532f21 --- /dev/null +++ b/lib/Target/Alpha/AlphaRelocations.h @@ -0,0 +1,31 @@ +//===- AlphaRelocations.h - Alpha Code Relocations --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the Alpha target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHARELOCATIONS_H +#define ALPHARELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace Alpha { + enum RelocationType { + reloc_literal, + reloc_gprellow, + reloc_gprelhigh, + reloc_gpdist, + reloc_bsr + }; + } +} + +#endif diff --git a/lib/Target/Alpha/AlphaSchedule.td b/lib/Target/Alpha/AlphaSchedule.td new file mode 100644 index 0000000..b3aab97 --- /dev/null +++ b/lib/Target/Alpha/AlphaSchedule.td @@ -0,0 +1,84 @@ +//===- AlphaSchedule.td - Alpha Scheduling Definitions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//This is table 2-2 from the 21264 compiler writers guide +//modified some + +//Pipelines + +def L0 : FuncUnit; +def L1 : FuncUnit; +def FST0 : FuncUnit; +def FST1 : FuncUnit; +def U0 : FuncUnit; +def U1 : FuncUnit; +def FA : FuncUnit; +def FM : FuncUnit; + +def s_ild : InstrItinClass; +def s_fld : InstrItinClass; +def s_ist : InstrItinClass; +def s_fst : InstrItinClass; +def s_lda : InstrItinClass; +def s_rpcc : InstrItinClass; +def s_rx : InstrItinClass; +def s_mxpr : InstrItinClass; +def s_icbr : InstrItinClass; +def s_ubr : InstrItinClass; +def s_jsr : InstrItinClass; +def s_iadd : InstrItinClass; +def s_ilog : InstrItinClass; +def s_ishf : InstrItinClass; +def s_cmov : InstrItinClass; +def s_imul : InstrItinClass; +def s_imisc : InstrItinClass; +def s_fbr : InstrItinClass; +def s_fadd : InstrItinClass; +def s_fmul : InstrItinClass; +def s_fcmov : InstrItinClass; +def s_fdivt : InstrItinClass; +def s_fdivs : InstrItinClass; +def s_fsqrts: InstrItinClass; +def s_fsqrtt: InstrItinClass; +def s_ftoi : InstrItinClass; +def s_itof : InstrItinClass; +def s_pseudo : InstrItinClass; + +//Table 24 Instruction Class Latency in Cycles +//modified some + +def Alpha21264Itineraries : ProcessorItineraries<[ + InstrItinData<s_ild , [InstrStage<3, [L0, L1]>]>, + InstrItinData<s_fld , [InstrStage<4, [L0, L1]>]>, + InstrItinData<s_ist , [InstrStage<0, [L0, L1]>]>, + InstrItinData<s_fst , [InstrStage<0, [FST0, FST1, L0, L1]>]>, + InstrItinData<s_lda , [InstrStage<1, [L0, L1, U0, U1]>]>, + InstrItinData<s_rpcc , [InstrStage<1, [L1]>]>, + InstrItinData<s_rx , [InstrStage<1, [L1]>]>, + InstrItinData<s_mxpr , [InstrStage<1, [L0, L1]>]>, + InstrItinData<s_icbr , [InstrStage<0, [U0, U1]>]>, + InstrItinData<s_ubr , [InstrStage<3, [U0, U1]>]>, + InstrItinData<s_jsr , [InstrStage<3, [L0]>]>, + InstrItinData<s_iadd , [InstrStage<1, [L0, U0, L1, U1]>]>, + InstrItinData<s_ilog , [InstrStage<1, [L0, U0, L1, U1]>]>, + InstrItinData<s_ishf , [InstrStage<1, [U0, U1]>]>, + InstrItinData<s_cmov , [InstrStage<1, [L0, U0, L1, U1]>]>, + InstrItinData<s_imul , [InstrStage<7, [U1]>]>, + InstrItinData<s_imisc , [InstrStage<3, [U0]>]>, + InstrItinData<s_fbr , [InstrStage<0, [FA]>]>, + InstrItinData<s_fadd , [InstrStage<6, [FA]>]>, + InstrItinData<s_fmul , [InstrStage<6, [FM]>]>, + InstrItinData<s_fcmov , [InstrStage<6, [FA]>]>, + InstrItinData<s_fdivs , [InstrStage<12, [FA]>]>, + InstrItinData<s_fdivt , [InstrStage<15, [FA]>]>, + InstrItinData<s_fsqrts , [InstrStage<18, [FA]>]>, + InstrItinData<s_fsqrtt , [InstrStage<33, [FA]>]>, + InstrItinData<s_ftoi , [InstrStage<3, [FST0, FST1, L0, L1]>]>, + InstrItinData<s_itof , [InstrStage<4, [L0, L1]>]> +]>; diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp new file mode 100644 index 0000000..4b7d612 --- /dev/null +++ b/lib/Target/Alpha/AlphaSubtarget.cpp @@ -0,0 +1,25 @@ +//===- AlphaSubtarget.cpp - Alpha Subtarget Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Alpha specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AlphaSubtarget.h" +#include "Alpha.h" +#include "AlphaGenSubtarget.inc" +using namespace llvm; + +AlphaSubtarget::AlphaSubtarget(const Module &M, const std::string &FS) + : HasCT(false) { + std::string CPU = "generic"; + + // Parse features string. + ParseSubtargetFeatures(FS, CPU); +} diff --git a/lib/Target/Alpha/AlphaSubtarget.h b/lib/Target/Alpha/AlphaSubtarget.h new file mode 100644 index 0000000..3fb95ad --- /dev/null +++ b/lib/Target/Alpha/AlphaSubtarget.h @@ -0,0 +1,46 @@ +//=====-- AlphaSubtarget.h - Define Subtarget for the Alpha --*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Andrew Lenharth and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Alpha specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHASUBTARGET_H +#define ALPHASUBTARGET_H + +#include "llvm/Target/TargetInstrItineraries.h" +#include "llvm/Target/TargetSubtarget.h" + +#include <string> + +namespace llvm { +class Module; + +class AlphaSubtarget : public TargetSubtarget { +protected: + + bool HasCT; + + InstrItineraryData InstrItins; + +public: + /// This constructor initializes the data members to match that + /// of the specified module. + /// + AlphaSubtarget(const Module &M, const std::string &FS); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + + bool hasCT() const { return HasCT; } +}; +} // End llvm namespace + +#endif diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.cpp b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp new file mode 100644 index 0000000..233d2c7 --- /dev/null +++ b/lib/Target/Alpha/AlphaTargetAsmInfo.cpp @@ -0,0 +1,24 @@ +//===-- AlphaTargetAsmInfo.cpp - Alpha asm properties -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the AlphaTargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "AlphaTargetAsmInfo.h" + +using namespace llvm; + +AlphaTargetAsmInfo::AlphaTargetAsmInfo(const AlphaTargetMachine &TM) { + AlignmentIsInBytes = false; + PrivateGlobalPrefix = "$"; + JumpTableDirective = ".gprel32"; + JumpTableDataSection = "\t.section .rodata\n"; + WeakRefDirective = "\t.weak\t"; +} diff --git a/lib/Target/Alpha/AlphaTargetAsmInfo.h b/lib/Target/Alpha/AlphaTargetAsmInfo.h new file mode 100644 index 0000000..c8b4fd5 --- /dev/null +++ b/lib/Target/Alpha/AlphaTargetAsmInfo.h @@ -0,0 +1,30 @@ +//=====-- AlphaTargetAsmInfo.h - Alpha asm properties ---------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the AlphaTargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHATARGETASMINFO_H +#define ALPHATARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class AlphaTargetMachine; + + struct AlphaTargetAsmInfo : public TargetAsmInfo { + AlphaTargetAsmInfo(const AlphaTargetMachine &TM); + }; + +} // namespace llvm + +#endif diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp new file mode 100644 index 0000000..d4137a5 --- /dev/null +++ b/lib/Target/Alpha/AlphaTargetMachine.cpp @@ -0,0 +1,97 @@ +//===-- AlphaTargetMachine.cpp - Define TargetMachine for Alpha -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "Alpha.h" +#include "AlphaJITInfo.h" +#include "AlphaTargetAsmInfo.h" +#include "AlphaTargetMachine.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Target/TargetMachineRegistry.h" + +using namespace llvm; + +namespace { + // Register the targets + RegisterTarget<AlphaTargetMachine> X("alpha", " Alpha (incomplete)"); +} + +const TargetAsmInfo *AlphaTargetMachine::createTargetAsmInfo() const { + return new AlphaTargetAsmInfo(*this); +} + +unsigned AlphaTargetMachine::getModuleMatchQuality(const Module &M) { + // We strongly match "alpha*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 5 && TT[0] == 'a' && TT[1] == 'l' && TT[2] == 'p' && + TT[3] == 'h' && TT[4] == 'a') + return 20; + // If the target triple is something non-alpha, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::LittleEndian && + M.getPointerSize() == Module::Pointer64) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + +unsigned AlphaTargetMachine::getJITMatchQuality() { +#ifdef __alpha + return 10; +#else + return 0; +#endif +} + +AlphaTargetMachine::AlphaTargetMachine(const Module &M, const std::string &FS) + : DataLayout("e"), + FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), + JITInfo(*this), + Subtarget(M, FS), + TLInfo(*this) { + setRelocationModel(Reloc::PIC_); +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool AlphaTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + PM.add(createAlphaISelDag(*this)); + return false; +} +bool AlphaTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) { + // Must run branch selection immediately preceding the asm printer + PM.add(createAlphaBranchSelectionPass()); + return false; +} +bool AlphaTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + PM.add(createAlphaLLRPPass(*this)); + PM.add(createAlphaCodePrinterPass(Out, *this)); + return false; +} +bool AlphaTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + PM.add(createAlphaCodeEmitterPass(*this, MCE)); + return false; +} +bool AlphaTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, + bool Fast, + MachineCodeEmitter &MCE) { + return addCodeEmitter(PM, Fast, MCE); +} diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h new file mode 100644 index 0000000..5a57f63 --- /dev/null +++ b/lib/Target/Alpha/AlphaTargetMachine.h @@ -0,0 +1,73 @@ +//===-- AlphaTargetMachine.h - Define TargetMachine for Alpha ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Alpha-specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef ALPHA_TARGETMACHINE_H +#define ALPHA_TARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "AlphaInstrInfo.h" +#include "AlphaJITInfo.h" +#include "AlphaISelLowering.h" +#include "AlphaSubtarget.h" + +namespace llvm { + +class GlobalValue; + +class AlphaTargetMachine : public LLVMTargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + AlphaInstrInfo InstrInfo; + TargetFrameInfo FrameInfo; + AlphaJITInfo JITInfo; + AlphaSubtarget Subtarget; + AlphaTargetLowering TLInfo; + +protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + +public: + AlphaTargetMachine(const Module &M, const std::string &FS); + + virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual const TargetSubtarget *getSubtargetImpl() const{ return &Subtarget; } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + virtual TargetLowering* getTargetLowering() const { + return const_cast<AlphaTargetLowering*>(&TLInfo); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual TargetJITInfo* getJITInfo() { + return &JITInfo; + } + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); + virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); + virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Alpha/Makefile b/lib/Target/Alpha/Makefile new file mode 100644 index 0000000..bb9895a --- /dev/null +++ b/lib/Target/Alpha/Makefile @@ -0,0 +1,20 @@ +##===- lib/Target/Alpha/Makefile -------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMAlpha +TARGET = Alpha + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = AlphaGenRegisterInfo.h.inc AlphaGenRegisterNames.inc \ + AlphaGenRegisterInfo.inc AlphaGenInstrNames.inc \ + AlphaGenInstrInfo.inc AlphaGenCodeEmitter.inc \ + AlphaGenAsmWriter.inc AlphaGenDAGISel.inc \ + AlphaGenSubtarget.inc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt new file mode 100644 index 0000000..9ae1517 --- /dev/null +++ b/lib/Target/Alpha/README.txt @@ -0,0 +1,42 @@ +*** + +add gcc builtins for alpha instructions + + +*** + +custom expand byteswap into nifty +extract/insert/mask byte/word/longword/quadword low/high +sequences + +*** + +see if any of the extract/insert/mask operations can be added + +*** + +match more interesting things for cmovlbc cmovlbs (move if low bit clear/set) + +*** + +lower srem and urem + +remq(i,j): i - (j * divq(i,j)) if j != 0 +remqu(i,j): i - (j * divqu(i,j)) if j != 0 +reml(i,j): i - (j * divl(i,j)) if j != 0 +remlu(i,j): i - (j * divlu(i,j)) if j != 0 + +*** + +add crazy vector instructions (MVI): + +(MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word +PKWB, UNPKBW pack/unpack word to byte +PKLB UNPKBL pack/unpack long to byte +PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b)) + +cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions) + +this has some good examples for other operations that can be synthesised well +from these rather meager vector ops (such as saturating add). +http://www.alphalinux.org/docs/MVI-full.html diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp new file mode 100644 index 0000000..b0c76c8 --- /dev/null +++ b/lib/Target/CBackend/CBackend.cpp @@ -0,0 +1,2930 @@ +//===-- CBackend.cpp - Library for converting LLVM code to C --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This library converts LLVM code to C code, compilable by GCC and other C +// compilers. +// +//===----------------------------------------------------------------------===// + +#include "CTargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/ParameterAttributes.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" +#include "llvm/TypeSymbolTable.h" +#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/InlineAsm.h" +#include "llvm/Analysis/ConstantsScanner.h" +#include "llvm/Analysis/FindUsedTypes.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Target/TargetMachineRegistry.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Config/config.h" +#include <algorithm> +#include <sstream> +using namespace llvm; + +namespace { + // Register the target. + RegisterTarget<CTargetMachine> X("c", " C backend"); + + /// CBackendNameAllUsedStructsAndMergeFunctions - This pass inserts names for + /// any unnamed structure types that are used by the program, and merges + /// external functions with the same name. + /// + class CBackendNameAllUsedStructsAndMergeFunctions : public ModulePass { + public: + static char ID; + CBackendNameAllUsedStructsAndMergeFunctions() + : ModulePass((intptr_t)&ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<FindUsedTypes>(); + } + + virtual const char *getPassName() const { + return "C backend type canonicalizer"; + } + + virtual bool runOnModule(Module &M); + }; + + char CBackendNameAllUsedStructsAndMergeFunctions::ID = 0; + + /// CWriter - This class is the main chunk of code that converts an LLVM + /// module to a C translation unit. + class CWriter : public FunctionPass, public InstVisitor<CWriter> { + std::ostream &Out; + IntrinsicLowering *IL; + Mangler *Mang; + LoopInfo *LI; + const Module *TheModule; + const TargetAsmInfo* TAsm; + const TargetData* TD; + std::map<const Type *, std::string> TypeNames; + std::map<const ConstantFP *, unsigned> FPConstantMap; + std::set<Function*> intrinsicPrototypesAlreadyGenerated; + + public: + static char ID; + CWriter(std::ostream &o) + : FunctionPass((intptr_t)&ID), Out(o), IL(0), Mang(0), LI(0), + TheModule(0), TAsm(0), TD(0) {} + + virtual const char *getPassName() const { return "C backend"; } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.setPreservesAll(); + } + + virtual bool doInitialization(Module &M); + + bool runOnFunction(Function &F) { + LI = &getAnalysis<LoopInfo>(); + + // Get rid of intrinsics we can't handle. + lowerIntrinsics(F); + + // Output all floating point constants that cannot be printed accurately. + printFloatingPointConstants(F); + + printFunction(F); + FPConstantMap.clear(); + return false; + } + + virtual bool doFinalization(Module &M) { + // Free memory... + delete Mang; + TypeNames.clear(); + return false; + } + + std::ostream &printType(std::ostream &Out, const Type *Ty, + bool isSigned = false, + const std::string &VariableName = "", + bool IgnoreName = false); + std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, + bool isSigned, + const std::string &NameSoFar = ""); + + void printStructReturnPointerFunctionType(std::ostream &Out, + const PointerType *Ty); + + void writeOperand(Value *Operand); + void writeOperandRaw(Value *Operand); + void writeOperandInternal(Value *Operand); + void writeOperandWithCast(Value* Operand, unsigned Opcode); + void writeOperandWithCast(Value* Operand, ICmpInst::Predicate predicate); + bool writeInstructionCast(const Instruction &I); + + private : + std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c); + + void lowerIntrinsics(Function &F); + + void printModule(Module *M); + void printModuleTypes(const TypeSymbolTable &ST); + void printContainedStructs(const Type *Ty, std::set<const StructType *> &); + void printFloatingPointConstants(Function &F); + void printFunctionSignature(const Function *F, bool Prototype); + + void printFunction(Function &); + void printBasicBlock(BasicBlock *BB); + void printLoop(Loop *L); + + void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy); + void printConstant(Constant *CPV); + void printConstantWithCast(Constant *CPV, unsigned Opcode); + bool printConstExprCast(const ConstantExpr *CE); + void printConstantArray(ConstantArray *CPA); + void printConstantVector(ConstantVector *CP); + + // isInlinableInst - Attempt to inline instructions into their uses to build + // trees as much as possible. To do this, we have to consistently decide + // what is acceptable to inline, so that variable declarations don't get + // printed and an extra copy of the expr is not emitted. + // + static bool isInlinableInst(const Instruction &I) { + // Always inline cmp instructions, even if they are shared by multiple + // expressions. GCC generates horrible code if we don't. + if (isa<CmpInst>(I)) + return true; + + // Must be an expression, must be used exactly once. If it is dead, we + // emit it inline where it would go. + if (I.getType() == Type::VoidTy || !I.hasOneUse() || + isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) || + isa<LoadInst>(I) || isa<VAArgInst>(I)) + // Don't inline a load across a store or other bad things! + return false; + + // Must not be used in inline asm + if (I.hasOneUse() && isInlineAsm(*I.use_back())) return false; + + // Only inline instruction it if it's use is in the same BB as the inst. + return I.getParent() == cast<Instruction>(I.use_back())->getParent(); + } + + // isDirectAlloca - Define fixed sized allocas in the entry block as direct + // variables which are accessed with the & operator. This causes GCC to + // generate significantly better code than to emit alloca calls directly. + // + static const AllocaInst *isDirectAlloca(const Value *V) { + const AllocaInst *AI = dyn_cast<AllocaInst>(V); + if (!AI) return false; + if (AI->isArrayAllocation()) + return 0; // FIXME: we can also inline fixed size array allocas! + if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock()) + return 0; + return AI; + } + + // isInlineAsm - Check if the instruction is a call to an inline asm chunk + static bool isInlineAsm(const Instruction& I) { + if (isa<CallInst>(&I) && isa<InlineAsm>(I.getOperand(0))) + return true; + return false; + } + + // Instruction visitation functions + friend class InstVisitor<CWriter>; + + void visitReturnInst(ReturnInst &I); + void visitBranchInst(BranchInst &I); + void visitSwitchInst(SwitchInst &I); + void visitInvokeInst(InvokeInst &I) { + assert(0 && "Lowerinvoke pass didn't work!"); + } + + void visitUnwindInst(UnwindInst &I) { + assert(0 && "Lowerinvoke pass didn't work!"); + } + void visitUnreachableInst(UnreachableInst &I); + + void visitPHINode(PHINode &I); + void visitBinaryOperator(Instruction &I); + void visitICmpInst(ICmpInst &I); + void visitFCmpInst(FCmpInst &I); + + void visitCastInst (CastInst &I); + void visitSelectInst(SelectInst &I); + void visitCallInst (CallInst &I); + void visitInlineAsm(CallInst &I); + + void visitMallocInst(MallocInst &I); + void visitAllocaInst(AllocaInst &I); + void visitFreeInst (FreeInst &I); + void visitLoadInst (LoadInst &I); + void visitStoreInst (StoreInst &I); + void visitGetElementPtrInst(GetElementPtrInst &I); + void visitVAArgInst (VAArgInst &I); + + void visitInstruction(Instruction &I) { + cerr << "C Writer does not know about " << I; + abort(); + } + + void outputLValue(Instruction *I) { + Out << " " << GetValueName(I) << " = "; + } + + bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To); + void printPHICopiesForSuccessor(BasicBlock *CurBlock, + BasicBlock *Successor, unsigned Indent); + void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock, + unsigned Indent); + void printIndexingExpression(Value *Ptr, gep_type_iterator I, + gep_type_iterator E); + + std::string GetValueName(const Value *Operand); + }; +} + +char CWriter::ID = 0; + +/// This method inserts names for any unnamed structure types that are used by +/// the program, and removes names from structure types that are not used by the +/// program. +/// +bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) { + // Get a set of types that are used by the program... + std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes(); + + // Loop over the module symbol table, removing types from UT that are + // already named, and removing names for types that are not used. + // + TypeSymbolTable &TST = M.getTypeSymbolTable(); + for (TypeSymbolTable::iterator TI = TST.begin(), TE = TST.end(); + TI != TE; ) { + TypeSymbolTable::iterator I = TI++; + + // If this isn't a struct type, remove it from our set of types to name. + // This simplifies emission later. + if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second)) { + TST.remove(I); + } else { + // If this is not used, remove it from the symbol table. + std::set<const Type *>::iterator UTI = UT.find(I->second); + if (UTI == UT.end()) + TST.remove(I); + else + UT.erase(UTI); // Only keep one name for this type. + } + } + + // UT now contains types that are not named. Loop over it, naming + // structure types. + // + bool Changed = false; + unsigned RenameCounter = 0; + for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end(); + I != E; ++I) + if (const StructType *ST = dyn_cast<StructType>(*I)) { + while (M.addTypeName("unnamed"+utostr(RenameCounter), ST)) + ++RenameCounter; + Changed = true; + } + + + // Loop over all external functions and globals. If we have two with + // identical names, merge them. + // FIXME: This code should disappear when we don't allow values with the same + // names when they have different types! + std::map<std::string, GlobalValue*> ExtSymbols; + for (Module::iterator I = M.begin(), E = M.end(); I != E;) { + Function *GV = I++; + if (GV->isDeclaration() && GV->hasName()) { + std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X + = ExtSymbols.insert(std::make_pair(GV->getName(), GV)); + if (!X.second) { + // Found a conflict, replace this global with the previous one. + GlobalValue *OldGV = X.first->second; + GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType())); + GV->eraseFromParent(); + Changed = true; + } + } + } + // Do the same for globals. + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E;) { + GlobalVariable *GV = I++; + if (GV->isDeclaration() && GV->hasName()) { + std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X + = ExtSymbols.insert(std::make_pair(GV->getName(), GV)); + if (!X.second) { + // Found a conflict, replace this global with the previous one. + GlobalValue *OldGV = X.first->second; + GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType())); + GV->eraseFromParent(); + Changed = true; + } + } + } + + return Changed; +} + +/// printStructReturnPointerFunctionType - This is like printType for a struct +/// return type, except, instead of printing the type as void (*)(Struct*, ...) +/// print it as "Struct (*)(...)", for struct return functions. +void CWriter::printStructReturnPointerFunctionType(std::ostream &Out, + const PointerType *TheTy) { + const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType()); + std::stringstream FunctionInnards; + FunctionInnards << " (*) ("; + bool PrintedType = false; + + FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); + const Type *RetTy = cast<PointerType>(I->get())->getElementType(); + unsigned Idx = 1; + const ParamAttrsList *Attrs = FTy->getParamAttrs(); + for (++I; I != E; ++I) { + if (PrintedType) + FunctionInnards << ", "; + printType(FunctionInnards, *I, + /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), ""); + PrintedType = true; + } + if (FTy->isVarArg()) { + if (PrintedType) + FunctionInnards << ", ..."; + } else if (!PrintedType) { + FunctionInnards << "void"; + } + FunctionInnards << ')'; + std::string tstr = FunctionInnards.str(); + printType(Out, RetTy, + /*isSigned=*/Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), tstr); +} + +std::ostream & +CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned, + const std::string &NameSoFar) { + assert((Ty->isPrimitiveType() || Ty->isInteger()) && + "Invalid type for printSimpleType"); + switch (Ty->getTypeID()) { + case Type::VoidTyID: return Out << "void " << NameSoFar; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits == 1) + return Out << "bool " << NameSoFar; + else if (NumBits <= 8) + return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar; + else if (NumBits <= 16) + return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar; + else if (NumBits <= 32) + return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar; + else { + assert(NumBits <= 64 && "Bit widths > 64 not implemented yet"); + return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar; + } + } + case Type::FloatTyID: return Out << "float " << NameSoFar; + case Type::DoubleTyID: return Out << "double " << NameSoFar; + default : + cerr << "Unknown primitive type: " << *Ty << "\n"; + abort(); + } +} + +// Pass the Type* and the variable name and this prints out the variable +// declaration. +// +std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty, + bool isSigned, const std::string &NameSoFar, + bool IgnoreName) { + if (Ty->isPrimitiveType() || Ty->isInteger()) { + printSimpleType(Out, Ty, isSigned, NameSoFar); + return Out; + } + + // Check to see if the type is named. + if (!IgnoreName || isa<OpaqueType>(Ty)) { + std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty); + if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar; + } + + switch (Ty->getTypeID()) { + case Type::FunctionTyID: { + const FunctionType *FTy = cast<FunctionType>(Ty); + std::stringstream FunctionInnards; + FunctionInnards << " (" << NameSoFar << ") ("; + const ParamAttrsList *Attrs = FTy->getParamAttrs(); + unsigned Idx = 1; + for (FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); I != E; ++I) { + if (I != FTy->param_begin()) + FunctionInnards << ", "; + printType(FunctionInnards, *I, + /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), ""); + ++Idx; + } + if (FTy->isVarArg()) { + if (FTy->getNumParams()) + FunctionInnards << ", ..."; + } else if (!FTy->getNumParams()) { + FunctionInnards << "void"; + } + FunctionInnards << ')'; + std::string tstr = FunctionInnards.str(); + printType(Out, FTy->getReturnType(), + /*isSigned=*/Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), tstr); + return Out; + } + case Type::StructTyID: { + const StructType *STy = cast<StructType>(Ty); + Out << NameSoFar + " {\n"; + unsigned Idx = 0; + for (StructType::element_iterator I = STy->element_begin(), + E = STy->element_end(); I != E; ++I) { + Out << " "; + printType(Out, *I, false, "field" + utostr(Idx++)); + Out << ";\n"; + } + Out << '}'; + if (STy->isPacked()) + Out << " __attribute__ ((packed))"; + return Out; + } + + case Type::PointerTyID: { + const PointerType *PTy = cast<PointerType>(Ty); + std::string ptrName = "*" + NameSoFar; + + if (isa<ArrayType>(PTy->getElementType()) || + isa<VectorType>(PTy->getElementType())) + ptrName = "(" + ptrName + ")"; + + return printType(Out, PTy->getElementType(), false, ptrName); + } + + case Type::ArrayTyID: { + const ArrayType *ATy = cast<ArrayType>(Ty); + unsigned NumElements = ATy->getNumElements(); + if (NumElements == 0) NumElements = 1; + return printType(Out, ATy->getElementType(), false, + NameSoFar + "[" + utostr(NumElements) + "]"); + } + + case Type::VectorTyID: { + const VectorType *PTy = cast<VectorType>(Ty); + unsigned NumElements = PTy->getNumElements(); + if (NumElements == 0) NumElements = 1; + return printType(Out, PTy->getElementType(), false, + NameSoFar + "[" + utostr(NumElements) + "]"); + } + + case Type::OpaqueTyID: { + static int Count = 0; + std::string TyName = "struct opaque_" + itostr(Count++); + assert(TypeNames.find(Ty) == TypeNames.end()); + TypeNames[Ty] = TyName; + return Out << TyName << ' ' << NameSoFar; + } + default: + assert(0 && "Unhandled case in getTypeProps!"); + abort(); + } + + return Out; +} + +void CWriter::printConstantArray(ConstantArray *CPA) { + + // As a special case, print the array as a string if it is an array of + // ubytes or an array of sbytes with positive values. + // + const Type *ETy = CPA->getType()->getElementType(); + bool isString = (ETy == Type::Int8Ty || ETy == Type::Int8Ty); + + // Make sure the last character is a null char, as automatically added by C + if (isString && (CPA->getNumOperands() == 0 || + !cast<Constant>(*(CPA->op_end()-1))->isNullValue())) + isString = false; + + if (isString) { + Out << '\"'; + // Keep track of whether the last number was a hexadecimal escape + bool LastWasHex = false; + + // Do not include the last character, which we know is null + for (unsigned i = 0, e = CPA->getNumOperands()-1; i != e; ++i) { + unsigned char C = cast<ConstantInt>(CPA->getOperand(i))->getZExtValue(); + + // Print it out literally if it is a printable character. The only thing + // to be careful about is when the last letter output was a hex escape + // code, in which case we have to be careful not to print out hex digits + // explicitly (the C compiler thinks it is a continuation of the previous + // character, sheesh...) + // + if (isprint(C) && (!LastWasHex || !isxdigit(C))) { + LastWasHex = false; + if (C == '"' || C == '\\') + Out << "\\" << C; + else + Out << C; + } else { + LastWasHex = false; + switch (C) { + case '\n': Out << "\\n"; break; + case '\t': Out << "\\t"; break; + case '\r': Out << "\\r"; break; + case '\v': Out << "\\v"; break; + case '\a': Out << "\\a"; break; + case '\"': Out << "\\\""; break; + case '\'': Out << "\\\'"; break; + default: + Out << "\\x"; + Out << (char)(( C/16 < 10) ? ( C/16 +'0') : ( C/16 -10+'A')); + Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A')); + LastWasHex = true; + break; + } + } + } + Out << '\"'; + } else { + Out << '{'; + if (CPA->getNumOperands()) { + Out << ' '; + printConstant(cast<Constant>(CPA->getOperand(0))); + for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) { + Out << ", "; + printConstant(cast<Constant>(CPA->getOperand(i))); + } + } + Out << " }"; + } +} + +void CWriter::printConstantVector(ConstantVector *CP) { + Out << '{'; + if (CP->getNumOperands()) { + Out << ' '; + printConstant(cast<Constant>(CP->getOperand(0))); + for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) { + Out << ", "; + printConstant(cast<Constant>(CP->getOperand(i))); + } + } + Out << " }"; +} + +// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out +// textually as a double (rather than as a reference to a stack-allocated +// variable). We decide this by converting CFP to a string and back into a +// double, and then checking whether the conversion results in a bit-equal +// double to the original value of CFP. This depends on us and the target C +// compiler agreeing on the conversion process (which is pretty likely since we +// only deal in IEEE FP). +// +static bool isFPCSafeToPrint(const ConstantFP *CFP) { +#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A + char Buffer[100]; + sprintf(Buffer, "%a", CFP->getValue()); + + if (!strncmp(Buffer, "0x", 2) || + !strncmp(Buffer, "-0x", 3) || + !strncmp(Buffer, "+0x", 3)) + return atof(Buffer) == CFP->getValue(); + return false; +#else + std::string StrVal = ftostr(CFP->getValue()); + + while (StrVal[0] == ' ') + StrVal.erase(StrVal.begin()); + + // Check to make sure that the stringized number is not some string like "Inf" + // or NaN. Check that the string matches the "[-+]?[0-9]" regex. + if ((StrVal[0] >= '0' && StrVal[0] <= '9') || + ((StrVal[0] == '-' || StrVal[0] == '+') && + (StrVal[1] >= '0' && StrVal[1] <= '9'))) + // Reparse stringized version! + return atof(StrVal.c_str()) == CFP->getValue(); + return false; +#endif +} + +/// Print out the casting for a cast operation. This does the double casting +/// necessary for conversion to the destination type, if necessary. +/// @brief Print a cast +void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) { + // Print the destination type cast + switch (opc) { + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::IntToPtr: + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::FPExt: + case Instruction::FPTrunc: // For these the DstTy sign doesn't matter + Out << '('; + printType(Out, DstTy); + Out << ')'; + break; + case Instruction::ZExt: + case Instruction::PtrToInt: + case Instruction::FPToUI: // For these, make sure we get an unsigned dest + Out << '('; + printSimpleType(Out, DstTy, false); + Out << ')'; + break; + case Instruction::SExt: + case Instruction::FPToSI: // For these, make sure we get a signed dest + Out << '('; + printSimpleType(Out, DstTy, true); + Out << ')'; + break; + default: + assert(0 && "Invalid cast opcode"); + } + + // Print the source type cast + switch (opc) { + case Instruction::UIToFP: + case Instruction::ZExt: + Out << '('; + printSimpleType(Out, SrcTy, false); + Out << ')'; + break; + case Instruction::SIToFP: + case Instruction::SExt: + Out << '('; + printSimpleType(Out, SrcTy, true); + Out << ')'; + break; + case Instruction::IntToPtr: + case Instruction::PtrToInt: + // Avoid "cast to pointer from integer of different size" warnings + Out << "(unsigned long)"; + break; + case Instruction::Trunc: + case Instruction::BitCast: + case Instruction::FPExt: + case Instruction::FPTrunc: + case Instruction::FPToSI: + case Instruction::FPToUI: + break; // These don't need a source cast. + default: + assert(0 && "Invalid cast opcode"); + break; + } +} + +// printConstant - The LLVM Constant to C Constant converter. +void CWriter::printConstant(Constant *CPV) { + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) { + switch (CE->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + Out << "("; + printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType()); + if (CE->getOpcode() == Instruction::SExt && + CE->getOperand(0)->getType() == Type::Int1Ty) { + // Make sure we really sext from bool here by subtracting from 0 + Out << "0-"; + } + printConstant(CE->getOperand(0)); + if (CE->getType() == Type::Int1Ty && + (CE->getOpcode() == Instruction::Trunc || + CE->getOpcode() == Instruction::FPToUI || + CE->getOpcode() == Instruction::FPToSI || + CE->getOpcode() == Instruction::PtrToInt)) { + // Make sure we really truncate to bool here by anding with 1 + Out << "&1u"; + } + Out << ')'; + return; + + case Instruction::GetElementPtr: + Out << "(&("; + printIndexingExpression(CE->getOperand(0), gep_type_begin(CPV), + gep_type_end(CPV)); + Out << "))"; + return; + case Instruction::Select: + Out << '('; + printConstant(CE->getOperand(0)); + Out << '?'; + printConstant(CE->getOperand(1)); + Out << ':'; + printConstant(CE->getOperand(2)); + Out << ')'; + return; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + { + Out << '('; + bool NeedsClosingParens = printConstExprCast(CE); + printConstantWithCast(CE->getOperand(0), CE->getOpcode()); + switch (CE->getOpcode()) { + case Instruction::Add: Out << " + "; break; + case Instruction::Sub: Out << " - "; break; + case Instruction::Mul: Out << " * "; break; + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: Out << " % "; break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: Out << " / "; break; + case Instruction::And: Out << " & "; break; + case Instruction::Or: Out << " | "; break; + case Instruction::Xor: Out << " ^ "; break; + case Instruction::Shl: Out << " << "; break; + case Instruction::LShr: + case Instruction::AShr: Out << " >> "; break; + case Instruction::ICmp: + switch (CE->getPredicate()) { + case ICmpInst::ICMP_EQ: Out << " == "; break; + case ICmpInst::ICMP_NE: Out << " != "; break; + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_ULT: Out << " < "; break; + case ICmpInst::ICMP_SLE: + case ICmpInst::ICMP_ULE: Out << " <= "; break; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_UGT: Out << " > "; break; + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_UGE: Out << " >= "; break; + default: assert(0 && "Illegal ICmp predicate"); + } + break; + default: assert(0 && "Illegal opcode here!"); + } + printConstantWithCast(CE->getOperand(1), CE->getOpcode()); + if (NeedsClosingParens) + Out << "))"; + Out << ')'; + return; + } + case Instruction::FCmp: { + Out << '('; + bool NeedsClosingParens = printConstExprCast(CE); + if (CE->getPredicate() == FCmpInst::FCMP_FALSE) + Out << "0"; + else if (CE->getPredicate() == FCmpInst::FCMP_TRUE) + Out << "1"; + else { + const char* op = 0; + switch (CE->getPredicate()) { + default: assert(0 && "Illegal FCmp predicate"); + case FCmpInst::FCMP_ORD: op = "ord"; break; + case FCmpInst::FCMP_UNO: op = "uno"; break; + case FCmpInst::FCMP_UEQ: op = "ueq"; break; + case FCmpInst::FCMP_UNE: op = "une"; break; + case FCmpInst::FCMP_ULT: op = "ult"; break; + case FCmpInst::FCMP_ULE: op = "ule"; break; + case FCmpInst::FCMP_UGT: op = "ugt"; break; + case FCmpInst::FCMP_UGE: op = "uge"; break; + case FCmpInst::FCMP_OEQ: op = "oeq"; break; + case FCmpInst::FCMP_ONE: op = "one"; break; + case FCmpInst::FCMP_OLT: op = "olt"; break; + case FCmpInst::FCMP_OLE: op = "ole"; break; + case FCmpInst::FCMP_OGT: op = "ogt"; break; + case FCmpInst::FCMP_OGE: op = "oge"; break; + } + Out << "llvm_fcmp_" << op << "("; + printConstantWithCast(CE->getOperand(0), CE->getOpcode()); + Out << ", "; + printConstantWithCast(CE->getOperand(1), CE->getOpcode()); + Out << ")"; + } + if (NeedsClosingParens) + Out << "))"; + Out << ')'; + } + default: + cerr << "CWriter Error: Unhandled constant expression: " + << *CE << "\n"; + abort(); + } + } else if (isa<UndefValue>(CPV) && CPV->getType()->isFirstClassType()) { + Out << "(("; + printType(Out, CPV->getType()); // sign doesn't matter + Out << ")/*UNDEF*/0)"; + return; + } + + if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { + const Type* Ty = CI->getType(); + if (Ty == Type::Int1Ty) + Out << (CI->getZExtValue() ? '1' : '0') ; + else { + Out << "(("; + printSimpleType(Out, Ty, false) << ')'; + if (CI->isMinValue(true)) + Out << CI->getZExtValue() << 'u'; + else + Out << CI->getSExtValue(); + if (Ty->getPrimitiveSizeInBits() > 32) + Out << "ll"; + Out << ')'; + } + return; + } + + switch (CPV->getType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: { + ConstantFP *FPC = cast<ConstantFP>(CPV); + std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC); + if (I != FPConstantMap.end()) { + // Because of FP precision problems we must load from a stack allocated + // value that holds the value in hex. + Out << "(*(" << (FPC->getType() == Type::FloatTy ? "float" : "double") + << "*)&FPConstant" << I->second << ')'; + } else { + if (IsNAN(FPC->getValue())) { + // The value is NaN + + // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN, + // it's 0x7ff4. + const unsigned long QuietNaN = 0x7ff8UL; + //const unsigned long SignalNaN = 0x7ff4UL; + + // We need to grab the first part of the FP # + char Buffer[100]; + + uint64_t ll = DoubleToBits(FPC->getValue()); + sprintf(Buffer, "0x%llx", static_cast<long long>(ll)); + + std::string Num(&Buffer[0], &Buffer[6]); + unsigned long Val = strtoul(Num.c_str(), 0, 16); + + if (FPC->getType() == Type::FloatTy) + Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\"" + << Buffer << "\") /*nan*/ "; + else + Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\"" + << Buffer << "\") /*nan*/ "; + } else if (IsInf(FPC->getValue())) { + // The value is Inf + if (FPC->getValue() < 0) Out << '-'; + Out << "LLVM_INF" << (FPC->getType() == Type::FloatTy ? "F" : "") + << " /*inf*/ "; + } else { + std::string Num; +#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A + // Print out the constant as a floating point number. + char Buffer[100]; + sprintf(Buffer, "%a", FPC->getValue()); + Num = Buffer; +#else + Num = ftostr(FPC->getValue()); +#endif + Out << Num; + } + } + break; + } + + case Type::ArrayTyID: + if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) { + const ArrayType *AT = cast<ArrayType>(CPV->getType()); + Out << '{'; + if (AT->getNumElements()) { + Out << ' '; + Constant *CZ = Constant::getNullValue(AT->getElementType()); + printConstant(CZ); + for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) { + Out << ", "; + printConstant(CZ); + } + } + Out << " }"; + } else { + printConstantArray(cast<ConstantArray>(CPV)); + } + break; + + case Type::VectorTyID: + if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) { + const VectorType *AT = cast<VectorType>(CPV->getType()); + Out << '{'; + if (AT->getNumElements()) { + Out << ' '; + Constant *CZ = Constant::getNullValue(AT->getElementType()); + printConstant(CZ); + for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) { + Out << ", "; + printConstant(CZ); + } + } + Out << " }"; + } else { + printConstantVector(cast<ConstantVector>(CPV)); + } + break; + + case Type::StructTyID: + if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) { + const StructType *ST = cast<StructType>(CPV->getType()); + Out << '{'; + if (ST->getNumElements()) { + Out << ' '; + printConstant(Constant::getNullValue(ST->getElementType(0))); + for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) { + Out << ", "; + printConstant(Constant::getNullValue(ST->getElementType(i))); + } + } + Out << " }"; + } else { + Out << '{'; + if (CPV->getNumOperands()) { + Out << ' '; + printConstant(cast<Constant>(CPV->getOperand(0))); + for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) { + Out << ", "; + printConstant(cast<Constant>(CPV->getOperand(i))); + } + } + Out << " }"; + } + break; + + case Type::PointerTyID: + if (isa<ConstantPointerNull>(CPV)) { + Out << "(("; + printType(Out, CPV->getType()); // sign doesn't matter + Out << ")/*NULL*/0)"; + break; + } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) { + writeOperand(GV); + break; + } + // FALL THROUGH + default: + cerr << "Unknown constant type: " << *CPV << "\n"; + abort(); + } +} + +// Some constant expressions need to be casted back to the original types +// because their operands were casted to the expected type. This function takes +// care of detecting that case and printing the cast for the ConstantExpr. +bool CWriter::printConstExprCast(const ConstantExpr* CE) { + bool NeedsExplicitCast = false; + const Type *Ty = CE->getOperand(0)->getType(); + bool TypeIsSigned = false; + switch (CE->getOpcode()) { + case Instruction::LShr: + case Instruction::URem: + case Instruction::UDiv: NeedsExplicitCast = true; break; + case Instruction::AShr: + case Instruction::SRem: + case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break; + case Instruction::SExt: + Ty = CE->getType(); + NeedsExplicitCast = true; + TypeIsSigned = true; + break; + case Instruction::ZExt: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + Ty = CE->getType(); + NeedsExplicitCast = true; + break; + default: break; + } + if (NeedsExplicitCast) { + Out << "(("; + if (Ty->isInteger() && Ty != Type::Int1Ty) + printSimpleType(Out, Ty, TypeIsSigned); + else + printType(Out, Ty); // not integer, sign doesn't matter + Out << ")("; + } + return NeedsExplicitCast; +} + +// Print a constant assuming that it is the operand for a given Opcode. The +// opcodes that care about sign need to cast their operands to the expected +// type before the operation proceeds. This function does the casting. +void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) { + + // Extract the operand's type, we'll need it. + const Type* OpTy = CPV->getType(); + + // Indicate whether to do the cast or not. + bool shouldCast = false; + bool typeIsSigned = false; + + // Based on the Opcode for which this Constant is being written, determine + // the new type to which the operand should be casted by setting the value + // of OpTy. If we change OpTy, also set shouldCast to true so it gets + // casted below. + switch (Opcode) { + default: + // for most instructions, it doesn't matter + break; + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::URem: + shouldCast = true; + break; + case Instruction::AShr: + case Instruction::SDiv: + case Instruction::SRem: + shouldCast = true; + typeIsSigned = true; + break; + } + + // Write out the casted constant if we should, otherwise just write the + // operand. + if (shouldCast) { + Out << "(("; + printSimpleType(Out, OpTy, typeIsSigned); + Out << ")"; + printConstant(CPV); + Out << ")"; + } else + printConstant(CPV); +} + +std::string CWriter::GetValueName(const Value *Operand) { + std::string Name; + + if (!isa<GlobalValue>(Operand) && Operand->getName() != "") { + std::string VarName; + + Name = Operand->getName(); + VarName.reserve(Name.capacity()); + + for (std::string::iterator I = Name.begin(), E = Name.end(); + I != E; ++I) { + char ch = *I; + + if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || + (ch >= '0' && ch <= '9') || ch == '_')) + VarName += '_'; + else + VarName += ch; + } + + Name = "llvm_cbe_" + VarName; + } else { + Name = Mang->getValueName(Operand); + } + + return Name; +} + +void CWriter::writeOperandInternal(Value *Operand) { + if (Instruction *I = dyn_cast<Instruction>(Operand)) + if (isInlinableInst(*I) && !isDirectAlloca(I)) { + // Should we inline this instruction to build a tree? + Out << '('; + visit(*I); + Out << ')'; + return; + } + + Constant* CPV = dyn_cast<Constant>(Operand); + + if (CPV && !isa<GlobalValue>(CPV)) + printConstant(CPV); + else + Out << GetValueName(Operand); +} + +void CWriter::writeOperandRaw(Value *Operand) { + Constant* CPV = dyn_cast<Constant>(Operand); + if (CPV && !isa<GlobalValue>(CPV)) { + printConstant(CPV); + } else { + Out << GetValueName(Operand); + } +} + +void CWriter::writeOperand(Value *Operand) { + if (isa<GlobalVariable>(Operand) || isDirectAlloca(Operand)) + Out << "(&"; // Global variables are referenced as their addresses by llvm + + writeOperandInternal(Operand); + + if (isa<GlobalVariable>(Operand) || isDirectAlloca(Operand)) + Out << ')'; +} + +// Some instructions need to have their result value casted back to the +// original types because their operands were casted to the expected type. +// This function takes care of detecting that case and printing the cast +// for the Instruction. +bool CWriter::writeInstructionCast(const Instruction &I) { + const Type *Ty = I.getOperand(0)->getType(); + switch (I.getOpcode()) { + case Instruction::LShr: + case Instruction::URem: + case Instruction::UDiv: + Out << "(("; + printSimpleType(Out, Ty, false); + Out << ")("; + return true; + case Instruction::AShr: + case Instruction::SRem: + case Instruction::SDiv: + Out << "(("; + printSimpleType(Out, Ty, true); + Out << ")("; + return true; + default: break; + } + return false; +} + +// Write the operand with a cast to another type based on the Opcode being used. +// This will be used in cases where an instruction has specific type +// requirements (usually signedness) for its operands. +void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) { + + // Extract the operand's type, we'll need it. + const Type* OpTy = Operand->getType(); + + // Indicate whether to do the cast or not. + bool shouldCast = false; + + // Indicate whether the cast should be to a signed type or not. + bool castIsSigned = false; + + // Based on the Opcode for which this Operand is being written, determine + // the new type to which the operand should be casted by setting the value + // of OpTy. If we change OpTy, also set shouldCast to true. + switch (Opcode) { + default: + // for most instructions, it doesn't matter + break; + case Instruction::LShr: + case Instruction::UDiv: + case Instruction::URem: // Cast to unsigned first + shouldCast = true; + castIsSigned = false; + break; + case Instruction::AShr: + case Instruction::SDiv: + case Instruction::SRem: // Cast to signed first + shouldCast = true; + castIsSigned = true; + break; + } + + // Write out the casted operand if we should, otherwise just write the + // operand. + if (shouldCast) { + Out << "(("; + printSimpleType(Out, OpTy, castIsSigned); + Out << ")"; + writeOperand(Operand); + Out << ")"; + } else + writeOperand(Operand); +} + +// Write the operand with a cast to another type based on the icmp predicate +// being used. +void CWriter::writeOperandWithCast(Value* Operand, ICmpInst::Predicate predicate) { + + // Extract the operand's type, we'll need it. + const Type* OpTy = Operand->getType(); + + // Indicate whether to do the cast or not. + bool shouldCast = false; + + // Indicate whether the cast should be to a signed type or not. + bool castIsSigned = false; + + // Based on the Opcode for which this Operand is being written, determine + // the new type to which the operand should be casted by setting the value + // of OpTy. If we change OpTy, also set shouldCast to true. + switch (predicate) { + default: + // for eq and ne, it doesn't matter + break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: + shouldCast = true; + break; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: + shouldCast = true; + castIsSigned = true; + break; + } + + // Write out the casted operand if we should, otherwise just write the + // operand. + if (shouldCast) { + Out << "(("; + if (OpTy->isInteger() && OpTy != Type::Int1Ty) + printSimpleType(Out, OpTy, castIsSigned); + else + printType(Out, OpTy); // not integer, sign doesn't matter + Out << ")"; + writeOperand(Operand); + Out << ")"; + } else + writeOperand(Operand); +} + +// generateCompilerSpecificCode - This is where we add conditional compilation +// directives to cater to specific compilers as need be. +// +static void generateCompilerSpecificCode(std::ostream& Out) { + // Alloca is hard to get, and we don't want to include stdlib.h here. + Out << "/* get a declaration for alloca */\n" + << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n" + << "#define alloca(x) __builtin_alloca((x))\n" + << "#define _alloca(x) __builtin_alloca((x))\n" + << "#elif defined(__APPLE__)\n" + << "extern void *__builtin_alloca(unsigned long);\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#define longjmp _longjmp\n" + << "#define setjmp _setjmp\n" + << "#elif defined(__sun__)\n" + << "#if defined(__sparcv9)\n" + << "extern void *__builtin_alloca(unsigned long);\n" + << "#else\n" + << "extern void *__builtin_alloca(unsigned int);\n" + << "#endif\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#elif defined(__FreeBSD__) || defined(__OpenBSD__)\n" + << "#define alloca(x) __builtin_alloca(x)\n" + << "#elif defined(_MSC_VER)\n" + << "#define inline _inline\n" + << "#define alloca(x) _alloca(x)\n" + << "#else\n" + << "#include <alloca.h>\n" + << "#endif\n\n"; + + // We output GCC specific attributes to preserve 'linkonce'ness on globals. + // If we aren't being compiled with GCC, just drop these attributes. + Out << "#ifndef __GNUC__ /* Can only support \"linkonce\" vars with GCC */\n" + << "#define __attribute__(X)\n" + << "#endif\n\n"; + + // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))". + Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n" + << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n" + << "#elif defined(__GNUC__)\n" + << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n" + << "#else\n" + << "#define __EXTERNAL_WEAK__\n" + << "#endif\n\n"; + + // For now, turn off the weak linkage attribute on Mac OS X. (See above.) + Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n" + << "#define __ATTRIBUTE_WEAK__\n" + << "#elif defined(__GNUC__)\n" + << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n" + << "#else\n" + << "#define __ATTRIBUTE_WEAK__\n" + << "#endif\n\n"; + + // Add hidden visibility support. FIXME: APPLE_CC? + Out << "#if defined(__GNUC__)\n" + << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n" + << "#endif\n\n"; + + // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise + // From the GCC documentation: + // + // double __builtin_nan (const char *str) + // + // This is an implementation of the ISO C99 function nan. + // + // Since ISO C99 defines this function in terms of strtod, which we do + // not implement, a description of the parsing is in order. The string is + // parsed as by strtol; that is, the base is recognized by leading 0 or + // 0x prefixes. The number parsed is placed in the significand such that + // the least significant bit of the number is at the least significant + // bit of the significand. The number is truncated to fit the significand + // field provided. The significand is forced to be a quiet NaN. + // + // This function, if given a string literal, is evaluated early enough + // that it is considered a compile-time constant. + // + // float __builtin_nanf (const char *str) + // + // Similar to __builtin_nan, except the return type is float. + // + // double __builtin_inf (void) + // + // Similar to __builtin_huge_val, except a warning is generated if the + // target floating-point format does not support infinities. This + // function is suitable for implementing the ISO C99 macro INFINITY. + // + // float __builtin_inff (void) + // + // Similar to __builtin_inf, except the return type is float. + Out << "#ifdef __GNUC__\n" + << "#define LLVM_NAN(NanStr) __builtin_nan(NanStr) /* Double */\n" + << "#define LLVM_NANF(NanStr) __builtin_nanf(NanStr) /* Float */\n" + << "#define LLVM_NANS(NanStr) __builtin_nans(NanStr) /* Double */\n" + << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n" + << "#define LLVM_INF __builtin_inf() /* Double */\n" + << "#define LLVM_INFF __builtin_inff() /* Float */\n" + << "#define LLVM_PREFETCH(addr,rw,locality) " + "__builtin_prefetch(addr,rw,locality)\n" + << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n" + << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n" + << "#define LLVM_ASM __asm__\n" + << "#else\n" + << "#define LLVM_NAN(NanStr) ((double)0.0) /* Double */\n" + << "#define LLVM_NANF(NanStr) 0.0F /* Float */\n" + << "#define LLVM_NANS(NanStr) ((double)0.0) /* Double */\n" + << "#define LLVM_NANSF(NanStr) 0.0F /* Float */\n" + << "#define LLVM_INF ((double)0.0) /* Double */\n" + << "#define LLVM_INFF 0.0F /* Float */\n" + << "#define LLVM_PREFETCH(addr,rw,locality) /* PREFETCH */\n" + << "#define __ATTRIBUTE_CTOR__\n" + << "#define __ATTRIBUTE_DTOR__\n" + << "#define LLVM_ASM(X)\n" + << "#endif\n\n"; + + Out << "#if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n" + << "#define __builtin_stack_save() 0 /* not implemented */\n" + << "#define __builtin_stack_restore(X) /* noop */\n" + << "#endif\n\n"; + + // Output target-specific code that should be inserted into main. + Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n"; + // On X86, set the FP control word to 64-bits of precision instead of 80 bits. + Out << "#if defined(__GNUC__) && !defined(__llvm__)\n" + << "#if defined(i386) || defined(__i386__) || defined(__i386) || " + << "defined(__x86_64__)\n" + << "#undef CODE_FOR_MAIN\n" + << "#define CODE_FOR_MAIN() \\\n" + << " {short F;__asm__ (\"fnstcw %0\" : \"=m\" (*&F)); \\\n" + << " F=(F&~0x300)|0x200;__asm__(\"fldcw %0\"::\"m\"(*&F));}\n" + << "#endif\n#endif\n"; + +} + +/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into +/// the StaticTors set. +static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){ + ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer()); + if (!InitList) return; + + for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) + if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){ + if (CS->getNumOperands() != 2) return; // Not array of 2-element structs. + + if (CS->getOperand(1)->isNullValue()) + return; // Found a null terminator, exit printing. + Constant *FP = CS->getOperand(1); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP)) + if (CE->isCast()) + FP = CE->getOperand(0); + if (Function *F = dyn_cast<Function>(FP)) + StaticTors.insert(F); + } +} + +enum SpecialGlobalClass { + NotSpecial = 0, + GlobalCtors, GlobalDtors, + NotPrinted +}; + +/// getGlobalVariableClass - If this is a global that is specially recognized +/// by LLVM, return a code that indicates how we should handle it. +static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) { + // If this is a global ctors/dtors list, handle it now. + if (GV->hasAppendingLinkage() && GV->use_empty()) { + if (GV->getName() == "llvm.global_ctors") + return GlobalCtors; + else if (GV->getName() == "llvm.global_dtors") + return GlobalDtors; + } + + // Otherwise, it it is other metadata, don't print it. This catches things + // like debug information. + if (GV->getSection() == "llvm.metadata") + return NotPrinted; + + return NotSpecial; +} + + +bool CWriter::doInitialization(Module &M) { + // Initialize + TheModule = &M; + + TD = new TargetData(&M); + IL = new IntrinsicLowering(*TD); + IL->AddPrototypes(M); + + // Ensure that all structure types have names... + Mang = new Mangler(M); + Mang->markCharUnacceptable('.'); + + // Keep track of which functions are static ctors/dtors so they can have + // an attribute added to their prototypes. + std::set<Function*> StaticCtors, StaticDtors; + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + switch (getGlobalVariableClass(I)) { + default: break; + case GlobalCtors: + FindStaticTors(I, StaticCtors); + break; + case GlobalDtors: + FindStaticTors(I, StaticDtors); + break; + } + } + + // get declaration for alloca + Out << "/* Provide Declarations */\n"; + Out << "#include <stdarg.h>\n"; // Varargs support + Out << "#include <setjmp.h>\n"; // Unwind support + generateCompilerSpecificCode(Out); + + // Provide a definition for `bool' if not compiling with a C++ compiler. + Out << "\n" + << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n" + + << "\n\n/* Support for floating point constants */\n" + << "typedef unsigned long long ConstantDoubleTy;\n" + << "typedef unsigned int ConstantFloatTy;\n" + + << "\n\n/* Global Declarations */\n"; + + // First output all the declarations for the program, because C requires + // Functions & globals to be declared before they are used. + // + + // Loop over the symbol table, emitting all named constants... + printModuleTypes(M.getTypeSymbolTable()); + + // Global variable declarations... + if (!M.global_empty()) { + Out << "\n/* External Global Variable Declarations */\n"; + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + + if (I->hasExternalLinkage() || I->hasExternalWeakLinkage()) + Out << "extern "; + else if (I->hasDLLImportLinkage()) + Out << "__declspec(dllimport) "; + else + continue; // Internal Global + + // Thread Local Storage + if (I->isThreadLocal()) + Out << "__thread "; + + printType(Out, I->getType()->getElementType(), false, GetValueName(I)); + + if (I->hasExternalWeakLinkage()) + Out << " __EXTERNAL_WEAK__"; + Out << ";\n"; + } + } + + // Function declarations + Out << "\n/* Function Declarations */\n"; + Out << "double fmod(double, double);\n"; // Support for FP rem + Out << "float fmodf(float, float);\n"; + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + // Don't print declarations for intrinsic functions. + if (!I->getIntrinsicID() && I->getName() != "setjmp" && + I->getName() != "longjmp" && I->getName() != "_setjmp") { + if (I->hasExternalWeakLinkage()) + Out << "extern "; + printFunctionSignature(I, true); + if (I->hasWeakLinkage() || I->hasLinkOnceLinkage()) + Out << " __ATTRIBUTE_WEAK__"; + if (I->hasExternalWeakLinkage()) + Out << " __EXTERNAL_WEAK__"; + if (StaticCtors.count(I)) + Out << " __ATTRIBUTE_CTOR__"; + if (StaticDtors.count(I)) + Out << " __ATTRIBUTE_DTOR__"; + if (I->hasHiddenVisibility()) + Out << " __HIDDEN__"; + + if (I->hasName() && I->getName()[0] == 1) + Out << " LLVM_ASM(\"" << I->getName().c_str()+1 << "\")"; + + Out << ";\n"; + } + } + + // Output the global variable declarations + if (!M.global_empty()) { + Out << "\n\n/* Global Variable Declarations */\n"; + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (!I->isDeclaration()) { + // Ignore special globals, such as debug info. + if (getGlobalVariableClass(I)) + continue; + + if (I->hasInternalLinkage()) + Out << "static "; + else + Out << "extern "; + + // Thread Local Storage + if (I->isThreadLocal()) + Out << "__thread "; + + printType(Out, I->getType()->getElementType(), false, + GetValueName(I)); + + if (I->hasLinkOnceLinkage()) + Out << " __attribute__((common))"; + else if (I->hasWeakLinkage()) + Out << " __ATTRIBUTE_WEAK__"; + else if (I->hasExternalWeakLinkage()) + Out << " __EXTERNAL_WEAK__"; + if (I->hasHiddenVisibility()) + Out << " __HIDDEN__"; + Out << ";\n"; + } + } + + // Output the global variable definitions and contents... + if (!M.global_empty()) { + Out << "\n\n/* Global Variable Definitions and Initialization */\n"; + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (!I->isDeclaration()) { + // Ignore special globals, such as debug info. + if (getGlobalVariableClass(I)) + continue; + + if (I->hasInternalLinkage()) + Out << "static "; + else if (I->hasDLLImportLinkage()) + Out << "__declspec(dllimport) "; + else if (I->hasDLLExportLinkage()) + Out << "__declspec(dllexport) "; + + // Thread Local Storage + if (I->isThreadLocal()) + Out << "__thread "; + + printType(Out, I->getType()->getElementType(), false, + GetValueName(I)); + if (I->hasLinkOnceLinkage()) + Out << " __attribute__((common))"; + else if (I->hasWeakLinkage()) + Out << " __ATTRIBUTE_WEAK__"; + + if (I->hasHiddenVisibility()) + Out << " __HIDDEN__"; + + // If the initializer is not null, emit the initializer. If it is null, + // we try to avoid emitting large amounts of zeros. The problem with + // this, however, occurs when the variable has weak linkage. In this + // case, the assembler will complain about the variable being both weak + // and common, so we disable this optimization. + if (!I->getInitializer()->isNullValue()) { + Out << " = " ; + writeOperand(I->getInitializer()); + } else if (I->hasWeakLinkage()) { + // We have to specify an initializer, but it doesn't have to be + // complete. If the value is an aggregate, print out { 0 }, and let + // the compiler figure out the rest of the zeros. + Out << " = " ; + if (isa<StructType>(I->getInitializer()->getType()) || + isa<ArrayType>(I->getInitializer()->getType()) || + isa<VectorType>(I->getInitializer()->getType())) { + Out << "{ 0 }"; + } else { + // Just print it out normally. + writeOperand(I->getInitializer()); + } + } + Out << ";\n"; + } + } + + if (!M.empty()) + Out << "\n\n/* Function Bodies */\n"; + + // Emit some helper functions for dealing with FCMP instruction's + // predicates + Out << "static inline int llvm_fcmp_ord(double X, double Y) { "; + Out << "return X == X && Y == Y; }\n"; + Out << "static inline int llvm_fcmp_uno(double X, double Y) { "; + Out << "return X != X || Y != Y; }\n"; + Out << "static inline int llvm_fcmp_ueq(double X, double Y) { "; + Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n"; + Out << "static inline int llvm_fcmp_une(double X, double Y) { "; + Out << "return X != Y; }\n"; + Out << "static inline int llvm_fcmp_ult(double X, double Y) { "; + Out << "return X < Y || llvm_fcmp_uno(X, Y); }\n"; + Out << "static inline int llvm_fcmp_ugt(double X, double Y) { "; + Out << "return X > Y || llvm_fcmp_uno(X, Y); }\n"; + Out << "static inline int llvm_fcmp_ule(double X, double Y) { "; + Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n"; + Out << "static inline int llvm_fcmp_uge(double X, double Y) { "; + Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n"; + Out << "static inline int llvm_fcmp_oeq(double X, double Y) { "; + Out << "return X == Y ; }\n"; + Out << "static inline int llvm_fcmp_one(double X, double Y) { "; + Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n"; + Out << "static inline int llvm_fcmp_olt(double X, double Y) { "; + Out << "return X < Y ; }\n"; + Out << "static inline int llvm_fcmp_ogt(double X, double Y) { "; + Out << "return X > Y ; }\n"; + Out << "static inline int llvm_fcmp_ole(double X, double Y) { "; + Out << "return X <= Y ; }\n"; + Out << "static inline int llvm_fcmp_oge(double X, double Y) { "; + Out << "return X >= Y ; }\n"; + return false; +} + + +/// Output all floating point constants that cannot be printed accurately... +void CWriter::printFloatingPointConstants(Function &F) { + // Scan the module for floating point constants. If any FP constant is used + // in the function, we want to redirect it here so that we do not depend on + // the precision of the printed form, unless the printed form preserves + // precision. + // + static unsigned FPCounter = 0; + for (constant_iterator I = constant_begin(&F), E = constant_end(&F); + I != E; ++I) + if (const ConstantFP *FPC = dyn_cast<ConstantFP>(*I)) + if (!isFPCSafeToPrint(FPC) && // Do not put in FPConstantMap if safe. + !FPConstantMap.count(FPC)) { + double Val = FPC->getValue(); + + FPConstantMap[FPC] = FPCounter; // Number the FP constants + + if (FPC->getType() == Type::DoubleTy) { + Out << "static const ConstantDoubleTy FPConstant" << FPCounter++ + << " = 0x" << std::hex << DoubleToBits(Val) << std::dec + << "ULL; /* " << Val << " */\n"; + } else if (FPC->getType() == Type::FloatTy) { + Out << "static const ConstantFloatTy FPConstant" << FPCounter++ + << " = 0x" << std::hex << FloatToBits(Val) << std::dec + << "U; /* " << Val << " */\n"; + } else + assert(0 && "Unknown float type!"); + } + + Out << '\n'; +} + + +/// printSymbolTable - Run through symbol table looking for type names. If a +/// type name is found, emit its declaration... +/// +void CWriter::printModuleTypes(const TypeSymbolTable &TST) { + Out << "/* Helper union for bitcasts */\n"; + Out << "typedef union {\n"; + Out << " unsigned int Int32;\n"; + Out << " unsigned long long Int64;\n"; + Out << " float Float;\n"; + Out << " double Double;\n"; + Out << "} llvmBitCastUnion;\n"; + + // We are only interested in the type plane of the symbol table. + TypeSymbolTable::const_iterator I = TST.begin(); + TypeSymbolTable::const_iterator End = TST.end(); + + // If there are no type names, exit early. + if (I == End) return; + + // Print out forward declarations for structure types before anything else! + Out << "/* Structure forward decls */\n"; + for (; I != End; ++I) { + std::string Name = "struct l_" + Mang->makeNameProper(I->first); + Out << Name << ";\n"; + TypeNames.insert(std::make_pair(I->second, Name)); + } + + Out << '\n'; + + // Now we can print out typedefs. Above, we guaranteed that this can only be + // for struct or opaque types. + Out << "/* Typedefs */\n"; + for (I = TST.begin(); I != End; ++I) { + std::string Name = "l_" + Mang->makeNameProper(I->first); + Out << "typedef "; + printType(Out, I->second, false, Name); + Out << ";\n"; + } + + Out << '\n'; + + // Keep track of which structures have been printed so far... + std::set<const StructType *> StructPrinted; + + // Loop over all structures then push them into the stack so they are + // printed in the correct order. + // + Out << "/* Structure contents */\n"; + for (I = TST.begin(); I != End; ++I) + if (const StructType *STy = dyn_cast<StructType>(I->second)) + // Only print out used types! + printContainedStructs(STy, StructPrinted); +} + +// Push the struct onto the stack and recursively push all structs +// this one depends on. +// +// TODO: Make this work properly with vector types +// +void CWriter::printContainedStructs(const Type *Ty, + std::set<const StructType*> &StructPrinted){ + // Don't walk through pointers. + if (isa<PointerType>(Ty) || Ty->isPrimitiveType() || Ty->isInteger()) return; + + // Print all contained types first. + for (Type::subtype_iterator I = Ty->subtype_begin(), + E = Ty->subtype_end(); I != E; ++I) + printContainedStructs(*I, StructPrinted); + + if (const StructType *STy = dyn_cast<StructType>(Ty)) { + // Check to see if we have already printed this struct. + if (StructPrinted.insert(STy).second) { + // Print structure type out. + std::string Name = TypeNames[STy]; + printType(Out, STy, false, Name, true); + Out << ";\n\n"; + } + } +} + +void CWriter::printFunctionSignature(const Function *F, bool Prototype) { + /// isStructReturn - Should this function actually return a struct by-value? + bool isStructReturn = F->getFunctionType()->isStructReturn(); + + if (F->hasInternalLinkage()) Out << "static "; + if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) "; + if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) "; + switch (F->getCallingConv()) { + case CallingConv::X86_StdCall: + Out << "__stdcall "; + break; + case CallingConv::X86_FastCall: + Out << "__fastcall "; + break; + } + + // Loop over the arguments, printing them... + const FunctionType *FT = cast<FunctionType>(F->getFunctionType()); + const ParamAttrsList *Attrs = FT->getParamAttrs(); + + std::stringstream FunctionInnards; + + // Print out the name... + FunctionInnards << GetValueName(F) << '('; + + bool PrintedArg = false; + if (!F->isDeclaration()) { + if (!F->arg_empty()) { + Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + + // If this is a struct-return function, don't print the hidden + // struct-return argument. + if (isStructReturn) { + assert(I != E && "Invalid struct return function!"); + ++I; + } + + std::string ArgName; + unsigned Idx = 1; + for (; I != E; ++I) { + if (PrintedArg) FunctionInnards << ", "; + if (I->hasName() || !Prototype) + ArgName = GetValueName(I); + else + ArgName = ""; + printType(FunctionInnards, I->getType(), + /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt), + ArgName); + PrintedArg = true; + ++Idx; + } + } + } else { + // Loop over the arguments, printing them. + FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end(); + + // If this is a struct-return function, don't print the hidden + // struct-return argument. + if (isStructReturn) { + assert(I != E && "Invalid struct return function!"); + ++I; + } + + unsigned Idx = 1; + for (; I != E; ++I) { + if (PrintedArg) FunctionInnards << ", "; + printType(FunctionInnards, *I, + /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt)); + PrintedArg = true; + ++Idx; + } + } + + // Finish printing arguments... if this is a vararg function, print the ..., + // unless there are no known types, in which case, we just emit (). + // + if (FT->isVarArg() && PrintedArg) { + if (PrintedArg) FunctionInnards << ", "; + FunctionInnards << "..."; // Output varargs portion of signature! + } else if (!FT->isVarArg() && !PrintedArg) { + FunctionInnards << "void"; // ret() -> ret(void) in C. + } + FunctionInnards << ')'; + + // Get the return tpe for the function. + const Type *RetTy; + if (!isStructReturn) + RetTy = F->getReturnType(); + else { + // If this is a struct-return function, print the struct-return type. + RetTy = cast<PointerType>(FT->getParamType(0))->getElementType(); + } + + // Print out the return type and the signature built above. + printType(Out, RetTy, + /*isSigned=*/ Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt), + FunctionInnards.str()); +} + +static inline bool isFPIntBitCast(const Instruction &I) { + if (!isa<BitCastInst>(I)) + return false; + const Type *SrcTy = I.getOperand(0)->getType(); + const Type *DstTy = I.getType(); + return (SrcTy->isFloatingPoint() && DstTy->isInteger()) || + (DstTy->isFloatingPoint() && SrcTy->isInteger()); +} + +void CWriter::printFunction(Function &F) { + /// isStructReturn - Should this function actually return a struct by-value? + bool isStructReturn = F.getFunctionType()->isStructReturn(); + + printFunctionSignature(&F, false); + Out << " {\n"; + + // If this is a struct return function, handle the result with magic. + if (isStructReturn) { + const Type *StructTy = + cast<PointerType>(F.arg_begin()->getType())->getElementType(); + Out << " "; + printType(Out, StructTy, false, "StructReturn"); + Out << "; /* Struct return temporary */\n"; + + Out << " "; + printType(Out, F.arg_begin()->getType(), false, + GetValueName(F.arg_begin())); + Out << " = &StructReturn;\n"; + } + + bool PrintedVar = false; + + // print local variable information for the function + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { + if (const AllocaInst *AI = isDirectAlloca(&*I)) { + Out << " "; + printType(Out, AI->getAllocatedType(), false, GetValueName(AI)); + Out << "; /* Address-exposed local */\n"; + PrintedVar = true; + } else if (I->getType() != Type::VoidTy && !isInlinableInst(*I)) { + Out << " "; + printType(Out, I->getType(), false, GetValueName(&*I)); + Out << ";\n"; + + if (isa<PHINode>(*I)) { // Print out PHI node temporaries as well... + Out << " "; + printType(Out, I->getType(), false, + GetValueName(&*I)+"__PHI_TEMPORARY"); + Out << ";\n"; + } + PrintedVar = true; + } + // We need a temporary for the BitCast to use so it can pluck a value out + // of a union to do the BitCast. This is separate from the need for a + // variable to hold the result of the BitCast. + if (isFPIntBitCast(*I)) { + Out << " llvmBitCastUnion " << GetValueName(&*I) + << "__BITCAST_TEMPORARY;\n"; + PrintedVar = true; + } + } + + if (PrintedVar) + Out << '\n'; + + if (F.hasExternalLinkage() && F.getName() == "main") + Out << " CODE_FOR_MAIN();\n"; + + // print the basic blocks + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (Loop *L = LI->getLoopFor(BB)) { + if (L->getHeader() == BB && L->getParentLoop() == 0) + printLoop(L); + } else { + printBasicBlock(BB); + } + } + + Out << "}\n\n"; +} + +void CWriter::printLoop(Loop *L) { + Out << " do { /* Syntactic loop '" << L->getHeader()->getName() + << "' to make GCC happy */\n"; + for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) { + BasicBlock *BB = L->getBlocks()[i]; + Loop *BBLoop = LI->getLoopFor(BB); + if (BBLoop == L) + printBasicBlock(BB); + else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L) + printLoop(BBLoop); + } + Out << " } while (1); /* end of syntactic loop '" + << L->getHeader()->getName() << "' */\n"; +} + +void CWriter::printBasicBlock(BasicBlock *BB) { + + // Don't print the label for the basic block if there are no uses, or if + // the only terminator use is the predecessor basic block's terminator. + // We have to scan the use list because PHI nodes use basic blocks too but + // do not require a label to be generated. + // + bool NeedsLabel = false; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + if (isGotoCodeNecessary(*PI, BB)) { + NeedsLabel = true; + break; + } + + if (NeedsLabel) Out << GetValueName(BB) << ":\n"; + + // Output all of the instructions in the basic block... + for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; + ++II) { + if (!isInlinableInst(*II) && !isDirectAlloca(II)) { + if (II->getType() != Type::VoidTy && !isInlineAsm(*II)) + outputLValue(II); + else + Out << " "; + visit(*II); + Out << ";\n"; + } + } + + // Don't emit prefix or suffix for the terminator... + visit(*BB->getTerminator()); +} + + +// Specific Instruction type classes... note that all of the casts are +// necessary because we use the instruction classes as opaque types... +// +void CWriter::visitReturnInst(ReturnInst &I) { + // If this is a struct return function, return the temporary struct. + bool isStructReturn = I.getParent()->getParent()-> + getFunctionType()->isStructReturn(); + + if (isStructReturn) { + Out << " return StructReturn;\n"; + return; + } + + // Don't output a void return if this is the last basic block in the function + if (I.getNumOperands() == 0 && + &*--I.getParent()->getParent()->end() == I.getParent() && + !I.getParent()->size() == 1) { + return; + } + + Out << " return"; + if (I.getNumOperands()) { + Out << ' '; + writeOperand(I.getOperand(0)); + } + Out << ";\n"; +} + +void CWriter::visitSwitchInst(SwitchInst &SI) { + + Out << " switch ("; + writeOperand(SI.getOperand(0)); + Out << ") {\n default:\n"; + printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2); + printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2); + Out << ";\n"; + for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) { + Out << " case "; + writeOperand(SI.getOperand(i)); + Out << ":\n"; + BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1)); + printPHICopiesForSuccessor (SI.getParent(), Succ, 2); + printBranchToBlock(SI.getParent(), Succ, 2); + if (Function::iterator(Succ) == next(Function::iterator(SI.getParent()))) + Out << " break;\n"; + } + Out << " }\n"; +} + +void CWriter::visitUnreachableInst(UnreachableInst &I) { + Out << " /*UNREACHABLE*/;\n"; +} + +bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) { + /// FIXME: This should be reenabled, but loop reordering safe!! + return true; + + if (next(Function::iterator(From)) != Function::iterator(To)) + return true; // Not the direct successor, we need a goto. + + //isa<SwitchInst>(From->getTerminator()) + + if (LI->getLoopFor(From) != LI->getLoopFor(To)) + return true; + return false; +} + +void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock, + BasicBlock *Successor, + unsigned Indent) { + for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + // Now we have to do the printing. + Value *IV = PN->getIncomingValueForBlock(CurBlock); + if (!isa<UndefValue>(IV)) { + Out << std::string(Indent, ' '); + Out << " " << GetValueName(I) << "__PHI_TEMPORARY = "; + writeOperand(IV); + Out << "; /* for PHI node */\n"; + } + } +} + +void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ, + unsigned Indent) { + if (isGotoCodeNecessary(CurBB, Succ)) { + Out << std::string(Indent, ' ') << " goto "; + writeOperand(Succ); + Out << ";\n"; + } +} + +// Branch instruction printing - Avoid printing out a branch to a basic block +// that immediately succeeds the current one. +// +void CWriter::visitBranchInst(BranchInst &I) { + + if (I.isConditional()) { + if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) { + Out << " if ("; + writeOperand(I.getCondition()); + Out << ") {\n"; + + printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2); + printBranchToBlock(I.getParent(), I.getSuccessor(0), 2); + + if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) { + Out << " } else {\n"; + printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2); + printBranchToBlock(I.getParent(), I.getSuccessor(1), 2); + } + } else { + // First goto not necessary, assume second one is... + Out << " if (!"; + writeOperand(I.getCondition()); + Out << ") {\n"; + + printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2); + printBranchToBlock(I.getParent(), I.getSuccessor(1), 2); + } + + Out << " }\n"; + } else { + printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0); + printBranchToBlock(I.getParent(), I.getSuccessor(0), 0); + } + Out << "\n"; +} + +// PHI nodes get copied into temporary values at the end of predecessor basic +// blocks. We now need to copy these temporary values into the REAL value for +// the PHI. +void CWriter::visitPHINode(PHINode &I) { + writeOperand(&I); + Out << "__PHI_TEMPORARY"; +} + + +void CWriter::visitBinaryOperator(Instruction &I) { + // binary instructions, shift instructions, setCond instructions. + assert(!isa<PointerType>(I.getType())); + + // We must cast the results of binary operations which might be promoted. + bool needsCast = false; + if ((I.getType() == Type::Int8Ty) || (I.getType() == Type::Int16Ty) + || (I.getType() == Type::FloatTy)) { + needsCast = true; + Out << "(("; + printType(Out, I.getType(), false); + Out << ")("; + } + + // If this is a negation operation, print it out as such. For FP, we don't + // want to print "-0.0 - X". + if (BinaryOperator::isNeg(&I)) { + Out << "-("; + writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I))); + Out << ")"; + } else if (I.getOpcode() == Instruction::FRem) { + // Output a call to fmod/fmodf instead of emitting a%b + if (I.getType() == Type::FloatTy) + Out << "fmodf("; + else + Out << "fmod("; + writeOperand(I.getOperand(0)); + Out << ", "; + writeOperand(I.getOperand(1)); + Out << ")"; + } else { + + // Write out the cast of the instruction's value back to the proper type + // if necessary. + bool NeedsClosingParens = writeInstructionCast(I); + + // Certain instructions require the operand to be forced to a specific type + // so we use writeOperandWithCast here instead of writeOperand. Similarly + // below for operand 1 + writeOperandWithCast(I.getOperand(0), I.getOpcode()); + + switch (I.getOpcode()) { + case Instruction::Add: Out << " + "; break; + case Instruction::Sub: Out << " - "; break; + case Instruction::Mul: Out << " * "; break; + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: Out << " % "; break; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: Out << " / "; break; + case Instruction::And: Out << " & "; break; + case Instruction::Or: Out << " | "; break; + case Instruction::Xor: Out << " ^ "; break; + case Instruction::Shl : Out << " << "; break; + case Instruction::LShr: + case Instruction::AShr: Out << " >> "; break; + default: cerr << "Invalid operator type!" << I; abort(); + } + + writeOperandWithCast(I.getOperand(1), I.getOpcode()); + if (NeedsClosingParens) + Out << "))"; + } + + if (needsCast) { + Out << "))"; + } +} + +void CWriter::visitICmpInst(ICmpInst &I) { + // We must cast the results of icmp which might be promoted. + bool needsCast = false; + + // Write out the cast of the instruction's value back to the proper type + // if necessary. + bool NeedsClosingParens = writeInstructionCast(I); + + // Certain icmp predicate require the operand to be forced to a specific type + // so we use writeOperandWithCast here instead of writeOperand. Similarly + // below for operand 1 + writeOperandWithCast(I.getOperand(0), I.getPredicate()); + + switch (I.getPredicate()) { + case ICmpInst::ICMP_EQ: Out << " == "; break; + case ICmpInst::ICMP_NE: Out << " != "; break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: Out << " <= "; break; + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: Out << " >= "; break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: Out << " < "; break; + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: Out << " > "; break; + default: cerr << "Invalid icmp predicate!" << I; abort(); + } + + writeOperandWithCast(I.getOperand(1), I.getPredicate()); + if (NeedsClosingParens) + Out << "))"; + + if (needsCast) { + Out << "))"; + } +} + +void CWriter::visitFCmpInst(FCmpInst &I) { + if (I.getPredicate() == FCmpInst::FCMP_FALSE) { + Out << "0"; + return; + } + if (I.getPredicate() == FCmpInst::FCMP_TRUE) { + Out << "1"; + return; + } + + const char* op = 0; + switch (I.getPredicate()) { + default: assert(0 && "Illegal FCmp predicate"); + case FCmpInst::FCMP_ORD: op = "ord"; break; + case FCmpInst::FCMP_UNO: op = "uno"; break; + case FCmpInst::FCMP_UEQ: op = "ueq"; break; + case FCmpInst::FCMP_UNE: op = "une"; break; + case FCmpInst::FCMP_ULT: op = "ult"; break; + case FCmpInst::FCMP_ULE: op = "ule"; break; + case FCmpInst::FCMP_UGT: op = "ugt"; break; + case FCmpInst::FCMP_UGE: op = "uge"; break; + case FCmpInst::FCMP_OEQ: op = "oeq"; break; + case FCmpInst::FCMP_ONE: op = "one"; break; + case FCmpInst::FCMP_OLT: op = "olt"; break; + case FCmpInst::FCMP_OLE: op = "ole"; break; + case FCmpInst::FCMP_OGT: op = "ogt"; break; + case FCmpInst::FCMP_OGE: op = "oge"; break; + } + + Out << "llvm_fcmp_" << op << "("; + // Write the first operand + writeOperand(I.getOperand(0)); + Out << ", "; + // Write the second operand + writeOperand(I.getOperand(1)); + Out << ")"; +} + +static const char * getFloatBitCastField(const Type *Ty) { + switch (Ty->getTypeID()) { + default: assert(0 && "Invalid Type"); + case Type::FloatTyID: return "Float"; + case Type::DoubleTyID: return "Double"; + case Type::IntegerTyID: { + unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); + if (NumBits <= 32) + return "Int32"; + else + return "Int64"; + } + } +} + +void CWriter::visitCastInst(CastInst &I) { + const Type *DstTy = I.getType(); + const Type *SrcTy = I.getOperand(0)->getType(); + Out << '('; + if (isFPIntBitCast(I)) { + // These int<->float and long<->double casts need to be handled specially + Out << GetValueName(&I) << "__BITCAST_TEMPORARY." + << getFloatBitCastField(I.getOperand(0)->getType()) << " = "; + writeOperand(I.getOperand(0)); + Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY." + << getFloatBitCastField(I.getType()); + } else { + printCast(I.getOpcode(), SrcTy, DstTy); + if (I.getOpcode() == Instruction::SExt && SrcTy == Type::Int1Ty) { + // Make sure we really get a sext from bool by subtracing the bool from 0 + Out << "0-"; + } + writeOperand(I.getOperand(0)); + if (DstTy == Type::Int1Ty && + (I.getOpcode() == Instruction::Trunc || + I.getOpcode() == Instruction::FPToUI || + I.getOpcode() == Instruction::FPToSI || + I.getOpcode() == Instruction::PtrToInt)) { + // Make sure we really get a trunc to bool by anding the operand with 1 + Out << "&1u"; + } + } + Out << ')'; +} + +void CWriter::visitSelectInst(SelectInst &I) { + Out << "(("; + writeOperand(I.getCondition()); + Out << ") ? ("; + writeOperand(I.getTrueValue()); + Out << ") : ("; + writeOperand(I.getFalseValue()); + Out << "))"; +} + + +void CWriter::lowerIntrinsics(Function &F) { + // This is used to keep track of intrinsics that get generated to a lowered + // function. We must generate the prototypes before the function body which + // will only be expanded on first use (by the loop below). + std::vector<Function*> prototypesToGen; + + // Examine all the instructions in this function to find the intrinsics that + // need to be lowered. + for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) + if (CallInst *CI = dyn_cast<CallInst>(I++)) + if (Function *F = CI->getCalledFunction()) + switch (F->getIntrinsicID()) { + case Intrinsic::not_intrinsic: + case Intrinsic::vastart: + case Intrinsic::vacopy: + case Intrinsic::vaend: + case Intrinsic::returnaddress: + case Intrinsic::frameaddress: + case Intrinsic::setjmp: + case Intrinsic::longjmp: + case Intrinsic::prefetch: + case Intrinsic::dbg_stoppoint: + case Intrinsic::powi_f32: + case Intrinsic::powi_f64: + // We directly implement these intrinsics + break; + default: + // If this is an intrinsic that directly corresponds to a GCC + // builtin, we handle it. + const char *BuiltinName = ""; +#define GET_GCC_BUILTIN_NAME +#include "llvm/Intrinsics.gen" +#undef GET_GCC_BUILTIN_NAME + // If we handle it, don't lower it. + if (BuiltinName[0]) break; + + // All other intrinsic calls we must lower. + Instruction *Before = 0; + if (CI != &BB->front()) + Before = prior(BasicBlock::iterator(CI)); + + IL->LowerIntrinsicCall(CI); + if (Before) { // Move iterator to instruction after call + I = Before; ++I; + } else { + I = BB->begin(); + } + // If the intrinsic got lowered to another call, and that call has + // a definition then we need to make sure its prototype is emitted + // before any calls to it. + if (CallInst *Call = dyn_cast<CallInst>(I)) + if (Function *NewF = Call->getCalledFunction()) + if (!NewF->isDeclaration()) + prototypesToGen.push_back(NewF); + + break; + } + + // We may have collected some prototypes to emit in the loop above. + // Emit them now, before the function that uses them is emitted. But, + // be careful not to emit them twice. + std::vector<Function*>::iterator I = prototypesToGen.begin(); + std::vector<Function*>::iterator E = prototypesToGen.end(); + for ( ; I != E; ++I) { + if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) { + Out << '\n'; + printFunctionSignature(*I, true); + Out << ";\n"; + } + } +} + + +void CWriter::visitCallInst(CallInst &I) { + //check if we have inline asm + if (isInlineAsm(I)) { + visitInlineAsm(I); + return; + } + + bool WroteCallee = false; + + // Handle intrinsic function calls first... + if (Function *F = I.getCalledFunction()) + if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID()) { + switch (ID) { + default: { + // If this is an intrinsic that directly corresponds to a GCC + // builtin, we emit it here. + const char *BuiltinName = ""; +#define GET_GCC_BUILTIN_NAME +#include "llvm/Intrinsics.gen" +#undef GET_GCC_BUILTIN_NAME + assert(BuiltinName[0] && "Unknown LLVM intrinsic!"); + + Out << BuiltinName; + WroteCallee = true; + break; + } + case Intrinsic::vastart: + Out << "0; "; + + Out << "va_start(*(va_list*)"; + writeOperand(I.getOperand(1)); + Out << ", "; + // Output the last argument to the enclosing function... + if (I.getParent()->getParent()->arg_empty()) { + cerr << "The C backend does not currently support zero " + << "argument varargs functions, such as '" + << I.getParent()->getParent()->getName() << "'!\n"; + abort(); + } + writeOperand(--I.getParent()->getParent()->arg_end()); + Out << ')'; + return; + case Intrinsic::vaend: + if (!isa<ConstantPointerNull>(I.getOperand(1))) { + Out << "0; va_end(*(va_list*)"; + writeOperand(I.getOperand(1)); + Out << ')'; + } else { + Out << "va_end(*(va_list*)0)"; + } + return; + case Intrinsic::vacopy: + Out << "0; "; + Out << "va_copy(*(va_list*)"; + writeOperand(I.getOperand(1)); + Out << ", *(va_list*)"; + writeOperand(I.getOperand(2)); + Out << ')'; + return; + case Intrinsic::returnaddress: + Out << "__builtin_return_address("; + writeOperand(I.getOperand(1)); + Out << ')'; + return; + case Intrinsic::frameaddress: + Out << "__builtin_frame_address("; + writeOperand(I.getOperand(1)); + Out << ')'; + return; + case Intrinsic::powi_f32: + case Intrinsic::powi_f64: + Out << "__builtin_powi("; + writeOperand(I.getOperand(1)); + Out << ", "; + writeOperand(I.getOperand(2)); + Out << ')'; + return; + case Intrinsic::setjmp: + Out << "setjmp(*(jmp_buf*)"; + writeOperand(I.getOperand(1)); + Out << ')'; + return; + case Intrinsic::longjmp: + Out << "longjmp(*(jmp_buf*)"; + writeOperand(I.getOperand(1)); + Out << ", "; + writeOperand(I.getOperand(2)); + Out << ')'; + return; + case Intrinsic::prefetch: + Out << "LLVM_PREFETCH((const void *)"; + writeOperand(I.getOperand(1)); + Out << ", "; + writeOperand(I.getOperand(2)); + Out << ", "; + writeOperand(I.getOperand(3)); + Out << ")"; + return; + case Intrinsic::dbg_stoppoint: { + // If we use writeOperand directly we get a "u" suffix which is rejected + // by gcc. + DbgStopPointInst &SPI = cast<DbgStopPointInst>(I); + + Out << "\n#line " + << SPI.getLine() + << " \"" << SPI.getDirectory() + << SPI.getFileName() << "\"\n"; + return; + } + } + } + + Value *Callee = I.getCalledValue(); + + const PointerType *PTy = cast<PointerType>(Callee->getType()); + const FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); + + // If this is a call to a struct-return function, assign to the first + // parameter instead of passing it to the call. + bool isStructRet = FTy->isStructReturn(); + if (isStructRet) { + Out << "*("; + writeOperand(I.getOperand(1)); + Out << ") = "; + } + + if (I.isTailCall()) Out << " /*tail*/ "; + + if (!WroteCallee) { + // If this is an indirect call to a struct return function, we need to cast + // the pointer. + bool NeedsCast = isStructRet && !isa<Function>(Callee); + + // GCC is a real PITA. It does not permit codegening casts of functions to + // function pointers if they are in a call (it generates a trap instruction + // instead!). We work around this by inserting a cast to void* in between + // the function and the function pointer cast. Unfortunately, we can't just + // form the constant expression here, because the folder will immediately + // nuke it. + // + // Note finally, that this is completely unsafe. ANSI C does not guarantee + // that void* and function pointers have the same size. :( To deal with this + // in the common case, we handle casts where the number of arguments passed + // match exactly. + // + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee)) + if (CE->isCast()) + if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) { + NeedsCast = true; + Callee = RF; + } + + if (NeedsCast) { + // Ok, just cast the pointer type. + Out << "(("; + if (!isStructRet) + printType(Out, I.getCalledValue()->getType()); + else + printStructReturnPointerFunctionType(Out, + cast<PointerType>(I.getCalledValue()->getType())); + Out << ")(void*)"; + } + writeOperand(Callee); + if (NeedsCast) Out << ')'; + } + + Out << '('; + + unsigned NumDeclaredParams = FTy->getNumParams(); + + CallSite::arg_iterator AI = I.op_begin()+1, AE = I.op_end(); + unsigned ArgNo = 0; + if (isStructRet) { // Skip struct return argument. + ++AI; + ++ArgNo; + } + + const ParamAttrsList *Attrs = FTy->getParamAttrs(); + bool PrintedArg = false; + unsigned Idx = 1; + for (; AI != AE; ++AI, ++ArgNo, ++Idx) { + if (PrintedArg) Out << ", "; + if (ArgNo < NumDeclaredParams && + (*AI)->getType() != FTy->getParamType(ArgNo)) { + Out << '('; + printType(Out, FTy->getParamType(ArgNo), + /*isSigned=*/Attrs && Attrs->paramHasAttr(Idx, ParamAttr::SExt)); + Out << ')'; + } + writeOperand(*AI); + PrintedArg = true; + } + Out << ')'; +} + + +//This converts the llvm constraint string to something gcc is expecting. +//TODO: work out platform independent constraints and factor those out +// of the per target tables +// handle multiple constraint codes +std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) { + + assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle"); + + const char** table = 0; + + //Grab the translation table from TargetAsmInfo if it exists + if (!TAsm) { + std::string E; + const TargetMachineRegistry::Entry* Match = + TargetMachineRegistry::getClosestStaticTargetForModule(*TheModule, E); + if (Match) { + //Per platform Target Machines don't exist, so create it + // this must be done only once + const TargetMachine* TM = Match->CtorFn(*TheModule, ""); + TAsm = TM->getTargetAsmInfo(); + } + } + if (TAsm) + table = TAsm->getAsmCBE(); + + //Search the translation table if it exists + for (int i = 0; table && table[i]; i += 2) + if (c.Codes[0] == table[i]) + return table[i+1]; + + //default is identity + return c.Codes[0]; +} + +//TODO: import logic from AsmPrinter.cpp +static std::string gccifyAsm(std::string asmstr) { + for (std::string::size_type i = 0; i != asmstr.size(); ++i) + if (asmstr[i] == '\n') + asmstr.replace(i, 1, "\\n"); + else if (asmstr[i] == '\t') + asmstr.replace(i, 1, "\\t"); + else if (asmstr[i] == '$') { + if (asmstr[i + 1] == '{') { + std::string::size_type a = asmstr.find_first_of(':', i + 1); + std::string::size_type b = asmstr.find_first_of('}', i + 1); + std::string n = "%" + + asmstr.substr(a + 1, b - a - 1) + + asmstr.substr(i + 2, a - i - 2); + asmstr.replace(i, b - i + 1, n); + i += n.size() - 1; + } else + asmstr.replace(i, 1, "%"); + } + else if (asmstr[i] == '%')//grr + { asmstr.replace(i, 1, "%%"); ++i;} + + return asmstr; +} + +//TODO: assumptions about what consume arguments from the call are likely wrong +// handle communitivity +void CWriter::visitInlineAsm(CallInst &CI) { + InlineAsm* as = cast<InlineAsm>(CI.getOperand(0)); + std::vector<InlineAsm::ConstraintInfo> Constraints = as->ParseConstraints(); + std::vector<std::pair<std::string, Value*> > Input; + std::vector<std::pair<std::string, Value*> > Output; + std::string Clobber; + int count = CI.getType() == Type::VoidTy ? 1 : 0; + for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(), + E = Constraints.end(); I != E; ++I) { + assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle"); + std::string c = + InterpretASMConstraint(*I); + switch(I->Type) { + default: + assert(0 && "Unknown asm constraint"); + break; + case InlineAsm::isInput: { + if (c.size()) { + Input.push_back(std::make_pair(c, count ? CI.getOperand(count) : &CI)); + ++count; //consume arg + } + break; + } + case InlineAsm::isOutput: { + if (c.size()) { + Output.push_back(std::make_pair("="+((I->isEarlyClobber ? "&" : "")+c), + count ? CI.getOperand(count) : &CI)); + ++count; //consume arg + } + break; + } + case InlineAsm::isClobber: { + if (c.size()) + Clobber += ",\"" + c + "\""; + break; + } + } + } + + //fix up the asm string for gcc + std::string asmstr = gccifyAsm(as->getAsmString()); + + Out << "__asm__ volatile (\"" << asmstr << "\"\n"; + Out << " :"; + for (std::vector<std::pair<std::string, Value*> >::iterator I = Output.begin(), + E = Output.end(); I != E; ++I) { + Out << "\"" << I->first << "\"("; + writeOperandRaw(I->second); + Out << ")"; + if (I + 1 != E) + Out << ","; + } + Out << "\n :"; + for (std::vector<std::pair<std::string, Value*> >::iterator I = Input.begin(), + E = Input.end(); I != E; ++I) { + Out << "\"" << I->first << "\"("; + writeOperandRaw(I->second); + Out << ")"; + if (I + 1 != E) + Out << ","; + } + if (Clobber.size()) + Out << "\n :" << Clobber.substr(1); + Out << ")"; +} + +void CWriter::visitMallocInst(MallocInst &I) { + assert(0 && "lowerallocations pass didn't work!"); +} + +void CWriter::visitAllocaInst(AllocaInst &I) { + Out << '('; + printType(Out, I.getType()); + Out << ") alloca(sizeof("; + printType(Out, I.getType()->getElementType()); + Out << ')'; + if (I.isArrayAllocation()) { + Out << " * " ; + writeOperand(I.getOperand(0)); + } + Out << ')'; +} + +void CWriter::visitFreeInst(FreeInst &I) { + assert(0 && "lowerallocations pass didn't work!"); +} + +void CWriter::printIndexingExpression(Value *Ptr, gep_type_iterator I, + gep_type_iterator E) { + bool HasImplicitAddress = false; + // If accessing a global value with no indexing, avoid *(&GV) syndrome + if (isa<GlobalValue>(Ptr)) { + HasImplicitAddress = true; + } else if (isDirectAlloca(Ptr)) { + HasImplicitAddress = true; + } + + if (I == E) { + if (!HasImplicitAddress) + Out << '*'; // Implicit zero first argument: '*x' is equivalent to 'x[0]' + + writeOperandInternal(Ptr); + return; + } + + const Constant *CI = dyn_cast<Constant>(I.getOperand()); + if (HasImplicitAddress && (!CI || !CI->isNullValue())) + Out << "(&"; + + writeOperandInternal(Ptr); + + if (HasImplicitAddress && (!CI || !CI->isNullValue())) { + Out << ')'; + HasImplicitAddress = false; // HIA is only true if we haven't addressed yet + } + + assert(!HasImplicitAddress || (CI && CI->isNullValue()) && + "Can only have implicit address with direct accessing"); + + if (HasImplicitAddress) { + ++I; + } else if (CI && CI->isNullValue()) { + gep_type_iterator TmpI = I; ++TmpI; + + // Print out the -> operator if possible... + if (TmpI != E && isa<StructType>(*TmpI)) { + Out << (HasImplicitAddress ? "." : "->"); + Out << "field" << cast<ConstantInt>(TmpI.getOperand())->getZExtValue(); + I = ++TmpI; + } + } + + for (; I != E; ++I) + if (isa<StructType>(*I)) { + Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue(); + } else { + Out << '['; + writeOperand(I.getOperand()); + Out << ']'; + } +} + +void CWriter::visitLoadInst(LoadInst &I) { + Out << '*'; + if (I.isVolatile()) { + Out << "(("; + printType(Out, I.getType(), false, "volatile*"); + Out << ")"; + } + + writeOperand(I.getOperand(0)); + + if (I.isVolatile()) + Out << ')'; +} + +void CWriter::visitStoreInst(StoreInst &I) { + Out << '*'; + if (I.isVolatile()) { + Out << "(("; + printType(Out, I.getOperand(0)->getType(), false, " volatile*"); + Out << ")"; + } + writeOperand(I.getPointerOperand()); + if (I.isVolatile()) Out << ')'; + Out << " = "; + Value *Operand = I.getOperand(0); + Constant *BitMask = 0; + if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType())) + if (!ITy->isPowerOf2ByteWidth()) + // We have a bit width that doesn't match an even power-of-2 byte + // size. Consequently we must & the value with the type's bit mask + BitMask = ConstantInt::get(ITy, ITy->getBitMask()); + if (BitMask) + Out << "(("; + writeOperand(Operand); + if (BitMask) { + Out << ") & "; + printConstant(BitMask); + Out << ")"; + } +} + +void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) { + Out << '&'; + printIndexingExpression(I.getPointerOperand(), gep_type_begin(I), + gep_type_end(I)); +} + +void CWriter::visitVAArgInst(VAArgInst &I) { + Out << "va_arg(*(va_list*)"; + writeOperand(I.getOperand(0)); + Out << ", "; + printType(Out, I.getType()); + Out << ");\n "; +} + +//===----------------------------------------------------------------------===// +// External Interface declaration +//===----------------------------------------------------------------------===// + +bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM, + std::ostream &o, + CodeGenFileType FileType, + bool Fast) { + if (FileType != TargetMachine::AssemblyFile) return true; + + PM.add(createLowerGCPass()); + PM.add(createLowerAllocationsPass(true)); + PM.add(createLowerInvokePass()); + PM.add(createCFGSimplificationPass()); // clean up after lower invoke. + PM.add(new CBackendNameAllUsedStructsAndMergeFunctions()); + PM.add(new CWriter(o)); + return false; +} diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h new file mode 100644 index 0000000..38c738e --- /dev/null +++ b/lib/Target/CBackend/CTargetMachine.h @@ -0,0 +1,41 @@ +//===-- CTargetMachine.h - TargetMachine for the C backend ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the TargetMachine that is used by the C backend. +// +//===----------------------------------------------------------------------===// + +#ifndef CTARGETMACHINE_H +#define CTARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" + +namespace llvm { + +struct CTargetMachine : public TargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + + CTargetMachine(const Module &M, const std::string &FS) + : DataLayout(&M) {} + + virtual bool WantsWholeFile() const { return true; } + virtual bool addPassesToEmitWholeFile(PassManager &PM, std::ostream &Out, + CodeGenFileType FileType, bool Fast); + + // This class always works, but shouldn't be the default in most cases. + static unsigned getModuleMatchQuality(const Module &M) { return 1; } + + virtual const TargetData *getTargetData() const { return &DataLayout; } +}; + +} // End llvm namespace + + +#endif diff --git a/lib/Target/CBackend/Makefile b/lib/Target/CBackend/Makefile new file mode 100644 index 0000000..fea2494 --- /dev/null +++ b/lib/Target/CBackend/Makefile @@ -0,0 +1,14 @@ +##===- lib/Target/CBackend/Makefile ------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMCBackend +include $(LEVEL)/Makefile.common + +CompileCommonOpts += -Wno-format diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt new file mode 100644 index 0000000..cf2a974 --- /dev/null +++ b/lib/Target/CellSPU/README.txt @@ -0,0 +1,10 @@ +//===- README.txt - Notes for improving CellSPU-specific code gen ---------===// + +TODO: +* Check in the actual code. + +===-------------------------------------------------------------------------=== + +Note: The CellSPU work is work-in-progress and "alpha" quality code. No code +has been officially checked into the llvm repo, but this will happen Real Soon, +Real Soon Now. diff --git a/lib/Target/IA64/IA64.h b/lib/Target/IA64/IA64.h new file mode 100644 index 0000000..e5b84e6 --- /dev/null +++ b/lib/Target/IA64/IA64.h @@ -0,0 +1,54 @@ +//===-- IA64.h - Top-level interface for IA64 representation ------*- C++ -*-===// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the IA64 +// target library, as used by the LLVM JIT. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_IA64_H +#define TARGET_IA64_H + +#include <iosfwd> + +namespace llvm { + +class IA64TargetMachine; +class FunctionPass; + +/// createIA64DAGToDAGInstructionSelector - This pass converts an LLVM +/// function into IA64 machine code in a sane, DAG->DAG transform. +/// +FunctionPass *createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM); + +/// createIA64BundlingPass - This pass adds stop bits and bundles +/// instructions. +/// +FunctionPass *createIA64BundlingPass(IA64TargetMachine &TM); + +/// createIA64CodePrinterPass - Returns a pass that prints the IA64 +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. This should work +/// regardless of whether the function is in SSA form. +/// +FunctionPass *createIA64CodePrinterPass(std::ostream &o, IA64TargetMachine &tm); + +} // End llvm namespace + +// Defines symbolic names for IA64 registers. This defines a mapping from +// register name to register number. +// +#include "IA64GenRegisterNames.inc" + +// Defines symbolic names for the IA64 instructions. +// +#include "IA64GenInstrNames.inc" + +#endif + + diff --git a/lib/Target/IA64/IA64.td b/lib/Target/IA64/IA64.td new file mode 100644 index 0000000..2e231d4 --- /dev/null +++ b/lib/Target/IA64/IA64.td @@ -0,0 +1,39 @@ +//===-- IA64.td - Target definition file for Intel IA64 -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel IA64 architecture, +// also known variously as ia64, IA-64, IPF, "the Itanium architecture" etc. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "../Target.td" + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "IA64RegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "IA64InstrInfo.td" + +def IA64InstrInfo : InstrInfo { } + +def IA64 : Target { + // Our instruction set + let InstructionSet = IA64InstrInfo; + +} + + diff --git a/lib/Target/IA64/IA64AsmPrinter.cpp b/lib/Target/IA64/IA64AsmPrinter.cpp new file mode 100644 index 0000000..d576c4c --- /dev/null +++ b/lib/Target/IA64/IA64AsmPrinter.cpp @@ -0,0 +1,360 @@ +//===-- IA64AsmPrinter.cpp - Print out IA64 LLVM as assembly --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to assembly accepted by the GNU binutils 'gas' +// assembler. The Intel 'ias' and HP-UX 'as' assemblers *may* choke on this +// output, but if so that's a bug I'd like to hear about: please file a bug +// report in bugzilla. FYI, the not too bad 'ias' assembler is bundled with +// the Intel C/C++ compiler for Itanium Linux. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "IA64.h" +#include "IA64TargetMachine.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Mangler.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct IA64AsmPrinter : public AsmPrinter { + std::set<std::string> ExternalFunctionNames, ExternalObjectNames; + + IA64AsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T) + : AsmPrinter(O, TM, T) { + } + + virtual const char *getPassName() const { + return "IA64 Assembly Printer"; + } + + /// printInstruction - This method is automatically generated by tablegen + /// from the instruction set description. This method returns true if the + /// machine instruction was sufficiently described to print it, otherwise it + /// returns false. + bool printInstruction(const MachineInstr *MI); + + // This method is used by the tablegen'erated instruction printer. + void printOperand(const MachineInstr *MI, unsigned OpNo){ + const MachineOperand &MO = MI->getOperand(OpNo); + if (MO.getType() == MachineOperand::MO_Register) { + assert(MRegisterInfo::isPhysicalRegister(MO.getReg())&&"Not physref??"); + //XXX Bug Workaround: See note in Printer::doInitialization about %. + O << TM.getRegisterInfo()->get(MO.getReg()).Name; + } else { + printOp(MO); + } + } + + void printS8ImmOperand(const MachineInstr *MI, unsigned OpNo) { + int val=(unsigned int)MI->getOperand(OpNo).getImmedValue(); + if(val>=128) val=val-256; // if negative, flip sign + O << val; + } + void printS14ImmOperand(const MachineInstr *MI, unsigned OpNo) { + int val=(unsigned int)MI->getOperand(OpNo).getImmedValue(); + if(val>=8192) val=val-16384; // if negative, flip sign + O << val; + } + void printS22ImmOperand(const MachineInstr *MI, unsigned OpNo) { + int val=(unsigned int)MI->getOperand(OpNo).getImmedValue(); + if(val>=2097152) val=val-4194304; // if negative, flip sign + O << val; + } + void printU64ImmOperand(const MachineInstr *MI, unsigned OpNo) { + O << (uint64_t)MI->getOperand(OpNo).getImmedValue(); + } + void printS64ImmOperand(const MachineInstr *MI, unsigned OpNo) { +// XXX : nasty hack to avoid GPREL22 "relocation truncated to fit" linker +// errors - instead of add rX = @gprel(CPI<whatever>), r1;; we now +// emit movl rX = @gprel(CPI<whatever);; +// add rX = rX, r1; +// this gives us 64 bits instead of 22 (for the add long imm) to play +// with, which shuts up the linker. The problem is that the constant +// pool entries aren't immediates at this stage, so we check here. +// If it's an immediate, print it the old fashioned way. If it's +// not, we print it as a constant pool index. + if(MI->getOperand(OpNo).isImmediate()) { + O << (int64_t)MI->getOperand(OpNo).getImmedValue(); + } else { // this is a constant pool reference: FIXME: assert this + printOp(MI->getOperand(OpNo)); + } + } + + void printGlobalOperand(const MachineInstr *MI, unsigned OpNo) { + printOp(MI->getOperand(OpNo), false); // this is NOT a br.call instruction + } + + void printCallOperand(const MachineInstr *MI, unsigned OpNo) { + printOp(MI->getOperand(OpNo), true); // this is a br.call instruction + } + + std::string getSectionForFunction(const Function &F) const; + + void printMachineInstruction(const MachineInstr *MI); + void printOp(const MachineOperand &MO, bool isBRCALLinsn= false); + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + }; +} // end of anonymous namespace + + +// Include the auto-generated portion of the assembly writer. +#include "IA64GenAsmWriter.inc" + + +std::string IA64AsmPrinter::getSectionForFunction(const Function &F) const { + // This means "Allocated instruXions in mem, initialized". + return "\n\t.section .text, \"ax\", \"progbits\"\n"; +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool IA64AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + // Print out labels for the function. + EmitAlignment(5); + O << "\t.global\t" << CurrentFnName << "\n"; + O << "\t.type\t" << CurrentFnName << ", @function\n"; + O << CurrentFnName << ":\n"; + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block if there are any predecessors. + if (I->pred_begin() != I->pred_end()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printMachineInstruction(II); + } + } + + // We didn't modify anything. + return false; +} + +void IA64AsmPrinter::printOp(const MachineOperand &MO, + bool isBRCALLinsn /* = false */) { + const MRegisterInfo &RI = *TM.getRegisterInfo(); + switch (MO.getType()) { + case MachineOperand::MO_Register: + O << RI.get(MO.getReg()).Name; + return; + + case MachineOperand::MO_Immediate: + O << MO.getImmedValue(); + return; + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_ConstantPoolIndex: { + O << "@gprel(" << TAI->getPrivateGlobalPrefix() + << "CPI" << getFunctionNumber() << "_" + << MO.getConstantPoolIndex() << ")"; + return; + } + + case MachineOperand::MO_GlobalAddress: { + + // functions need @ltoff(@fptr(fn_name)) form + GlobalValue *GV = MO.getGlobal(); + Function *F = dyn_cast<Function>(GV); + + bool Needfptr=false; // if we're computing an address @ltoff(X), do + // we need to decorate it so it becomes + // @ltoff(@fptr(X)) ? + if (F && !isBRCALLinsn /*&& F->isDeclaration()*/) + Needfptr=true; + + // if this is the target of a call instruction, we should define + // the function somewhere (GNU gas has no problem without this, but + // Intel ias rightly complains of an 'undefined symbol') + + if (F /*&& isBRCALLinsn*/ && F->isDeclaration()) + ExternalFunctionNames.insert(Mang->getValueName(MO.getGlobal())); + else + if (GV->isDeclaration()) // e.g. stuff like 'stdin' + ExternalObjectNames.insert(Mang->getValueName(MO.getGlobal())); + + if (!isBRCALLinsn) + O << "@ltoff("; + if (Needfptr) + O << "@fptr("; + O << Mang->getValueName(MO.getGlobal()); + + if (Needfptr && !isBRCALLinsn) + O << "#))"; // close both fptr( and ltoff( + else { + if (Needfptr) + O << "#)"; // close only fptr( + if (!isBRCALLinsn) + O << "#)"; // close only ltoff( + } + + int Offset = MO.getOffset(); + if (Offset > 0) + O << " + " << Offset; + else if (Offset < 0) + O << " - " << -Offset; + return; + } + case MachineOperand::MO_ExternalSymbol: + O << MO.getSymbolName(); + ExternalFunctionNames.insert(MO.getSymbolName()); + return; + default: + O << "<AsmPrinter: unknown operand type: " << MO.getType() << " >"; return; + } +} + +/// printMachineInstruction -- Print out a single IA64 LLVM instruction +/// MI to the current output stream. +/// +void IA64AsmPrinter::printMachineInstruction(const MachineInstr *MI) { + ++EmittedInsts; + + // Call the autogenerated instruction printer routines. + printInstruction(MI); +} + +bool IA64AsmPrinter::doInitialization(Module &M) { + AsmPrinter::doInitialization(M); + + O << "\n.ident \"LLVM-ia64\"\n\n" + << "\t.psr lsb\n" // should be "msb" on HP-UX, for starters + << "\t.radix C\n" + << "\t.psr abi64\n"; // we only support 64 bits for now + return false; +} + +bool IA64AsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (I->hasInitializer()) { // External global require no code + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + O << "\n\n"; + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + unsigned Size = TD->getTypeSize(C->getType()); + unsigned Align = TD->getPreferredTypeAlignmentShift(C->getType()); + + if (C->isNullValue() && + (I->hasLinkOnceLinkage() || I->hasInternalLinkage() || + I->hasWeakLinkage() /* FIXME: Verify correct */)) { + SwitchToDataSection(".data", I); + if (I->hasInternalLinkage()) { + O << "\t.lcomm " << name << "#," << TD->getTypeSize(C->getType()) + << "," << (1 << Align); + O << "\n"; + } else { + O << "\t.common " << name << "#," << TD->getTypeSize(C->getType()) + << "," << (1 << Align); + O << "\n"; + } + } else { + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: // FIXME: Verify correct for weak. + // Nonnull linkonce -> weak + O << "\t.weak " << name << "\n"; + O << "\t.section\t.llvm.linkonce.d." << name + << ", \"aw\", \"progbits\"\n"; + SwitchToDataSection("", I); + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.global " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: + SwitchToDataSection(C->isNullValue() ? ".bss" : ".data", I); + break; + case GlobalValue::GhostLinkage: + cerr << "GhostLinkage cannot appear in IA64AsmPrinter!\n"; + abort(); + case GlobalValue::DLLImportLinkage: + cerr << "DLLImport linkage is not supported by this target!\n"; + abort(); + case GlobalValue::DLLExportLinkage: + cerr << "DLLExport linkage is not supported by this target!\n"; + abort(); + default: + assert(0 && "Unknown linkage type!"); + } + + EmitAlignment(Align); + O << "\t.type " << name << ",@object\n"; + O << "\t.size " << name << "," << Size << "\n"; + O << name << ":\t\t\t\t// " << *C << "\n"; + EmitGlobalConstant(C); + } + } + + // we print out ".global X \n .type X, @function" for each external function + O << "\n\n// br.call targets referenced (and not defined) above: \n"; + for (std::set<std::string>::iterator i = ExternalFunctionNames.begin(), + e = ExternalFunctionNames.end(); i!=e; ++i) { + O << "\t.global " << *i << "\n\t.type " << *i << ", @function\n"; + } + O << "\n\n"; + + // we print out ".global X \n .type X, @object" for each external object + O << "\n\n// (external) symbols referenced (and not defined) above: \n"; + for (std::set<std::string>::iterator i = ExternalObjectNames.begin(), + e = ExternalObjectNames.end(); i!=e; ++i) { + O << "\t.global " << *i << "\n\t.type " << *i << ", @object\n"; + } + O << "\n\n"; + + AsmPrinter::doFinalization(M); + return false; // success +} + +/// createIA64CodePrinterPass - Returns a pass that prints the IA64 +/// assembly code for a MachineFunction to the given output stream, using +/// the given target machine description. +/// +FunctionPass *llvm::createIA64CodePrinterPass(std::ostream &o, + IA64TargetMachine &tm) { + return new IA64AsmPrinter(o, tm, tm.getTargetAsmInfo()); +} + + diff --git a/lib/Target/IA64/IA64Bundling.cpp b/lib/Target/IA64/IA64Bundling.cpp new file mode 100644 index 0000000..6c9fa29 --- /dev/null +++ b/lib/Target/IA64/IA64Bundling.cpp @@ -0,0 +1,118 @@ +//===-- IA64Bundling.cpp - IA-64 instruction bundling pass. ------------ --===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Add stops where required to prevent read-after-write and write-after-write +// dependencies, for both registers and memory addresses. There are exceptions: +// +// - Compare instructions (cmp*, tbit, tnat, fcmp, frcpa) are OK with +// WAW dependencies so long as they all target p0, or are of parallel +// type (.and*/.or*) +// +// FIXME: bundling, for now, is left to the assembler. +// FIXME: this might be an appropriate place to translate between different +// instructions that do the same thing, if this helps bundling. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ia64-codegen" +#include "IA64.h" +#include "IA64InstrInfo.h" +#include "IA64TargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" +#include <set> +using namespace llvm; + +STATISTIC(StopBitsAdded, "Number of stop bits added"); + +namespace { + struct IA64BundlingPass : public MachineFunctionPass { + static char ID; + /// Target machine description which we query for reg. names, data + /// layout, etc. + /// + IA64TargetMachine &TM; + + IA64BundlingPass(IA64TargetMachine &tm) + : MachineFunctionPass((intptr_t)&ID), TM(tm) { } + + virtual const char *getPassName() const { + return "IA64 (Itanium) Bundling Pass"; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + bool runOnMachineFunction(MachineFunction &F) { + bool Changed = false; + for (MachineFunction::iterator FI = F.begin(), FE = F.end(); + FI != FE; ++FI) + Changed |= runOnMachineBasicBlock(*FI); + return Changed; + } + + // XXX: ugly global, but pending writes can cross basic blocks. Note that + // taken branches end instruction groups. So we only need to worry about + // 'fallthrough' code + std::set<unsigned> PendingRegWrites; + }; + char IA64BundlingPass::ID = 0; +} // end of anonymous namespace + +/// createIA64BundlingPass - Returns a pass that adds STOP (;;) instructions +/// and arranges the result into bundles. +/// +FunctionPass *llvm::createIA64BundlingPass(IA64TargetMachine &tm) { + return new IA64BundlingPass(tm); +} + +/// runOnMachineBasicBlock - add stops and bundle this MBB. +/// +bool IA64BundlingPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { + MachineInstr *CurrentInsn = I++; + std::set<unsigned> CurrentReads, CurrentWrites, OrigWrites; + + for(unsigned i=0; i < CurrentInsn->getNumOperands(); i++) { + MachineOperand &MO=CurrentInsn->getOperand(i); + if(MO.isRegister()) { + if(MO.isUse()) { // TODO: exclude p0 + CurrentReads.insert(MO.getReg()); + } + if(MO.isDef()) { // TODO: exclude p0 + CurrentWrites.insert(MO.getReg()); + OrigWrites.insert(MO.getReg()); // FIXME: use a nondestructive + // set_intersect instead? + } + } + } + + // CurrentReads/CurrentWrites contain info for the current instruction. + // Does it read or write any registers that are pending a write? + // (i.e. not separated by a stop) + set_intersect(CurrentReads, PendingRegWrites); + set_intersect(CurrentWrites, PendingRegWrites); + + if(! (CurrentReads.empty() && CurrentWrites.empty()) ) { + // there is a conflict, insert a stop and reset PendingRegWrites + CurrentInsn = BuildMI(MBB, CurrentInsn, + TM.getInstrInfo()->get(IA64::STOP), 0); + PendingRegWrites=OrigWrites; // carry over current writes to next insn + Changed=true; StopBitsAdded++; // update stats + } else { // otherwise, track additional pending writes + set_union(PendingRegWrites, OrigWrites); + } + } // onto the next insn in the MBB + + return Changed; +} + diff --git a/lib/Target/IA64/IA64ISelDAGToDAG.cpp b/lib/Target/IA64/IA64ISelDAGToDAG.cpp new file mode 100644 index 0000000..53b704e --- /dev/null +++ b/lib/Target/IA64/IA64ISelDAGToDAG.cpp @@ -0,0 +1,587 @@ +//===---- IA64ISelDAGToDAG.cpp - IA64 pattern matching inst selector ------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for IA64, +// converting a legalized dag to an IA64 dag. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ia64-codegen" +#include "IA64.h" +#include "IA64TargetMachine.h" +#include "IA64ISelLowering.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Constants.h" +#include "llvm/GlobalValue.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include <queue> +#include <set> +using namespace llvm; + +namespace { + //===--------------------------------------------------------------------===// + /// IA64DAGToDAGISel - IA64 specific code to select IA64 machine + /// instructions for SelectionDAG operations. + /// + class IA64DAGToDAGISel : public SelectionDAGISel { + IA64TargetLowering IA64Lowering; + unsigned GlobalBaseReg; + public: + IA64DAGToDAGISel(IA64TargetMachine &TM) + : SelectionDAGISel(IA64Lowering), IA64Lowering(*TM.getTargetLowering()) {} + + virtual bool runOnFunction(Function &Fn) { + // Make sure we re-emit a set of the global base reg if necessary + GlobalBaseReg = 0; + return SelectionDAGISel::runOnFunction(Fn); + } + + /// getI64Imm - Return a target constant with the specified value, of type + /// i64. + inline SDOperand getI64Imm(uint64_t Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i64); + } + + /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC + /// base register. Return the virtual register that holds this value. + // SDOperand getGlobalBaseReg(); TODO: hmm + + // Select - Convert the specified operand from a target-independent to a + // target-specific node if it hasn't already been changed. + SDNode *Select(SDOperand N); + + SDNode *SelectIntImmediateExpr(SDOperand LHS, SDOperand RHS, + unsigned OCHi, unsigned OCLo, + bool IsArithmetic = false, + bool Negate = false); + SDNode *SelectBitfieldInsert(SDNode *N); + + /// SelectCC - Select a comparison of the specified values with the + /// specified condition code, returning the CR# of the expression. + SDOperand SelectCC(SDOperand LHS, SDOperand RHS, ISD::CondCode CC); + + /// SelectAddr - Given the specified address, return the two operands for a + /// load/store instruction, and return true if it should be an indexed [r+r] + /// operation. + bool SelectAddr(SDOperand Addr, SDOperand &Op1, SDOperand &Op2); + + /// InstructionSelectBasicBlock - This callback is invoked by + /// SelectionDAGISel when it has created a SelectionDAG for us to codegen. + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + + virtual const char *getPassName() const { + return "IA64 (Itanium) DAG->DAG Instruction Selector"; + } + +// Include the pieces autogenerated from the target description. +#include "IA64GenDAGISel.inc" + +private: + SDNode *SelectDIV(SDOperand Op); + }; +} + +/// InstructionSelectBasicBlock - This callback is invoked by +/// SelectionDAGISel when it has created a SelectionDAG for us to codegen. +void IA64DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + + // Select target instructions for the DAG. + DAG.setRoot(SelectRoot(DAG.getRoot())); + DAG.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(DAG); +} + +SDNode *IA64DAGToDAGISel::SelectDIV(SDOperand Op) { + SDNode *N = Op.Val; + SDOperand Chain = N->getOperand(0); + SDOperand Tmp1 = N->getOperand(0); + SDOperand Tmp2 = N->getOperand(1); + AddToISelQueue(Chain); + + AddToISelQueue(Tmp1); + AddToISelQueue(Tmp2); + + bool isFP=false; + + if(MVT::isFloatingPoint(Tmp1.getValueType())) + isFP=true; + + bool isModulus=false; // is it a division or a modulus? + bool isSigned=false; + + switch(N->getOpcode()) { + case ISD::FDIV: + case ISD::SDIV: isModulus=false; isSigned=true; break; + case ISD::UDIV: isModulus=false; isSigned=false; break; + case ISD::FREM: + case ISD::SREM: isModulus=true; isSigned=true; break; + case ISD::UREM: isModulus=true; isSigned=false; break; + } + + // TODO: check for integer divides by powers of 2 (or other simple patterns?) + + SDOperand TmpPR, TmpPR2; + SDOperand TmpF1, TmpF2, TmpF3, TmpF4, TmpF5, TmpF6, TmpF7, TmpF8; + SDOperand TmpF9, TmpF10,TmpF11,TmpF12,TmpF13,TmpF14,TmpF15; + SDNode *Result; + + // we'll need copies of F0 and F1 + SDOperand F0 = CurDAG->getRegister(IA64::F0, MVT::f64); + SDOperand F1 = CurDAG->getRegister(IA64::F1, MVT::f64); + + // OK, emit some code: + + if(!isFP) { + // first, load the inputs into FP regs. + TmpF1 = + SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, Tmp1), 0); + Chain = TmpF1.getValue(1); + TmpF2 = + SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, Tmp2), 0); + Chain = TmpF2.getValue(1); + + // next, convert the inputs to FP + if(isSigned) { + TmpF3 = + SDOperand(CurDAG->getTargetNode(IA64::FCVTXF, MVT::f64, TmpF1), 0); + Chain = TmpF3.getValue(1); + TmpF4 = + SDOperand(CurDAG->getTargetNode(IA64::FCVTXF, MVT::f64, TmpF2), 0); + Chain = TmpF4.getValue(1); + } else { // is unsigned + TmpF3 = + SDOperand(CurDAG->getTargetNode(IA64::FCVTXUFS1, MVT::f64, TmpF1), 0); + Chain = TmpF3.getValue(1); + TmpF4 = + SDOperand(CurDAG->getTargetNode(IA64::FCVTXUFS1, MVT::f64, TmpF2), 0); + Chain = TmpF4.getValue(1); + } + + } else { // this is an FP divide/remainder, so we 'leak' some temp + // regs and assign TmpF3=Tmp1, TmpF4=Tmp2 + TmpF3=Tmp1; + TmpF4=Tmp2; + } + + // we start by computing an approximate reciprocal (good to 9 bits?) + // note, this instruction writes _both_ TmpF5 (answer) and TmpPR (predicate) + if(isFP) + TmpF5 = SDOperand(CurDAG->getTargetNode(IA64::FRCPAS0, MVT::f64, MVT::i1, + TmpF3, TmpF4), 0); + else + TmpF5 = SDOperand(CurDAG->getTargetNode(IA64::FRCPAS1, MVT::f64, MVT::i1, + TmpF3, TmpF4), 0); + + TmpPR = TmpF5.getValue(1); + Chain = TmpF5.getValue(2); + + SDOperand minusB; + if(isModulus) { // for remainders, it'll be handy to have + // copies of -input_b + minusB = SDOperand(CurDAG->getTargetNode(IA64::SUB, MVT::i64, + CurDAG->getRegister(IA64::r0, MVT::i64), Tmp2), 0); + Chain = minusB.getValue(1); + } + + SDOperand TmpE0, TmpY1, TmpE1, TmpY2; + + SDOperand OpsE0[] = { TmpF4, TmpF5, F1, TmpPR }; + TmpE0 = SDOperand(CurDAG->getTargetNode(IA64::CFNMAS1, MVT::f64, + OpsE0, 4), 0); + Chain = TmpE0.getValue(1); + SDOperand OpsY1[] = { TmpF5, TmpE0, TmpF5, TmpPR }; + TmpY1 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsY1, 4), 0); + Chain = TmpY1.getValue(1); + SDOperand OpsE1[] = { TmpE0, TmpE0, F0, TmpPR }; + TmpE1 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsE1, 4), 0); + Chain = TmpE1.getValue(1); + SDOperand OpsY2[] = { TmpY1, TmpE1, TmpY1, TmpPR }; + TmpY2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsY2, 4), 0); + Chain = TmpY2.getValue(1); + + if(isFP) { // if this is an FP divide, we finish up here and exit early + if(isModulus) + assert(0 && "Sorry, try another FORTRAN compiler."); + + SDOperand TmpE2, TmpY3, TmpQ0, TmpR0; + + SDOperand OpsE2[] = { TmpE1, TmpE1, F0, TmpPR }; + TmpE2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsE2, 4), 0); + Chain = TmpE2.getValue(1); + SDOperand OpsY3[] = { TmpY2, TmpE2, TmpY2, TmpPR }; + TmpY3 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsY3, 4), 0); + Chain = TmpY3.getValue(1); + SDOperand OpsQ0[] = { Tmp1, TmpY3, F0, TmpPR }; + TmpQ0 = + SDOperand(CurDAG->getTargetNode(IA64::CFMADS1, MVT::f64, // double prec! + OpsQ0, 4), 0); + Chain = TmpQ0.getValue(1); + SDOperand OpsR0[] = { Tmp2, TmpQ0, Tmp1, TmpPR }; + TmpR0 = + SDOperand(CurDAG->getTargetNode(IA64::CFNMADS1, MVT::f64, // double prec! + OpsR0, 4), 0); + Chain = TmpR0.getValue(1); + +// we want Result to have the same target register as the frcpa, so +// we two-address hack it. See the comment "for this to work..." on +// page 48 of Intel application note #245415 + SDOperand Ops[] = { TmpF5, TmpY3, TmpR0, TmpQ0, TmpPR }; + Result = CurDAG->getTargetNode(IA64::TCFMADS0, MVT::f64, // d.p. s0 rndg! + Ops, 5); + Chain = SDOperand(Result, 1); + return Result; // XXX: early exit! + } else { // this is *not* an FP divide, so there's a bit left to do: + + SDOperand TmpQ2, TmpR2, TmpQ3, TmpQ; + + SDOperand OpsQ2[] = { TmpF3, TmpY2, F0, TmpPR }; + TmpQ2 = SDOperand(CurDAG->getTargetNode(IA64::CFMAS1, MVT::f64, + OpsQ2, 4), 0); + Chain = TmpQ2.getValue(1); + SDOperand OpsR2[] = { TmpF4, TmpQ2, TmpF3, TmpPR }; + TmpR2 = SDOperand(CurDAG->getTargetNode(IA64::CFNMAS1, MVT::f64, + OpsR2, 4), 0); + Chain = TmpR2.getValue(1); + +// we want TmpQ3 to have the same target register as the frcpa? maybe we +// should two-address hack it. See the comment "for this to work..." on page +// 48 of Intel application note #245415 + SDOperand OpsQ3[] = { TmpF5, TmpR2, TmpY2, TmpQ2, TmpPR }; + TmpQ3 = SDOperand(CurDAG->getTargetNode(IA64::TCFMAS1, MVT::f64, + OpsQ3, 5), 0); + Chain = TmpQ3.getValue(1); + + // STORY: without these two-address instructions (TCFMAS1 and TCFMADS0) + // the FPSWA won't be able to help out in the case of large/tiny + // arguments. Other fun bugs may also appear, e.g. 0/x = x, not 0. + + if(isSigned) + TmpQ = SDOperand(CurDAG->getTargetNode(IA64::FCVTFXTRUNCS1, + MVT::f64, TmpQ3), 0); + else + TmpQ = SDOperand(CurDAG->getTargetNode(IA64::FCVTFXUTRUNCS1, + MVT::f64, TmpQ3), 0); + + Chain = TmpQ.getValue(1); + + if(isModulus) { + SDOperand FPminusB = + SDOperand(CurDAG->getTargetNode(IA64::SETFSIG, MVT::f64, minusB), 0); + Chain = FPminusB.getValue(1); + SDOperand Remainder = + SDOperand(CurDAG->getTargetNode(IA64::XMAL, MVT::f64, + TmpQ, FPminusB, TmpF1), 0); + Chain = Remainder.getValue(1); + Result = CurDAG->getTargetNode(IA64::GETFSIG, MVT::i64, Remainder); + Chain = SDOperand(Result, 1); + } else { // just an integer divide + Result = CurDAG->getTargetNode(IA64::GETFSIG, MVT::i64, TmpQ); + Chain = SDOperand(Result, 1); + } + + return Result; + } // wasn't an FP divide +} + +// Select - Convert the specified operand from a target-independent to a +// target-specific node if it hasn't already been changed. +SDNode *IA64DAGToDAGISel::Select(SDOperand Op) { + SDNode *N = Op.Val; + if (N->getOpcode() >= ISD::BUILTIN_OP_END && + N->getOpcode() < IA64ISD::FIRST_NUMBER) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + + case IA64ISD::BRCALL: { // XXX: this is also a hack! + SDOperand Chain = N->getOperand(0); + SDOperand InFlag; // Null incoming flag value. + + AddToISelQueue(Chain); + if(N->getNumOperands()==3) { // we have an incoming chain, callee and flag + InFlag = N->getOperand(2); + AddToISelQueue(InFlag); + } + + unsigned CallOpcode; + SDOperand CallOperand; + + // if we can call directly, do so + if (GlobalAddressSDNode *GASD = + dyn_cast<GlobalAddressSDNode>(N->getOperand(1))) { + CallOpcode = IA64::BRCALL_IPREL_GA; + CallOperand = CurDAG->getTargetGlobalAddress(GASD->getGlobal(), MVT::i64); + } else if (isa<ExternalSymbolSDNode>(N->getOperand(1))) { + // FIXME: we currently NEED this case for correctness, to avoid + // "non-pic code with imm reloc.n against dynamic symbol" errors + CallOpcode = IA64::BRCALL_IPREL_ES; + CallOperand = N->getOperand(1); + } else { + // otherwise we need to load the function descriptor, + // load the branch target (function)'s entry point and GP, + // branch (call) then restore the GP + SDOperand FnDescriptor = N->getOperand(1); + AddToISelQueue(FnDescriptor); + + // load the branch target's entry point [mem] and + // GP value [mem+8] + SDOperand targetEntryPoint= + SDOperand(CurDAG->getTargetNode(IA64::LD8, MVT::i64, FnDescriptor), 0); + Chain = targetEntryPoint.getValue(1); + SDOperand targetGPAddr= + SDOperand(CurDAG->getTargetNode(IA64::ADDS, MVT::i64, + FnDescriptor, + CurDAG->getConstant(8, MVT::i64)), 0); + Chain = targetGPAddr.getValue(1); + SDOperand targetGP = + SDOperand(CurDAG->getTargetNode(IA64::LD8, MVT::i64, targetGPAddr), 0); + Chain = targetGP.getValue(1); + + Chain = CurDAG->getCopyToReg(Chain, IA64::r1, targetGP, InFlag); + InFlag = Chain.getValue(1); + Chain = CurDAG->getCopyToReg(Chain, IA64::B6, targetEntryPoint, InFlag); // FLAG these? + InFlag = Chain.getValue(1); + + CallOperand = CurDAG->getRegister(IA64::B6, MVT::i64); + CallOpcode = IA64::BRCALL_INDIRECT; + } + + // Finally, once everything is setup, emit the call itself + if(InFlag.Val) + Chain = SDOperand(CurDAG->getTargetNode(CallOpcode, MVT::Other, MVT::Flag, + CallOperand, InFlag), 0); + else // there might be no arguments + Chain = SDOperand(CurDAG->getTargetNode(CallOpcode, MVT::Other, MVT::Flag, + CallOperand, Chain), 0); + InFlag = Chain.getValue(1); + + std::vector<SDOperand> CallResults; + + CallResults.push_back(Chain); + CallResults.push_back(InFlag); + + for (unsigned i = 0, e = CallResults.size(); i != e; ++i) + ReplaceUses(Op.getValue(i), CallResults[i]); + return NULL; + } + + case IA64ISD::GETFD: { + SDOperand Input = N->getOperand(0); + AddToISelQueue(Input); + return CurDAG->getTargetNode(IA64::GETFD, MVT::i64, Input); + } + + case ISD::FDIV: + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: + return SelectDIV(Op); + + case ISD::TargetConstantFP: { + SDOperand Chain = CurDAG->getEntryNode(); // this is a constant, so.. + + SDOperand V; + if (cast<ConstantFPSDNode>(N)->isExactlyValue(+0.0)) { + V = CurDAG->getCopyFromReg(Chain, IA64::F0, MVT::f64); + } else if (cast<ConstantFPSDNode>(N)->isExactlyValue(+1.0)) { + V = CurDAG->getCopyFromReg(Chain, IA64::F1, MVT::f64); + } else + assert(0 && "Unexpected FP constant!"); + + ReplaceUses(SDOperand(N, 0), V); + return 0; + } + + case ISD::FrameIndex: { // TODO: reduce creepyness + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + if (N->hasOneUse()) + return CurDAG->SelectNodeTo(N, IA64::MOV, MVT::i64, + CurDAG->getTargetFrameIndex(FI, MVT::i64)); + else + return CurDAG->getTargetNode(IA64::MOV, MVT::i64, + CurDAG->getTargetFrameIndex(FI, MVT::i64)); + } + + case ISD::ConstantPool: { // TODO: nuke the constant pool + // (ia64 doesn't need one) + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N); + Constant *C = CP->getConstVal(); + SDOperand CPI = CurDAG->getTargetConstantPool(C, MVT::i64, + CP->getAlignment()); + return CurDAG->getTargetNode(IA64::ADDL_GA, MVT::i64, // ? + CurDAG->getRegister(IA64::r1, MVT::i64), CPI); + } + + case ISD::GlobalAddress: { + GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal(); + SDOperand GA = CurDAG->getTargetGlobalAddress(GV, MVT::i64); + SDOperand Tmp = + SDOperand(CurDAG->getTargetNode(IA64::ADDL_GA, MVT::i64, + CurDAG->getRegister(IA64::r1, + MVT::i64), GA), 0); + return CurDAG->getTargetNode(IA64::LD8, MVT::i64, Tmp); + } + +/* XXX + case ISD::ExternalSymbol: { + SDOperand EA = CurDAG->getTargetExternalSymbol( + cast<ExternalSymbolSDNode>(N)->getSymbol(), + MVT::i64); + SDOperand Tmp = CurDAG->getTargetNode(IA64::ADDL_EA, MVT::i64, + CurDAG->getRegister(IA64::r1, + MVT::i64), + EA); + return CurDAG->getTargetNode(IA64::LD8, MVT::i64, Tmp); + } +*/ + + case ISD::LOAD: { // FIXME: load -1, not 1, for bools? + LoadSDNode *LD = cast<LoadSDNode>(N); + SDOperand Chain = LD->getChain(); + SDOperand Address = LD->getBasePtr(); + AddToISelQueue(Chain); + AddToISelQueue(Address); + + MVT::ValueType TypeBeingLoaded = LD->getLoadedVT(); + unsigned Opc; + switch (TypeBeingLoaded) { + default: +#ifndef NDEBUG + N->dump(CurDAG); +#endif + assert(0 && "Cannot load this type!"); + case MVT::i1: { // this is a bool + Opc = IA64::LD1; // first we load a byte, then compare for != 0 + if(N->getValueType(0) == MVT::i1) { // XXX: early exit! + return CurDAG->SelectNodeTo(N, IA64::CMPNE, MVT::i1, MVT::Other, + SDOperand(CurDAG->getTargetNode(Opc, MVT::i64, Address), 0), + CurDAG->getRegister(IA64::r0, MVT::i64), + Chain); + } + /* otherwise, we want to load a bool into something bigger: LD1 + will do that for us, so we just fall through */ + } + case MVT::i8: Opc = IA64::LD1; break; + case MVT::i16: Opc = IA64::LD2; break; + case MVT::i32: Opc = IA64::LD4; break; + case MVT::i64: Opc = IA64::LD8; break; + + case MVT::f32: Opc = IA64::LDF4; break; + case MVT::f64: Opc = IA64::LDF8; break; + } + + // TODO: comment this + return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), MVT::Other, + Address, Chain); + } + + case ISD::STORE: { + StoreSDNode *ST = cast<StoreSDNode>(N); + SDOperand Address = ST->getBasePtr(); + SDOperand Chain = ST->getChain(); + AddToISelQueue(Address); + AddToISelQueue(Chain); + + unsigned Opc; + if (ISD::isNON_TRUNCStore(N)) { + switch (N->getOperand(1).getValueType()) { + default: assert(0 && "unknown type in store"); + case MVT::i1: { // this is a bool + Opc = IA64::ST1; // we store either 0 or 1 as a byte + // first load zero! + SDOperand Initial = CurDAG->getCopyFromReg(Chain, IA64::r0, MVT::i64); + Chain = Initial.getValue(1); + // then load 1 into the same reg iff the predicate to store is 1 + SDOperand Tmp = ST->getValue(); + AddToISelQueue(Tmp); + Tmp = + SDOperand(CurDAG->getTargetNode(IA64::TPCADDS, MVT::i64, Initial, + CurDAG->getTargetConstant(1, MVT::i64), + Tmp), 0); + return CurDAG->SelectNodeTo(N, Opc, MVT::Other, Address, Tmp, Chain); + } + case MVT::i64: Opc = IA64::ST8; break; + case MVT::f64: Opc = IA64::STF8; break; + } + } else { // Truncating store + switch(ST->getStoredVT()) { + default: assert(0 && "unknown type in truncstore"); + case MVT::i8: Opc = IA64::ST1; break; + case MVT::i16: Opc = IA64::ST2; break; + case MVT::i32: Opc = IA64::ST4; break; + case MVT::f32: Opc = IA64::STF4; break; + } + } + + SDOperand N1 = N->getOperand(1); + SDOperand N2 = N->getOperand(2); + AddToISelQueue(N1); + AddToISelQueue(N2); + return CurDAG->SelectNodeTo(N, Opc, MVT::Other, N2, N1, Chain); + } + + case ISD::BRCOND: { + SDOperand Chain = N->getOperand(0); + SDOperand CC = N->getOperand(1); + AddToISelQueue(Chain); + AddToISelQueue(CC); + MachineBasicBlock *Dest = + cast<BasicBlockSDNode>(N->getOperand(2))->getBasicBlock(); + //FIXME - we do NOT need long branches all the time + return CurDAG->SelectNodeTo(N, IA64::BRLCOND_NOTCALL, MVT::Other, CC, + CurDAG->getBasicBlock(Dest), Chain); + } + + case ISD::CALLSEQ_START: + case ISD::CALLSEQ_END: { + int64_t Amt = cast<ConstantSDNode>(N->getOperand(1))->getValue(); + unsigned Opc = N->getOpcode() == ISD::CALLSEQ_START ? + IA64::ADJUSTCALLSTACKDOWN : IA64::ADJUSTCALLSTACKUP; + SDOperand N0 = N->getOperand(0); + AddToISelQueue(N0); + return CurDAG->SelectNodeTo(N, Opc, MVT::Other, getI64Imm(Amt), N0); + } + + case ISD::BR: + // FIXME: we don't need long branches all the time! + SDOperand N0 = N->getOperand(0); + AddToISelQueue(N0); + return CurDAG->SelectNodeTo(N, IA64::BRL_NOTCALL, MVT::Other, + N->getOperand(1), N0); + } + + return SelectCode(Op); +} + + +/// createIA64DAGToDAGInstructionSelector - This pass converts a legalized DAG +/// into an IA64-specific DAG, ready for instruction scheduling. +/// +FunctionPass +*llvm::createIA64DAGToDAGInstructionSelector(IA64TargetMachine &TM) { + return new IA64DAGToDAGISel(TM); +} + diff --git a/lib/Target/IA64/IA64ISelLowering.cpp b/lib/Target/IA64/IA64ISelLowering.cpp new file mode 100644 index 0000000..0237a9a --- /dev/null +++ b/lib/Target/IA64/IA64ISelLowering.cpp @@ -0,0 +1,602 @@ +//===-- IA64ISelLowering.cpp - IA64 DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the IA64ISelLowering class. +// +//===----------------------------------------------------------------------===// + +#include "IA64ISelLowering.h" +#include "IA64MachineFunctionInfo.h" +#include "IA64TargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +using namespace llvm; + +IA64TargetLowering::IA64TargetLowering(TargetMachine &TM) + : TargetLowering(TM) { + + // register class for general registers + addRegisterClass(MVT::i64, IA64::GRRegisterClass); + + // register class for FP registers + addRegisterClass(MVT::f64, IA64::FPRegisterClass); + + // register class for predicate registers + addRegisterClass(MVT::i1, IA64::PRRegisterClass); + + setLoadXAction(ISD::EXTLOAD , MVT::i1 , Promote); + + setLoadXAction(ISD::ZEXTLOAD , MVT::i1 , Expand); + + setLoadXAction(ISD::SEXTLOAD , MVT::i1 , Expand); + setLoadXAction(ISD::SEXTLOAD , MVT::i8 , Expand); + setLoadXAction(ISD::SEXTLOAD , MVT::i16 , Expand); + setLoadXAction(ISD::SEXTLOAD , MVT::i32 , Expand); + + setOperationAction(ISD::BRIND , MVT::Other, Expand); + setOperationAction(ISD::BR_JT , MVT::Other, Expand); + setOperationAction(ISD::BR_CC , MVT::Other, Expand); + setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); + + // ia64 uses SELECT not SELECT_CC + setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + + // We need to handle ISD::RET for void functions ourselves, + // so we get a chance to restore ar.pfs before adding a + // br.ret insn + setOperationAction(ISD::RET, MVT::Other, Custom); + + setSetCCResultType(MVT::i1); + setShiftAmountType(MVT::i64); + + setOperationAction(ISD::FREM , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f64 , Expand); + + setOperationAction(ISD::UREM , MVT::f32 , Expand); + setOperationAction(ISD::UREM , MVT::f64 , Expand); + + setOperationAction(ISD::MEMMOVE , MVT::Other, Expand); + setOperationAction(ISD::MEMSET , MVT::Other, Expand); + setOperationAction(ISD::MEMCPY , MVT::Other, Expand); + + setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); + + // We don't support sin/cos/sqrt + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + + // FIXME: IA64 supports fcopysign natively! + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + // We don't have line number support yet. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + setOperationAction(ISD::LABEL, MVT::Other, Expand); + + //IA64 has these, but they are not implemented + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + setOperationAction(ISD::CTLZ , MVT::i64 , Expand); + setOperationAction(ISD::ROTL , MVT::i64 , Expand); + setOperationAction(ISD::ROTR , MVT::i64 , Expand); + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); // mux @rev + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VAARG , MVT::Other, Custom); + setOperationAction(ISD::VASTART , MVT::Other, Custom); + + // Use the default implementation. + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + // Thread Local Storage + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + + setStackPointerRegisterToSaveRestore(IA64::r12); + + setJumpBufSize(704); // on ia64-linux, jmp_bufs are 704 bytes.. + setJumpBufAlignment(16); // ...and must be 16-byte aligned + + computeRegisterProperties(); + + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + addLegalFPImmediate(+0.0); + addLegalFPImmediate(+1.0); +} + +const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case IA64ISD::GETFD: return "IA64ISD::GETFD"; + case IA64ISD::BRCALL: return "IA64ISD::BRCALL"; + case IA64ISD::RET_FLAG: return "IA64ISD::RET_FLAG"; + } +} + + +std::vector<SDOperand> +IA64TargetLowering::LowerArguments(Function &F, SelectionDAG &DAG) { + std::vector<SDOperand> ArgValues; + // + // add beautiful description of IA64 stack frame format + // here (from intel 24535803.pdf most likely) + // + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + GP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64)); + SP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64)); + RP = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64)); + + MachineBasicBlock& BB = MF.front(); + + unsigned args_int[] = {IA64::r32, IA64::r33, IA64::r34, IA64::r35, + IA64::r36, IA64::r37, IA64::r38, IA64::r39}; + + unsigned args_FP[] = {IA64::F8, IA64::F9, IA64::F10, IA64::F11, + IA64::F12,IA64::F13,IA64::F14, IA64::F15}; + + unsigned argVreg[8]; + unsigned argPreg[8]; + unsigned argOpc[8]; + + unsigned used_FPArgs = 0; // how many FP args have been used so far? + + unsigned ArgOffset = 0; + int count = 0; + + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) + { + SDOperand newroot, argt; + if(count < 8) { // need to fix this logic? maybe. + + switch (getValueType(I->getType())) { + default: + assert(0 && "ERROR in LowerArgs: can't lower this type of arg.\n"); + case MVT::f32: + // fixme? (well, will need to for weird FP structy stuff, + // see intel ABI docs) + case MVT::f64: +//XXX BuildMI(&BB, IA64::IDEF, 0, args_FP[used_FPArgs]); + MF.addLiveIn(args_FP[used_FPArgs]); // mark this reg as liveIn + // floating point args go into f8..f15 as-needed, the increment + argVreg[count] = // is below..: + MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::f64)); + // FP args go into f8..f15 as needed: (hence the ++) + argPreg[count] = args_FP[used_FPArgs++]; + argOpc[count] = IA64::FMOV; + argt = newroot = DAG.getCopyFromReg(DAG.getRoot(), argVreg[count], + MVT::f64); + if (I->getType() == Type::FloatTy) + argt = DAG.getNode(ISD::FP_ROUND, MVT::f32, argt); + break; + case MVT::i1: // NOTE: as far as C abi stuff goes, + // bools are just boring old ints + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: +//XXX BuildMI(&BB, IA64::IDEF, 0, args_int[count]); + MF.addLiveIn(args_int[count]); // mark this register as liveIn + argVreg[count] = + MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64)); + argPreg[count] = args_int[count]; + argOpc[count] = IA64::MOV; + argt = newroot = + DAG.getCopyFromReg(DAG.getRoot(), argVreg[count], MVT::i64); + if ( getValueType(I->getType()) != MVT::i64) + argt = DAG.getNode(ISD::TRUNCATE, getValueType(I->getType()), + newroot); + break; + } + } else { // more than 8 args go into the frame + // Create the frame index object for this incoming parameter... + ArgOffset = 16 + 8 * (count - 8); + int FI = MFI->CreateFixedObject(8, ArgOffset); + + // Create the SelectionDAG nodes corresponding to a load + //from this parameter + SDOperand FIN = DAG.getFrameIndex(FI, MVT::i64); + argt = newroot = DAG.getLoad(getValueType(I->getType()), + DAG.getEntryNode(), FIN, NULL, 0); + } + ++count; + DAG.setRoot(newroot.getValue(1)); + ArgValues.push_back(argt); + } + + + // Create a vreg to hold the output of (what will become) + // the "alloc" instruction + VirtGPR = MF.getSSARegMap()->createVirtualRegister(getRegClassFor(MVT::i64)); + BuildMI(&BB, TII->get(IA64::PSEUDO_ALLOC), VirtGPR); + // we create a PSEUDO_ALLOC (pseudo)instruction for now +/* + BuildMI(&BB, IA64::IDEF, 0, IA64::r1); + + // hmm: + BuildMI(&BB, IA64::IDEF, 0, IA64::r12); + BuildMI(&BB, IA64::IDEF, 0, IA64::rp); + // ..hmm. + + BuildMI(&BB, IA64::MOV, 1, GP).addReg(IA64::r1); + + // hmm: + BuildMI(&BB, IA64::MOV, 1, SP).addReg(IA64::r12); + BuildMI(&BB, IA64::MOV, 1, RP).addReg(IA64::rp); + // ..hmm. +*/ + + unsigned tempOffset=0; + + // if this is a varargs function, we simply lower llvm.va_start by + // pointing to the first entry + if(F.isVarArg()) { + tempOffset=0; + VarArgsFrameIndex = MFI->CreateFixedObject(8, tempOffset); + } + + // here we actually do the moving of args, and store them to the stack + // too if this is a varargs function: + for (int i = 0; i < count && i < 8; ++i) { + BuildMI(&BB, TII->get(argOpc[i]), argVreg[i]).addReg(argPreg[i]); + if(F.isVarArg()) { + // if this is a varargs function, we copy the input registers to the stack + int FI = MFI->CreateFixedObject(8, tempOffset); + tempOffset+=8; //XXX: is it safe to use r22 like this? + BuildMI(&BB, TII->get(IA64::MOV), IA64::r22).addFrameIndex(FI); + // FIXME: we should use st8.spill here, one day + BuildMI(&BB, TII->get(IA64::ST8), IA64::r22).addReg(argPreg[i]); + } + } + + // Finally, inform the code generator which regs we return values in. + // (see the ISD::RET: case in the instruction selector) + switch (getValueType(F.getReturnType())) { + default: assert(0 && "i have no idea where to return this type!"); + case MVT::isVoid: break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + MF.addLiveOut(IA64::r8); + break; + case MVT::f32: + case MVT::f64: + MF.addLiveOut(IA64::F8); + break; + } + + return ArgValues; +} + +std::pair<SDOperand, SDOperand> +IA64TargetLowering::LowerCallTo(SDOperand Chain, + const Type *RetTy, bool RetTyIsSigned, + bool isVarArg, unsigned CallingConv, + bool isTailCall, SDOperand Callee, + ArgListTy &Args, SelectionDAG &DAG) { + + MachineFunction &MF = DAG.getMachineFunction(); + + unsigned NumBytes = 16; + unsigned outRegsUsed = 0; + + if (Args.size() > 8) { + NumBytes += (Args.size() - 8) * 8; + outRegsUsed = 8; + } else { + outRegsUsed = Args.size(); + } + + // FIXME? this WILL fail if we ever try to pass around an arg that + // consumes more than a single output slot (a 'real' double, int128 + // some sort of aggregate etc.), as we'll underestimate how many 'outX' + // registers we use. Hopefully, the assembler will notice. + MF.getInfo<IA64FunctionInfo>()->outRegsUsed= + std::max(outRegsUsed, MF.getInfo<IA64FunctionInfo>()->outRegsUsed); + + // keep stack frame 16-byte aligned + // assert(NumBytes==((NumBytes+15) & ~15) && + // "stack frame not 16-byte aligned!"); + NumBytes = (NumBytes+15) & ~15; + + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); + + SDOperand StackPtr; + std::vector<SDOperand> Stores; + std::vector<SDOperand> Converts; + std::vector<SDOperand> RegValuesToPass; + unsigned ArgOffset = 16; + + for (unsigned i = 0, e = Args.size(); i != e; ++i) + { + SDOperand Val = Args[i].Node; + MVT::ValueType ObjectVT = Val.getValueType(); + SDOperand ValToStore(0, 0), ValToConvert(0, 0); + unsigned ObjSize=8; + switch (ObjectVT) { + default: assert(0 && "unexpected argument type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: { + //promote to 64-bits, sign/zero extending based on type + //of the argument + ISD::NodeType ExtendKind = ISD::ANY_EXTEND; + if (Args[i].isSExt) + ExtendKind = ISD::SIGN_EXTEND; + else if (Args[i].isZExt) + ExtendKind = ISD::ZERO_EXTEND; + Val = DAG.getNode(ExtendKind, MVT::i64, Val); + // XXX: fall through + } + case MVT::i64: + //ObjSize = 8; + if(RegValuesToPass.size() >= 8) { + ValToStore = Val; + } else { + RegValuesToPass.push_back(Val); + } + break; + case MVT::f32: + //promote to 64-bits + Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val); + // XXX: fall through + case MVT::f64: + if(RegValuesToPass.size() >= 8) { + ValToStore = Val; + } else { + RegValuesToPass.push_back(Val); + if(1 /* TODO: if(calling external or varadic function)*/ ) { + ValToConvert = Val; // additionally pass this FP value as an int + } + } + break; + } + + if(ValToStore.Val) { + if(!StackPtr.Val) { + StackPtr = DAG.getRegister(IA64::r12, MVT::i64); + } + SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, MVT::i64, StackPtr, PtrOff); + Stores.push_back(DAG.getStore(Chain, ValToStore, PtrOff, NULL, 0)); + ArgOffset += ObjSize; + } + + if(ValToConvert.Val) { + Converts.push_back(DAG.getNode(IA64ISD::GETFD, MVT::i64, ValToConvert)); + } + } + + // Emit all stores, make sure they occur before any copies into physregs. + if (!Stores.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Stores[0],Stores.size()); + + static const unsigned IntArgRegs[] = { + IA64::out0, IA64::out1, IA64::out2, IA64::out3, + IA64::out4, IA64::out5, IA64::out6, IA64::out7 + }; + + static const unsigned FPArgRegs[] = { + IA64::F8, IA64::F9, IA64::F10, IA64::F11, + IA64::F12, IA64::F13, IA64::F14, IA64::F15 + }; + + SDOperand InFlag; + + // save the current GP, SP and RP : FIXME: do we need to do all 3 always? + SDOperand GPBeforeCall = DAG.getCopyFromReg(Chain, IA64::r1, MVT::i64, InFlag); + Chain = GPBeforeCall.getValue(1); + InFlag = Chain.getValue(2); + SDOperand SPBeforeCall = DAG.getCopyFromReg(Chain, IA64::r12, MVT::i64, InFlag); + Chain = SPBeforeCall.getValue(1); + InFlag = Chain.getValue(2); + SDOperand RPBeforeCall = DAG.getCopyFromReg(Chain, IA64::rp, MVT::i64, InFlag); + Chain = RPBeforeCall.getValue(1); + InFlag = Chain.getValue(2); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing integer args into regs out[0-7] + // mapped 1:1 and the FP args into regs F8-F15 "lazily" + // TODO: for performance, we should only copy FP args into int regs when we + // know this is required (i.e. for varardic or external (unknown) functions) + + // first to the FP->(integer representation) conversions, these are + // flagged for now, but shouldn't have to be (TODO) + unsigned seenConverts = 0; + for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) { + if(MVT::isFloatingPoint(RegValuesToPass[i].getValueType())) { + Chain = DAG.getCopyToReg(Chain, IntArgRegs[i], Converts[seenConverts++], + InFlag); + InFlag = Chain.getValue(1); + } + } + + // next copy args into the usual places, these are flagged + unsigned usedFPArgs = 0; + for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, + MVT::isInteger(RegValuesToPass[i].getValueType()) ? + IntArgRegs[i] : FPArgRegs[usedFPArgs++], RegValuesToPass[i], InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. +/* + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i64); + } +*/ + + std::vector<MVT::ValueType> NodeTys; + std::vector<SDOperand> CallOperands; + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Flag); // Returns a flag for retval copy to use. + CallOperands.push_back(Chain); + CallOperands.push_back(Callee); + + // emit the call itself + if (InFlag.Val) + CallOperands.push_back(InFlag); + else + assert(0 && "this should never happen!\n"); + + // to make way for a hack: + Chain = DAG.getNode(IA64ISD::BRCALL, NodeTys, + &CallOperands[0], CallOperands.size()); + InFlag = Chain.getValue(1); + + // restore the GP, SP and RP after the call + Chain = DAG.getCopyToReg(Chain, IA64::r1, GPBeforeCall, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, IA64::r12, SPBeforeCall, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, IA64::rp, RPBeforeCall, InFlag); + InFlag = Chain.getValue(1); + + std::vector<MVT::ValueType> RetVals; + RetVals.push_back(MVT::Other); + RetVals.push_back(MVT::Flag); + + MVT::ValueType RetTyVT = getValueType(RetTy); + SDOperand RetVal; + if (RetTyVT != MVT::isVoid) { + switch (RetTyVT) { + default: assert(0 && "Unknown value type to return!"); + case MVT::i1: { // bools are just like other integers (returned in r8) + // we *could* fall through to the truncate below, but this saves a + // few redundant predicate ops + SDOperand boolInR8 = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64,InFlag); + InFlag = boolInR8.getValue(2); + Chain = boolInR8.getValue(1); + SDOperand zeroReg = DAG.getCopyFromReg(Chain, IA64::r0, MVT::i64, InFlag); + InFlag = zeroReg.getValue(2); + Chain = zeroReg.getValue(1); + + RetVal = DAG.getSetCC(MVT::i1, boolInR8, zeroReg, ISD::SETNE); + break; + } + case MVT::i8: + case MVT::i16: + case MVT::i32: + RetVal = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64, InFlag); + Chain = RetVal.getValue(1); + + // keep track of whether it is sign or zero extended (todo: bools?) +/* XXX + RetVal = DAG.getNode(RetTy->isSigned() ? ISD::AssertSext :ISD::AssertZext, + MVT::i64, RetVal, DAG.getValueType(RetTyVT)); +*/ + RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal); + break; + case MVT::i64: + RetVal = DAG.getCopyFromReg(Chain, IA64::r8, MVT::i64, InFlag); + Chain = RetVal.getValue(1); + InFlag = RetVal.getValue(2); // XXX dead + break; + case MVT::f32: + RetVal = DAG.getCopyFromReg(Chain, IA64::F8, MVT::f64, InFlag); + Chain = RetVal.getValue(1); + RetVal = DAG.getNode(ISD::TRUNCATE, MVT::f32, RetVal); + break; + case MVT::f64: + RetVal = DAG.getCopyFromReg(Chain, IA64::F8, MVT::f64, InFlag); + Chain = RetVal.getValue(1); + InFlag = RetVal.getValue(2); // XXX dead + break; + } + } + + Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain, + DAG.getConstant(NumBytes, getPointerTy())); + + return std::make_pair(RetVal, Chain); +} + +SDOperand IA64TargetLowering:: +LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Should not custom lower this!"); + case ISD::GlobalTLSAddress: + assert(0 && "TLS not implemented for IA64."); + case ISD::RET: { + SDOperand AR_PFSVal, Copy; + + switch(Op.getNumOperands()) { + default: + assert(0 && "Do not know how to return this many arguments!"); + abort(); + case 1: + AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), VirtGPR, MVT::i64); + AR_PFSVal = DAG.getCopyToReg(AR_PFSVal.getValue(1), IA64::AR_PFS, + AR_PFSVal); + return DAG.getNode(IA64ISD::RET_FLAG, MVT::Other, AR_PFSVal); + case 3: { + // Copy the result into the output register & restore ar.pfs + MVT::ValueType ArgVT = Op.getOperand(1).getValueType(); + unsigned ArgReg = MVT::isInteger(ArgVT) ? IA64::r8 : IA64::F8; + + AR_PFSVal = DAG.getCopyFromReg(Op.getOperand(0), VirtGPR, MVT::i64); + Copy = DAG.getCopyToReg(AR_PFSVal.getValue(1), ArgReg, Op.getOperand(1), + SDOperand()); + AR_PFSVal = DAG.getCopyToReg(Copy.getValue(0), IA64::AR_PFS, AR_PFSVal, + Copy.getValue(1)); + return DAG.getNode(IA64ISD::RET_FLAG, MVT::Other, + AR_PFSVal, AR_PFSVal.getValue(1)); + } + } + return SDOperand(); + } + case ISD::VAARG: { + MVT::ValueType VT = getPointerTy(); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + SDOperand VAList = DAG.getLoad(VT, Op.getOperand(0), Op.getOperand(1), + SV->getValue(), SV->getOffset()); + // Increment the pointer, VAList, to the next vaarg + SDOperand VAIncr = DAG.getNode(ISD::ADD, VT, VAList, + DAG.getConstant(MVT::getSizeInBits(VT)/8, + VT)); + // Store the incremented VAList to the legalized pointer + VAIncr = DAG.getStore(VAList.getValue(1), VAIncr, + Op.getOperand(1), SV->getValue(), SV->getOffset()); + // Load the actual argument out of the pointer VAList + return DAG.getLoad(Op.getValueType(), VAIncr, VAList, NULL, 0); + } + case ISD::VASTART: { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, MVT::i64); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + return DAG.getStore(Op.getOperand(0), FR, + Op.getOperand(1), SV->getValue(), SV->getOffset()); + } + // Frame & Return address. Currently unimplemented + case ISD::RETURNADDR: break; + case ISD::FRAMEADDR: break; + } + return SDOperand(); +} diff --git a/lib/Target/IA64/IA64ISelLowering.h b/lib/Target/IA64/IA64ISelLowering.h new file mode 100644 index 0000000..6bc5534 --- /dev/null +++ b/lib/Target/IA64/IA64ISelLowering.h @@ -0,0 +1,74 @@ +//===-- IA64ISelLowering.h - IA64 DAG Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that IA64 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_IA64_IA64ISELLOWERING_H +#define LLVM_TARGET_IA64_IA64ISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "IA64.h" + +namespace llvm { + namespace IA64ISD { + enum NodeType { + // Start the numbering where the builting ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END+IA64::INSTRUCTION_LIST_END, + + /// GETFD - the getf.d instruction takes a floating point operand and + /// returns its 64-bit memory representation as an i64 + GETFD, + + // TODO: explain this hack + BRCALL, + + // RET_FLAG - Return with a flag operand + RET_FLAG + }; + } + + class IA64TargetLowering : public TargetLowering { + int VarArgsFrameIndex; // FrameIndex for start of varargs area. + //int ReturnAddrIndex; // FrameIndex for return slot. + unsigned GP, SP, RP; // FIXME - clean this mess up + + public: + IA64TargetLowering(TargetMachine &TM); + + unsigned VirtGPR; // this is public so it can be accessed in the selector + // for ISD::RET. add an accessor instead? FIXME + + const char *getTargetNodeName(unsigned Opcode) const; + + /// LowerArguments - This hook must be implemented to indicate how we should + /// lower the arguments for the specified function, into the specified DAG. + virtual std::vector<SDOperand> + LowerArguments(Function &F, SelectionDAG &DAG); + + /// LowerCallTo - This hook lowers an abstract call to a function into an + /// actual call. + virtual std::pair<SDOperand, SDOperand> + LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, + bool isVarArg, unsigned CC, bool isTailCall, + SDOperand Callee, ArgListTy &Args, SelectionDAG &DAG); + + /// LowerOperation - for custom lowering specific ops + /// (currently, only "ret void") + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + +// XXX virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI, +// XXX MachineBasicBlock *MBB); + }; +} + +#endif // LLVM_TARGET_IA64_IA64ISELLOWERING_H diff --git a/lib/Target/IA64/IA64InstrBuilder.h b/lib/Target/IA64/IA64InstrBuilder.h new file mode 100644 index 0000000..f9b5004 --- /dev/null +++ b/lib/Target/IA64/IA64InstrBuilder.h @@ -0,0 +1,52 @@ +//===-- IA64PCInstrBuilder.h - Aids for building IA64 insts -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to simplify generating frame and constant pool +// references. +// +//===----------------------------------------------------------------------===// + +#ifndef IA64_INSTRBUILDER_H +#define IA64_INSTRBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +namespace llvm { + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +inline const MachineInstrBuilder& +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0, + bool mem = true) { + if (mem) + return MIB.addImm(Offset).addFrameIndex(FI); + else + return MIB.addFrameIndex(FI).addImm(Offset); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference has base register ConstantPoolIndex offset which is retained until +/// either machine code emission or assembly output. This allows an optional +/// offset to be added as well. +/// +inline const MachineInstrBuilder& +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + int Offset = 0) { + return MIB.addImm(Offset).addConstantPoolIndex(CPI); +} + +} // End llvm namespace + +#endif + diff --git a/lib/Target/IA64/IA64InstrFormats.td b/lib/Target/IA64/IA64InstrFormats.td new file mode 100644 index 0000000..ba6c574 --- /dev/null +++ b/lib/Target/IA64/IA64InstrFormats.td @@ -0,0 +1,79 @@ +//===- IA64InstrFormats.td - IA64 Instruction Formats --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// - Warning: the stuff in here isn't really being used, so is mostly +// junk. It'll get fixed as the JIT gets built. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +class InstIA64<bits<4> op, dag OL, string asmstr> : Instruction { + // IA64 instruction baseline + field bits<41> Inst; + let Namespace = "IA64"; + let OperandList = OL; + let AsmString = asmstr; + + let Inst{40-37} = op; +} + +//"Each Itanium instruction is categorized into one of six types." +//We should have: +// A, I, M, F, B, L+X + +class AForm<bits<4> opcode, bits<6> qpReg, dag OL, string asmstr> : + InstIA64<opcode, OL, asmstr> { + + let Inst{5-0} = qpReg; +} + +class AForm_DAG<bits<4> opcode, bits<6> qpReg, dag OL, string asmstr, + list<dag> pattern> : + InstIA64<opcode, OL, asmstr> { + + let Pattern = pattern; + let Inst{5-0} = qpReg; +} + +let isBranch = 1, isTerminator = 1 in +class BForm<bits<4> opcode, bits<6> x6, bits<3> btype, dag OL, string asmstr> : + InstIA64<opcode, OL, asmstr> { + + let Inst{32-27} = x6; + let Inst{8-6} = btype; +} + +class MForm<bits<4> opcode, bits<6> x6, dag OL, string asmstr> : + InstIA64<opcode, OL, asmstr> { + bits<7> Ra; + bits<7> Rb; + bits<16> disp; + + let Inst{35-30} = x6; +// let Inst{20-16} = Rb; + let Inst{15-0} = disp; +} + +class RawForm<bits<4> opcode, bits<26> rest, dag OL, string asmstr> : + InstIA64<opcode, OL, asmstr> { + let Inst{25-0} = rest; +} + +// Pseudo instructions. +class PseudoInstIA64<dag OL, string nm> : InstIA64<0, OL, nm> { +} + +class PseudoInstIA64_DAG<dag OL, string nm, list<dag> pattern> + : InstIA64<0, OL, nm> { + let Pattern = pattern; +} + diff --git a/lib/Target/IA64/IA64InstrInfo.cpp b/lib/Target/IA64/IA64InstrInfo.cpp new file mode 100644 index 0000000..a66c9bc --- /dev/null +++ b/lib/Target/IA64/IA64InstrInfo.cpp @@ -0,0 +1,58 @@ +//===- IA64InstrInfo.cpp - IA64 Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the IA64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "IA64InstrInfo.h" +#include "IA64.h" +#include "IA64InstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "IA64GenInstrInfo.inc" +using namespace llvm; + +IA64InstrInfo::IA64InstrInfo() + : TargetInstrInfo(IA64Insts, sizeof(IA64Insts)/sizeof(IA64Insts[0])), + RI(*this) { +} + + +bool IA64InstrInfo::isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const { + MachineOpCode oc = MI.getOpcode(); + if (oc == IA64::MOV || oc == IA64::FMOV) { + // TODO: this doesn't detect predicate moves + assert(MI.getNumOperands() >= 2 && + /* MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && */ + "invalid register-register move instruction"); + if( MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() ) { + // if both operands of the MOV/FMOV are registers, then + // yes, this is a move instruction + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + } + return false; // we don't consider e.g. %regN = MOV <FrameIndex #x> a + // move instruction +} + +unsigned +IA64InstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond)const { + // Can only insert uncond branches so far. + assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!"); + BuildMI(&MBB, get(IA64::BRL_NOTCALL)).addMBB(TBB); + return 1; +} diff --git a/lib/Target/IA64/IA64InstrInfo.h b/lib/Target/IA64/IA64InstrInfo.h new file mode 100644 index 0000000..3bb14e0 --- /dev/null +++ b/lib/Target/IA64/IA64InstrInfo.h @@ -0,0 +1,49 @@ +//===- IA64InstrInfo.h - IA64 Instruction Information ----------*- C++ -*- ===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the IA64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef IA64INSTRUCTIONINFO_H +#define IA64INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "IA64RegisterInfo.h" + +namespace llvm { + +class IA64InstrInfo : public TargetInstrInfo { + const IA64RegisterInfo RI; +public: + IA64InstrInfo(); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + // + // Return true if the instruction is a register to register move and + // leave the source and dest operands in the passed parameters. + // + virtual bool isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; + +}; + +} // End llvm namespace + +#endif + diff --git a/lib/Target/IA64/IA64InstrInfo.td b/lib/Target/IA64/IA64InstrInfo.td new file mode 100644 index 0000000..57f5f66 --- /dev/null +++ b/lib/Target/IA64/IA64InstrInfo.td @@ -0,0 +1,744 @@ +//===- IA64InstrInfo.td - Describe the IA64 Instruction Set -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the IA64 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +include "IA64InstrFormats.td" + +//===----------------------------------------------------------------------===// +// IA-64 specific DAG Nodes. +// + +def IA64getfd : SDNode<"IA64ISD::GETFD", SDTFPToIntOp, []>; + +def SDT_IA64RetFlag : SDTypeProfile<0, 0, []>; +def retflag : SDNode<"IA64ISD::RET_FLAG", SDT_IA64RetFlag, + [SDNPHasChain, SDNPOptInFlag]>; + +//===--------- +// Instruction types + +class isA { bit A=1; } // I or M unit +class isM { bit M=1; } // M unit +class isI { bit I=1; } // I unit +class isB { bit B=1; } // B unit +class isF { bit F=1; } // F unit +class isLX { bit LX=1; } // I/B + +//===--------- + +def u2imm : Operand<i8>; +def u6imm : Operand<i8>; +def s8imm : Operand<i8> { + let PrintMethod = "printS8ImmOperand"; +} +def s14imm : Operand<i64> { + let PrintMethod = "printS14ImmOperand"; +} +def s22imm : Operand<i64> { + let PrintMethod = "printS22ImmOperand"; +} +def u64imm : Operand<i64> { + let PrintMethod = "printU64ImmOperand"; +} +def s64imm : Operand<i64> { + let PrintMethod = "printS64ImmOperand"; +} + +let PrintMethod = "printGlobalOperand" in + def globaladdress : Operand<i64>; + +// the asmprinter needs to know about calls +let PrintMethod = "printCallOperand" in + def calltarget : Operand<i64>; + +/* new daggy action!!! */ + +def is32ones : PatLeaf<(i64 imm), [{ + // is32ones predicate - True if the immediate is 0x00000000FFFFFFFF + // Used to create ZXT4s appropriately + uint64_t v = (uint64_t)N->getValue(); + return (v == 0x00000000FFFFFFFFLL); +}]>; + +// isMIXable predicates - True if the immediate is +// 0xFF00FF00FF00FF00, 0x00FF00FF00FF00FF +// etc, through 0x00000000FFFFFFFF +// Used to test for the suitability of mix* +def isMIX1Lable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0xFF00FF00FF00FF00LL); +}]>; +def isMIX1Rable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0x00FF00FF00FF00FFLL); +}]>; +def isMIX2Lable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0xFFFF0000FFFF0000LL); +}]>; +def isMIX2Rable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0x0000FFFF0000FFFFLL); +}]>; +def isMIX4Lable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0xFFFFFFFF00000000LL); +}]>; +def isMIX4Rable: PatLeaf<(i64 imm), [{ + return((uint64_t)N->getValue()==0x00000000FFFFFFFFLL); +}]>; + +def isSHLADDimm: PatLeaf<(i64 imm), [{ + // isSHLADDimm predicate - True if the immediate is exactly 1, 2, 3 or 4 + // - 0 is *not* okay. + // Used to create shladd instructions appropriately + int64_t v = (int64_t)N->getValue(); + return (v >= 1 && v <= 4); +}]>; + +def immSExt14 : PatLeaf<(i64 imm), [{ + // immSExt14 predicate - True if the immediate fits in a 14-bit sign extended + // field. Used by instructions like 'adds'. + int64_t v = (int64_t)N->getValue(); + return (v <= 8191 && v >= -8192); +}]>; + +// imm64 predicate - True if the immediate fits in a 64-bit +// field - i.e., true. used to keep movl happy +def imm64 : PatLeaf<(i64 imm)>; + +def ADD : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "add $dst = $src1, $src2", + [(set GR:$dst, (add GR:$src1, GR:$src2))]>, isA; + +def ADD1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "add $dst = $src1, $src2, 1", + [(set GR:$dst, (add (add GR:$src1, GR:$src2), 1))]>, isA; + +def ADDS : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm), + "adds $dst = $imm, $src1", + [(set GR:$dst, (add GR:$src1, immSExt14:$imm))]>, isA; + +def MOVL : AForm_DAG<0x03, 0x0b, (ops GR:$dst, s64imm:$imm), + "movl $dst = $imm", + [(set GR:$dst, imm64:$imm)]>, isLX; + +def ADDL_GA : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, globaladdress:$imm), + "addl $dst = $imm, $src1", + []>, isA; + +// hmm +def ADDL_EA : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, calltarget:$imm), + "addl $dst = $imm, $src1", + []>, isA; + +def SUB : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "sub $dst = $src1, $src2", + [(set GR:$dst, (sub GR:$src1, GR:$src2))]>, isA; + +def SUB1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "sub $dst = $src1, $src2, 1", + [(set GR:$dst, (add (sub GR: $src1, GR:$src2), -1))]>, isA; + +let isTwoAddress = 1 in { +def TPCADDIMM22 : AForm<0x03, 0x0b, + (ops GR:$dst, GR:$src1, s22imm:$imm, PR:$qp), + "($qp) add $dst = $imm, $dst">, isA; +def TPCADDS : AForm_DAG<0x03, 0x0b, + (ops GR:$dst, GR:$src1, s14imm:$imm, PR:$qp), + "($qp) adds $dst = $imm, $dst", + []>, isA; +def TPCMPIMM8NE : AForm<0x03, 0x0b, + (ops PR:$dst, PR:$src1, s22imm:$imm, GR:$src2, PR:$qp), + "($qp) cmp.ne $dst , p0 = $imm, $src2">, isA; +} + +// zero extend a bool (predicate reg) into an integer reg +def ZXTb : Pat<(zext PR:$src), + (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>; +def AXTb : Pat<(anyext PR:$src), + (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src)>; + +// normal sign/zero-extends +def SXT1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt1 $dst = $src", + [(set GR:$dst, (sext_inreg GR:$src, i8))]>, isI; +def ZXT1 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt1 $dst = $src", + [(set GR:$dst, (and GR:$src, 255))]>, isI; +def SXT2 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt2 $dst = $src", + [(set GR:$dst, (sext_inreg GR:$src, i16))]>, isI; +def ZXT2 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt2 $dst = $src", + [(set GR:$dst, (and GR:$src, 65535))]>, isI; +def SXT4 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "sxt4 $dst = $src", + [(set GR:$dst, (sext_inreg GR:$src, i32))]>, isI; +def ZXT4 : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), "zxt4 $dst = $src", + [(set GR:$dst, (and GR:$src, is32ones))]>, isI; + +// fixme: shrs vs shru? +def MIX1L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix1.l $dst = $src1, $src2", + [(set GR:$dst, (or (and GR:$src1, isMIX1Lable), + (and (srl GR:$src2, (i64 8)), isMIX1Lable)))]>, isI; + +def MIX2L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix2.l $dst = $src1, $src2", + [(set GR:$dst, (or (and GR:$src1, isMIX2Lable), + (and (srl GR:$src2, (i64 16)), isMIX2Lable)))]>, isI; + +def MIX4L : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix4.l $dst = $src1, $src2", + [(set GR:$dst, (or (and GR:$src1, isMIX4Lable), + (and (srl GR:$src2, (i64 32)), isMIX4Lable)))]>, isI; + +def MIX1R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix1.r $dst = $src1, $src2", + [(set GR:$dst, (or (and (shl GR:$src1, (i64 8)), isMIX1Rable), + (and GR:$src2, isMIX1Rable)))]>, isI; + +def MIX2R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix2.r $dst = $src1, $src2", + [(set GR:$dst, (or (and (shl GR:$src1, (i64 16)), isMIX2Rable), + (and GR:$src2, isMIX2Rable)))]>, isI; + +def MIX4R : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "mix4.r $dst = $src1, $src2", + [(set GR:$dst, (or (and (shl GR:$src1, (i64 32)), isMIX4Rable), + (and GR:$src2, isMIX4Rable)))]>, isI; + +def GETFSIGD : AForm_DAG<0x03, 0x0b, (ops GR:$dst, FP:$src), + "getf.sig $dst = $src", + []>, isM; + +def SETFSIGD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, GR:$src), + "setf.sig $dst = $src", + []>, isM; + +def XMALD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "xma.l $dst = $src1, $src2, $src3", + []>, isF; +def XMAHD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "xma.h $dst = $src1, $src2, $src3", + []>, isF; +def XMAHUD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "xma.hu $dst = $src1, $src2, $src3", + []>, isF; + +// pseudocode for integer multiplication +def : Pat<(mul GR:$src1, GR:$src2), + (GETFSIGD (XMALD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>; +def : Pat<(mulhs GR:$src1, GR:$src2), + (GETFSIGD (XMAHD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>; +def : Pat<(mulhu GR:$src1, GR:$src2), + (GETFSIGD (XMAHUD (SETFSIGD GR:$src1), (SETFSIGD GR:$src2), F0))>; + +// TODO: addp4 (addp4 dst = src, r0 is a 32-bit add) +// has imm form, too + +// def ADDS : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm), +// "adds $dst = $imm, $src1">; + +def AND : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "and $dst = $src1, $src2", + [(set GR:$dst, (and GR:$src1, GR:$src2))]>, isA; +def ANDCM : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "andcm $dst = $src1, $src2", + [(set GR:$dst, (and GR:$src1, (not GR:$src2)))]>, isA; +// TODO: and/andcm/or/xor/add/sub/shift immediate forms +def OR : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "or $dst = $src1, $src2", + [(set GR:$dst, (or GR:$src1, GR:$src2))]>, isA; + +def pOR : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2, PR:$qp), + "($qp) or $dst = $src1, $src2">, isA; + +// the following are all a bit unfortunate: we throw away the complement +// of the compare! +def CMPEQ : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.eq $dst, p0 = $src1, $src2", + [(set PR:$dst, (seteq GR:$src1, GR:$src2))]>, isA; +def CMPGT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.gt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setgt GR:$src1, GR:$src2))]>, isA; +def CMPGE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.ge $dst, p0 = $src1, $src2", + [(set PR:$dst, (setge GR:$src1, GR:$src2))]>, isA; +def CMPLT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.lt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setlt GR:$src1, GR:$src2))]>, isA; +def CMPLE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.le $dst, p0 = $src1, $src2", + [(set PR:$dst, (setle GR:$src1, GR:$src2))]>, isA; +def CMPNE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.ne $dst, p0 = $src1, $src2", + [(set PR:$dst, (setne GR:$src1, GR:$src2))]>, isA; +def CMPLTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.ltu $dst, p0 = $src1, $src2", + [(set PR:$dst, (setult GR:$src1, GR:$src2))]>, isA; +def CMPGTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.gtu $dst, p0 = $src1, $src2", + [(set PR:$dst, (setugt GR:$src1, GR:$src2))]>, isA; +def CMPLEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.leu $dst, p0 = $src1, $src2", + [(set PR:$dst, (setule GR:$src1, GR:$src2))]>, isA; +def CMPGEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2), + "cmp.geu $dst, p0 = $src1, $src2", + [(set PR:$dst, (setuge GR:$src1, GR:$src2))]>, isA; + +// and we do the whole thing again for FP compares! +def FCMPEQ : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.eq $dst, p0 = $src1, $src2", + [(set PR:$dst, (seteq FP:$src1, FP:$src2))]>, isF; +def FCMPGT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.gt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setgt FP:$src1, FP:$src2))]>, isF; +def FCMPGE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.ge $dst, p0 = $src1, $src2", + [(set PR:$dst, (setge FP:$src1, FP:$src2))]>, isF; +def FCMPLT : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.lt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setlt FP:$src1, FP:$src2))]>, isF; +def FCMPLE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.le $dst, p0 = $src1, $src2", + [(set PR:$dst, (setle FP:$src1, FP:$src2))]>, isF; +def FCMPNE : AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.neq $dst, p0 = $src1, $src2", + [(set PR:$dst, (setne FP:$src1, FP:$src2))]>, isF; +def FCMPLTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.lt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setult FP:$src1, FP:$src2))]>, isF; +def FCMPGTU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.gt $dst, p0 = $src1, $src2", + [(set PR:$dst, (setugt FP:$src1, FP:$src2))]>, isF; +def FCMPLEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.le $dst, p0 = $src1, $src2", + [(set PR:$dst, (setule FP:$src1, FP:$src2))]>, isF; +def FCMPGEU: AForm_DAG<0x03, 0x0b, (ops PR:$dst, FP:$src1, FP:$src2), + "fcmp.ge $dst, p0 = $src1, $src2", + [(set PR:$dst, (setuge FP:$src1, FP:$src2))]>, isF; + +def PCMPEQUNCR0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$qp), + "($qp) cmp.eq.unc $dst, p0 = r0, r0">, isA; + +def : Pat<(trunc GR:$src), // truncate i64 to i1 + (CMPNE GR:$src, r0)>; // $src!=0? If so, PR:$dst=true + +let isTwoAddress=1 in { + def TPCMPEQR0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$bogus, PR:$qp), + "($qp) cmp.eq $dst, p0 = r0, r0">, isA; + def TPCMPNER0R0 : AForm<0x03, 0x0b, (ops PR:$dst, PR:$bogus, PR:$qp), + "($qp) cmp.ne $dst, p0 = r0, r0">, isA; +} + +/* our pseudocode for OR on predicates is: +pC = pA OR pB +------------- +(pA) cmp.eq.unc pC,p0 = r0,r0 // pC = pA + ;; +(pB) cmp.eq pC,p0 = r0,r0 // if (pB) pC = 1 */ + +def bOR : Pat<(or PR:$src1, PR:$src2), + (TPCMPEQR0R0 (PCMPEQUNCR0R0 PR:$src1), PR:$src2)>; + +/* our pseudocode for AND on predicates is: + * +(pA) cmp.eq.unc pC,p0 = r0,r0 // pC = pA + cmp.eq pTemp,p0 = r0,r0 // pTemp = NOT pB + ;; +(pB) cmp.ne pTemp,p0 = r0,r0 + ;; +(pTemp)cmp.ne pC,p0 = r0,r0 // if (NOT pB) pC = 0 */ + +def bAND : Pat<(and PR:$src1, PR:$src2), + ( TPCMPNER0R0 (PCMPEQUNCR0R0 PR:$src1), + (TPCMPNER0R0 (CMPEQ r0, r0), PR:$src2) )>; + +/* one possible routine for XOR on predicates is: + + // Compute px = py ^ pz + // using sum of products: px = (py & !pz) | (pz & !py) + // Uses 5 instructions in 3 cycles. + // cycle 1 +(pz) cmp.eq.unc px = r0, r0 // px = pz +(py) cmp.eq.unc pt = r0, r0 // pt = py + ;; + // cycle 2 +(pt) cmp.ne.and px = r0, r0 // px = px & !pt (px = pz & !pt) +(pz) cmp.ne.and pt = r0, r0 // pt = pt & !pz + ;; + } { .mmi + // cycle 3 +(pt) cmp.eq.or px = r0, r0 // px = px | pt + +*** Another, which we use here, requires one scratch GR. it is: + + mov rt = 0 // initialize rt off critical path + ;; + + // cycle 1 +(pz) cmp.eq.unc px = r0, r0 // px = pz +(pz) mov rt = 1 // rt = pz + ;; + // cycle 2 +(py) cmp.ne px = 1, rt // if (py) px = !pz + +.. these routines kindly provided by Jim Hull +*/ + +def bXOR : Pat<(xor PR:$src1, PR:$src2), + (TPCMPIMM8NE (PCMPEQUNCR0R0 PR:$src2), 1, + (TPCADDS (ADDS r0, 0), 1, PR:$src2), + PR:$src1)>; + +def XOR : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "xor $dst = $src1, $src2", + [(set GR:$dst, (xor GR:$src1, GR:$src2))]>, isA; + +def SHLADD: AForm_DAG<0x03, 0x0b, (ops GR:$dst,GR:$src1,s64imm:$imm,GR:$src2), + "shladd $dst = $src1, $imm, $src2", + [(set GR:$dst, (add GR:$src2, (shl GR:$src1, isSHLADDimm:$imm)))]>, isA; + +def SHL : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "shl $dst = $src1, $src2", + [(set GR:$dst, (shl GR:$src1, GR:$src2))]>, isI; + +def SHRU : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "shr.u $dst = $src1, $src2", + [(set GR:$dst, (srl GR:$src1, GR:$src2))]>, isI; + +def SHRS : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src1, GR:$src2), + "shr $dst = $src1, $src2", + [(set GR:$dst, (sra GR:$src1, GR:$src2))]>, isI; + +def MOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src), "mov $dst = $src">, isA; +def FMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "mov $dst = $src">, isF; // XXX: there _is_ no fmov +def PMOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src, PR:$qp), + "($qp) mov $dst = $src">, isA; + +def SPILL_ALL_PREDICATES_TO_GR : AForm<0x03, 0x0b, (ops GR:$dst), + "mov $dst = pr">, isI; +def FILL_ALL_PREDICATES_FROM_GR : AForm<0x03, 0x0b, (ops GR:$src), + "mov pr = $src">, isI; + +let isTwoAddress = 1 in { + def CMOV : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src2, GR:$src, PR:$qp), + "($qp) mov $dst = $src">, isA; +} + +def PFMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src, PR:$qp), + "($qp) mov $dst = $src">, isF; + +let isTwoAddress = 1 in { + def CFMOV : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src2, FP:$src, PR:$qp), + "($qp) mov $dst = $src">, isF; +} + +def SELECTINT : Pat<(select PR:$which, GR:$src1, GR:$src2), + (CMOV (MOV GR:$src2), GR:$src1, PR:$which)>; // note order! +def SELECTFP : Pat<(select PR:$which, FP:$src1, FP:$src2), + (CFMOV (FMOV FP:$src2), FP:$src1, PR:$which)>; // note order! +// TODO: can do this faster, w/o using any integer regs (see pattern isel) +def SELECTBOOL : Pat<(select PR:$which, PR:$src1, PR:$src2), // note order! + (CMPNE (CMOV + (MOV (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src2)), + (TPCADDIMM22 (ADDS r0, 0), 1, PR:$src1), PR:$which), r0)>; + +// load constants of various sizes // FIXME: prettyprint -ve constants +def : Pat<(i64 immSExt14:$imm), (ADDS r0, immSExt14:$imm)>; +def : Pat<(i1 -1), (CMPEQ r0, r0)>; // TODO: this should just be a ref to p0 +def : Pat<(i1 0), (CMPNE r0, r0)>; // TODO: any instruction actually *using* + // this predicate should be killed! + +// TODO: support postincrement (reg, imm9) loads+stores - this needs more +// tablegen support + +def IDEF : PseudoInstIA64<(ops variable_ops), "// IDEF">; + +def IDEF_GR_D : PseudoInstIA64_DAG<(ops GR:$reg), "// $reg = IDEF", + [(set GR:$reg, (undef))]>; +def IDEF_FP_D : PseudoInstIA64_DAG<(ops FP:$reg), "// $reg = IDEF", + [(set FP:$reg, (undef))]>; +def IDEF_PR_D : PseudoInstIA64_DAG<(ops PR:$reg), "// $reg = IDEF", + [(set PR:$reg, (undef))]>; + +def IUSE : PseudoInstIA64<(ops variable_ops), "// IUSE">; +def ADJUSTCALLSTACKUP : PseudoInstIA64<(ops variable_ops), + "// ADJUSTCALLSTACKUP">; +def ADJUSTCALLSTACKDOWN : PseudoInstIA64<(ops variable_ops), + "// ADJUSTCALLSTACKDOWN">; +def PSEUDO_ALLOC : PseudoInstIA64<(ops GR:$foo), "// PSEUDO_ALLOC">; + +def ALLOC : AForm<0x03, 0x0b, + (ops GR:$dst, i8imm:$inputs, i8imm:$locals, i8imm:$outputs, i8imm:$rotating), + "alloc $dst = ar.pfs,$inputs,$locals,$outputs,$rotating">, isM; + +let isTwoAddress = 1 in { + def TCMPNE : AForm<0x03, 0x0b, + (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4), + "cmp.ne $dst, p0 = $src3, $src4">, isA; + + def TPCMPEQOR : AForm<0x03, 0x0b, + (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp), + "($qp) cmp.eq.or $dst, p0 = $src3, $src4">, isA; + + def TPCMPNE : AForm<0x03, 0x0b, + (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp), + "($qp) cmp.ne $dst, p0 = $src3, $src4">, isA; + + def TPCMPEQ : AForm<0x03, 0x0b, + (ops PR:$dst, PR:$src2, GR:$src3, GR:$src4, PR:$qp), + "($qp) cmp.eq $dst, p0 = $src3, $src4">, isA; +} + +def MOVSIMM14 : AForm<0x03, 0x0b, (ops GR:$dst, s14imm:$imm), + "mov $dst = $imm">, isA; +def MOVSIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, s22imm:$imm), + "mov $dst = $imm">, isA; +def MOVLIMM64 : AForm<0x03, 0x0b, (ops GR:$dst, s64imm:$imm), + "movl $dst = $imm">, isLX; + +def SHLI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm), + "shl $dst = $src1, $imm">, isI; +def SHRUI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm), + "shr.u $dst = $src1, $imm">, isI; +def SHRSI : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, u6imm:$imm), + "shr $dst = $src1, $imm">, isI; + +def EXTRU : AForm<0x03, 0x0b, + (ops GR:$dst, GR:$src1, u6imm:$imm1, u6imm:$imm2), + "extr.u $dst = $src1, $imm1, $imm2">, isI; + +def DEPZ : AForm<0x03, 0x0b, + (ops GR:$dst, GR:$src1, u6imm:$imm1, u6imm:$imm2), + "dep.z $dst = $src1, $imm1, $imm2">, isI; + +def PCMPEQOR : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp), + "($qp) cmp.eq.or $dst, p0 = $src1, $src2">, isA; +def PCMPEQUNC : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp), + "($qp) cmp.eq.unc $dst, p0 = $src1, $src2">, isA; +def PCMPNE : AForm<0x03, 0x0b, (ops PR:$dst, GR:$src1, GR:$src2, PR:$qp), + "($qp) cmp.ne $dst, p0 = $src1, $src2">, isA; + +// two destinations! +def BCMPEQ : AForm<0x03, 0x0b, (ops PR:$dst1, PR:$dst2, GR:$src1, GR:$src2), + "cmp.eq $dst1, dst2 = $src1, $src2">, isA; + +def ADDIMM14 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s14imm:$imm), + "adds $dst = $imm, $src1">, isA; + +def ADDIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s22imm:$imm), + "add $dst = $imm, $src1">, isA; +def CADDIMM22 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$src1, s22imm:$imm, PR:$qp), + "($qp) add $dst = $imm, $src1">, isA; + +def SUBIMM8 : AForm<0x03, 0x0b, (ops GR:$dst, s8imm:$imm, GR:$src2), + "sub $dst = $imm, $src2">, isA; + +let isStore = 1, noResults = 1 in { + def ST1 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value), + "st1 [$dstPtr] = $value">, isM; + def ST2 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value), + "st2 [$dstPtr] = $value">, isM; + def ST4 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value), + "st4 [$dstPtr] = $value">, isM; + def ST8 : AForm<0x03, 0x0b, (ops GR:$dstPtr, GR:$value), + "st8 [$dstPtr] = $value">, isM; + def STF4 : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value), + "stfs [$dstPtr] = $value">, isM; + def STF8 : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value), + "stfd [$dstPtr] = $value">, isM; + def STF_SPILL : AForm<0x03, 0x0b, (ops GR:$dstPtr, FP:$value), + "stf.spill [$dstPtr] = $value">, isM; +} + +let isLoad = 1 in { + def LD1 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr), + "ld1 $dst = [$srcPtr]">, isM; + def LD2 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr), + "ld2 $dst = [$srcPtr]">, isM; + def LD4 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr), + "ld4 $dst = [$srcPtr]">, isM; + def LD8 : AForm<0x03, 0x0b, (ops GR:$dst, GR:$srcPtr), + "ld8 $dst = [$srcPtr]">, isM; + def LDF4 : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr), + "ldfs $dst = [$srcPtr]">, isM; + def LDF8 : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr), + "ldfd $dst = [$srcPtr]">, isM; + def LDF_FILL : AForm<0x03, 0x0b, (ops FP:$dst, GR:$srcPtr), + "ldf.fill $dst = [$srcPtr]">, isM; +} + +def POPCNT : AForm_DAG<0x03, 0x0b, (ops GR:$dst, GR:$src), + "popcnt $dst = $src", + [(set GR:$dst, (ctpop GR:$src))]>, isI; + +// some FP stuff: // TODO: single-precision stuff? +def FADD : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2), + "fadd $dst = $src1, $src2", + [(set FP:$dst, (fadd FP:$src1, FP:$src2))]>, isF; +def FADDS: AForm<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2), + "fadd.s $dst = $src1, $src2">, isF; +def FSUB : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2), + "fsub $dst = $src1, $src2", + [(set FP:$dst, (fsub FP:$src1, FP:$src2))]>, isF; +def FMPY : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2), + "fmpy $dst = $src1, $src2", + [(set FP:$dst, (fmul FP:$src1, FP:$src2))]>, isF; +def FMA : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "fma $dst = $src1, $src2, $src3", + [(set FP:$dst, (fadd (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF; +def FMS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "fms $dst = $src1, $src2, $src3", + [(set FP:$dst, (fsub (fmul FP:$src1, FP:$src2), FP:$src3))]>, isF; +def FNMA : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "fnma $dst = $src1, $src2, $src3", + [(set FP:$dst, (fneg (fadd (fmul FP:$src1, FP:$src2), FP:$src3)))]>, isF; +def FABS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fabs $dst = $src", + [(set FP:$dst, (fabs FP:$src))]>, isF; +def FNEG : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fneg $dst = $src", + [(set FP:$dst, (fneg FP:$src))]>, isF; +def FNEGABS : AForm_DAG<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fnegabs $dst = $src", + [(set FP:$dst, (fneg (fabs FP:$src)))]>, isF; + +let isTwoAddress=1 in { +def TCFMAS1 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF; +def TCFMADS0 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$bogussrc, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF; +} + +def CFMAS1 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fma.s1 $dst = $src1, $src2, $src3">, isF; +def CFNMAS1 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fnma.s1 $dst = $src1, $src2, $src3">, isF; + +def CFMADS1 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fma.d.s1 $dst = $src1, $src2, $src3">, isF; +def CFMADS0 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fma.d.s0 $dst = $src1, $src2, $src3">, isF; +def CFNMADS1 : AForm<0x03, 0x0b, + (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3, PR:$qp), + "($qp) fnma.d.s1 $dst = $src1, $src2, $src3">, isF; + +def FRCPAS0 : AForm<0x03, 0x0b, (ops FP:$dstFR, PR:$dstPR, FP:$src1, FP:$src2), + "frcpa.s0 $dstFR, $dstPR = $src1, $src2">, isF; +def FRCPAS1 : AForm<0x03, 0x0b, (ops FP:$dstFR, PR:$dstPR, FP:$src1, FP:$src2), + "frcpa.s1 $dstFR, $dstPR = $src1, $src2">, isF; + +def XMAL : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src1, FP:$src2, FP:$src3), + "xma.l $dst = $src1, $src2, $src3">, isF; + +def FCVTXF : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.xf $dst = $src">, isF; +def FCVTXUF : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.xuf $dst = $src">, isF; +def FCVTXUFS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.xuf.s1 $dst = $src">, isF; +def FCVTFX : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fx $dst = $src">, isF; +def FCVTFXU : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fxu $dst = $src">, isF; + +def FCVTFXTRUNC : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fx.trunc $dst = $src">, isF; +def FCVTFXUTRUNC : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fxu.trunc $dst = $src">, isF; + +def FCVTFXTRUNCS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fx.trunc.s1 $dst = $src">, isF; +def FCVTFXUTRUNCS1 : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fcvt.fxu.trunc.s1 $dst = $src">, isF; + +def FNORMD : AForm<0x03, 0x0b, (ops FP:$dst, FP:$src), + "fnorm.d $dst = $src">, isF; + +def GETFD : AForm<0x03, 0x0b, (ops GR:$dst, FP:$src), + "getf.d $dst = $src">, isM; +def SETFD : AForm<0x03, 0x0b, (ops FP:$dst, GR:$src), + "setf.d $dst = $src">, isM; + +def GETFSIG : AForm<0x03, 0x0b, (ops GR:$dst, FP:$src), + "getf.sig $dst = $src">, isM; +def SETFSIG : AForm<0x03, 0x0b, (ops FP:$dst, GR:$src), + "setf.sig $dst = $src">, isM; + +// these four FP<->int conversion patterns need checking/cleaning +def SINT_TO_FP : Pat<(sint_to_fp GR:$src), + (FNORMD (FCVTXF (SETFSIG GR:$src)))>; +def UINT_TO_FP : Pat<(uint_to_fp GR:$src), + (FNORMD (FCVTXUF (SETFSIG GR:$src)))>; +def FP_TO_SINT : Pat<(i64 (fp_to_sint FP:$src)), + (GETFSIG (FCVTFXTRUNC FP:$src))>; +def FP_TO_UINT : Pat<(i64 (fp_to_uint FP:$src)), + (GETFSIG (FCVTFXUTRUNC FP:$src))>; + + +let isTerminator = 1, isBranch = 1, noResults = 1 in { + def BRL_NOTCALL : RawForm<0x03, 0xb0, (ops i64imm:$dst), + "(p0) brl.cond.sptk $dst">, isB; + def BRLCOND_NOTCALL : RawForm<0x03, 0xb0, (ops PR:$qp, i64imm:$dst), + "($qp) brl.cond.sptk $dst">, isB; + def BRCOND_NOTCALL : RawForm<0x03, 0xb0, (ops PR:$qp, GR:$dst), + "($qp) br.cond.sptk $dst">, isB; +} + +let isCall = 1, noResults = 1, /* isTerminator = 1, isBranch = 1, */ + Uses = [out0,out1,out2,out3,out4,out5,out6,out7], +// all calls clobber non-callee-saved registers, and for now, they are these: + Defs = [r2,r3,r8,r9,r10,r11,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24, + r25,r26,r27,r28,r29,r30,r31, + p6,p7,p8,p9,p10,p11,p12,p13,p14,p15, + F6,F7,F8,F9,F10,F11,F12,F13,F14,F15, + F32,F33,F34,F35,F36,F37,F38,F39,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49, + F50,F51,F52,F53,F54,F55,F56, + F57,F58,F59,F60,F61,F62,F63,F64,F65,F66,F67,F68,F69,F70,F71,F72,F73,F74, + F75,F76,F77,F78,F79,F80,F81, + F82,F83,F84,F85,F86,F87,F88,F89,F90,F91,F92,F93,F94,F95,F96,F97,F98,F99, + F100,F101,F102,F103,F104,F105, + F106,F107,F108,F109,F110,F111,F112,F113,F114,F115,F116,F117,F118,F119, + F120,F121,F122,F123,F124,F125,F126,F127, + out0,out1,out2,out3,out4,out5,out6,out7] in { +// old pattern call + def BRCALL: RawForm<0x03, 0xb0, (ops calltarget:$dst), + "br.call.sptk rp = $dst">, isB; // FIXME: teach llvm about branch regs? +// new daggy stuff! + +// calls a globaladdress + def BRCALL_IPREL_GA : RawForm<0x03, 0xb0, (ops calltarget:$dst), + "br.call.sptk rp = $dst">, isB; // FIXME: teach llvm about branch regs? +// calls an externalsymbol + def BRCALL_IPREL_ES : RawForm<0x03, 0xb0, (ops calltarget:$dst), + "br.call.sptk rp = $dst">, isB; // FIXME: teach llvm about branch regs? +// calls through a function descriptor + def BRCALL_INDIRECT : RawForm<0x03, 0xb0, (ops GR:$branchreg), + "br.call.sptk rp = $branchreg">, isB; // FIXME: teach llvm about branch regs? + def BRLCOND_CALL : RawForm<0x03, 0xb0, (ops PR:$qp, i64imm:$dst), + "($qp) brl.cond.call.sptk $dst">, isB; + def BRCOND_CALL : RawForm<0x03, 0xb0, (ops PR:$qp, GR:$dst), + "($qp) br.cond.call.sptk $dst">, isB; +} + +// Return branch: +let isTerminator = 1, isReturn = 1, noResults = 1 in + def RET : AForm_DAG<0x03, 0x0b, (ops), + "br.ret.sptk.many rp", + [(retflag)]>, isB; // return +def : Pat<(ret), (RET)>; + +// the evil stop bit of despair +def STOP : PseudoInstIA64<(ops variable_ops), ";;">; + diff --git a/lib/Target/IA64/IA64MachineFunctionInfo.h b/lib/Target/IA64/IA64MachineFunctionInfo.h new file mode 100644 index 0000000..fb93056 --- /dev/null +++ b/lib/Target/IA64/IA64MachineFunctionInfo.h @@ -0,0 +1,34 @@ +//===-- IA64MachineFunctionInfo.h - IA64-specific information ---*- C++ -*-===// +//===-- for MachineFunction ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +//===----------------------------------------------------------------------===// +// +// This file declares IA64-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef IA64MACHINEFUNCTIONINFO_H +#define IA64MACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/MachineFunction.h" +//#include "IA64JITInfo.h" + +namespace llvm { + +class IA64FunctionInfo : public MachineFunctionInfo { + +public: + unsigned outRegsUsed; // how many 'out' registers are used + // by this machinefunction? (used to compute the appropriate + // entry in the 'alloc' instruction at the top of the + // machinefunction) + IA64FunctionInfo(MachineFunction& MF) { outRegsUsed=0; }; + +}; + +} // End llvm namespace + +#endif + diff --git a/lib/Target/IA64/IA64RegisterInfo.cpp b/lib/Target/IA64/IA64RegisterInfo.cpp new file mode 100644 index 0000000..08327f2 --- /dev/null +++ b/lib/Target/IA64/IA64RegisterInfo.cpp @@ -0,0 +1,388 @@ +//===- IA64RegisterInfo.cpp - IA64 Register Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the IA64 implementation of the MRegisterInfo class. This +// file is responsible for the frame pointer elimination optimization on IA64. +// +//===----------------------------------------------------------------------===// + +#include "IA64.h" +#include "IA64RegisterInfo.h" +#include "IA64InstrBuilder.h" +#include "IA64MachineFunctionInfo.h" +#include "llvm/Constants.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +IA64RegisterInfo::IA64RegisterInfo(const TargetInstrInfo &tii) + : IA64GenRegisterInfo(IA64::ADJUSTCALLSTACKDOWN, IA64::ADJUSTCALLSTACKUP), + TII(tii) {} + +void IA64RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIdx, + const TargetRegisterClass *RC) const{ + + if (RC == IA64::FPRegisterClass) { + BuildMI(MBB, MI, TII.get(IA64::STF_SPILL)).addFrameIndex(FrameIdx) + .addReg(SrcReg, false, false, true); + } else if (RC == IA64::GRRegisterClass) { + BuildMI(MBB, MI, TII.get(IA64::ST8)).addFrameIndex(FrameIdx) + .addReg(SrcReg, false, false, true); + } else if (RC == IA64::PRRegisterClass) { + /* we use IA64::r2 as a temporary register for doing this hackery. */ + // first we load 0: + BuildMI(MBB, MI, TII.get(IA64::MOV), IA64::r2).addReg(IA64::r0); + // then conditionally add 1: + BuildMI(MBB, MI, TII.get(IA64::CADDIMM22), IA64::r2).addReg(IA64::r2) + .addImm(1).addReg(SrcReg, false, false, true); + // and then store it to the stack + BuildMI(MBB, MI, TII.get(IA64::ST8)).addFrameIndex(FrameIdx).addReg(IA64::r2); + } else assert(0 && + "sorry, I don't know how to store this sort of reg in the stack\n"); +} + +void IA64RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC)const{ + + if (RC == IA64::FPRegisterClass) { + BuildMI(MBB, MI, TII.get(IA64::LDF_FILL), DestReg).addFrameIndex(FrameIdx); + } else if (RC == IA64::GRRegisterClass) { + BuildMI(MBB, MI, TII.get(IA64::LD8), DestReg).addFrameIndex(FrameIdx); + } else if (RC == IA64::PRRegisterClass) { + // first we load a byte from the stack into r2, our 'predicate hackery' + // scratch reg + BuildMI(MBB, MI, TII.get(IA64::LD8), IA64::r2).addFrameIndex(FrameIdx); + // then we compare it to zero. If it _is_ zero, compare-not-equal to + // r0 gives us 0, which is what we want, so that's nice. + BuildMI(MBB, MI, TII.get(IA64::CMPNE), DestReg).addReg(IA64::r2).addReg(IA64::r0); + } else assert(0 && + "sorry, I don't know how to load this sort of reg from the stack\n"); +} + +void IA64RegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + + if(RC == IA64::PRRegisterClass ) // if a bool, we use pseudocode + // (SrcReg) DestReg = cmp.eq.unc(r0, r0) + BuildMI(MBB, MI, TII.get(IA64::PCMPEQUNC), DestReg) + .addReg(IA64::r0).addReg(IA64::r0).addReg(SrcReg); + else // otherwise, MOV works (for both gen. regs and FP regs) + BuildMI(MBB, MI, TII.get(IA64::MOV), DestReg).addReg(SrcReg); +} + +void IA64RegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +const unsigned* IA64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + static const unsigned CalleeSavedRegs[] = { + IA64::r5, 0 + }; + return CalleeSavedRegs; +} + +const TargetRegisterClass* const* +IA64RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &IA64::GRRegClass, 0 + }; + return CalleeSavedRegClasses; +} + +BitVector IA64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(IA64::r0); + Reserved.set(IA64::r1); + Reserved.set(IA64::r2); + Reserved.set(IA64::r5); + Reserved.set(IA64::r12); + Reserved.set(IA64::r13); + Reserved.set(IA64::r22); + Reserved.set(IA64::rp); + return Reserved; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +// hasFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +// +bool IA64RegisterInfo::hasFP(const MachineFunction &MF) const { + return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects(); +} + +void IA64RegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (hasFP(MF)) { + // If we have a frame pointer, turn the adjcallstackup instruction into a + // 'sub SP, <amt>' and the adjcallstackdown instruction into 'add SP, + // <amt>' + MachineInstr *Old = I; + unsigned Amount = Old->getOperand(0).getImmedValue(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + MachineInstr *New; + if (Old->getOpcode() == IA64::ADJUSTCALLSTACKDOWN) { + New=BuildMI(TII.get(IA64::ADDIMM22), IA64::r12).addReg(IA64::r12) + .addImm(-Amount); + } else { + assert(Old->getOpcode() == IA64::ADJUSTCALLSTACKUP); + New=BuildMI(TII.get(IA64::ADDIMM22), IA64::r12).addReg(IA64::r12) + .addImm(Amount); + } + + // Replace the pseudo instruction with a new instruction... + MBB.insert(I, New); + } + } + + MBB.erase(I); +} + +void IA64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS)const{ + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + + bool FP = hasFP(MF); + + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getFrameIndex(); + + // choose a base register: ( hasFP? framepointer : stack pointer ) + unsigned BaseRegister = FP ? IA64::r5 : IA64::r12; + // Add the base register + MI.getOperand(i).ChangeToRegister(BaseRegister, false); + + // Now add the frame object offset to the offset from r1. + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex); + + // If we're not using a Frame Pointer that has been set to the value of the + // SP before having the stack size subtracted from it, then add the stack size + // to Offset to get the correct offset. + Offset += MF.getFrameInfo()->getStackSize(); + + // XXX: we use 'r22' as another hack+slash temporary register here :( + if (Offset <= 8191 && Offset >= -8192) { // smallish offset + // Fix up the old: + MI.getOperand(i).ChangeToRegister(IA64::r22, false); + //insert the new + MachineInstr* nMI=BuildMI(TII.get(IA64::ADDIMM22), IA64::r22) + .addReg(BaseRegister).addImm(Offset); + MBB.insert(II, nMI); + } else { // it's big + //fix up the old: + MI.getOperand(i).ChangeToRegister(IA64::r22, false); + MachineInstr* nMI; + nMI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(Offset); + MBB.insert(II, nMI); + nMI=BuildMI(TII.get(IA64::ADD), IA64::r22).addReg(BaseRegister) + .addReg(IA64::r22); + MBB.insert(II, nMI); + } + +} + +void IA64RegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineInstr *MI; + bool FP = hasFP(MF); + + // first, we handle the 'alloc' instruction, that should be right up the + // top of any function + static const unsigned RegsInOrder[96] = { // there are 96 GPRs the + // RSE worries about + IA64::r32, IA64::r33, IA64::r34, IA64::r35, + IA64::r36, IA64::r37, IA64::r38, IA64::r39, IA64::r40, IA64::r41, + IA64::r42, IA64::r43, IA64::r44, IA64::r45, IA64::r46, IA64::r47, + IA64::r48, IA64::r49, IA64::r50, IA64::r51, IA64::r52, IA64::r53, + IA64::r54, IA64::r55, IA64::r56, IA64::r57, IA64::r58, IA64::r59, + IA64::r60, IA64::r61, IA64::r62, IA64::r63, IA64::r64, IA64::r65, + IA64::r66, IA64::r67, IA64::r68, IA64::r69, IA64::r70, IA64::r71, + IA64::r72, IA64::r73, IA64::r74, IA64::r75, IA64::r76, IA64::r77, + IA64::r78, IA64::r79, IA64::r80, IA64::r81, IA64::r82, IA64::r83, + IA64::r84, IA64::r85, IA64::r86, IA64::r87, IA64::r88, IA64::r89, + IA64::r90, IA64::r91, IA64::r92, IA64::r93, IA64::r94, IA64::r95, + IA64::r96, IA64::r97, IA64::r98, IA64::r99, IA64::r100, IA64::r101, + IA64::r102, IA64::r103, IA64::r104, IA64::r105, IA64::r106, IA64::r107, + IA64::r108, IA64::r109, IA64::r110, IA64::r111, IA64::r112, IA64::r113, + IA64::r114, IA64::r115, IA64::r116, IA64::r117, IA64::r118, IA64::r119, + IA64::r120, IA64::r121, IA64::r122, IA64::r123, IA64::r124, IA64::r125, + IA64::r126, IA64::r127 }; + + unsigned numStackedGPRsUsed=0; + for(int i=0; i<96; i++) { + if(MF.isPhysRegUsed(RegsInOrder[i])) + numStackedGPRsUsed=i+1; // (i+1 and not ++ - consider fn(fp, fp, int) + } + + unsigned numOutRegsUsed=MF.getInfo<IA64FunctionInfo>()->outRegsUsed; + + // XXX FIXME : this code should be a bit more reliable (in case there _isn't_ + // a pseudo_alloc in the MBB) + unsigned dstRegOfPseudoAlloc; + for(MBBI = MBB.begin(); /*MBBI->getOpcode() != IA64::PSEUDO_ALLOC*/; ++MBBI) { + assert(MBBI != MBB.end()); + if(MBBI->getOpcode() == IA64::PSEUDO_ALLOC) { + dstRegOfPseudoAlloc=MBBI->getOperand(0).getReg(); + break; + } + } + + MI=BuildMI(TII.get(IA64::ALLOC)).addReg(dstRegOfPseudoAlloc).addImm(0). \ + addImm(numStackedGPRsUsed).addImm(numOutRegsUsed).addImm(0); + MBB.insert(MBBI, MI); + + // Get the number of bytes to allocate from the FrameInfo + unsigned NumBytes = MFI->getStackSize(); + + if(FP) + NumBytes += 8; // reserve space for the old FP + + // Do we need to allocate space on the stack? + if (NumBytes == 0) + return; + + // Add 16 bytes at the bottom of the stack (scratch area) + // and round the size to a multiple of the alignment. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned Size = 16 + (FP ? 8 : 0); + NumBytes = (NumBytes+Size+Align-1)/Align*Align; + + // Update frame info to pretend that this is part of the stack... + MFI->setStackSize(NumBytes); + + // adjust stack pointer: r12 -= numbytes + if (NumBytes <= 8191) { + MI=BuildMI(TII.get(IA64::ADDIMM22),IA64::r12).addReg(IA64::r12). + addImm(-NumBytes); + MBB.insert(MBBI, MI); + } else { // we use r22 as a scratch register here + MI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(-NumBytes); + // FIXME: MOVLSI32 expects a _u_32imm + MBB.insert(MBBI, MI); // first load the decrement into r22 + MI=BuildMI(TII.get(IA64::ADD), IA64::r12).addReg(IA64::r12).addReg(IA64::r22); + MBB.insert(MBBI, MI); // then add (subtract) it to r12 (stack ptr) + } + + // now if we need to, save the old FP and set the new + if (FP) { + MI = BuildMI(TII.get(IA64::ST8)).addReg(IA64::r12).addReg(IA64::r5); + MBB.insert(MBBI, MI); + // this must be the last instr in the prolog ? (XXX: why??) + MI = BuildMI(TII.get(IA64::MOV), IA64::r5).addReg(IA64::r12); + MBB.insert(MBBI, MI); + } + +} + +void IA64RegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + MachineInstr *MI; + assert(MBBI->getOpcode() == IA64::RET && + "Can only insert epilog into returning blocks"); + + bool FP = hasFP(MF); + + // Get the number of bytes allocated from the FrameInfo... + unsigned NumBytes = MFI->getStackSize(); + + //now if we need to, restore the old FP + if (FP) + { + //copy the FP into the SP (discards allocas) + MI=BuildMI(TII.get(IA64::MOV), IA64::r12).addReg(IA64::r5); + MBB.insert(MBBI, MI); + //restore the FP + MI=BuildMI(TII.get(IA64::LD8), IA64::r5).addReg(IA64::r5); + MBB.insert(MBBI, MI); + } + + if (NumBytes != 0) + { + if (NumBytes <= 8191) { + MI=BuildMI(TII.get(IA64::ADDIMM22),IA64::r12).addReg(IA64::r12). + addImm(NumBytes); + MBB.insert(MBBI, MI); + } else { + MI=BuildMI(TII.get(IA64::MOVLIMM64), IA64::r22).addImm(NumBytes); + MBB.insert(MBBI, MI); + MI=BuildMI(TII.get(IA64::ADD), IA64::r12).addReg(IA64::r12). + addReg(IA64::r22); + MBB.insert(MBBI, MI); + } + } + +} + +unsigned IA64RegisterInfo::getRARegister() const { + assert(0 && "What is the return address register"); + return 0; +} + +unsigned IA64RegisterInfo::getFrameRegister(MachineFunction &MF) const { + return hasFP(MF) ? IA64::r5 : IA64::r12; +} + +unsigned IA64RegisterInfo::getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned IA64RegisterInfo::getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +#include "IA64GenRegisterInfo.inc" + diff --git a/lib/Target/IA64/IA64RegisterInfo.h b/lib/Target/IA64/IA64RegisterInfo.h new file mode 100644 index 0000000..162ad5a --- /dev/null +++ b/lib/Target/IA64/IA64RegisterInfo.h @@ -0,0 +1,81 @@ +//===- IA64RegisterInfo.h - IA64 Register Information Impl ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the IA64 implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef IA64REGISTERINFO_H +#define IA64REGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "IA64GenRegisterInfo.h.inc" + +namespace llvm { class llvm::Type; } + +namespace llvm { + +class TargetInstrInfo; + +struct IA64RegisterInfo : public IA64GenRegisterInfo { + const TargetInstrInfo &TII; + + IA64RegisterInfo(const TargetInstrInfo &tii); + + /// Code Generation virtual methods... + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, RegScavenger *RS = NULL) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +} // End llvm namespace + +#endif + diff --git a/lib/Target/IA64/IA64RegisterInfo.td b/lib/Target/IA64/IA64RegisterInfo.td new file mode 100644 index 0000000..087c18f --- /dev/null +++ b/lib/Target/IA64/IA64RegisterInfo.td @@ -0,0 +1,508 @@ +//===- IA64RegisterInfo.td - Describe the IA64 Register File ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the IA64 register file, defining the registers +// themselves, aliases between the registers, and the register classes built +// out of the registers. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Register definitions... +// + +class IA64Register<string n> : Register<n> { + let Namespace = "IA64"; +} + +// GR - One of 128 32-bit general registers +class GR<bits<7> num, string n> : IA64Register<n> { + field bits<7> Num = num; +} + +// FP - One of 128 82-bit floating-point registers +class FP<bits<7> num, string n> : IA64Register<n> { + field bits<7> Num = num; +} + +// PR - One of 64 1-bit predicate registers +class PR<bits<6> num, string n> : IA64Register<n> { + field bits<6> Num = num; +} + +/* general registers */ +def r0 : GR< 0, "r0">, DwarfRegNum<0>; +def r1 : GR< 1, "r1">, DwarfRegNum<1>; +def r2 : GR< 2, "r2">, DwarfRegNum<2>; +def r3 : GR< 3, "r3">, DwarfRegNum<3>; +def r4 : GR< 4, "r4">, DwarfRegNum<4>; +def r5 : GR< 5, "r5">, DwarfRegNum<5>; +def r6 : GR< 6, "r6">, DwarfRegNum<6>; +def r7 : GR< 7, "r7">, DwarfRegNum<7>; +def r8 : GR< 8, "r8">, DwarfRegNum<8>; +def r9 : GR< 9, "r9">, DwarfRegNum<9>; +def r10 : GR< 10, "r10">, DwarfRegNum<10>; +def r11 : GR< 11, "r11">, DwarfRegNum<11>; +def r12 : GR< 12, "r12">, DwarfRegNum<12>; +def r13 : GR< 13, "r13">, DwarfRegNum<13>; +def r14 : GR< 14, "r14">, DwarfRegNum<14>; +def r15 : GR< 15, "r15">, DwarfRegNum<15>; +def r16 : GR< 16, "r16">, DwarfRegNum<16>; +def r17 : GR< 17, "r17">, DwarfRegNum<17>; +def r18 : GR< 18, "r18">, DwarfRegNum<18>; +def r19 : GR< 19, "r19">, DwarfRegNum<19>; +def r20 : GR< 20, "r20">, DwarfRegNum<20>; +def r21 : GR< 21, "r21">, DwarfRegNum<21>; +def r22 : GR< 22, "r22">, DwarfRegNum<22>; +def r23 : GR< 23, "r23">, DwarfRegNum<23>; +def r24 : GR< 24, "r24">, DwarfRegNum<24>; +def r25 : GR< 25, "r25">, DwarfRegNum<25>; +def r26 : GR< 26, "r26">, DwarfRegNum<26>; +def r27 : GR< 27, "r27">, DwarfRegNum<27>; +def r28 : GR< 28, "r28">, DwarfRegNum<28>; +def r29 : GR< 29, "r29">, DwarfRegNum<29>; +def r30 : GR< 30, "r30">, DwarfRegNum<30>; +def r31 : GR< 31, "r31">, DwarfRegNum<31>; +def r32 : GR< 32, "r32">, DwarfRegNum<32>; +def r33 : GR< 33, "r33">, DwarfRegNum<33>; +def r34 : GR< 34, "r34">, DwarfRegNum<34>; +def r35 : GR< 35, "r35">, DwarfRegNum<35>; +def r36 : GR< 36, "r36">, DwarfRegNum<36>; +def r37 : GR< 37, "r37">, DwarfRegNum<37>; +def r38 : GR< 38, "r38">, DwarfRegNum<38>; +def r39 : GR< 39, "r39">, DwarfRegNum<39>; +def r40 : GR< 40, "r40">, DwarfRegNum<40>; +def r41 : GR< 41, "r41">, DwarfRegNum<41>; +def r42 : GR< 42, "r42">, DwarfRegNum<42>; +def r43 : GR< 43, "r43">, DwarfRegNum<43>; +def r44 : GR< 44, "r44">, DwarfRegNum<44>; +def r45 : GR< 45, "r45">, DwarfRegNum<45>; +def r46 : GR< 46, "r46">, DwarfRegNum<46>; +def r47 : GR< 47, "r47">, DwarfRegNum<47>; +def r48 : GR< 48, "r48">, DwarfRegNum<48>; +def r49 : GR< 49, "r49">, DwarfRegNum<49>; +def r50 : GR< 50, "r50">, DwarfRegNum<50>; +def r51 : GR< 51, "r51">, DwarfRegNum<51>; +def r52 : GR< 52, "r52">, DwarfRegNum<52>; +def r53 : GR< 53, "r53">, DwarfRegNum<53>; +def r54 : GR< 54, "r54">, DwarfRegNum<54>; +def r55 : GR< 55, "r55">, DwarfRegNum<55>; +def r56 : GR< 56, "r56">, DwarfRegNum<56>; +def r57 : GR< 57, "r57">, DwarfRegNum<57>; +def r58 : GR< 58, "r58">, DwarfRegNum<58>; +def r59 : GR< 59, "r59">, DwarfRegNum<59>; +def r60 : GR< 60, "r60">, DwarfRegNum<60>; +def r61 : GR< 61, "r61">, DwarfRegNum<61>; +def r62 : GR< 62, "r62">, DwarfRegNum<62>; +def r63 : GR< 63, "r63">, DwarfRegNum<63>; +def r64 : GR< 64, "r64">, DwarfRegNum<64>; +def r65 : GR< 65, "r65">, DwarfRegNum<65>; +def r66 : GR< 66, "r66">, DwarfRegNum<66>; +def r67 : GR< 67, "r67">, DwarfRegNum<67>; +def r68 : GR< 68, "r68">, DwarfRegNum<68>; +def r69 : GR< 69, "r69">, DwarfRegNum<69>; +def r70 : GR< 70, "r70">, DwarfRegNum<70>; +def r71 : GR< 71, "r71">, DwarfRegNum<71>; +def r72 : GR< 72, "r72">, DwarfRegNum<72>; +def r73 : GR< 73, "r73">, DwarfRegNum<73>; +def r74 : GR< 74, "r74">, DwarfRegNum<74>; +def r75 : GR< 75, "r75">, DwarfRegNum<75>; +def r76 : GR< 76, "r76">, DwarfRegNum<76>; +def r77 : GR< 77, "r77">, DwarfRegNum<77>; +def r78 : GR< 78, "r78">, DwarfRegNum<78>; +def r79 : GR< 79, "r79">, DwarfRegNum<79>; +def r80 : GR< 80, "r80">, DwarfRegNum<80>; +def r81 : GR< 81, "r81">, DwarfRegNum<81>; +def r82 : GR< 82, "r82">, DwarfRegNum<82>; +def r83 : GR< 83, "r83">, DwarfRegNum<83>; +def r84 : GR< 84, "r84">, DwarfRegNum<84>; +def r85 : GR< 85, "r85">, DwarfRegNum<85>; +def r86 : GR< 86, "r86">, DwarfRegNum<86>; +def r87 : GR< 87, "r87">, DwarfRegNum<87>; +def r88 : GR< 88, "r88">, DwarfRegNum<88>; +def r89 : GR< 89, "r89">, DwarfRegNum<89>; +def r90 : GR< 90, "r90">, DwarfRegNum<90>; +def r91 : GR< 91, "r91">, DwarfRegNum<91>; +def r92 : GR< 92, "r92">, DwarfRegNum<92>; +def r93 : GR< 93, "r93">, DwarfRegNum<93>; +def r94 : GR< 94, "r94">, DwarfRegNum<94>; +def r95 : GR< 95, "r95">, DwarfRegNum<95>; +def r96 : GR< 96, "r96">, DwarfRegNum<96>; +def r97 : GR< 97, "r97">, DwarfRegNum<97>; +def r98 : GR< 98, "r98">, DwarfRegNum<98>; +def r99 : GR< 99, "r99">, DwarfRegNum<99>; +def r100 : GR< 100, "r100">, DwarfRegNum<100>; +def r101 : GR< 101, "r101">, DwarfRegNum<101>; +def r102 : GR< 102, "r102">, DwarfRegNum<102>; +def r103 : GR< 103, "r103">, DwarfRegNum<103>; +def r104 : GR< 104, "r104">, DwarfRegNum<104>; +def r105 : GR< 105, "r105">, DwarfRegNum<105>; +def r106 : GR< 106, "r106">, DwarfRegNum<106>; +def r107 : GR< 107, "r107">, DwarfRegNum<107>; +def r108 : GR< 108, "r108">, DwarfRegNum<108>; +def r109 : GR< 109, "r109">, DwarfRegNum<109>; +def r110 : GR< 110, "r110">, DwarfRegNum<110>; +def r111 : GR< 111, "r111">, DwarfRegNum<111>; +def r112 : GR< 112, "r112">, DwarfRegNum<112>; +def r113 : GR< 113, "r113">, DwarfRegNum<113>; +def r114 : GR< 114, "r114">, DwarfRegNum<114>; +def r115 : GR< 115, "r115">, DwarfRegNum<115>; +def r116 : GR< 116, "r116">, DwarfRegNum<116>; +def r117 : GR< 117, "r117">, DwarfRegNum<117>; +def r118 : GR< 118, "r118">, DwarfRegNum<118>; +def r119 : GR< 119, "r119">, DwarfRegNum<119>; +def r120 : GR< 120, "r120">, DwarfRegNum<120>; +def r121 : GR< 121, "r121">, DwarfRegNum<121>; +def r122 : GR< 122, "r122">, DwarfRegNum<122>; +def r123 : GR< 123, "r123">, DwarfRegNum<123>; +def r124 : GR< 124, "r124">, DwarfRegNum<124>; +def r125 : GR< 125, "r125">, DwarfRegNum<125>; +def r126 : GR< 126, "r126">, DwarfRegNum<126>; +def r127 : GR< 127, "r127">, DwarfRegNum<127>; + +/* floating-point registers */ +def F0 : FP< 0, "f0">, DwarfRegNum<128>; +def F1 : FP< 1, "f1">, DwarfRegNum<129>; +def F2 : FP< 2, "f2">, DwarfRegNum<130>; +def F3 : FP< 3, "f3">, DwarfRegNum<131>; +def F4 : FP< 4, "f4">, DwarfRegNum<132>; +def F5 : FP< 5, "f5">, DwarfRegNum<133>; +def F6 : FP< 6, "f6">, DwarfRegNum<134>; +def F7 : FP< 7, "f7">, DwarfRegNum<135>; +def F8 : FP< 8, "f8">, DwarfRegNum<136>; +def F9 : FP< 9, "f9">, DwarfRegNum<137>; +def F10 : FP< 10, "f10">, DwarfRegNum<138>; +def F11 : FP< 11, "f11">, DwarfRegNum<139>; +def F12 : FP< 12, "f12">, DwarfRegNum<140>; +def F13 : FP< 13, "f13">, DwarfRegNum<141>; +def F14 : FP< 14, "f14">, DwarfRegNum<142>; +def F15 : FP< 15, "f15">, DwarfRegNum<143>; +def F16 : FP< 16, "f16">, DwarfRegNum<144>; +def F17 : FP< 17, "f17">, DwarfRegNum<145>; +def F18 : FP< 18, "f18">, DwarfRegNum<146>; +def F19 : FP< 19, "f19">, DwarfRegNum<147>; +def F20 : FP< 20, "f20">, DwarfRegNum<148>; +def F21 : FP< 21, "f21">, DwarfRegNum<149>; +def F22 : FP< 22, "f22">, DwarfRegNum<150>; +def F23 : FP< 23, "f23">, DwarfRegNum<151>; +def F24 : FP< 24, "f24">, DwarfRegNum<152>; +def F25 : FP< 25, "f25">, DwarfRegNum<153>; +def F26 : FP< 26, "f26">, DwarfRegNum<154>; +def F27 : FP< 27, "f27">, DwarfRegNum<155>; +def F28 : FP< 28, "f28">, DwarfRegNum<156>; +def F29 : FP< 29, "f29">, DwarfRegNum<157>; +def F30 : FP< 30, "f30">, DwarfRegNum<158>; +def F31 : FP< 31, "f31">, DwarfRegNum<159>; +def F32 : FP< 32, "f32">, DwarfRegNum<160>; +def F33 : FP< 33, "f33">, DwarfRegNum<161>; +def F34 : FP< 34, "f34">, DwarfRegNum<162>; +def F35 : FP< 35, "f35">, DwarfRegNum<163>; +def F36 : FP< 36, "f36">, DwarfRegNum<164>; +def F37 : FP< 37, "f37">, DwarfRegNum<165>; +def F38 : FP< 38, "f38">, DwarfRegNum<166>; +def F39 : FP< 39, "f39">, DwarfRegNum<167>; +def F40 : FP< 40, "f40">, DwarfRegNum<168>; +def F41 : FP< 41, "f41">, DwarfRegNum<169>; +def F42 : FP< 42, "f42">, DwarfRegNum<170>; +def F43 : FP< 43, "f43">, DwarfRegNum<171>; +def F44 : FP< 44, "f44">, DwarfRegNum<172>; +def F45 : FP< 45, "f45">, DwarfRegNum<173>; +def F46 : FP< 46, "f46">, DwarfRegNum<174>; +def F47 : FP< 47, "f47">, DwarfRegNum<175>; +def F48 : FP< 48, "f48">, DwarfRegNum<176>; +def F49 : FP< 49, "f49">, DwarfRegNum<177>; +def F50 : FP< 50, "f50">, DwarfRegNum<178>; +def F51 : FP< 51, "f51">, DwarfRegNum<179>; +def F52 : FP< 52, "f52">, DwarfRegNum<180>; +def F53 : FP< 53, "f53">, DwarfRegNum<181>; +def F54 : FP< 54, "f54">, DwarfRegNum<182>; +def F55 : FP< 55, "f55">, DwarfRegNum<183>; +def F56 : FP< 56, "f56">, DwarfRegNum<184>; +def F57 : FP< 57, "f57">, DwarfRegNum<185>; +def F58 : FP< 58, "f58">, DwarfRegNum<186>; +def F59 : FP< 59, "f59">, DwarfRegNum<187>; +def F60 : FP< 60, "f60">, DwarfRegNum<188>; +def F61 : FP< 61, "f61">, DwarfRegNum<189>; +def F62 : FP< 62, "f62">, DwarfRegNum<190>; +def F63 : FP< 63, "f63">, DwarfRegNum<191>; +def F64 : FP< 64, "f64">, DwarfRegNum<192>; +def F65 : FP< 65, "f65">, DwarfRegNum<193>; +def F66 : FP< 66, "f66">, DwarfRegNum<194>; +def F67 : FP< 67, "f67">, DwarfRegNum<195>; +def F68 : FP< 68, "f68">, DwarfRegNum<196>; +def F69 : FP< 69, "f69">, DwarfRegNum<197>; +def F70 : FP< 70, "f70">, DwarfRegNum<198>; +def F71 : FP< 71, "f71">, DwarfRegNum<199>; +def F72 : FP< 72, "f72">, DwarfRegNum<200>; +def F73 : FP< 73, "f73">, DwarfRegNum<201>; +def F74 : FP< 74, "f74">, DwarfRegNum<202>; +def F75 : FP< 75, "f75">, DwarfRegNum<203>; +def F76 : FP< 76, "f76">, DwarfRegNum<204>; +def F77 : FP< 77, "f77">, DwarfRegNum<205>; +def F78 : FP< 78, "f78">, DwarfRegNum<206>; +def F79 : FP< 79, "f79">, DwarfRegNum<207>; +def F80 : FP< 80, "f80">, DwarfRegNum<208>; +def F81 : FP< 81, "f81">, DwarfRegNum<209>; +def F82 : FP< 82, "f82">, DwarfRegNum<210>; +def F83 : FP< 83, "f83">, DwarfRegNum<211>; +def F84 : FP< 84, "f84">, DwarfRegNum<212>; +def F85 : FP< 85, "f85">, DwarfRegNum<213>; +def F86 : FP< 86, "f86">, DwarfRegNum<214>; +def F87 : FP< 87, "f87">, DwarfRegNum<215>; +def F88 : FP< 88, "f88">, DwarfRegNum<216>; +def F89 : FP< 89, "f89">, DwarfRegNum<217>; +def F90 : FP< 90, "f90">, DwarfRegNum<218>; +def F91 : FP< 91, "f91">, DwarfRegNum<219>; +def F92 : FP< 92, "f92">, DwarfRegNum<220>; +def F93 : FP< 93, "f93">, DwarfRegNum<221>; +def F94 : FP< 94, "f94">, DwarfRegNum<222>; +def F95 : FP< 95, "f95">, DwarfRegNum<223>; +def F96 : FP< 96, "f96">, DwarfRegNum<224>; +def F97 : FP< 97, "f97">, DwarfRegNum<225>; +def F98 : FP< 98, "f98">, DwarfRegNum<226>; +def F99 : FP< 99, "f99">, DwarfRegNum<227>; +def F100 : FP< 100, "f100">, DwarfRegNum<228>; +def F101 : FP< 101, "f101">, DwarfRegNum<229>; +def F102 : FP< 102, "f102">, DwarfRegNum<230>; +def F103 : FP< 103, "f103">, DwarfRegNum<231>; +def F104 : FP< 104, "f104">, DwarfRegNum<232>; +def F105 : FP< 105, "f105">, DwarfRegNum<233>; +def F106 : FP< 106, "f106">, DwarfRegNum<234>; +def F107 : FP< 107, "f107">, DwarfRegNum<235>; +def F108 : FP< 108, "f108">, DwarfRegNum<236>; +def F109 : FP< 109, "f109">, DwarfRegNum<237>; +def F110 : FP< 110, "f110">, DwarfRegNum<238>; +def F111 : FP< 111, "f111">, DwarfRegNum<239>; +def F112 : FP< 112, "f112">, DwarfRegNum<240>; +def F113 : FP< 113, "f113">, DwarfRegNum<241>; +def F114 : FP< 114, "f114">, DwarfRegNum<242>; +def F115 : FP< 115, "f115">, DwarfRegNum<243>; +def F116 : FP< 116, "f116">, DwarfRegNum<244>; +def F117 : FP< 117, "f117">, DwarfRegNum<245>; +def F118 : FP< 118, "f118">, DwarfRegNum<246>; +def F119 : FP< 119, "f119">, DwarfRegNum<247>; +def F120 : FP< 120, "f120">, DwarfRegNum<248>; +def F121 : FP< 121, "f121">, DwarfRegNum<249>; +def F122 : FP< 122, "f122">, DwarfRegNum<250>; +def F123 : FP< 123, "f123">, DwarfRegNum<251>; +def F124 : FP< 124, "f124">, DwarfRegNum<252>; +def F125 : FP< 125, "f125">, DwarfRegNum<253>; +def F126 : FP< 126, "f126">, DwarfRegNum<254>; +def F127 : FP< 127, "f127">, DwarfRegNum<255>; + +/* predicate registers */ +def p0 : PR< 0, "p0">, DwarfRegNum<256>; +def p1 : PR< 1, "p1">, DwarfRegNum<257>; +def p2 : PR< 2, "p2">, DwarfRegNum<258>; +def p3 : PR< 3, "p3">, DwarfRegNum<259>; +def p4 : PR< 4, "p4">, DwarfRegNum<260>; +def p5 : PR< 5, "p5">, DwarfRegNum<261>; +def p6 : PR< 6, "p6">, DwarfRegNum<262>; +def p7 : PR< 7, "p7">, DwarfRegNum<263>; +def p8 : PR< 8, "p8">, DwarfRegNum<264>; +def p9 : PR< 9, "p9">, DwarfRegNum<265>; +def p10 : PR< 10, "p10">, DwarfRegNum<266>; +def p11 : PR< 11, "p11">, DwarfRegNum<267>; +def p12 : PR< 12, "p12">, DwarfRegNum<268>; +def p13 : PR< 13, "p13">, DwarfRegNum<269>; +def p14 : PR< 14, "p14">, DwarfRegNum<270>; +def p15 : PR< 15, "p15">, DwarfRegNum<271>; +def p16 : PR< 16, "p16">, DwarfRegNum<272>; +def p17 : PR< 17, "p17">, DwarfRegNum<273>; +def p18 : PR< 18, "p18">, DwarfRegNum<274>; +def p19 : PR< 19, "p19">, DwarfRegNum<275>; +def p20 : PR< 20, "p20">, DwarfRegNum<276>; +def p21 : PR< 21, "p21">, DwarfRegNum<277>; +def p22 : PR< 22, "p22">, DwarfRegNum<278>; +def p23 : PR< 23, "p23">, DwarfRegNum<279>; +def p24 : PR< 24, "p24">, DwarfRegNum<280>; +def p25 : PR< 25, "p25">, DwarfRegNum<281>; +def p26 : PR< 26, "p26">, DwarfRegNum<282>; +def p27 : PR< 27, "p27">, DwarfRegNum<283>; +def p28 : PR< 28, "p28">, DwarfRegNum<284>; +def p29 : PR< 29, "p29">, DwarfRegNum<285>; +def p30 : PR< 30, "p30">, DwarfRegNum<286>; +def p31 : PR< 31, "p31">, DwarfRegNum<287>; +def p32 : PR< 32, "p32">, DwarfRegNum<288>; +def p33 : PR< 33, "p33">, DwarfRegNum<289>; +def p34 : PR< 34, "p34">, DwarfRegNum<290>; +def p35 : PR< 35, "p35">, DwarfRegNum<291>; +def p36 : PR< 36, "p36">, DwarfRegNum<292>; +def p37 : PR< 37, "p37">, DwarfRegNum<293>; +def p38 : PR< 38, "p38">, DwarfRegNum<294>; +def p39 : PR< 39, "p39">, DwarfRegNum<295>; +def p40 : PR< 40, "p40">, DwarfRegNum<296>; +def p41 : PR< 41, "p41">, DwarfRegNum<297>; +def p42 : PR< 42, "p42">, DwarfRegNum<298>; +def p43 : PR< 43, "p43">, DwarfRegNum<299>; +def p44 : PR< 44, "p44">, DwarfRegNum<300>; +def p45 : PR< 45, "p45">, DwarfRegNum<301>; +def p46 : PR< 46, "p46">, DwarfRegNum<302>; +def p47 : PR< 47, "p47">, DwarfRegNum<303>; +def p48 : PR< 48, "p48">, DwarfRegNum<304>; +def p49 : PR< 49, "p49">, DwarfRegNum<305>; +def p50 : PR< 50, "p50">, DwarfRegNum<306>; +def p51 : PR< 51, "p51">, DwarfRegNum<307>; +def p52 : PR< 52, "p52">, DwarfRegNum<308>; +def p53 : PR< 53, "p53">, DwarfRegNum<309>; +def p54 : PR< 54, "p54">, DwarfRegNum<310>; +def p55 : PR< 55, "p55">, DwarfRegNum<311>; +def p56 : PR< 56, "p56">, DwarfRegNum<312>; +def p57 : PR< 57, "p57">, DwarfRegNum<313>; +def p58 : PR< 58, "p58">, DwarfRegNum<314>; +def p59 : PR< 59, "p59">, DwarfRegNum<315>; +def p60 : PR< 60, "p60">, DwarfRegNum<316>; +def p61 : PR< 61, "p61">, DwarfRegNum<317>; +def p62 : PR< 62, "p62">, DwarfRegNum<318>; +def p63 : PR< 63, "p63">, DwarfRegNum<319>; + +// XXX : this is temporary, we'll eventually have the output registers +// in the general purpose register class too? +def out0 : GR<0, "out0">, DwarfRegNum<120>; +def out1 : GR<1, "out1">, DwarfRegNum<121>; +def out2 : GR<2, "out2">, DwarfRegNum<122>; +def out3 : GR<3, "out3">, DwarfRegNum<123>; +def out4 : GR<4, "out4">, DwarfRegNum<124>; +def out5 : GR<5, "out5">, DwarfRegNum<125>; +def out6 : GR<6, "out6">, DwarfRegNum<126>; +def out7 : GR<7, "out7">, DwarfRegNum<127>; + +// application (special) registers: + +// "previous function state" application register +def AR_PFS : GR<0, "ar.pfs">, DwarfRegNum<331>; + +// "return pointer" (this is really branch register b0) +def rp : GR<0, "rp">, DwarfRegNum<-1>; + +// branch reg 6 +def B6 : GR<0, "b6">, DwarfRegNum<326>; + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// these are the scratch (+stacked) general registers +// FIXME/XXX we also reserve a frame pointer (r5) +// FIXME/XXX we also reserve r2 for spilling/filling predicates +// in IA64RegisterInfo.cpp +// FIXME/XXX we also reserve r22 for calculating addresses +// in IA64RegisterInfo.cpp + +def GR : RegisterClass<"IA64", [i64], 64, + [ + +//FIXME!: for both readability and performance, we don't want the out +// registers to be the first ones allocated + + out7, out6, out5, out4, out3, out2, out1, out0, + r3, r8, r9, r10, r11, r14, r15, + r16, r17, r18, r19, r20, r21, r23, + r24, r25, r26, r27, r28, r29, r30, r31, + r32, r33, r34, r35, r36, r37, r38, r39, + r40, r41, r42, r43, r44, r45, r46, r47, + r48, r49, r50, r51, r52, r53, r54, r55, + r56, r57, r58, r59, r60, r61, r62, r63, + r64, r65, r66, r67, r68, r69, r70, r71, + r72, r73, r74, r75, r76, r77, r78, r79, + r80, r81, r82, r83, r84, r85, r86, r87, + r88, r89, r90, r91, r92, r93, r94, r95, + r96, r97, r98, r99, r100, r101, r102, r103, + r104, r105, r106, r107, r108, r109, r110, r111, + r112, r113, r114, r115, r116, r117, r118, r119, + r120, r121, r122, r123, r124, r125, r126, r127, + r0, r1, r2, r5, r12, r13, r22, rp]> // the last 16 are special (look down) + { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + GRClass::iterator + GRClass::allocation_order_begin(const MachineFunction &MF) const { + // hide the 8 out? registers appropriately: + return begin()+(8-(MF.getInfo<IA64FunctionInfo>()->outRegsUsed)); + } + + GRClass::iterator + GRClass::allocation_order_end(const MachineFunction &MF) const { + int numReservedRegs=8; // the 8 special registers r0,r1,r2,r5,r12,r13 etc + + // we also can't allocate registers for use as locals if they're + // already required as 'out' registers + numReservedRegs+=MF.getInfo<IA64FunctionInfo>()->outRegsUsed; + + return end()-numReservedRegs; // hide registers appropriately + } + }]; +} + + +// these are the scratch (+stacked) FP registers + +def FP : RegisterClass<"IA64", [f64], 64, + [F6, F7, + F8, F9, F10, F11, F12, F13, F14, F15, + F32, F33, F34, F35, F36, F37, F38, F39, + F40, F41, F42, F43, F44, F45, F46, F47, + F48, F49, F50, F51, F52, F53, F54, F55, + F56, F57, F58, F59, F60, F61, F62, F63, + F64, F65, F66, F67, F68, F69, F70, F71, + F72, F73, F74, F75, F76, F77, F78, F79, + F80, F81, F82, F83, F84, F85, F86, F87, + F88, F89, F90, F91, F92, F93, F94, F95, + F96, F97, F98, F99, F100, F101, F102, F103, + F104, F105, F106, F107, F108, F109, F110, F111, + F112, F113, F114, F115, F116, F117, F118, F119, + F120, F121, F122, F123, F124, F125, F126, F127, + F0, F1]> // these last two are hidden + { +// the 128s here are to make stf.spill/ldf.fill happy, +// when storing full (82-bit) FP regs to stack slots +// we need to 16-byte align + let Size=128; + let Alignment=128; + + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + FPClass::iterator + FPClass::allocation_order_begin(const MachineFunction &MF) const { + return begin(); // we don't hide any FP regs from the start + } + + FPClass::iterator + FPClass::allocation_order_end(const MachineFunction &MF) const { + return end()-2; // we hide regs F0, F1 from the end + } + }]; +} + +// these are the predicate registers, p0 (1/TRUE) is not here +def PR : RegisterClass<"IA64", [i1], 64, + +// for now, let's be wimps and only have the scratch predicate regs + [p6, p7, p8, p9, p10, p11, p12, p13, p14, p15]> { + let Size = 64; + } + +/* + [p1, p2, p3, p4, p5, p6, p7, + p8, p9, p10, p11, p12, p13, p14, p15, + p16, p17, p18, p19, p20, p21, p22, p23, + p24, p25, p26, p27, p28, p29, p30, p31, + p32, p33, p34, p35, p36, p37, p38, p39, + p40, p41, p42, p43, p44, p45, p46, p47, + p48, p49, p50, p51, p52, p53, p54, p55, + p56, p57, p58, p59, p60, p61, p62, p63]>; + */ diff --git a/lib/Target/IA64/IA64TargetAsmInfo.cpp b/lib/Target/IA64/IA64TargetAsmInfo.cpp new file mode 100644 index 0000000..1a7e2b2 --- /dev/null +++ b/lib/Target/IA64/IA64TargetAsmInfo.cpp @@ -0,0 +1,34 @@ +//===-- IA64TargetAsmInfo.cpp - IA64 asm properties -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the IA64TargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "IA64TargetAsmInfo.h" + +using namespace llvm; + +IA64TargetAsmInfo::IA64TargetAsmInfo(const IA64TargetMachine &TM) { + CommentString = "//"; + Data8bitsDirective = "\tdata1\t"; // FIXME: check that we are + Data16bitsDirective = "\tdata2.ua\t"; // disabling auto-alignment + Data32bitsDirective = "\tdata4.ua\t"; // properly + Data64bitsDirective = "\tdata8.ua\t"; + ZeroDirective = "\t.skip\t"; + AsciiDirective = "\tstring\t"; + + GlobalVarAddrPrefix=""; + GlobalVarAddrSuffix=""; + FunctionAddrPrefix="@fptr("; + FunctionAddrSuffix=")"; + + // FIXME: would be nice to have rodata (no 'w') when appropriate? + ConstantPoolSection = "\n\t.section .data, \"aw\", \"progbits\"\n"; +} diff --git a/lib/Target/IA64/IA64TargetAsmInfo.h b/lib/Target/IA64/IA64TargetAsmInfo.h new file mode 100644 index 0000000..681253c --- /dev/null +++ b/lib/Target/IA64/IA64TargetAsmInfo.h @@ -0,0 +1,31 @@ +//=====-- IA64TargetAsmInfo.h - IA64 asm properties -----------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the IA64TargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef IA64TARGETASMINFO_H +#define IA64TARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class IA64TargetMachine; + + struct IA64TargetAsmInfo : public TargetAsmInfo { + IA64TargetAsmInfo(const IA64TargetMachine &TM); + }; + + +} // namespace llvm + +#endif diff --git a/lib/Target/IA64/IA64TargetMachine.cpp b/lib/Target/IA64/IA64TargetMachine.cpp new file mode 100644 index 0000000..51beaa1 --- /dev/null +++ b/lib/Target/IA64/IA64TargetMachine.cpp @@ -0,0 +1,91 @@ +//===-- IA64TargetMachine.cpp - Define TargetMachine for IA64 -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the IA64 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#include "IA64TargetAsmInfo.h" +#include "IA64TargetMachine.h" +#include "IA64.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Target/TargetMachineRegistry.h" +using namespace llvm; + +/// IA64TargetMachineModule - Note that this is used on hosts that cannot link +/// in a library unless there are references into the library. In particular, +/// it seems that it is not possible to get things to work on Win32 without +/// this. Though it is unused, do not remove it. +extern "C" int IA64TargetMachineModule; +int IA64TargetMachineModule = 0; + +namespace { + RegisterTarget<IA64TargetMachine> X("ia64", " IA-64 (Itanium)"); +} + +const TargetAsmInfo *IA64TargetMachine::createTargetAsmInfo() const { + return new IA64TargetAsmInfo(*this); +} + +unsigned IA64TargetMachine::getModuleMatchQuality(const Module &M) { + // we match [iI][aA]*64 + bool seenIA64=false; + std::string TT = M.getTargetTriple(); + + if (TT.size() >= 4) { + if( (TT[0]=='i' || TT[0]=='I') && + (TT[1]=='a' || TT[1]=='A') ) { + for(unsigned int i=2; i<(TT.size()-1); i++) + if(TT[i]=='6' && TT[i+1]=='4') + seenIA64=true; + } + + if (seenIA64) + return 20; // strong match + } + // If the target triple is something non-ia64, we don't match. + if (!TT.empty()) return 0; + +#if defined(__ia64__) || defined(__IA64__) + return 5; +#else + return 0; +#endif +} + +/// IA64TargetMachine ctor - Create an LP64 architecture model +/// +IA64TargetMachine::IA64TargetMachine(const Module &M, const std::string &FS) + : DataLayout("e"), + FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), + TLInfo(*this) { // FIXME? check this stuff +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool IA64TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + PM.add(createIA64DAGToDAGInstructionSelector(*this)); + return false; +} + +bool IA64TargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) { + // Make sure everything is bundled happily + PM.add(createIA64BundlingPass(*this)); + return true; +} +bool IA64TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + PM.add(createIA64CodePrinterPass(Out, *this)); + return false; +} + diff --git a/lib/Target/IA64/IA64TargetMachine.h b/lib/Target/IA64/IA64TargetMachine.h new file mode 100644 index 0000000..538a330 --- /dev/null +++ b/lib/Target/IA64/IA64TargetMachine.h @@ -0,0 +1,60 @@ +//===-- IA64TargetMachine.h - Define TargetMachine for IA64 ---*- C++ -*---===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Duraid Madina and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the IA64 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_IA64TARGETMACHINE_H +#define LLVM_TARGET_IA64TARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "IA64InstrInfo.h" +#include "IA64ISelLowering.h" + +namespace llvm { + +class IA64TargetMachine : public LLVMTargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + IA64InstrInfo InstrInfo; + TargetFrameInfo FrameInfo; + //IA64JITInfo JITInfo; + IA64TargetLowering TLInfo; + +protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + +public: + IA64TargetMachine(const Module &M, const std::string &FS); + + virtual const IA64InstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual IA64TargetLowering *getTargetLowering() const { + return const_cast<IA64TargetLowering*>(&TLInfo); + } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + + static unsigned getModuleMatchQuality(const Module &M); + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); +}; +} // End llvm namespace + +#endif + + diff --git a/lib/Target/IA64/Makefile b/lib/Target/IA64/Makefile new file mode 100644 index 0000000..f519cf9 --- /dev/null +++ b/lib/Target/IA64/Makefile @@ -0,0 +1,18 @@ +##===- lib/Target/IA64/Makefile -----------------------------*- Makefile -*-===## +# The LLVM Compiler Infrastructure +# +# This file was developed by Duraid Madina and is distributed under the +# University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMIA64 +TARGET = IA64 +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = IA64GenRegisterInfo.h.inc IA64GenRegisterNames.inc \ + IA64GenRegisterInfo.inc IA64GenInstrNames.inc \ + IA64GenInstrInfo.inc IA64GenAsmWriter.inc \ + IA64GenDAGISel.inc + +include $(LEVEL)/Makefile.common + diff --git a/lib/Target/IA64/README b/lib/Target/IA64/README new file mode 100644 index 0000000..852d512 --- /dev/null +++ b/lib/Target/IA64/README @@ -0,0 +1,106 @@ +*** README for the LLVM IA64 Backend "Version 0.01" - March 18, 2005 +*** Quote for this version: + + "Kaori and Hitomi are naughty!!" + + +Congratulations, you have found: + +**************************************************************** +* @@@ @@@ @@@ @@@ @@@@@@@@@@ * +* @@@ @@@ @@@ @@@ @@@@@@@@@@@ * +* @@! @@! @@! @@@ @@! @@! @@! * +* !@! !@! !@! @!@ !@! !@! !@! * +* @!! @!! @!@ !@! @!! !!@ @!@ * +* !!! !!! !@! !!! !@! ! !@! * +* !!: !!: :!: !!: !!: !!: * +* :!: :!: ::!!:! :!: :!: * +* :: :::: :: :::: :::: ::: :: * +* : :: : : : :: : : : : : * +* * +* * +* @@@@@@ @@@ @@@ @@@ @@@@@@ @@@@@@ @@@ * +* @@@@@@@@ @@@@ @@@ @@@ @@@@@@@@ @@@@@@@ @@@@ * +* @@! @@@ @@!@!@@@ @@! @@! @@@ !@@ @@!@! * +* !@! @!@ !@!!@!@! !@! !@! @!@ !@! !@!!@! * +* @!@ !@! @!@ !!@! !!@ @!@!@!@! !!@@!@! @!! @!! * +* !@! !!! !@! !!! !!! !!!@!!!! @!!@!!!! !!! !@! * +* !!: !!! !!: !!! !!: !!: !!! !:! !:! :!!:!:!!: * +* :!: !:! :!: !:! :!: :!: !:! :!: !:! !:::!!::: * +* ::::: :: :: :: :: :: ::: :::: ::: ::: * +* : : : :: : : : : : :: : : ::: * +* * +**************************************************************** +* Bow down, bow down, before the power of IA64! Or be crushed, * +* be crushed, by its jolly registers of doom!! * +**************************************************************** + +DEVELOPMENT PLAN: + + _ you are 2005 maybe 2005 2006 2006 and + / here | | | beyond + v v v v | + v +CLEAN UP ADD INSTRUCTION ADD PLAY WITH +INSTRUCTION --> SCHEDULING AND --> JIT --> DYNAMIC --> FUTURE WORK +SELECTION BUNDLING SUPPORT REOPTIMIZATION + +DISCLAIMER AND PROMISE: + +As of the time of this release, you are probably better off using Intel C/C++ +or GCC. The performance of the code emitted right now is, in a word, +terrible. Check back in a few months - the story will be different then, +I guarantee it. + +TODO: + + - stop passing FP args in both FP *and* integer regs when not required + - allocate low (nonstacked) registers more aggressively + - clean up and thoroughly test the isel patterns. + - fix stacked register allocation order: (for readability) we don't want + the out? registers being the first ones used + - fix up floating point + (nb http://gcc.gnu.org/wiki?pagename=ia64%20floating%20point ) + - bundling! + (we will avoid the mess that is: + http://gcc.gnu.org/ml/gcc/2003-12/msg00832.html ) + - instruction scheduling (hmmmm! ;) + - write truly inspirational documentation + - if-conversion (predicate database/knowledge? etc etc) + - counted loop support + - make integer + FP mul/div more clever (we have fixed pseudocode atm) + - track and use comparison complements + +INFO: + + - we are strictly LP64 here, no support for ILP32 on HP-UX. Linux users + don't need to worry about this. + - i have instruction scheduling/bundling pseudocode, that really works + (has been tested, albeit at the perl-script level). + so, before you go write your own, send me an email! + +KNOWN DEFECTS AT THE CURRENT TIME: + + - C++ vtables contain naked function pointers, not function descriptors, + which is bad. see http://llvm.cs.uiuc.edu/bugs/show_bug.cgi?id=406 + - varargs are broken + - alloca doesn't work (indeed, stack frame layout is bogus) + - no support for big-endian environments + - (not really the backend, but...) the CFE has some issues on IA64. + these will probably be fixed soon. + +ACKNOWLEDGEMENTS: + + - Chris Lattner (x100) + - Other LLVM developers ("hey, that looks familiar") + +CONTACT: + + - You can email me at duraid@octopus.com.au. If you find a small bug, + just email me. If you find a big bug, please file a bug report + in bugzilla! http://llvm.cs.uiuc.edu is your one stop shop for all + things LLVM. + + + + diff --git a/lib/Target/MRegisterInfo.cpp b/lib/Target/MRegisterInfo.cpp new file mode 100644 index 0000000..3af611d --- /dev/null +++ b/lib/Target/MRegisterInfo.cpp @@ -0,0 +1,81 @@ +//===- MRegisterInfo.cpp - Target Register Information Implementation -----===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the MRegisterInfo interface. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/MRegisterInfo.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/ADT/BitVector.h" + +using namespace llvm; + +MRegisterInfo::MRegisterInfo(const TargetRegisterDesc *D, unsigned NR, + regclass_iterator RCB, regclass_iterator RCE, + int CFSO, int CFDO) + : Desc(D), NumRegs(NR), RegClassBegin(RCB), RegClassEnd(RCE) { + assert(NumRegs < FirstVirtualRegister && + "Target has too many physical registers!"); + + CallFrameSetupOpcode = CFSO; + CallFrameDestroyOpcode = CFDO; +} + +MRegisterInfo::~MRegisterInfo() {} + +/// getAllocatableSetForRC - Toggle the bits that represent allocatable +/// registers for the specific register class. +static void getAllocatableSetForRC(MachineFunction &MF, + const TargetRegisterClass *RC, BitVector &R){ + for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF), + E = RC->allocation_order_end(MF); I != E; ++I) + R.set(*I); +} + +BitVector MRegisterInfo::getAllocatableSet(MachineFunction &MF, + const TargetRegisterClass *RC) const { + BitVector Allocatable(NumRegs); + if (RC) { + getAllocatableSetForRC(MF, RC, Allocatable); + return Allocatable; + } + + for (MRegisterInfo::regclass_iterator I = regclass_begin(), + E = regclass_end(); I != E; ++I) + getAllocatableSetForRC(MF, *I, Allocatable); + return Allocatable; +} + +/// getLocation - This method should return the actual location of a frame +/// variable given the frame index. The location is returned in ML. +/// Subclasses should override this method for special handling of frame +/// variables and then call MRegisterInfo::getLocation for the default action. +void MRegisterInfo::getLocation(MachineFunction &MF, unsigned Index, + MachineLocation &ML) const { + const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + ML.set(getFrameRegister(MF), + MFI->getObjectOffset(Index) + + MFI->getStackSize() - + TFI.getOffsetOfLocalArea() + + MFI->getOffsetAdjustment()); +} + +/// getInitialFrameState - Returns a list of machine moves that are assumed +/// on entry to a function. +void +MRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const { + // Default is to do nothing. +} + diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp new file mode 100644 index 0000000..5859adf --- /dev/null +++ b/lib/Target/MSIL/MSILWriter.cpp @@ -0,0 +1,1657 @@ +//===-- MSILWriter.cpp - Library for converting LLVM code to MSIL ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Roman Samoilov and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This library converts LLVM code to MSIL code. +// +//===----------------------------------------------------------------------===// + +#include "MSILWriter.h" +#include "llvm/CallingConv.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/ParameterAttributes.h" +#include "llvm/TypeSymbolTable.h" +#include "llvm/Analysis/ConstantsScanner.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/StringExtras.h" + +namespace { + // TargetMachine for the MSIL + struct VISIBILITY_HIDDEN MSILTarget : public TargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + + MSILTarget(const Module &M, const std::string &FS) + : DataLayout(&M) {} + + virtual bool WantsWholeFile() const { return true; } + virtual bool addPassesToEmitWholeFile(PassManager &PM, std::ostream &Out, + CodeGenFileType FileType, bool Fast); + + // This class always works, but shouldn't be the default in most cases. + static unsigned getModuleMatchQuality(const Module &M) { return 1; } + + virtual const TargetData *getTargetData() const { return &DataLayout; } + }; +} + + +RegisterTarget<MSILTarget> X("msil", " MSIL backend"); + +bool MSILModule::runOnModule(Module &M) { + ModulePtr = &M; + TD = &getAnalysis<TargetData>(); + bool Changed = false; + // Find named types. + TypeSymbolTable& Table = M.getTypeSymbolTable(); + std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes(); + for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) { + if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second)) + Table.remove(I++); + else { + std::set<const Type *>::iterator T = Types.find(I->second); + if (T==Types.end()) + Table.remove(I++); + else { + Types.erase(T); + ++I; + } + } + } + // Find unnamed types. + unsigned RenameCounter = 0; + for (std::set<const Type *>::const_iterator I = Types.begin(), + E = Types.end(); I!=E; ++I) + if (const StructType *STy = dyn_cast<StructType>(*I)) { + while (ModulePtr->addTypeName("unnamed$"+utostr(RenameCounter), STy)) + ++RenameCounter; + Changed = true; + } + // Pointer for FunctionPass. + UsedTypes = &getAnalysis<FindUsedTypes>().getTypes(); + return Changed; +} + +char MSILModule::ID = 0; +char MSILWriter::ID = 0; + +bool MSILWriter::runOnFunction(Function &F) { + if (F.isDeclaration()) return false; + LInfo = &getAnalysis<LoopInfo>(); + printFunction(F); + return false; +} + + +bool MSILWriter::doInitialization(Module &M) { + ModulePtr = &M; + Mang = new Mangler(M); + Out << ".assembly extern mscorlib {}\n"; + Out << ".assembly MSIL {}\n\n"; + Out << "// External\n"; + printExternals(); + Out << "// Declarations\n"; + printDeclarations(M.getTypeSymbolTable()); + Out << "// Definitions\n"; + printGlobalVariables(); + Out << "// Startup code\n"; + printModuleStartup(); + return false; +} + + +bool MSILWriter::doFinalization(Module &M) { + delete Mang; + return false; +} + + +void MSILWriter::printModuleStartup() { + Out << + ".method static public int32 $MSIL_Startup() {\n" + "\t.entrypoint\n" + "\t.locals (native int i)\n" + "\t.locals (native int argc)\n" + "\t.locals (native int ptr)\n" + "\t.locals (void* argv)\n" + "\t.locals (string[] args)\n" + "\tcall\tstring[] [mscorlib]System.Environment::GetCommandLineArgs()\n" + "\tdup\n" + "\tstloc\targs\n" + "\tldlen\n" + "\tconv.i4\n" + "\tdup\n" + "\tstloc\targc\n"; + printPtrLoad(TD->getPointerSize()); + Out << + "\tmul\n" + "\tlocalloc\n" + "\tstloc\targv\n" + "\tldc.i4.0\n" + "\tstloc\ti\n" + "L_01:\n" + "\tldloc\ti\n" + "\tldloc\targc\n" + "\tceq\n" + "\tbrtrue\tL_02\n" + "\tldloc\targs\n" + "\tldloc\ti\n" + "\tldelem.ref\n" + "\tcall\tnative int [mscorlib]System.Runtime.InteropServices.Marshal::" + "StringToHGlobalAnsi(string)\n" + "\tstloc\tptr\n" + "\tldloc\targv\n" + "\tldloc\ti\n"; + printPtrLoad(TD->getPointerSize()); + Out << + "\tmul\n" + "\tadd\n" + "\tldloc\tptr\n" + "\tstind.i\n" + "\tldloc\ti\n" + "\tldc.i4.1\n" + "\tadd\n" + "\tstloc\ti\n" + "\tbr\tL_01\n" + "L_02:\n" + "\tcall void $MSIL_Init()\n"; + + // Call user 'main' function. + const Function* F = ModulePtr->getFunction("main"); + if (!F || F->isDeclaration()) { + Out << "\tldc.i4.0\n\tret\n}\n"; + return; + } + bool BadSig = true;; + std::string Args(""); + Function::const_arg_iterator Arg1,Arg2; + + switch (F->arg_size()) { + case 0: + BadSig = false; + break; + case 1: + Arg1 = F->arg_begin(); + if (Arg1->getType()->isInteger()) { + Out << "\tldloc\targc\n"; + Args = getTypeName(Arg1->getType()); + BadSig = false; + } + break; + case 2: + Arg1 = Arg2 = F->arg_begin(); ++Arg2; + if (Arg1->getType()->isInteger() && + Arg2->getType()->getTypeID() == Type::PointerTyID) { + Out << "\tldloc\targc\n\tldloc\targv\n"; + Args = getTypeName(Arg1->getType())+","+getTypeName(Arg2->getType()); + BadSig = false; + } + break; + default: + BadSig = true; + } + + bool RetVoid = (F->getReturnType()->getTypeID() == Type::VoidTyID); + if (BadSig || !F->getReturnType()->isInteger() && !RetVoid) { + Out << "\tldc.i4.0\n"; + } else { + Out << "\tcall\t" << getTypeName(F->getReturnType()) << + getConvModopt(F->getCallingConv()) << "main(" << Args << ")\n"; + if (RetVoid) + Out << "\tldc.i4.0\n"; + else + Out << "\tconv.i4\n"; + } + Out << "\tret\n}\n"; +} + +bool MSILWriter::isZeroValue(const Value* V) { + if (const Constant *C = dyn_cast<Constant>(V)) + return C->isNullValue(); + return false; +} + + +std::string MSILWriter::getValueName(const Value* V) { + // Name into the quotes allow control and space characters. + return "'"+Mang->getValueName(V)+"'"; +} + + +std::string MSILWriter::getLabelName(const std::string& Name) { + if (Name.find('.')!=std::string::npos) { + std::string Tmp(Name); + // Replace unaccepable characters in the label name. + for (std::string::iterator I = Tmp.begin(), E = Tmp.end(); I!=E; ++I) + if (*I=='.') *I = '@'; + return Tmp; + } + return Name; +} + + +std::string MSILWriter::getLabelName(const Value* V) { + return getLabelName(Mang->getValueName(V)); +} + + +std::string MSILWriter::getConvModopt(unsigned CallingConvID) { + switch (CallingConvID) { + case CallingConv::C: + case CallingConv::Cold: + case CallingConv::Fast: + return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) "; + case CallingConv::X86_FastCall: + return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvFastcall) "; + case CallingConv::X86_StdCall: + return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) "; + default: + cerr << "CallingConvID = " << CallingConvID << '\n'; + assert(0 && "Unsupported calling convention"); + } +} + + +std::string MSILWriter::getArrayTypeName(Type::TypeID TyID, const Type* Ty) { + std::string Tmp = ""; + const Type* ElemTy = Ty; + assert(Ty->getTypeID()==TyID && "Invalid type passed"); + // Walk trought array element types. + for (;;) { + // Multidimensional array. + if (ElemTy->getTypeID()==TyID) { + if (const ArrayType* ATy = dyn_cast<ArrayType>(ElemTy)) + Tmp += utostr(ATy->getNumElements()); + else if (const VectorType* VTy = dyn_cast<VectorType>(ElemTy)) + Tmp += utostr(VTy->getNumElements()); + ElemTy = cast<SequentialType>(ElemTy)->getElementType(); + } + // Base element type found. + if (ElemTy->getTypeID()!=TyID) break; + Tmp += ","; + } + return getTypeName(ElemTy, false, true)+"["+Tmp+"]"; +} + + +std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) { + unsigned NumBits = 0; + switch (Ty->getTypeID()) { + case Type::VoidTyID: + return "void "; + case Type::IntegerTyID: + NumBits = getBitWidth(Ty); + if(NumBits==1) + return "bool "; + if (!isSigned) + return "unsigned int"+utostr(NumBits)+" "; + return "int"+utostr(NumBits)+" "; + case Type::FloatTyID: + return "float32 "; + case Type::DoubleTyID: + return "float64 "; + default: + cerr << "Type = " << *Ty << '\n'; + assert(0 && "Invalid primitive type"); + } +} + + +std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned, + bool isNested) { + if (Ty->isPrimitiveType() || Ty->isInteger()) + return getPrimitiveTypeName(Ty,isSigned); + // FIXME: "OpaqueType" support + switch (Ty->getTypeID()) { + case Type::PointerTyID: + return "void* "; + case Type::StructTyID: + if (isNested) + return ModulePtr->getTypeName(Ty); + return "valuetype '"+ModulePtr->getTypeName(Ty)+"' "; + case Type::ArrayTyID: + if (isNested) + return getArrayTypeName(Ty->getTypeID(),Ty); + return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' "; + case Type::VectorTyID: + if (isNested) + return getArrayTypeName(Ty->getTypeID(),Ty); + return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' "; + default: + cerr << "Type = " << *Ty << '\n'; + assert(0 && "Invalid type in getTypeName()"); + } +} + + +MSILWriter::ValueType MSILWriter::getValueLocation(const Value* V) { + // Function argument + if (isa<Argument>(V)) + return ArgumentVT; + // Function + else if (const Function* F = dyn_cast<Function>(V)) + return F->hasInternalLinkage() ? InternalVT : GlobalVT; + // Variable + else if (const GlobalVariable* G = dyn_cast<GlobalVariable>(V)) + return G->hasInternalLinkage() ? InternalVT : GlobalVT; + // Constant + else if (isa<Constant>(V)) + return isa<ConstantExpr>(V) ? ConstExprVT : ConstVT; + // Local variable + return LocalVT; +} + + +std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand, + bool isSigned) { + unsigned NumBits = 0; + switch (Ty->getTypeID()) { + // Integer constant, expanding for stack operations. + case Type::IntegerTyID: + NumBits = getBitWidth(Ty); + // Expand integer value to "int32" or "int64". + if (Expand) return (NumBits<=32 ? "i4" : "i8"); + if (NumBits==1) return "i1"; + return (isSigned ? "i" : "u")+utostr(NumBits/8); + // Float constant. + case Type::FloatTyID: + return "r4"; + case Type::DoubleTyID: + return "r8"; + case Type::PointerTyID: + return "i"+utostr(TD->getTypeSize(Ty)); + default: + cerr << "TypeID = " << Ty->getTypeID() << '\n'; + assert(0 && "Invalid type in TypeToPostfix()"); + } +} + + +void MSILWriter::printConvToPtr() { + switch (ModulePtr->getPointerSize()) { + case Module::Pointer32: + printSimpleInstruction("conv.u4"); + break; + case Module::Pointer64: + printSimpleInstruction("conv.u8"); + break; + default: + assert(0 && "Module use not supporting pointer size"); + } +} + + +void MSILWriter::printPtrLoad(uint64_t N) { + switch (ModulePtr->getPointerSize()) { + case Module::Pointer32: + printSimpleInstruction("ldc.i4",utostr(N).c_str()); + // FIXME: Need overflow test? + if (!isUInt32(N)) { + cerr << "Value = " << utostr(N) << '\n'; + assert(0 && "32-bit pointer overflowed"); + } + break; + case Module::Pointer64: + printSimpleInstruction("ldc.i8",utostr(N).c_str()); + break; + default: + assert(0 && "Module use not supporting pointer size"); + } +} + + +void MSILWriter::printValuePtrLoad(const Value* V) { + printValueLoad(V); + printConvToPtr(); +} + + +void MSILWriter::printConstLoad(const Constant* C) { + if (const ConstantInt* CInt = dyn_cast<ConstantInt>(C)) { + // Integer constant + Out << "\tldc." << getTypePostfix(C->getType(),true) << '\t'; + if (CInt->isMinValue(true)) + Out << CInt->getSExtValue(); + else + Out << CInt->getZExtValue(); + } else if (const ConstantFP* FP = dyn_cast<ConstantFP>(C)) { + // Float constant + uint64_t X; + unsigned Size; + if (FP->getType()->getTypeID()==Type::FloatTyID) { + X = FloatToBits(FP->getValue()); + Size = 4; + } else { + X = DoubleToBits(FP->getValue()); + Size = 8; + } + Out << "\tldc.r" << Size << "\t( " << utohexstr(X) << ')'; + } else if (isa<UndefValue>(C)) { + // Undefined constant value = NULL. + printPtrLoad(0); + } else { + cerr << "Constant = " << *C << '\n'; + assert(0 && "Invalid constant value"); + } + Out << '\n'; +} + + +void MSILWriter::printValueLoad(const Value* V) { + MSILWriter::ValueType Location = getValueLocation(V); + switch (Location) { + // Global variable or function address. + case GlobalVT: + case InternalVT: + if (const Function* F = dyn_cast<Function>(V)) { + std::string Name = getConvModopt(F->getCallingConv())+getValueName(F); + printSimpleInstruction("ldftn", + getCallSignature(F->getFunctionType(),NULL,Name).c_str()); + } else { + std::string Tmp; + const Type* ElemTy = cast<PointerType>(V->getType())->getElementType(); + if (Location==GlobalVT && cast<GlobalVariable>(V)->hasDLLImportLinkage()) { + Tmp = "void* "+getValueName(V); + printSimpleInstruction("ldsfld",Tmp.c_str()); + } else { + Tmp = getTypeName(ElemTy)+getValueName(V); + printSimpleInstruction("ldsflda",Tmp.c_str()); + } + } + break; + // Function argument. + case ArgumentVT: + printSimpleInstruction("ldarg",getValueName(V).c_str()); + break; + // Local function variable. + case LocalVT: + printSimpleInstruction("ldloc",getValueName(V).c_str()); + break; + // Constant value. + case ConstVT: + if (isa<ConstantPointerNull>(V)) + printPtrLoad(0); + else + printConstLoad(cast<Constant>(V)); + break; + // Constant expression. + case ConstExprVT: + printConstantExpr(cast<ConstantExpr>(V)); + break; + default: + cerr << "Value = " << *V << '\n'; + assert(0 && "Invalid value location"); + } +} + + +void MSILWriter::printValueSave(const Value* V) { + switch (getValueLocation(V)) { + case ArgumentVT: + printSimpleInstruction("starg",getValueName(V).c_str()); + break; + case LocalVT: + printSimpleInstruction("stloc",getValueName(V).c_str()); + break; + default: + cerr << "Value = " << *V << '\n'; + assert(0 && "Invalid value location"); + } +} + + +void MSILWriter::printBinaryInstruction(const char* Name, const Value* Left, + const Value* Right) { + printValueLoad(Left); + printValueLoad(Right); + Out << '\t' << Name << '\n'; +} + + +void MSILWriter::printSimpleInstruction(const char* Inst, const char* Operand) { + if(Operand) + Out << '\t' << Inst << '\t' << Operand << '\n'; + else + Out << '\t' << Inst << '\n'; +} + + +void MSILWriter::printPHICopy(const BasicBlock* Src, const BasicBlock* Dst) { + for (BasicBlock::const_iterator I = Dst->begin(), E = Dst->end(); + isa<PHINode>(I); ++I) { + const PHINode* Phi = cast<PHINode>(I); + const Value* Val = Phi->getIncomingValueForBlock(Src); + if (isa<UndefValue>(Val)) continue; + printValueLoad(Val); + printValueSave(Phi); + } +} + + +void MSILWriter::printBranchToBlock(const BasicBlock* CurrBB, + const BasicBlock* TrueBB, + const BasicBlock* FalseBB) { + if (TrueBB==FalseBB) { + // "TrueBB" and "FalseBB" destination equals + printPHICopy(CurrBB,TrueBB); + printSimpleInstruction("pop"); + printSimpleInstruction("br",getLabelName(TrueBB).c_str()); + } else if (FalseBB==NULL) { + // If "FalseBB" not used the jump have condition + printPHICopy(CurrBB,TrueBB); + printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str()); + } else if (TrueBB==NULL) { + // If "TrueBB" not used the jump is unconditional + printPHICopy(CurrBB,FalseBB); + printSimpleInstruction("br",getLabelName(FalseBB).c_str()); + } else { + // Copy PHI instructions for each block + std::string TmpLabel; + // Print PHI instructions for "TrueBB" + if (isa<PHINode>(TrueBB->begin())) { + TmpLabel = getLabelName(TrueBB)+"$phi_"+utostr(getUniqID()); + printSimpleInstruction("brtrue",TmpLabel.c_str()); + } else { + printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str()); + } + // Print PHI instructions for "FalseBB" + if (isa<PHINode>(FalseBB->begin())) { + printPHICopy(CurrBB,FalseBB); + printSimpleInstruction("br",getLabelName(FalseBB).c_str()); + } else { + printSimpleInstruction("br",getLabelName(FalseBB).c_str()); + } + if (isa<PHINode>(TrueBB->begin())) { + // Handle "TrueBB" PHI Copy + Out << TmpLabel << ":\n"; + printPHICopy(CurrBB,TrueBB); + printSimpleInstruction("br",getLabelName(TrueBB).c_str()); + } + } +} + + +void MSILWriter::printBranchInstruction(const BranchInst* Inst) { + if (Inst->isUnconditional()) { + printBranchToBlock(Inst->getParent(),NULL,Inst->getSuccessor(0)); + } else { + printValueLoad(Inst->getCondition()); + printBranchToBlock(Inst->getParent(),Inst->getSuccessor(0), + Inst->getSuccessor(1)); + } +} + + +void MSILWriter::printSelectInstruction(const Value* Cond, const Value* VTrue, + const Value* VFalse) { + std::string TmpLabel = std::string("select$true_")+utostr(getUniqID()); + printValueLoad(VTrue); + printValueLoad(Cond); + printSimpleInstruction("brtrue",TmpLabel.c_str()); + printSimpleInstruction("pop"); + printValueLoad(VFalse); + Out << TmpLabel << ":\n"; +} + + +void MSILWriter::printIndirectLoad(const Value* V) { + const Type* Ty = V->getType(); + printValueLoad(V); + if (const PointerType* P = dyn_cast<PointerType>(Ty)) + Ty = P->getElementType(); + std::string Tmp = "ldind."+getTypePostfix(Ty, false); + printSimpleInstruction(Tmp.c_str()); +} + + +void MSILWriter::printIndirectSave(const Value* Ptr, const Value* Val) { + printValueLoad(Ptr); + printValueLoad(Val); + printIndirectSave(Val->getType()); +} + + +void MSILWriter::printIndirectSave(const Type* Ty) { + // Instruction need signed postfix for any type. + std::string postfix = getTypePostfix(Ty, false); + if (*postfix.begin()=='u') *postfix.begin() = 'i'; + postfix = "stind."+postfix; + printSimpleInstruction(postfix.c_str()); +} + + +void MSILWriter::printCastInstruction(unsigned int Op, const Value* V, + const Type* Ty) { + std::string Tmp(""); + printValueLoad(V); + switch (Op) { + // Signed + case Instruction::SExt: + case Instruction::SIToFP: + case Instruction::FPToSI: + Tmp = "conv."+getTypePostfix(Ty,false,true); + printSimpleInstruction(Tmp.c_str()); + break; + // Unsigned + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::FPToUI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + Tmp = "conv."+getTypePostfix(Ty,false); + printSimpleInstruction(Tmp.c_str()); + break; + // Do nothing + case Instruction::BitCast: + // FIXME: meaning that ld*/st* instruction do not change data format. + break; + default: + cerr << "Opcode = " << Op << '\n'; + assert(0 && "Invalid conversion instruction"); + } +} + + +void MSILWriter::printGepInstruction(const Value* V, gep_type_iterator I, + gep_type_iterator E) { + unsigned Size; + // Load address + printValuePtrLoad(V); + // Calculate element offset. + for (; I!=E; ++I){ + Size = 0; + const Value* IndexValue = I.getOperand(); + if (const StructType* StrucTy = dyn_cast<StructType>(*I)) { + uint64_t FieldIndex = cast<ConstantInt>(IndexValue)->getZExtValue(); + // Offset is the sum of all previous structure fields. + for (uint64_t F = 0; F<FieldIndex; ++F) + Size += TD->getTypeSize(StrucTy->getContainedType((unsigned)F)); + printPtrLoad(Size); + printSimpleInstruction("add"); + continue; + } else if (const SequentialType* SeqTy = dyn_cast<SequentialType>(*I)) { + Size = TD->getTypeSize(SeqTy->getElementType()); + } else { + Size = TD->getTypeSize(*I); + } + // Add offset of current element to stack top. + if (!isZeroValue(IndexValue)) { + // Constant optimization. + if (const ConstantInt* C = dyn_cast<ConstantInt>(IndexValue)) { + if (C->getValue().isNegative()) { + printPtrLoad(C->getValue().abs().getZExtValue()*Size); + printSimpleInstruction("sub"); + continue; + } else + printPtrLoad(C->getZExtValue()*Size); + } else { + printPtrLoad(Size); + printValuePtrLoad(IndexValue); + printSimpleInstruction("mul"); + } + printSimpleInstruction("add"); + } + } +} + + +std::string MSILWriter::getCallSignature(const FunctionType* Ty, + const Instruction* Inst, + std::string Name) { + std::string Tmp(""); + if (Ty->isVarArg()) Tmp += "vararg "; + // Name and return type. + Tmp += getTypeName(Ty->getReturnType())+Name+"("; + // Function argument type list. + unsigned NumParams = Ty->getNumParams(); + for (unsigned I = 0; I!=NumParams; ++I) { + if (I!=0) Tmp += ","; + Tmp += getTypeName(Ty->getParamType(I)); + } + // CLR needs to know the exact amount of parameters received by vararg + // function, because caller cleans the stack. + if (Ty->isVarArg() && Inst) { + // Origin to function arguments in "CallInst" or "InvokeInst". + unsigned Org = isa<InvokeInst>(Inst) ? 3 : 1; + // Print variable argument types. + unsigned NumOperands = Inst->getNumOperands()-Org; + if (NumParams<NumOperands) { + if (NumParams!=0) Tmp += ", "; + Tmp += "... , "; + for (unsigned J = NumParams; J!=NumOperands; ++J) { + if (J!=NumParams) Tmp += ", "; + Tmp += getTypeName(Inst->getOperand(J+Org)->getType()); + } + } + } + return Tmp+")"; +} + + +void MSILWriter::printFunctionCall(const Value* FnVal, + const Instruction* Inst) { + // Get function calling convention. + std::string Name = ""; + if (const CallInst* Call = dyn_cast<CallInst>(Inst)) + Name = getConvModopt(Call->getCallingConv()); + else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst)) + Name = getConvModopt(Invoke->getCallingConv()); + else { + cerr << "Instruction = " << Inst->getName() << '\n'; + assert(0 && "Need \"Invoke\" or \"Call\" instruction only"); + } + if (const Function* F = dyn_cast<Function>(FnVal)) { + // Direct call. + Name += getValueName(F); + printSimpleInstruction("call", + getCallSignature(F->getFunctionType(),Inst,Name).c_str()); + } else { + // Indirect function call. + const PointerType* PTy = cast<PointerType>(FnVal->getType()); + const FunctionType* FTy = cast<FunctionType>(PTy->getElementType()); + // Load function address. + printValueLoad(FnVal); + printSimpleInstruction("calli",getCallSignature(FTy,Inst,Name).c_str()); + } +} + + +void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) { + std::string Name; + switch (Inst->getIntrinsicID()) { + case Intrinsic::vastart: + Name = getValueName(Inst->getOperand(1)); + Name.insert(Name.length()-1,"$valist"); + // Obtain the argument handle. + printSimpleInstruction("ldloca",Name.c_str()); + printSimpleInstruction("arglist"); + printSimpleInstruction("call", + "instance void [mscorlib]System.ArgIterator::.ctor" + "(valuetype [mscorlib]System.RuntimeArgumentHandle)"); + // Save as pointer type "void*" + printValueLoad(Inst->getOperand(1)); + printSimpleInstruction("ldloca",Name.c_str()); + printIndirectSave(PointerType::get(IntegerType::get(8))); + break; + case Intrinsic::vaend: + // Close argument list handle. + printIndirectLoad(Inst->getOperand(1)); + printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()"); + break; + case Intrinsic::vacopy: + // Copy "ArgIterator" valuetype. + printIndirectLoad(Inst->getOperand(1)); + printIndirectLoad(Inst->getOperand(2)); + printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator"); + break; + default: + cerr << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n'; + assert(0 && "Invalid intrinsic function"); + } +} + + +void MSILWriter::printCallInstruction(const Instruction* Inst) { + if (isa<IntrinsicInst>(Inst)) { + // Handle intrinsic function. + printIntrinsicCall(cast<IntrinsicInst>(Inst)); + } else { + // Load arguments to stack and call function. + for (int I = 1, E = Inst->getNumOperands(); I!=E; ++I) + printValueLoad(Inst->getOperand(I)); + printFunctionCall(Inst->getOperand(0),Inst); + } +} + + +void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left, + const Value* Right) { + switch (Predicate) { + case ICmpInst::ICMP_EQ: + printBinaryInstruction("ceq",Left,Right); + break; + case ICmpInst::ICMP_NE: + // Emulate = not neg (Op1 eq Op2) + printBinaryInstruction("ceq",Left,Right); + printSimpleInstruction("neg"); + printSimpleInstruction("not"); + break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: + // Emulate = (Op1 eq Op2) or (Op1 lt Op2) + printBinaryInstruction("ceq",Left,Right); + if (Predicate==ICmpInst::ICMP_ULE) + printBinaryInstruction("clt.un",Left,Right); + else + printBinaryInstruction("clt",Left,Right); + printSimpleInstruction("or"); + break; + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: + // Emulate = (Op1 eq Op2) or (Op1 gt Op2) + printBinaryInstruction("ceq",Left,Right); + if (Predicate==ICmpInst::ICMP_UGE) + printBinaryInstruction("cgt.un",Left,Right); + else + printBinaryInstruction("cgt",Left,Right); + printSimpleInstruction("or"); + break; + case ICmpInst::ICMP_ULT: + printBinaryInstruction("clt.un",Left,Right); + break; + case ICmpInst::ICMP_SLT: + printBinaryInstruction("clt",Left,Right); + break; + case ICmpInst::ICMP_UGT: + printBinaryInstruction("cgt.un",Left,Right); + case ICmpInst::ICMP_SGT: + printBinaryInstruction("cgt",Left,Right); + break; + default: + cerr << "Predicate = " << Predicate << '\n'; + assert(0 && "Invalid icmp predicate"); + } +} + + +void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left, + const Value* Right) { + // FIXME: Correct comparison + std::string NanFunc = "bool [mscorlib]System.Double::IsNaN(float64)"; + switch (Predicate) { + case FCmpInst::FCMP_UGT: + // X > Y || llvm_fcmp_uno(X, Y) + printBinaryInstruction("cgt",Left,Right); + printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_OGT: + // X > Y + printBinaryInstruction("cgt",Left,Right); + break; + case FCmpInst::FCMP_UGE: + // X >= Y || llvm_fcmp_uno(X, Y) + printBinaryInstruction("ceq",Left,Right); + printBinaryInstruction("cgt",Left,Right); + printSimpleInstruction("or"); + printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_OGE: + // X >= Y + printBinaryInstruction("ceq",Left,Right); + printBinaryInstruction("cgt",Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_ULT: + // X < Y || llvm_fcmp_uno(X, Y) + printBinaryInstruction("clt",Left,Right); + printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_OLT: + // X < Y + printBinaryInstruction("clt",Left,Right); + break; + case FCmpInst::FCMP_ULE: + // X <= Y || llvm_fcmp_uno(X, Y) + printBinaryInstruction("ceq",Left,Right); + printBinaryInstruction("clt",Left,Right); + printSimpleInstruction("or"); + printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_OLE: + // X <= Y + printBinaryInstruction("ceq",Left,Right); + printBinaryInstruction("clt",Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_UEQ: + // X == Y || llvm_fcmp_uno(X, Y) + printBinaryInstruction("ceq",Left,Right); + printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_OEQ: + // X == Y + printBinaryInstruction("ceq",Left,Right); + break; + case FCmpInst::FCMP_UNE: + // X != Y + printBinaryInstruction("ceq",Left,Right); + printSimpleInstruction("neg"); + printSimpleInstruction("not"); + break; + case FCmpInst::FCMP_ONE: + // X != Y && llvm_fcmp_ord(X, Y) + printBinaryInstruction("ceq",Left,Right); + printSimpleInstruction("not"); + break; + case FCmpInst::FCMP_ORD: + // return X == X && Y == Y + printBinaryInstruction("ceq",Left,Left); + printBinaryInstruction("ceq",Right,Right); + printSimpleInstruction("or"); + break; + case FCmpInst::FCMP_UNO: + // X != X || Y != Y + printBinaryInstruction("ceq",Left,Left); + printSimpleInstruction("not"); + printBinaryInstruction("ceq",Right,Right); + printSimpleInstruction("not"); + printSimpleInstruction("or"); + break; + default: + assert(0 && "Illegal FCmp predicate"); + } +} + + +void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) { + std::string Label = "leave$normal_"+utostr(getUniqID()); + Out << ".try {\n"; + // Load arguments + for (int I = 3, E = Inst->getNumOperands(); I!=E; ++I) + printValueLoad(Inst->getOperand(I)); + // Print call instruction + printFunctionCall(Inst->getOperand(0),Inst); + // Save function result and leave "try" block + printValueSave(Inst); + printSimpleInstruction("leave",Label.c_str()); + Out << "}\n"; + Out << "catch [mscorlib]System.Exception {\n"; + // Redirect to unwind block + printSimpleInstruction("pop"); + printBranchToBlock(Inst->getParent(),NULL,Inst->getUnwindDest()); + Out << "}\n" << Label << ":\n"; + // Redirect to continue block + printBranchToBlock(Inst->getParent(),NULL,Inst->getNormalDest()); +} + + +void MSILWriter::printSwitchInstruction(const SwitchInst* Inst) { + // FIXME: Emulate with IL "switch" instruction + // Emulate = if () else if () else if () else ... + for (unsigned int I = 1, E = Inst->getNumCases(); I!=E; ++I) { + printValueLoad(Inst->getCondition()); + printValueLoad(Inst->getCaseValue(I)); + printSimpleInstruction("ceq"); + // Condition jump to successor block + printBranchToBlock(Inst->getParent(),Inst->getSuccessor(I),NULL); + } + // Jump to default block + printBranchToBlock(Inst->getParent(),NULL,Inst->getDefaultDest()); +} + + +void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) { + printIndirectLoad(Inst->getOperand(0)); + printSimpleInstruction("call", + "instance typedref [mscorlib]System.ArgIterator::GetNextArg()"); + printSimpleInstruction("refanyval","void*"); + std::string Name = "ldind."+getTypePostfix(PointerType::get(IntegerType::get(8)),false); + printSimpleInstruction(Name.c_str()); +} + + +void MSILWriter::printAllocaInstruction(const AllocaInst* Inst) { + uint64_t Size = TD->getTypeSize(Inst->getAllocatedType()); + // Constant optimization. + if (const ConstantInt* CInt = dyn_cast<ConstantInt>(Inst->getOperand(0))) { + printPtrLoad(CInt->getZExtValue()*Size); + } else { + printPtrLoad(Size); + printValueLoad(Inst->getOperand(0)); + printSimpleInstruction("mul"); + } + printSimpleInstruction("localloc"); +} + + +void MSILWriter::printInstruction(const Instruction* Inst) { + const Value *Left = 0, *Right = 0; + if (Inst->getNumOperands()>=1) Left = Inst->getOperand(0); + if (Inst->getNumOperands()>=2) Right = Inst->getOperand(1); + // Print instruction + // FIXME: "ShuffleVector","ExtractElement","InsertElement" support. + switch (Inst->getOpcode()) { + // Terminator + case Instruction::Ret: + if (Inst->getNumOperands()) { + printValueLoad(Left); + printSimpleInstruction("ret"); + } else + printSimpleInstruction("ret"); + break; + case Instruction::Br: + printBranchInstruction(cast<BranchInst>(Inst)); + break; + // Binary + case Instruction::Add: + printBinaryInstruction("add",Left,Right); + break; + case Instruction::Sub: + printBinaryInstruction("sub",Left,Right); + break; + case Instruction::Mul: + printBinaryInstruction("mul",Left,Right); + break; + case Instruction::UDiv: + printBinaryInstruction("div.un",Left,Right); + break; + case Instruction::SDiv: + case Instruction::FDiv: + printBinaryInstruction("div",Left,Right); + break; + case Instruction::URem: + printBinaryInstruction("rem.un",Left,Right); + break; + case Instruction::SRem: + case Instruction::FRem: + printBinaryInstruction("rem",Left,Right); + break; + // Binary Condition + case Instruction::ICmp: + printICmpInstruction(cast<ICmpInst>(Inst)->getPredicate(),Left,Right); + break; + case Instruction::FCmp: + printFCmpInstruction(cast<FCmpInst>(Inst)->getPredicate(),Left,Right); + break; + // Bitwise Binary + case Instruction::And: + printBinaryInstruction("and",Left,Right); + break; + case Instruction::Or: + printBinaryInstruction("or",Left,Right); + break; + case Instruction::Xor: + printBinaryInstruction("xor",Left,Right); + break; + case Instruction::Shl: + printValueLoad(Left); + printValueLoad(Right); + printSimpleInstruction("conv.i4"); + printSimpleInstruction("shl"); + break; + case Instruction::LShr: + printValueLoad(Left); + printValueLoad(Right); + printSimpleInstruction("conv.i4"); + printSimpleInstruction("shr.un"); + break; + case Instruction::AShr: + printValueLoad(Left); + printValueLoad(Right); + printSimpleInstruction("conv.i4"); + printSimpleInstruction("shr"); + break; + case Instruction::Select: + printSelectInstruction(Inst->getOperand(0),Inst->getOperand(1),Inst->getOperand(2)); + break; + case Instruction::Load: + printIndirectLoad(Inst->getOperand(0)); + break; + case Instruction::Store: + printIndirectSave(Inst->getOperand(1), Inst->getOperand(0)); + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + printCastInstruction(Inst->getOpcode(),Left, + cast<CastInst>(Inst)->getDestTy()); + break; + case Instruction::GetElementPtr: + printGepInstruction(Inst->getOperand(0),gep_type_begin(Inst), + gep_type_end(Inst)); + break; + case Instruction::Call: + printCallInstruction(cast<CallInst>(Inst)); + break; + case Instruction::Invoke: + printInvokeInstruction(cast<InvokeInst>(Inst)); + break; + case Instruction::Unwind: + printSimpleInstruction("newobj", + "instance void [mscorlib]System.Exception::.ctor()"); + printSimpleInstruction("throw"); + break; + case Instruction::Switch: + printSwitchInstruction(cast<SwitchInst>(Inst)); + break; + case Instruction::Alloca: + printAllocaInstruction(cast<AllocaInst>(Inst)); + break; + case Instruction::Malloc: + assert(0 && "LowerAllocationsPass used"); + break; + case Instruction::Free: + assert(0 && "LowerAllocationsPass used"); + break; + case Instruction::Unreachable: + printSimpleInstruction("ldstr", "\"Unreachable instruction\""); + printSimpleInstruction("newobj", + "instance void [mscorlib]System.Exception::.ctor(string)"); + printSimpleInstruction("throw"); + break; + case Instruction::VAArg: + printVAArgInstruction(cast<VAArgInst>(Inst)); + break; + default: + cerr << "Instruction = " << Inst->getName() << '\n'; + assert(0 && "Unsupported instruction"); + } +} + + +void MSILWriter::printLoop(const Loop* L) { + Out << getLabelName(L->getHeader()->getName()) << ":\n"; + const std::vector<BasicBlock*>& blocks = L->getBlocks(); + for (unsigned I = 0, E = blocks.size(); I!=E; I++) { + BasicBlock* BB = blocks[I]; + Loop* BBLoop = LInfo->getLoopFor(BB); + if (BBLoop == L) + printBasicBlock(BB); + else if (BB==BBLoop->getHeader() && BBLoop->getParentLoop()==L) + printLoop(BBLoop); + } + printSimpleInstruction("br",getLabelName(L->getHeader()->getName()).c_str()); +} + + +void MSILWriter::printBasicBlock(const BasicBlock* BB) { + Out << getLabelName(BB) << ":\n"; + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) { + const Instruction* Inst = I; + // Comment llvm original instruction + Out << "\n//" << *Inst << "\n"; + // Do not handle PHI instruction in current block + if (Inst->getOpcode()==Instruction::PHI) continue; + // Print instruction + printInstruction(Inst); + // Save result + if (Inst->getType()!=Type::VoidTy) { + // Do not save value after invoke, it done in "try" block + if (Inst->getOpcode()==Instruction::Invoke) continue; + printValueSave(Inst); + } + } +} + + +void MSILWriter::printLocalVariables(const Function& F) { + std::string Name; + const Type* Ty = NULL; + std::set<const Value*> Printed; + const Value* VaList = NULL; + unsigned StackDepth = 8; + // Find local variables + for (const_inst_iterator I = inst_begin(&F), E = inst_end(&F); I!=E; ++I) { + if (I->getOpcode()==Instruction::Call || + I->getOpcode()==Instruction::Invoke) { + // Test stack depth. + if (StackDepth<I->getNumOperands()) + StackDepth = I->getNumOperands(); + } + const AllocaInst* AI = dyn_cast<AllocaInst>(&*I); + if (AI && !isa<GlobalVariable>(AI)) { + // Local variable allocation. + Ty = PointerType::get(AI->getAllocatedType()); + Name = getValueName(AI); + Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n"; + } else if (I->getType()!=Type::VoidTy) { + // Operation result. + Ty = I->getType(); + Name = getValueName(&*I); + Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n"; + } + // Test on 'va_list' variable + bool isVaList = false; + if (const VAArgInst* VaInst = dyn_cast<VAArgInst>(&*I)) { + // "va_list" as "va_arg" instruction operand. + isVaList = true; + VaList = VaInst->getOperand(0); + } else if (const IntrinsicInst* Inst = dyn_cast<IntrinsicInst>(&*I)) { + // "va_list" as intrinsic function operand. + switch (Inst->getIntrinsicID()) { + case Intrinsic::vastart: + case Intrinsic::vaend: + case Intrinsic::vacopy: + isVaList = true; + VaList = Inst->getOperand(1); + break; + default: + isVaList = false; + } + } + // Print "va_list" variable. + if (isVaList && Printed.insert(VaList).second) { + Name = getValueName(VaList); + Name.insert(Name.length()-1,"$valist"); + Out << "\t.locals (valuetype [mscorlib]System.ArgIterator " + << Name << ")\n"; + } + } + printSimpleInstruction(".maxstack",utostr(StackDepth*2).c_str()); +} + + +void MSILWriter::printFunctionBody(const Function& F) { + // Print body + for (Function::const_iterator I = F.begin(), E = F.end(); I!=E; ++I) { + if (Loop *L = LInfo->getLoopFor(I)) { + if (L->getHeader()==I && L->getParentLoop()==0) + printLoop(L); + } else { + printBasicBlock(I); + } + } +} + + +void MSILWriter::printConstantExpr(const ConstantExpr* CE) { + const Value *left = 0, *right = 0; + if (CE->getNumOperands()>=1) left = CE->getOperand(0); + if (CE->getNumOperands()>=2) right = CE->getOperand(1); + // Print instruction + switch (CE->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + printCastInstruction(CE->getOpcode(),left,CE->getType()); + break; + case Instruction::GetElementPtr: + printGepInstruction(CE->getOperand(0),gep_type_begin(CE),gep_type_end(CE)); + break; + case Instruction::ICmp: + printICmpInstruction(CE->getPredicate(),left,right); + break; + case Instruction::FCmp: + printFCmpInstruction(CE->getPredicate(),left,right); + break; + case Instruction::Select: + printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2)); + break; + case Instruction::Add: + printBinaryInstruction("add",left,right); + break; + case Instruction::Sub: + printBinaryInstruction("sub",left,right); + break; + case Instruction::Mul: + printBinaryInstruction("mul",left,right); + break; + case Instruction::UDiv: + printBinaryInstruction("div.un",left,right); + break; + case Instruction::SDiv: + case Instruction::FDiv: + printBinaryInstruction("div",left,right); + break; + case Instruction::URem: + printBinaryInstruction("rem.un",left,right); + break; + case Instruction::SRem: + case Instruction::FRem: + printBinaryInstruction("rem",left,right); + break; + case Instruction::And: + printBinaryInstruction("and",left,right); + break; + case Instruction::Or: + printBinaryInstruction("or",left,right); + break; + case Instruction::Xor: + printBinaryInstruction("xor",left,right); + break; + case Instruction::Shl: + printBinaryInstruction("shl",left,right); + break; + case Instruction::LShr: + printBinaryInstruction("shr.un",left,right); + break; + case Instruction::AShr: + printBinaryInstruction("shr",left,right); + break; + default: + cerr << "Expression = " << *CE << "\n"; + assert(0 && "Invalid constant expression"); + } +} + + +void MSILWriter::printStaticInitializerList() { + // List of global variables with uninitialized fields. + for (std::map<const GlobalVariable*,std::vector<StaticInitializer> >::iterator + VarI = StaticInitList.begin(), VarE = StaticInitList.end(); VarI!=VarE; + ++VarI) { + const std::vector<StaticInitializer>& InitList = VarI->second; + if (InitList.empty()) continue; + // For each uninitialized field. + for (std::vector<StaticInitializer>::const_iterator I = InitList.begin(), + E = InitList.end(); I!=E; ++I) { + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(I->constant)) { + Out << "\n// Init " << getValueName(VarI->first) << ", offset " << + utostr(I->offset) << ", type "<< *I->constant->getType() << "\n\n"; + // Load variable address + printValueLoad(VarI->first); + // Add offset + if (I->offset!=0) { + printPtrLoad(I->offset); + printSimpleInstruction("add"); + } + // Load value + printConstantExpr(CE); + // Save result at offset + std::string postfix = getTypePostfix(CE->getType(),true); + if (*postfix.begin()=='u') *postfix.begin() = 'i'; + postfix = "stind."+postfix; + printSimpleInstruction(postfix.c_str()); + } else { + cerr << "Constant = " << *I->constant << '\n'; + assert(0 && "Invalid static initializer"); + } + } + } +} + + +void MSILWriter::printFunction(const Function& F) { + const FunctionType* FTy = F.getFunctionType(); + const ParamAttrsList *Attrs = FTy->getParamAttrs(); + bool isSigned = Attrs && Attrs->paramHasAttr(0, ParamAttr::SExt); + Out << "\n.method static "; + Out << (F.hasInternalLinkage() ? "private " : "public "); + if (F.isVarArg()) Out << "vararg "; + Out << getTypeName(F.getReturnType(),isSigned) << + getConvModopt(F.getCallingConv()) << getValueName(&F) << '\n'; + // Arguments + Out << "\t("; + unsigned ArgIdx = 1; + for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I!=E; + ++I, ++ArgIdx) { + isSigned = Attrs && Attrs->paramHasAttr(ArgIdx, ParamAttr::SExt); + if (I!=F.arg_begin()) Out << ", "; + Out << getTypeName(I->getType(),isSigned) << getValueName(I); + } + Out << ") cil managed\n"; + // Body + Out << "{\n"; + printLocalVariables(F); + printFunctionBody(F); + Out << "}\n"; +} + + +void MSILWriter::printDeclarations(const TypeSymbolTable& ST) { + std::string Name; + std::set<const Type*> Printed; + for (std::set<const Type*>::const_iterator + UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) { + const Type* Ty = *UI; + if (isa<ArrayType>(Ty) || isa<VectorType>(Ty) || isa<StructType>(Ty)) + Name = getTypeName(Ty, false, true); + // Type with no need to declare. + else continue; + // Print not duplicated type + if (Printed.insert(Ty).second) { + Out << ".class value explicit ansi sealed '" << Name << "'"; + Out << " { .pack " << 1 << " .size " << TD->getTypeSize(Ty) << " }\n\n"; + } + } +} + + +unsigned int MSILWriter::getBitWidth(const Type* Ty) { + unsigned int N = Ty->getPrimitiveSizeInBits(); + assert(N!=0 && "Invalid type in getBitWidth()"); + switch (N) { + case 1: + case 8: + case 16: + case 32: + case 64: + return N; + default: + cerr << "Bits = " << N << '\n'; + assert(0 && "Unsupported integer width"); + } +} + + +void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) { + uint64_t TySize = 0; + const Type* Ty = C->getType(); + // Print zero initialized constant. + if (isa<ConstantAggregateZero>(C) || C->isNullValue()) { + TySize = TD->getTypeSize(C->getType()); + Offset += TySize; + Out << "int8 (0) [" << TySize << "]"; + return; + } + // Print constant initializer + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + TySize = TD->getTypeSize(Ty); + const ConstantInt* Int = cast<ConstantInt>(C); + Out << getPrimitiveTypeName(Ty,true) << "(" << Int->getSExtValue() << ")"; + break; + } + case Type::FloatTyID: + case Type::DoubleTyID: { + TySize = TD->getTypeSize(Ty); + const ConstantFP* FP = cast<ConstantFP>(C); + if (Ty->getTypeID() == Type::FloatTyID) + Out << "int32 (" << FloatToBits(FP->getValue()) << ')'; + else + Out << "int64 (" << DoubleToBits(FP->getValue()) << ')'; + break; + } + case Type::ArrayTyID: + case Type::VectorTyID: + case Type::StructTyID: + for (unsigned I = 0, E = C->getNumOperands(); I<E; I++) { + if (I!=0) Out << ",\n"; + printStaticConstant(C->getOperand(I),Offset); + } + break; + case Type::PointerTyID: + TySize = TD->getTypeSize(C->getType()); + // Initialize with global variable address + if (const GlobalVariable *G = dyn_cast<GlobalVariable>(C)) { + std::string name = getValueName(G); + Out << "&(" << name.insert(name.length()-1,"$data") << ")"; + } else { + // Dynamic initialization + if (!isa<ConstantPointerNull>(C) && !C->isNullValue()) + InitListPtr->push_back(StaticInitializer(C,Offset)); + // Null pointer initialization + if (TySize==4) Out << "int32 (0)"; + else if (TySize==8) Out << "int64 (0)"; + else assert(0 && "Invalid pointer size"); + } + break; + default: + cerr << "TypeID = " << Ty->getTypeID() << '\n'; + assert(0 && "Invalid type in printStaticConstant()"); + } + // Increase offset. + Offset += TySize; +} + + +void MSILWriter::printStaticInitializer(const Constant* C, + const std::string& Name) { + switch (C->getType()->getTypeID()) { + case Type::IntegerTyID: + case Type::FloatTyID: + case Type::DoubleTyID: + Out << getPrimitiveTypeName(C->getType(), false); + break; + case Type::ArrayTyID: + case Type::VectorTyID: + case Type::StructTyID: + case Type::PointerTyID: + Out << getTypeName(C->getType()); + break; + default: + cerr << "Type = " << *C << "\n"; + assert(0 && "Invalid constant type"); + } + // Print initializer + std::string label = Name; + label.insert(label.length()-1,"$data"); + Out << Name << " at " << label << '\n'; + Out << ".data " << label << " = {\n"; + uint64_t offset = 0; + printStaticConstant(C,offset); + Out << "\n}\n\n"; +} + + +void MSILWriter::printVariableDefinition(const GlobalVariable* G) { + const Constant* C = G->getInitializer(); + if (C->isNullValue() || isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) + InitListPtr = 0; + else + InitListPtr = &StaticInitList[G]; + printStaticInitializer(C,getValueName(G)); +} + + +void MSILWriter::printGlobalVariables() { + if (ModulePtr->global_empty()) return; + Module::global_iterator I,E; + for (I = ModulePtr->global_begin(), E = ModulePtr->global_end(); I!=E; ++I) { + // Variable definition + Out << ".field static " << (I->isDeclaration() ? "public " : + "private "); + if (I->isDeclaration()) { + Out << getTypeName(I->getType()) << getValueName(&*I) << "\n\n"; + } else + printVariableDefinition(&*I); + } +} + + +const char* MSILWriter::getLibraryName(const Function* F) { + return getLibraryForSymbol(F->getName().c_str(), true, F->getCallingConv()); +} + + +const char* MSILWriter::getLibraryName(const GlobalVariable* GV) { + return getLibraryForSymbol(Mang->getValueName(GV).c_str(), false, 0); +} + + +const char* MSILWriter::getLibraryForSymbol(const char* Name, bool isFunction, + unsigned CallingConv) { + // TODO: Read *.def file with function and libraries definitions. + return "MSVCRT.DLL"; +} + + +void MSILWriter::printExternals() { + Module::const_iterator I,E; + // Functions. + for (I=ModulePtr->begin(),E=ModulePtr->end(); I!=E; ++I) { + // Skip intrisics + if (I->getIntrinsicID()) continue; + if (I->isDeclaration()) { + const Function* F = I; + std::string Name = getConvModopt(F->getCallingConv())+getValueName(F); + std::string Sig = + getCallSignature(cast<FunctionType>(F->getFunctionType()), NULL, Name); + Out << ".method static hidebysig pinvokeimpl(\"" + << getLibraryName(F) << "\")\n\t" << Sig << " preservesig {}\n\n"; + } + } + // External variables and static initialization. + Out << + ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)" + " native int LoadLibrary(string) preservesig {}\n" + ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)" + " native int GetProcAddress(native int, string) preservesig {}\n"; + Out << + ".method private static void* $MSIL_Import(string lib,string sym)\n" + " managed cil\n{\n" + "\tldarg\tlib\n" + "\tcall\tnative int LoadLibrary(string)\n" + "\tldarg\tsym\n" + "\tcall\tnative int GetProcAddress(native int,string)\n" + "\tdup\n" + "\tbrtrue\tL_01\n" + "\tldstr\t\"Can no import variable\"\n" + "\tnewobj\tinstance void [mscorlib]System.Exception::.ctor(string)\n" + "\tthrow\n" + "L_01:\n" + "\tret\n" + "}\n\n" + ".method static private void $MSIL_Init() managed cil\n{\n"; + printStaticInitializerList(); + // Foreach global variable. + for (Module::global_iterator I = ModulePtr->global_begin(), + E = ModulePtr->global_end(); I!=E; ++I) { + if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue; + // Use "LoadLibrary"/"GetProcAddress" to recive variable address. + std::string Label = "not_null$_"+utostr(getUniqID()); + std::string Tmp = getTypeName(I->getType())+getValueName(&*I); + printSimpleInstruction("ldsflda",Tmp.c_str()); + Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n"; + Out << "\tldstr\t\"" << Mang->getValueName(&*I) << "\"\n"; + printSimpleInstruction("call","void* $MSIL_Import(string,string)"); + printIndirectSave(I->getType()); + } + printSimpleInstruction("ret"); + Out << "}\n\n"; +} + + +//===----------------------------------------------------------------------===// +// External Interface declaration +//===----------------------------------------------------------------------===// + +bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM, std::ostream &o, + CodeGenFileType FileType, bool Fast) +{ + if (FileType != TargetMachine::AssemblyFile) return true; + MSILWriter* Writer = new MSILWriter(o); + PM.add(createLowerGCPass()); + PM.add(createLowerAllocationsPass(true)); + // FIXME: Handle switch trougth native IL instruction "switch" + PM.add(createLowerSwitchPass()); + PM.add(createCFGSimplificationPass()); + PM.add(new MSILModule(Writer->UsedTypes,Writer->TD)); + PM.add(Writer); + return false; +} diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h new file mode 100644 index 0000000..c2cd0ab --- /dev/null +++ b/lib/Target/MSIL/MSILWriter.h @@ -0,0 +1,255 @@ +//===-- MSILWriter.h - TargetMachine for the MSIL ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Roman Samoilov and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the MSILWriter that is used by the MSIL. +// +//===----------------------------------------------------------------------===// +#ifndef MSILWRITER_H +#define MSILWRITER_H + +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" +#include "llvm/Analysis/FindUsedTypes.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetMachineRegistry.h" +#include "llvm/Support/Mangler.h" +#include <algorithm> +#include <ios> +using namespace llvm; + +namespace { + + class MSILModule : public ModulePass { + Module *ModulePtr; + const std::set<const Type *>*& UsedTypes; + const TargetData*& TD; + + public: + static char ID; + MSILModule(const std::set<const Type *>*& _UsedTypes, + const TargetData*& _TD) + : ModulePass((intptr_t)&ID), UsedTypes(_UsedTypes), TD(_TD) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<FindUsedTypes>(); + AU.addRequired<TargetData>(); + } + + virtual const char *getPassName() const { + return "MSIL backend definitions"; + } + + virtual bool runOnModule(Module &M); + + }; + + class MSILWriter : public FunctionPass { + struct StaticInitializer { + const Constant* constant; + uint64_t offset; + + StaticInitializer() + : constant(0), offset(0) {} + + StaticInitializer(const Constant* _constant, uint64_t _offset) + : constant(_constant), offset(_offset) {} + }; + + uint64_t UniqID; + + uint64_t getUniqID() { + return ++UniqID; + } + + public: + std::ostream &Out; + Module* ModulePtr; + const TargetData* TD; + Mangler* Mang; + LoopInfo *LInfo; + std::vector<StaticInitializer>* InitListPtr; + std::map<const GlobalVariable*,std::vector<StaticInitializer> > + StaticInitList; + const std::set<const Type *>* UsedTypes; + static char ID; + MSILWriter(std::ostream &o) : FunctionPass((intptr_t)&ID), Out(o) { + UniqID = 0; + } + + enum ValueType { + UndefVT, + GlobalVT, + InternalVT, + ArgumentVT, + LocalVT, + ConstVT, + ConstExprVT + }; + + bool isVariable(ValueType V) { + return V==GlobalVT || V==InternalVT || V==ArgumentVT || V==LocalVT; + } + + bool isConstValue(ValueType V) { + return V==ConstVT || V==ConstExprVT; + } + + virtual const char *getPassName() const { return "MSIL backend"; } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.setPreservesAll(); + } + + bool runOnFunction(Function &F); + + virtual bool doInitialization(Module &M); + + virtual bool doFinalization(Module &M); + + void printModuleStartup(); + + bool isZeroValue(const Value* V); + + std::string getValueName(const Value* V); + + std::string getLabelName(const Value* V); + + std::string getLabelName(const std::string& Name); + + std::string getConvModopt(unsigned CallingConvID); + + std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty); + + std::string getPrimitiveTypeName(const Type* Ty, bool isSigned); + + std::string getFunctionTypeName(const Type* Ty); + + std::string getPointerTypeName(const Type* Ty); + + std::string getTypeName(const Type* Ty, bool isSigned = false, + bool isNested = false); + + ValueType getValueLocation(const Value* V); + + std::string getTypePostfix(const Type* Ty, bool Expand, + bool isSigned = false); + + void printConvToPtr(); + + void printPtrLoad(uint64_t N); + + void printValuePtrLoad(const Value* V); + + void printConstLoad(const Constant* C); + + void printValueLoad(const Value* V); + + void printValueSave(const Value* V); + + void printBinaryInstruction(const char* Name, const Value* Left, + const Value* Right); + + void printSimpleInstruction(const char* Inst, const char* Operand = NULL); + + void printPHICopy(const BasicBlock* Src, const BasicBlock* Dst); + + void printBranchToBlock(const BasicBlock* CurrBB, + const BasicBlock* TrueBB, + const BasicBlock* FalseBB); + + void printBranchInstruction(const BranchInst* Inst); + + void printSelectInstruction(const Value* Cond, const Value* VTrue, + const Value* VFalse); + + void printIndirectLoad(const Value* V); + + void printIndirectSave(const Value* Ptr, const Value* Val); + + void printIndirectSave(const Type* Ty); + + void printCastInstruction(unsigned int Op, const Value* V, + const Type* Ty); + + void printGepInstruction(const Value* V, gep_type_iterator I, + gep_type_iterator E); + + std::string getCallSignature(const FunctionType* Ty, + const Instruction* Inst, + std::string Name); + + void printFunctionCall(const Value* FnVal, const Instruction* Inst); + + void printIntrinsicCall(const IntrinsicInst* Inst); + + void printCallInstruction(const Instruction* Inst); + + void printICmpInstruction(unsigned Predicate, const Value* Left, + const Value* Right); + + void printFCmpInstruction(unsigned Predicate, const Value* Left, + const Value* Right); + + void printInvokeInstruction(const InvokeInst* Inst); + + void printSwitchInstruction(const SwitchInst* Inst); + + void printVAArgInstruction(const VAArgInst* Inst); + + void printAllocaInstruction(const AllocaInst* Inst); + + void printInstruction(const Instruction* Inst); + + void printLoop(const Loop* L); + + void printBasicBlock(const BasicBlock* BB); + + void printLocalVariables(const Function& F); + + void printFunctionBody(const Function& F); + + void printConstantExpr(const ConstantExpr* CE); + + void printStaticInitializerList(); + + void printFunction(const Function& F); + + void printDeclarations(const TypeSymbolTable& ST); + + unsigned int getBitWidth(const Type* Ty); + + void printStaticConstant(const Constant* C, uint64_t& Offset); + + void printStaticInitializer(const Constant* C, const std::string& Name); + + void printVariableDefinition(const GlobalVariable* G); + + void printGlobalVariables(); + + const char* getLibraryName(const Function* F); + + const char* getLibraryName(const GlobalVariable* GV); + + const char* getLibraryForSymbol(const char* Name, bool isFunction, + unsigned CallingConv); + + void printExternals(); + }; +} + +#endif + diff --git a/lib/Target/MSIL/Makefile b/lib/Target/MSIL/Makefile new file mode 100644 index 0000000..17f7247 --- /dev/null +++ b/lib/Target/MSIL/Makefile @@ -0,0 +1,14 @@ +##===- lib/Target/MSIL/Makefile ----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by Roman Samoilov and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMMSIL +include $(LEVEL)/Makefile.common + +CompileCommonOpts := $(CompileCommonOpts) -Wno-format diff --git a/lib/Target/MSIL/README.TXT b/lib/Target/MSIL/README.TXT new file mode 100644 index 0000000..2b9a569 --- /dev/null +++ b/lib/Target/MSIL/README.TXT @@ -0,0 +1,26 @@ +//===---------------------------------------------------------------------===// + +Vector instructions support. + +ShuffleVector +ExtractElement +InsertElement + +//===---------------------------------------------------------------------===// + +Add "OpaqueType" type. + +//===---------------------------------------------------------------------===// + +"switch" instruction emulation with CLI "switch" instruction. + +//===---------------------------------------------------------------------===// + +Write linker for external function, because function export need to know +dynamic library where function located. + +.method static hidebysig pinvokeimpl("msvcrt.dll" cdecl) + void free(void*) preservesig {} + + + diff --git a/lib/Target/Makefile b/lib/Target/Makefile new file mode 100644 index 0000000..59f50fe --- /dev/null +++ b/lib/Target/Makefile @@ -0,0 +1,20 @@ +#===- lib/Target/Makefile ----------------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../.. +LIBRARYNAME = LLVMTarget +BUILD_ARCHIVE = 1 + +# We include this early so we can access the value of TARGETS_TO_BUILD as the +# value for PARALLEL_DIRS which must be set before Makefile.rules is included +include $(LEVEL)/Makefile.config + +PARALLEL_DIRS := $(TARGETS_TO_BUILD) + +include $(LLVM_SRC_ROOT)/Makefile.rules diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile new file mode 100644 index 0000000..6ebffc7 --- /dev/null +++ b/lib/Target/Mips/Makefile @@ -0,0 +1,21 @@ +##===- lib/Target/Mips/Makefile ----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by Bruno Cardoso Lopes and is distributed under the +# University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMMips +TARGET = Mips + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = MipsGenRegisterInfo.h.inc MipsGenRegisterNames.inc \ + MipsGenRegisterInfo.inc MipsGenInstrNames.inc \ + MipsGenInstrInfo.inc MipsGenAsmWriter.inc \ + MipsGenDAGISel.inc MipsGenCallingConv.inc \ + MipsGenSubtarget.inc + +include $(LEVEL)/Makefile.common + diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h new file mode 100644 index 0000000..48b08ea --- /dev/null +++ b/lib/Target/Mips/Mips.h @@ -0,0 +1,38 @@ +//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in +// the LLVM Mips back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_MIPS_H +#define TARGET_MIPS_H + +#include <iosfwd> + +namespace llvm { + class MipsTargetMachine; + class FunctionPassManager; + class FunctionPass; + class MachineCodeEmitter; + + FunctionPass *createMipsCodePrinterPass(std::ostream &OS, + MipsTargetMachine &TM); + FunctionPass *createMipsISelDag(MipsTargetMachine &TM); +} // end namespace llvm; + +// Defines symbolic names for Mips registers. This defines a mapping from +// register name to register number. +#include "MipsGenRegisterNames.inc" + +// Defines symbolic names for the Mips instructions. +#include "MipsGenInstrNames.inc" + +#endif diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td new file mode 100644 index 0000000..662bc3b --- /dev/null +++ b/lib/Target/Mips/Mips.td @@ -0,0 +1,63 @@ +//===- Mips.td - Describe the Mips Target Machine ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "../Target.td" + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "MipsRegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Subtarget features +//===----------------------------------------------------------------------===// + +// TODO: dummy, needed to compile +def FeatureCIX : SubtargetFeature<"r3000", "isR3000", "true", + "Enable r3000 extentions">; + +//===----------------------------------------------------------------------===// +// Instruction Description +//===----------------------------------------------------------------------===// + +include "MipsInstrInfo.td" + +def MipsInstrInfo : InstrInfo { + // Define how we want to layout our target-specific information field. + let TSFlagsFields = []; + let TSFlagsShifts = []; +} +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "MipsCallingConv.td" + +//===----------------------------------------------------------------------===// +// Mips processors supported. +//===----------------------------------------------------------------------===// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"generic", []>; + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def Mips : Target { + // Pull in Instruction Info: + let InstructionSet = MipsInstrInfo; +} diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp new file mode 100644 index 0000000..1df1291 --- /dev/null +++ b/lib/Target/Mips/MipsAsmPrinter.cpp @@ -0,0 +1,433 @@ +//===-- MipsAsmPrinter.cpp - Mips LLVM assembly writer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format MIPS assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-asm-printer" + +#include "Mips.h" +#include "MipsInstrInfo.h" +#include "MipsTargetMachine.h" +#include "MipsMachineFunction.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Mangler.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MathExtras.h" +#include <cctype> + +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct VISIBILITY_HIDDEN MipsAsmPrinter : public AsmPrinter { + MipsAsmPrinter(std::ostream &O, MipsTargetMachine &TM, + const TargetAsmInfo *T): + AsmPrinter(O, TM, T) {} + + virtual const char *getPassName() const { + return "Mips Assembly Printer"; + } + + enum SetDirectiveFlags { + REORDER, // enables instruction reordering. + NOREORDER, // disables instruction reordering. + MACRO, // enables GAS macros. + NOMACRO // disables GAS macros. + }; + + void printOperand(const MachineInstr *MI, int opNum); + void printMemOperand(const MachineInstr *MI, int opNum, + const char *Modifier = 0); + + void printHex32(unsigned int Value); + void emitFunctionStart(MachineFunction &MF); + void emitFunctionEnd(); + void emitFrameDirective(MachineFunction &MF); + void emitMaskDirective(MachineFunction &MF); + void emitFMaskDirective(); + void emitSetDirective(SetDirectiveFlags Flag); + + bool printInstruction(const MachineInstr *MI); // autogenerated. + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + }; +} // end of anonymous namespace + +#include "MipsGenAsmWriter.inc" + +/// createMipsCodePrinterPass - Returns a pass that prints the MIPS +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. This should work +/// regardless of whether the function is in SSA form. +FunctionPass *llvm::createMipsCodePrinterPass(std::ostream &o, + MipsTargetMachine &tm) +{ + return new MipsAsmPrinter(o, tm, tm.getTargetAsmInfo()); +} + +/// This pattern will be emitted : +/// .frame reg1, size, reg2 +/// It describes the stack frame. +/// reg1 - stack pointer +/// size - stack size allocated for the function +/// reg2 - return address register +void MipsAsmPrinter:: +emitFrameDirective(MachineFunction &MF) +{ + const MRegisterInfo &RI = *TM.getRegisterInfo(); + + unsigned stackReg = RI.getFrameRegister(MF); + unsigned returnReg = RI.getRARegister(); + unsigned stackSize = MF.getFrameInfo()->getStackSize(); + + + O << "\t.frame\t" << "$" << LowercaseString(RI.get(stackReg).Name) + << "," << stackSize << "," + << "$" << LowercaseString(RI.get(returnReg).Name) + << "\n"; +} + +/// This pattern will be emitted : +/// .mask bitmask, offset +/// Tells the assembler (and possibly linker) which registers are saved and where. +/// bitmask - mask of all GPRs (little endian) +/// offset - negative value. offset+stackSize should give where on the stack +/// the first GPR is saved. +/// TODO: consider calle saved GPR regs here, not hardcode register numbers. +void MipsAsmPrinter:: +emitMaskDirective(MachineFunction &MF) +{ + const MRegisterInfo &RI = *TM.getRegisterInfo(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + + bool hasFP = RI.hasFP(MF); + bool saveRA = MF.getFrameInfo()->hasCalls(); + + int offset; + + if (!MipsFI->getTopSavedRegOffset()) + offset = 0; + else + offset = -(MF.getFrameInfo()->getStackSize() + -MipsFI->getTopSavedRegOffset()); + + #ifndef NDEBUG + DOUT << "<--ASM PRINTER--emitMaskDirective-->" << "\n"; + DOUT << "StackSize : " << MF.getFrameInfo()->getStackSize() << "\n"; + DOUT << "getTopSavedRegOffset() : " << MipsFI->getTopSavedRegOffset() << "\n"; + DOUT << "offset : " << offset << "\n\n"; + #endif + + unsigned int bitmask = 0; + + if (hasFP) + bitmask |= (1 << 30); + + if (saveRA) + bitmask |= (1 << 31); + + O << "\t.mask\t"; + printHex32(bitmask); + O << "," << offset << "\n"; +} + +/// This pattern will be emitted : +/// .fmask bitmask, offset +/// Tells the assembler (and possibly linker) which float registers are saved. +/// bitmask - mask of all Float Point registers (little endian) +/// offset - negative value. offset+stackSize should give where on the stack +/// the first Float Point register is saved. +/// TODO: implement this, dummy for now +void MipsAsmPrinter:: +emitFMaskDirective() +{ + O << "\t.fmask\t0x00000000,0" << "\n"; +} + +/// Print a 32 bit hex number filling with 0's on the left. +/// TODO: make this setfill and setw +void MipsAsmPrinter:: +printHex32(unsigned int Value) { + O << "0x" << std::hex << Value << std::dec; +} + +/// Emit Set directives. +void MipsAsmPrinter:: +emitSetDirective(SetDirectiveFlags Flag) { + + O << "\t.set\t"; + switch(Flag) { + case REORDER: O << "reorder" << "\n"; break; + case NOREORDER: O << "noreorder" << "\n"; break; + case MACRO: O << "macro" << "\n"; break; + case NOMACRO: O << "nomacro" << "\n"; break; + default: break; + } +} + +/// Emit the directives used by GAS on the start of functions +void MipsAsmPrinter:: +emitFunctionStart(MachineFunction &MF) +{ + // Print out the label for the function. + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + // On Mips GAS, if .align #n is present, #n means the number of bits + // to be cleared. So, if we want 4 byte alignment, we must have .align 2 + EmitAlignment(1, F); + + O << "\t.globl\t" << CurrentFnName << "\n"; + O << "\t.ent\t" << CurrentFnName << "\n"; + O << "\t.type\t" << CurrentFnName << ", @function\n"; + O << CurrentFnName << ":\n"; + + emitFrameDirective(MF); + emitMaskDirective(MF); + emitFMaskDirective(); + + emitSetDirective(NOREORDER); + emitSetDirective(NOMACRO); +} + +/// Emit the directives used by GAS on the end of functions +void MipsAsmPrinter:: +emitFunctionEnd() { + emitSetDirective(MACRO); + emitSetDirective(REORDER); + O << "\t.end\t" << CurrentFnName << "\n"; +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +bool MipsAsmPrinter:: +runOnMachineFunction(MachineFunction &MF) +{ + SetupMachineFunction(MF); + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + O << "\n\n"; + + // What's my mangled name? + CurrentFnName = Mang->getValueName(MF.getFunction()); + + // Emit the function start directives + emitFunctionStart(MF); + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + + // Print a label for the basic block. + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printInstruction(II); + ++EmittedInsts; + } + } + + // Emit function end directives + emitFunctionEnd(); + + // We didn't modify anything. + return false; +} + +void MipsAsmPrinter:: +printOperand(const MachineInstr *MI, int opNum) +{ + const MachineOperand &MO = MI->getOperand(opNum); + const MRegisterInfo &RI = *TM.getRegisterInfo(); + bool closeP=false; + + // %hi and %lo used on mips gas to break large constants + if (MI->getOpcode() == Mips::LUi && !MO.isRegister() + && !MO.isImmediate()) { + O << "%hi("; + closeP = true; + } else if ((MI->getOpcode() == Mips::ADDiu) && !MO.isRegister() + && !MO.isImmediate()) { + O << "%lo("; + closeP = true; + } + + switch (MO.getType()) + { + case MachineOperand::MO_Register: + if (MRegisterInfo::isPhysicalRegister(MO.getReg())) + O << "$" << LowercaseString (RI.get(MO.getReg()).Name); + else + O << "$" << MO.getReg(); + break; + + case MachineOperand::MO_Immediate: + if ((MI->getOpcode() == Mips::SLTiu) || (MI->getOpcode() == Mips::ORi) || + (MI->getOpcode() == Mips::LUi) || (MI->getOpcode() == Mips::ANDi)) + O << (unsigned short int)MO.getImmedValue(); + else + O << (short int)MO.getImmedValue(); + break; + + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + + case MachineOperand::MO_GlobalAddress: + O << Mang->getValueName(MO.getGlobal()); + break; + + case MachineOperand::MO_ExternalSymbol: + O << MO.getSymbolName(); + break; + + case MachineOperand::MO_ConstantPoolIndex: + O << TAI->getPrivateGlobalPrefix() << "CPI" + << getFunctionNumber() << "_" << MO.getConstantPoolIndex(); + break; + + default: + O << "<unknown operand type>"; abort (); break; + } + + if (closeP) O << ")"; +} + +void MipsAsmPrinter:: +printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) +{ + // lw/sw $reg, MemOperand + // will turn into : + // lw/sw $reg, imm($reg) + printOperand(MI, opNum); + O << "("; + printOperand(MI, opNum+1); + O << ")"; +} + +bool MipsAsmPrinter:: +doInitialization(Module &M) +{ + Mang = new Mangler(M); + return false; // success +} + +bool MipsAsmPrinter:: +doFinalization(Module &M) +{ + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) + + // External global require no code + if (I->hasInitializer()) { + + // Check to see if this is a special global + // used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + O << "\n\n"; + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + unsigned Size = TD->getTypeSize(C->getType()); + unsigned Align = TD->getPrefTypeAlignment(C->getType()); + + if (C->isNullValue() && (I->hasLinkOnceLinkage() || + I->hasInternalLinkage() || I->hasWeakLinkage() + /* FIXME: Verify correct */)) { + + SwitchToDataSection(".data", I); + if (I->hasInternalLinkage()) + O << "\t.local " << name << "\n"; + + O << "\t.comm " << name << "," + << TD->getTypeSize(C->getType()) + << "," << Align << "\n"; + + } else { + + switch (I->getLinkage()) + { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + // FIXME: Verify correct for weak. + // Nonnull linkonce -> weak + O << "\t.weak " << name << "\n"; + SwitchToDataSection("", I); + O << "\t.section\t\".llvm.linkonce.d." << name + << "\",\"aw\",@progbits\n"; + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables + // should go into a section of their name or + // something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.globl " << name << "\n"; + case GlobalValue::InternalLinkage: + if (C->isNullValue()) + SwitchToDataSection(".bss", I); + else + SwitchToDataSection(".data", I); + break; + case GlobalValue::GhostLinkage: + cerr << "Should not have any" + << "unmaterialized functions!\n"; + abort(); + case GlobalValue::DLLImportLinkage: + cerr << "DLLImport linkage is" + << "not supported by this target!\n"; + abort(); + case GlobalValue::DLLExportLinkage: + cerr << "DLLExport linkage is" + << "not supported by this target!\n"; + abort(); + default: + assert(0 && "Unknown linkage type!"); + } + O << "\t.align " << Align << "\n"; + O << "\t.type " << name << ",@object\n"; + O << "\t.size " << name << "," << Size << "\n"; + O << name << ":\n"; + EmitGlobalConstant(C); + } + } + + AsmPrinter::doFinalization(M); + return false; // success +} diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td new file mode 100644 index 0000000..23ef850 --- /dev/null +++ b/lib/Target/Mips/MipsCallingConv.td @@ -0,0 +1,39 @@ +//===- MipsCallingConv.td - Calling Conventions for Mips --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for Mips architecture. +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A>: + CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Mips Return Value Calling Convention +//===----------------------------------------------------------------------===// +def RetCC_Mips : CallingConv<[ + // i32 are returned in registers V0, V1 + CCIfType<[i32], CCAssignToReg<[V0, V1]>> +]>; + + +//===----------------------------------------------------------------------===// +// Mips Argument Calling Conventions +//===----------------------------------------------------------------------===// +def CC_Mips : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The first 4 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3]>>, + + // Integer values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32], CCAssignToStack<4, 4>> +]>; + diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp new file mode 100644 index 0000000..d6e3830 --- /dev/null +++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -0,0 +1,272 @@ +//===-- MipsISelDAGToDAG.cpp - A dag to dag inst selector for Mips --------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the MIPS target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-isel" + +#include "Mips.h" +#include "MipsISelLowering.h" +#include "MipsRegisterInfo.h" +#include "MipsSubtarget.h" +#include "MipsTargetMachine.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/CFG.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <set> + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MipsDAGToDAGISel - MIPS specific code to select MIPS machine +// instructions for SelectionDAG operations. +//===----------------------------------------------------------------------===// +namespace { + +class VISIBILITY_HIDDEN MipsDAGToDAGISel : public SelectionDAGISel { + + /// TM - Keep a reference to MipsTargetMachine. + MipsTargetMachine &TM; + + /// MipsLowering - This object fully describes how to lower LLVM code to an + /// Mips-specific SelectionDAG. + MipsTargetLowering MipsLowering; + + /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can + /// make the right decision when generating code for different targets. + //TODO: add initialization on constructor + //const MipsSubtarget *Subtarget; + +public: + MipsDAGToDAGISel(MipsTargetMachine &tm) : + SelectionDAGISel(MipsLowering), + TM(tm), MipsLowering(*TM.getTargetLowering()) {} + + virtual void InstructionSelectBasicBlock(SelectionDAG &SD); + + // Pass Name + virtual const char *getPassName() const { + return "MIPS DAG->DAG Pattern Instruction Selection"; + } + + +private: + // Include the pieces autogenerated from the target description. + #include "MipsGenDAGISel.inc" + + SDNode *Select(SDOperand N); + + // Complex Pattern. + bool SelectAddr(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Offset); + + + // getI32Imm - Return a target constant with the specified + // value, of type i32. + inline SDOperand getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + + #ifndef NDEBUG + unsigned Indent; + #endif +}; + +} + +/// InstructionSelectBasicBlock - This callback is invoked by +/// SelectionDAGISel when it has created a SelectionDAG for us to codegen. +void MipsDAGToDAGISel:: +InstructionSelectBasicBlock(SelectionDAG &SD) +{ + DEBUG(BB->dump()); + // Codegen the basic block. + #ifndef NDEBUG + DOUT << "===== Instruction selection begins:\n"; + Indent = 0; + #endif + + // Select target instructions for the DAG. + SD.setRoot(SelectRoot(SD.getRoot())); + + #ifndef NDEBUG + DOUT << "===== Instruction selection ends:\n"; + #endif + + SD.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(SD); +} + +/// ComplexPattern used on MipsInstrInfo +/// Used on Mips Load/Store instructions +bool MipsDAGToDAGISel:: +SelectAddr(SDOperand Op, SDOperand Addr, SDOperand &Offset, SDOperand &Base) +{ + // if Address is FI, get the TargetFrameIndex. + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + + // TargetExternalSymbol and TargetGlobalAddress are + // lowered and their addresses go into registers, so + // they should not be touched here. + if ((Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress)) + return false; + + // Operand is an result from an ADD. + if (Addr.getOpcode() == ISD::ADD) + { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) + { + if (Predicate_immSExt16(CN)) + { + // If the first operand is a FI, get the TargetFI Node + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode> + (Addr.getOperand(0))) { + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + } else { + Base = Addr.getOperand(0); + } + + Offset = CurDAG->getTargetConstant(CN->getValue(), MVT::i32); + return true; + } + } + } + + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +/// Select instructions not customized! Used for +/// expanded, promoted and normal instructions +SDNode* MipsDAGToDAGISel:: +Select(SDOperand N) +{ + SDNode *Node = N.Val; + unsigned Opcode = Node->getOpcode(); + + // Dump information about the Node being selected + #ifndef NDEBUG + DOUT << std::string(Indent, ' ') << "Selecting: "; + DEBUG(Node->dump(CurDAG)); + DOUT << "\n"; + Indent += 2; + #endif + + // If we have a custom node, we already have selected! + if (Opcode >= ISD::BUILTIN_OP_END && Opcode < MipsISD::FIRST_NUMBER) { + #ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "== "; + DEBUG(Node->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; + #endif + return NULL; + } + + /// + // Instruction Selection not handled by custom or by the + // auto-generated tablegen selection should be handled here + /// + switch(Opcode) { + + default: break; + + /// Special Mul operations + case ISD::MULHS: + case ISD::MULHU: { + SDOperand MulOp1 = Node->getOperand(0); + SDOperand MulOp2 = Node->getOperand(1); + AddToISelQueue(MulOp1); + AddToISelQueue(MulOp2); + + unsigned MulOp = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT); + SDNode *MulNode = CurDAG->getTargetNode(MulOp, MVT::Flag, MulOp1, MulOp2); + + SDOperand MFInFlag = SDOperand(MulNode, 0); + return CurDAG->getTargetNode(Mips::MFHI, MVT::i32, MFInFlag); + } + + /// Div operations + case ISD::SDIV: + case ISD::UDIV: { + SDOperand DivOp1 = Node->getOperand(0); + SDOperand DivOp2 = Node->getOperand(1); + AddToISelQueue(DivOp1); + AddToISelQueue(DivOp2); + + unsigned DivOp = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu); + SDNode *DivNode = CurDAG->getTargetNode(DivOp, MVT::Flag, DivOp1, DivOp2); + + SDOperand MFInFlag = SDOperand(DivNode, 0); + return CurDAG->getTargetNode(Mips::MFLO, MVT::i32, MFInFlag); + } + + /// Rem operations + case ISD::SREM: + case ISD::UREM: { + SDOperand RemOp1 = Node->getOperand(0); + SDOperand RemOp2 = Node->getOperand(1); + AddToISelQueue(RemOp1); + AddToISelQueue(RemOp2); + + unsigned RemOp = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu); + SDNode *RemNode = CurDAG->getTargetNode(RemOp, MVT::Flag, RemOp1, RemOp2); + + SDOperand MFInFlag = SDOperand(RemNode, 0); + return CurDAG->getTargetNode(Mips::MFHI, MVT::i32, MFInFlag); + } + } + + // Select the default instruction + SDNode *ResNode = SelectCode(N); + + #ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "=> "; + if (ResNode == NULL || ResNode == N.Val) + DEBUG(N.Val->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; + #endif + + return ResNode; +} + +/// createMipsISelDag - This pass converts a legalized DAG into a +/// MIPS-specific DAG, ready for instruction scheduling. +FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) { + return new MipsDAGToDAGISel(TM); +} diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp new file mode 100644 index 0000000..790cdaf --- /dev/null +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -0,0 +1,557 @@ +//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that Mips uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-lower" + +#include "MipsISelLowering.h" +#include "MipsTargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/CallingConv.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <set> + +using namespace llvm; + +const char *MipsTargetLowering:: +getTargetNodeName(unsigned Opcode) const +{ + switch (Opcode) + { + case MipsISD::JmpLink : return "MipsISD::JmpLink"; + case MipsISD::Hi : return "MipsISD::Hi"; + case MipsISD::Lo : return "MipsISD::Lo"; + case MipsISD::Ret : return "MipsISD::Ret"; + default : return NULL; + } +} + +MipsTargetLowering:: +MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM) +{ + // Mips does not have i1 type, so use i32 for + // setcc operations results (slt, sgt, ...). + setSetCCResultType(MVT::i32); + setSetCCResultContents(ZeroOrOneSetCCResult); + + // Set up the register classes + addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass); + + // Custom + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::RET, MVT::Other, Custom); + + // Load extented operations for i1 types must be promoted + setLoadXAction(ISD::EXTLOAD, MVT::i1, Promote); + setLoadXAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote); + + // Store operations for i1 types must be promoted + setStoreXAction(MVT::i1, Promote); + + // Mips does not have these NodeTypes below. + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i32, Expand); + + // Mips not supported intrinsics. + setOperationAction(ISD::MEMMOVE, MVT::Other, Expand); + setOperationAction(ISD::MEMSET, MVT::Other, Expand); + setOperationAction(ISD::MEMCPY, MVT::Other, Expand); + + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::CTTZ , MVT::i32, Expand); + setOperationAction(ISD::CTLZ , MVT::i32, Expand); + setOperationAction(ISD::ROTL , MVT::i32, Expand); + setOperationAction(ISD::ROTR , MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + + // We don't have line number support yet. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + setOperationAction(ISD::LABEL, MVT::Other, Expand); + + // Use the default for now + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + setStackPointerRegisterToSaveRestore(Mips::SP); + computeRegisterProperties(); +} + + +SDOperand MipsTargetLowering:: +LowerOperation(SDOperand Op, SelectionDAG &DAG) +{ + switch (Op.getOpcode()) + { + case ISD::CALL: return LowerCALL(Op, DAG); + case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); + case ISD::RET: return LowerRET(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + } + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// Lower helper functions +//===----------------------------------------------------------------------===// + +// AddLiveIn - This helper function adds the specified physical register to the +// MachineFunction as a live in value. It also creates a corresponding +// virtual register for it. +static unsigned +AddLiveIn(MachineFunction &MF, unsigned PReg, TargetRegisterClass *RC) +{ + assert(RC->contains(PReg) && "Not the correct regclass!"); + unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC); + MF.addLiveIn(PReg, VReg); + return VReg; +} + +// Set up a frame object for the return address. +//SDOperand MipsTargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { +// if (ReturnAddrIndex == 0) { +// MachineFunction &MF = DAG.getMachineFunction(); +// ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, 0); +// } +// +// return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); +//} + + +//===----------------------------------------------------------------------===// +// Misc Lower Operation implementation +//===----------------------------------------------------------------------===// +SDOperand MipsTargetLowering:: +LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) +{ + GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + + SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i32); + SDOperand Hi = DAG.getNode(MipsISD::Hi, MVT::i32, GA); + SDOperand Lo = DAG.getNode(MipsISD::Lo, MVT::i32, GA); + + return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi); +} + +SDOperand MipsTargetLowering:: +LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) +{ + assert(0 && "TLS not implemented for MIPS."); +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +// +// The lower operations present on calling convention works on this order: +// LowerCALL (virt regs --> phys regs, virt regs --> stack) +// LowerFORMAL_ARGUMENTS (phys --> virt regs, stack --> virt regs) +// LowerRET (virt regs --> phys regs) +// LowerCALL (phys regs --> virt regs) +// +//===----------------------------------------------------------------------===// + +#include "MipsGenCallingConv.inc" + +//===----------------------------------------------------------------------===// +// CALL Calling Convention Implementation +//===----------------------------------------------------------------------===// + +/// Mips custom CALL implementation +SDOperand MipsTargetLowering:: +LowerCALL(SDOperand Op, SelectionDAG &DAG) +{ + unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + + // By now, only CallingConv::C implemented + switch (CallingConv) + { + default: + assert(0 && "Unsupported calling convention"); + case CallingConv::Fast: + case CallingConv::C: + return LowerCCCCallTo(Op, DAG, CallingConv); + } +} + +/// LowerCCCCallTo - functions arguments are copied from virtual +/// regs to (physical regs)/(stack frame), CALLSEQ_START and +/// CALLSEQ_END are emitted. +/// TODO: isVarArg, isTailCall, sret, GOT, linkage types. +SDOperand MipsTargetLowering:: +LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC) +{ + MachineFunction &MF = DAG.getMachineFunction(); + unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF); + + SDOperand Chain = Op.getOperand(0); + SDOperand Callee = Op.getOperand(4); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + + // To meet ABI, Mips must always allocate 16 bytes on + // the stack (even if less than 4 are used as arguments) + int VTsize = MVT::getSizeInBits(MVT::i32)/8; + MFI->CreateFixedObject(VTsize, -(VTsize*3)); + + CCInfo.AnalyzeCallOperands(Op.Val, CC_Mips); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, + getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + + SDOperand StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + // Arguments start after the 5 first operands of ISD::CALL + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + // Arguments that can be passed on register, + // must be kept at RegsToPass vector + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + + assert(VA.isMemLoc()); + + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(StackReg, getPointerTy()); + + // Create the frame index object for this incoming parameter + // This guarantees that when allocating Local Area our room + // will not be overwritten. + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + -(16 + VA.getLocMemOffset()) ); + + SDOperand PtrOff = DAG.getFrameIndex(FI,getPointerTy()); + + // emit ISD::STORE whichs stores the + // parameter value to a stack Location + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + } + } + + // Transform all store nodes into one single node because + // all store nodes are independent of each other. + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token + // chain and flag operands which copy the outgoing args into registers. + // The InFlag in necessary since all emited instructions must be + // stuck together. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct + // call is) turn it into a TargetGlobalAddress node so that legalize + // doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); + } else + if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + + // MipsJmpLink = #chain, #target_address, #opt_in_flags... + // = Chain, Callee, Reg#1, Reg#2, ... + // + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are + // known live into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.Val) + Ops.push_back(InFlag); + + Chain = DAG.getNode(MipsISD::JmpLink, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node. + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); + Ops.push_back(InFlag); + Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); +} + +/// LowerCallResult - Lower the result values of an ISD::CALL into the +/// appropriate copies out of appropriate physical registers. This assumes that +/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call +/// being lowered. Returns a SDNode with the same number of values as the +/// ISD::CALL. +SDNode *MipsTargetLowering:: +LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, + unsigned CallingConv, SelectionDAG &DAG) { + + bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0; + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); + + CCInfo.AnalyzeCallResult(TheCall, RetCC_Mips); + SmallVector<SDOperand, 8> ResultVals; + + // Returns void + if (!RVLocs.size()) + return Chain.Val; + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(), + RVLocs[i].getValVT(), InFlag).getValue(1); + InFlag = Chain.getValue(2); + ResultVals.push_back(Chain.getValue(0)); + } + + // Merge everything together with a MERGE_VALUES node. + ResultVals.push_back(Chain); + return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(), + &ResultVals[0], ResultVals.size()).Val; +} + +//===----------------------------------------------------------------------===// +// FORMAL_ARGUMENTS Calling Convention Implementation +//===----------------------------------------------------------------------===// + +/// Mips custom FORMAL_ARGUMENTS implementation +SDOperand MipsTargetLowering:: +LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) +{ + unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + switch(CC) + { + default: + assert(0 && "Unsupported calling convention"); + case CallingConv::C: + return LowerCCCArguments(Op, DAG); + } +} + +/// LowerCCCArguments - transform physical registers into +/// virtual registers and generate load operations for +/// arguments places on the stack. +/// TODO: isVarArg, sret +SDOperand MipsTargetLowering:: +LowerCCCArguments(SDOperand Op, SelectionDAG &DAG) +{ + SDOperand Root = Op.getOperand(0); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); + + unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + + CCInfo.AnalyzeFormalArguments(Op.Val, CC_Mips); + SmallVector<SDOperand, 8> ArgValues; + SDOperand StackPtr; + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + + CCValAssign &VA = ArgLocs[i]; + + // Arguments stored on registers + if (VA.isRegLoc()) { + MVT::ValueType RegVT = VA.getLocVT(); + TargetRegisterClass *RC; + + if (RegVT == MVT::i32) + RC = Mips::CPURegsRegisterClass; + else + assert(0 && "support only Mips::CPURegsRegisterClass"); + + + // Transform the arguments stored on + // physical registers into virtual ones + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); + SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + + if (VA.getLocInfo() != CCValAssign::Full) + ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue); + + ArgValues.push_back(ArgValue); + + // To meet ABI, when VARARGS are passed on registers, the registers + // containt must be written to the their always reserved home location + // on the stack. + if (isVarArg) { + + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(StackReg, getPointerTy()); + + // Create the frame index object for this incoming parameter + // The first 16 bytes are reserved. + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + i*4); + SDOperand PtrOff = DAG.getFrameIndex(FI, getPointerTy()); + + // emit ISD::STORE whichs stores the + // parameter value to a stack Location + ArgValues.push_back(DAG.getStore(Root, ArgValue, PtrOff, NULL, 0)); + } + + } else { + // sanity check + assert(VA.isMemLoc()); + + // Create the frame index object for this incoming parameter... + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + (16 + VA.getLocMemOffset())); + + // Create load nodes to retrieve arguments from the stack + SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0)); + } + } + ArgValues.push_back(Root); + + // Return the new list of results. + return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), + &ArgValues[0], ArgValues.size()).getValue(Op.ResNo); +} + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +SDOperand MipsTargetLowering:: +LowerRET(SDOperand Op, SelectionDAG &DAG) +{ + // CCValAssign - represent the assignment of + // the return value to a location + SmallVector<CCValAssign, 16> RVLocs; + unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); + bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); + + // CCState - Info about the registers and stack slot. + CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); + + // Analize return values of ISD::RET + CCInfo.AnalyzeReturn(Op.Val, RetCC_Mips); + + // If this is the first return lowered for this function, add + // the regs to the liveout set for the function. + if (DAG.getMachineFunction().liveout_empty()) { + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc()) + DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg()); + } + + // The chain is always operand #0 + SDOperand Chain = Op.getOperand(0); + SDOperand Flag; + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + // ISD::RET => ret chain, (regnum1,val1), ... + // So i*2+1 index only the regnums + Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), + Op.getOperand(i*2+1), Flag); + + // guarantee that all emitted copies are + // stuck together, avoiding something bad + Flag = Chain.getValue(1); + } + + // Return on Mips is always a "jr $ra" + if (Flag.Val) + return DAG.getNode(MipsISD::Ret, MVT::Other, + Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag); + else // Return Void + return DAG.getNode(MipsISD::Ret, MVT::Other, + Chain, DAG.getRegister(Mips::RA, MVT::i32)); +} diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h new file mode 100644 index 0000000..0199175 --- /dev/null +++ b/lib/Target/Mips/MipsISelLowering.h @@ -0,0 +1,84 @@ +//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that Mips uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef MipsISELLOWERING_H +#define MipsISELLOWERING_H + +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" +#include "Mips.h" +#include "MipsSubtarget.h" + +namespace llvm { + namespace MipsISD { + enum NodeType { + // Start the numbering from where ISD NodeType finishes. + FIRST_NUMBER = ISD::BUILTIN_OP_END+Mips::INSTRUCTION_LIST_END, + + // Jump and link (call) + JmpLink, + + // Get the Higher 16 bits from a 32-bit immediate + // No relation with Mips Hi register + Hi, + + // Get the Lower 16 bits from a 32-bit immediate + // No relation with Mips Lo register + Lo, + + // Return + Ret + }; + } + + //===--------------------------------------------------------------------===// + // TargetLowering Implementation + //===--------------------------------------------------------------------===// + class MipsTargetLowering : public TargetLowering + { + // FrameIndex for return slot. + int ReturnAddrIndex; + + // const MipsSubtarget &MipsSubTarget; + public: + + MipsTargetLowering(MipsTargetMachine &TM); + + /// LowerOperation - Provide custom lowering hooks for some operations. + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + + /// getTargetNodeName - This method returns the name of a target specific + // DAG node. + virtual const char *getTargetNodeName(unsigned Opcode) const; + + private: + // Lower Operand helpers + SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC); + SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall, + unsigned CallingConv, SelectionDAG &DAG); + SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG); + + // Lower Operand specifics + SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG); + + }; +} + +#endif // MipsISELLOWERING_H diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td new file mode 100644 index 0000000..b88fa90 --- /dev/null +++ b/lib/Target/Mips/MipsInstrFormats.td @@ -0,0 +1,96 @@ +//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe MIPS instructions format +// +// All the possible Mips fields are: +// +// opcode - operation code. +// rs - src reg. +// rt - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr). +// rd - dst reg, only used on 3 regs instr. +// shamt - only used on shift instructions, contains the shift amount. +// funct - combined with opcode field give us an operation code. +// +//===----------------------------------------------------------------------===// + +// Generic Mips Format +class MipsInst<dag ops, string asmstr, list<dag> pattern>: + Instruction +{ + field bits<32> Inst; + + let Namespace = "Mips"; + + bits<6> opcode; + + // Top 5 bits are the 'opcode' field + let Inst{31-26} = opcode; + + dag OperandList = ops; + let AsmString = asmstr; + let Pattern = pattern; +} + + +//===----------------------------------------------------------------------===// +// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|> +//===----------------------------------------------------------------------===// + +class FR<bits<6> op, bits<6> _funct, dag ops, string asmstr, list<dag> pattern>: + MipsInst<ops, asmstr, pattern> +{ + bits<5> rd; + bits<5> rs; + bits<5> rt; + bits<5> shamt; + bits<6> funct; + + let opcode = op; + let funct = _funct; + + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-6} = shamt; + let Inst{5-0} = funct; +} + +//===----------------------------------------------------------------------===// +// Format I instruction class in Mips : <|opcode|rs|rt|immediate|> +//===----------------------------------------------------------------------===// + +class FI<bits<6> op, dag ops, string asmstr, list<dag> pattern>: + MipsInst<ops, asmstr, pattern> +{ + bits<5> rt; + bits<5> rs; + bits<16> imm16; + + let opcode = op; + + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-0} = imm16; +} + +//===----------------------------------------------------------------------===// +// Format J instruction class in Mips : <|opcode|address|> +//===----------------------------------------------------------------------===// + +class FJ<bits<6> op, dag ops, string asmstr, list<dag> pattern>: + MipsInst<ops, asmstr, pattern> +{ + bits<26> addr; + + let opcode = op; + + let Inst{25-0} = addr; +} diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp new file mode 100644 index 0000000..8084030 --- /dev/null +++ b/lib/Target/Mips/MipsInstrInfo.cpp @@ -0,0 +1,114 @@ +//===- MipsInstrInfo.cpp - Mips Instruction Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Mips implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Mips.h" +#include "MipsInstrInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "MipsGenInstrInfo.inc" + +using namespace llvm; + +// TODO: Add the subtarget support on this constructor +MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm) + : TargetInstrInfo(MipsInsts, sizeof(MipsInsts)/sizeof(MipsInsts[0])), + TM(tm), RI(*this) {} + +static bool isZeroImm(const MachineOperand &op) { + return op.isImmediate() && op.getImmedValue() == 0; +} + +/// Return true if the instruction is a register to register move and +/// leave the source and dest operands in the passed parameters. +bool MipsInstrInfo:: +isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg) const +{ + // addu $dst, $src, $zero || addu $dst, $zero, $src + // or $dst, $src, $zero || or $dst, $zero, $src + if ((MI.getOpcode() == Mips::ADDu) || (MI.getOpcode() == Mips::OR)) + { + if (MI.getOperand(1).getReg() == Mips::ZERO) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(2).getReg(); + return true; + } else if (MI.getOperand(2).getReg() == Mips::ZERO) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); + return true; + } + } + + // addiu $dst, $src, 0 + if (MI.getOpcode() == Mips::ADDiu) + { + if ((MI.getOperand(1).isRegister()) && (isZeroImm(MI.getOperand(2)))) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); + return true; + } + } + return false; +} + +/// isLoadFromStackSlot - If the specified machine instruction is a direct +/// load from a stack slot, return the virtual or physical register number of +/// the destination along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than loading from the stack slot. +unsigned MipsInstrInfo:: +isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const +{ + // TODO: add lhu, lbu ??? + if (MI->getOpcode() == Mips::LW) + { + if ((MI->getOperand(2).isFrameIndex()) && // is a stack slot + (MI->getOperand(1).isImmediate()) && // the imm is zero + (isZeroImm(MI->getOperand(1)))) + { + FrameIndex = MI->getOperand(2).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + } + + return 0; +} + +/// isStoreToStackSlot - If the specified machine instruction is a direct +/// store to a stack slot, return the virtual or physical register number of +/// the source reg along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than storing to the stack slot. +unsigned MipsInstrInfo:: +isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const +{ + // TODO: add sb, sh ??? + if (MI->getOpcode() == Mips::SW) { + if ((MI->getOperand(0).isFrameIndex()) && // is a stack slot + (MI->getOperand(1).isImmediate()) && // the imm is zero + (isZeroImm(MI->getOperand(1)))) + { + FrameIndex = MI->getOperand(0).getFrameIndex(); + return MI->getOperand(2).getReg(); + } + } + return 0; +} + +unsigned MipsInstrInfo:: +InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, const std::vector<MachineOperand> &Cond) + const +{ + // TODO: add Mips::J here. + assert(0 && "Cant handle any kind of branches!"); + return 1; +} diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h new file mode 100644 index 0000000..356cf3d --- /dev/null +++ b/lib/Target/Mips/MipsInstrInfo.h @@ -0,0 +1,63 @@ +//===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Mips implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSINSTRUCTIONINFO_H +#define MIPSINSTRUCTIONINFO_H + +#include "Mips.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "MipsRegisterInfo.h" + +namespace llvm { + +class MipsInstrInfo : public TargetInstrInfo +{ + MipsTargetMachine &TM; + const MipsRegisterInfo RI; +public: + MipsInstrInfo(MipsTargetMachine &TM); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + /// Return true if the instruction is a register to register move and + /// leave the source and dest operands in the passed parameters. + /// + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const; + + /// isLoadFromStackSlot - If the specified machine instruction is a direct + /// load from a stack slot, return the virtual or physical register number of + /// the destination along with the FrameIndex of the loaded stack slot. If + /// not, return 0. This predicate must return 0 if the instruction has + /// any side effects other than loading from the stack slot. + virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + + /// isStoreToStackSlot - If the specified machine instruction is a direct + /// store to a stack slot, return the virtual or physical register number of + /// the source reg along with the FrameIndex of the loaded stack slot. If + /// not, return 0. This predicate must return 0 if the instruction has + /// any side effects other than storing to the stack slot. + virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; +}; + +} + +#endif diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td new file mode 100644 index 0000000..1f5d152 --- /dev/null +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -0,0 +1,469 @@ +//===- MipsInstrInfo.td - Mips Register defs --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +include "MipsInstrFormats.td" + +//===----------------------------------------------------------------------===// +// Mips profiles and nodes +//===----------------------------------------------------------------------===// + +// Call +def SDT_MipsJmpLink : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; +def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, [SDNPHasChain, + SDNPOutFlag]>; + +// Hi and Lo nodes are created to let easy manipulation of 16-bit when +// handling 32-bit immediates. They are used on MipsISelLowering to +// lower stuff like GlobalAddress, ExternalSymbol, ... +// This two nodes have nothing to do with Mips Registers Hi and Lo. +def MipsHi : SDNode<"MipsISD::Hi", SDTIntUnaryOp>; +def MipsLo : SDNode<"MipsISD::Lo", SDTIntUnaryOp>; + +// Return +def SDT_MipsRet : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, + SDNPOptInFlag]>; + +// These are target-independent nodes, but have target-specific formats. +def SDT_MipsCallSeq : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeq, + [SDNPHasChain, SDNPOutFlag]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeq, + [SDNPHasChain, SDNPOutFlag]>; + +// Instruction operand types +def brtarget : Operand<OtherVT>; +def calltarget : Operand<i32>; +def uimm16 : Operand<i32>; +def simm16 : Operand<i32>; +def shamt : Operand<i32>; + +// Address operand +def mem : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops simm16, CPURegs); +} + +//===----------------------------------------------------------------------===// +// Mips Patterns and Transformations +//===----------------------------------------------------------------------===// + +// Transformation Function - get the lower 16 bits. +def LO16 : SDNodeXForm<imm, [{ + return getI32Imm((unsigned)N->getValue() & 0xFFFF); +}]>; + +// Transformation Function - get the higher 16 bits. +def HI16 : SDNodeXForm<imm, [{ + return getI32Imm((unsigned)N->getValue() >> 16); +}]>; + +// Node immediate fits as 16-bit sign extended on target immediate. +// e.g. addi, andi +def immSExt16 : PatLeaf<(imm), [{ + if (N->getValueType(0) == MVT::i32) + return (int32_t)N->getValue() == (short)N->getValue(); + else + return (int64_t)N->getValue() == (short)N->getValue(); +}]>; + +// Node immediate fits as 16-bit zero extended on target immediate. +// The LO16 param means that only the lower 16 bits of the node +// immediate are caught. +// e.g. addiu, sltiu +def immZExt16 : PatLeaf<(imm), [{ + if (N->getValueType(0) == MVT::i32) + return (uint32_t)N->getValue() == (unsigned short)N->getValue(); + else + return (uint64_t)N->getValue() == (unsigned short)N->getValue(); +}], LO16>; + +// shamt field must fit in 5 bits. +def immZExt5 : PatLeaf<(imm), [{ + return N->getValue() == ((N->getValue()) & 0x1f) ; +}]>; + +// Mips Address Mode! SDNode frameindex could possibily be a match +// since load and store instructions from stack used it. +def addr : ComplexPattern<i32, 2, "SelectAddr", [frameindex], []>; + +//===----------------------------------------------------------------------===// +// Instructions specific format +//===----------------------------------------------------------------------===// + +// Arithmetic 3 register operands +let isCommutable = 1 in +class ArithR< bits<6> op, bits<6> func, string instr_asm, SDNode OpNode>: + FR< op, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >; + +let isCommutable = 1 in +class ArithOverflowR< bits<6> op, bits<6> func, string instr_asm>: + FR< op, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + []>; + +// Arithmetic 2 register operands +let isCommutable = 1 in +class ArithI<bits<6> op, string instr_asm, SDNode OpNode, + Operand Od, PatLeaf imm_type> : + FI< op, + (ops CPURegs:$dst, CPURegs:$b, Od:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))] >; + +// Arithmetic Multiply ADD/SUB +let rd=0 in +class MArithR<bits<6> func, string instr_asm> : + FR< 0x1c, + func, + (ops CPURegs:$rs, CPURegs:$rt), + !strconcat(instr_asm, " $rs, $rt"), + []>; + +// Logical +class LogicR<bits<6> func, string instr_asm, SDNode OpNode>: + FR< 0x00, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >; + +class LogicI<bits<6> op, string instr_asm, SDNode OpNode>: + FI< op, + (ops CPURegs:$dst, CPURegs:$b, uimm16:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, immSExt16:$c))]>; + +class LogicNOR<bits<6> op, bits<6> func, string instr_asm>: + FR< op, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))] >; + +// Shifts +let rt = 0 in +class LogicR_shift_imm<bits<6> func, string instr_asm, SDNode OpNode>: + FR< 0x00, + func, + (ops CPURegs:$dst, CPURegs:$b, shamt:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))] >; + +class LogicR_shift_reg<bits<6> func, string instr_asm, SDNode OpNode>: + FR< 0x00, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))] >; + +// Load Upper Imediate +class LoadUpper<bits<6> op, string instr_asm>: + FI< op, + (ops CPURegs:$dst, uimm16:$imm), + !strconcat(instr_asm, " $dst, $imm"), + []>; + +// Memory Load/Store +let isLoad = 1 in +class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>: + FI< op, + (ops CPURegs:$dst, mem:$addr), + !strconcat(instr_asm, " $dst, $addr"), + [(set CPURegs:$dst, (OpNode addr:$addr))]>; + +let isStore = 1 in +class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>: + FI< op, + (ops CPURegs:$dst, mem:$addr), + !strconcat(instr_asm, " $dst, $addr"), + [(OpNode CPURegs:$dst, addr:$addr)]>; + +// Conditional Branch +let isBranch = 1, noResults=1, isTerminator=1 in +class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>: + FI< op, + (ops CPURegs:$a, CPURegs:$b, brtarget:$offset), + !strconcat(instr_asm, " $a, $b, $offset"), + [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)]>; + +class SetCC_R<bits<6> op, bits<6> func, string instr_asm, + PatFrag cond_op>: + FR< op, + func, + (ops CPURegs:$dst, CPURegs:$b, CPURegs:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))]>; + +class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, + Operand Od, PatLeaf imm_type>: + FI< op, + (ops CPURegs:$dst, CPURegs:$b, Od:$c), + !strconcat(instr_asm, " $dst, $b, $c"), + [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))]>; + +// Unconditional branch +let hasCtrlDep=1, noResults=1, isTerminator=1 in +class JumpFJ<bits<6> op, string instr_asm>: + FJ< op, + (ops brtarget:$target), + !strconcat(instr_asm, " $target"), + [(br bb:$target)]>; + +let hasCtrlDep=1, noResults=1, isTerminator=1, rd=0 in +class JumpFR<bits<6> op, bits<6> func, string instr_asm>: + FR< op, + func, + (ops CPURegs:$target), + !strconcat(instr_asm, " $target"), + []>; + +// Jump and Link (Call) +let isCall=1 in +class JumpLink<bits<6> op, string instr_asm>: + FJ< op, + (ops calltarget:$target), + !strconcat(instr_asm, " $target"), + [(MipsJmpLink imm:$target)]>; + +let isCall=1 in +class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>: + FR< op, + func, + (ops CPURegs:$rd, CPURegs:$rs), + !strconcat(instr_asm, " $rs, $rd"), + []>; + +// Mul, Div +class MulDiv<bits<6> func, string instr_asm>: + FR< 0x00, + func, + (ops CPURegs:$a, CPURegs:$b), + !strconcat(instr_asm, " $a, $b"), + []>; + +// Move from Hi/Lo +class MoveFromTo<bits<6> func, string instr_asm>: + FR< 0x00, + func, + (ops CPURegs:$dst), + !strconcat(instr_asm, " $dst"), + []>; + +// Count Leading Ones/Zeros in Word +class CountLeading<bits<6> func, string instr_asm>: + FR< 0x1c, + func, + (ops CPURegs:$dst, CPURegs:$src), + !strconcat(instr_asm, " $dst, $src"), + []>; + + +//===----------------------------------------------------------------------===// +// Pseudo instructions +//===----------------------------------------------------------------------===// + +class Pseudo<dag ops, string asmstr, list<dag> pattern>: + MipsInst<ops, asmstr, pattern>; + +// As stack alignment is always done with addiu, we need a 16-bit immediate +def ADJCALLSTACKDOWN : Pseudo<(ops uimm16:$amt), + "!ADJCALLSTACKDOWN $amt", + [(callseq_start imm:$amt)]>, Imp<[SP],[SP]>; +def ADJCALLSTACKUP : Pseudo<(ops uimm16:$amt), + "!ADJCALLSTACKUP $amt", + [(callseq_end imm:$amt)]>, Imp<[SP],[SP]>; + +def IMPLICIT_DEF_CPURegs : Pseudo<(ops CPURegs:$dst), + "!IMPLICIT_DEF $dst", + [(set CPURegs:$dst, (undef))]>; + +//===----------------------------------------------------------------------===// +// Instruction definition +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Mips32 I +//===----------------------------------------------------------------------===// + +// Arithmetic +def ADDiu : ArithI<0x09, "addiu", add, uimm16, immSExt16>; +def ADDi : ArithI<0x08, "addi", add, simm16, immZExt16>; +def MUL : ArithR<0x1c, 0x02, "mul", mul>; +def ADDu : ArithR<0x00, 0x21, "addu", add>; +def SUBu : ArithR<0x00, 0x23, "subu", sub>; +def ADD : ArithOverflowR<0x00, 0x20, "add">; +def SUB : ArithOverflowR<0x00, 0x22, "sub">; +def MADD : MArithR<0x00, "madd">; +def MADDU : MArithR<0x01, "maddu">; +def MSUB : MArithR<0x04, "msub">; +def MSUBU : MArithR<0x05, "msubu">; + +// Logical +def AND : LogicR<0x24, "and", and>; +def OR : LogicR<0x25, "or", or>; +def XOR : LogicR<0x26, "xor", xor>; +def ANDi : LogicI<0x0c, "andi", and>; +def ORi : LogicI<0x0d, "ori", or>; +def XORi : LogicI<0x0e, "xori", xor>; +def NOR : LogicNOR<0x00, 0x27, "nor">; + +// Shifts +def SLL : LogicR_shift_imm<0x00, "sll", shl>; +def SRL : LogicR_shift_imm<0x02, "srl", srl>; +def SRA : LogicR_shift_imm<0x03, "sra", sra>; +def SLLV : LogicR_shift_reg<0x04, "sllv", shl>; +def SRLV : LogicR_shift_reg<0x06, "srlv", srl>; +def SRAV : LogicR_shift_reg<0x07, "srav", sra>; + +// Load Upper Immediate +def LUi : LoadUpper<0x0f, "lui">; + +// Load/Store +def LB : LoadM<0x20, "lb", sextloadi8>; +def LBu : LoadM<0x24, "lbu", zextloadi8>; +def LH : LoadM<0x21, "lh", sextloadi16>; +def LHu : LoadM<0x25, "lhu", zextloadi16>; +def LW : LoadM<0x23, "lw", load>; +def SB : StoreM<0x28, "sb", truncstorei8>; +def SH : StoreM<0x29, "sh", truncstorei16>; +def SW : StoreM<0x2b, "sw", store>; + +// Conditional Branch +def BEQ : CBranch<0x04, "beq", seteq>; +def BNE : CBranch<0x05, "bne", setne>; +def SLT : SetCC_R<0x00, 0x2a, "slt", setlt>; +def SLTu : SetCC_R<0x00, 0x2b, "sltu", setult>; +def SLTi : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>; +def SLTiu : SetCC_I<0x0b, "sltiu", setult, uimm16, immZExt16>; + +// Unconditional jump +def J : JumpFJ<0x02, "j">; +def JR : JumpFR<0x00, 0x08, "jr">; + +// Jump and Link (Call) +def JAL : JumpLink<0x03, "jal">; +def JALR : JumpLinkReg<0x00, 0x09, "jalr">; + +// MulDiv and Move From Hi/Lo operations, have +// their correpondent SDNodes created on ISelDAG. +// Special Mul, Div operations +def MULT : MulDiv<0x18, "mult">; +def MULTu : MulDiv<0x19, "multu">; +def DIV : MulDiv<0x1a, "div">; +def DIVu : MulDiv<0x1b, "divu">; + +// Move From Hi/Lo +def MFHI : MoveFromTo<0x10, "mfhi">; +def MFLO : MoveFromTo<0x12, "mflo">; +def MTHI : MoveFromTo<0x11, "mthi">; +def MTLO : MoveFromTo<0x13, "mtlo">; + +// Count Leading +def CLO : CountLeading<0x21, "clo">; +def CLZ : CountLeading<0x20, "clz">; + +// No operation +let addr=0 in +def NOOP : FJ<0, (ops), "nop", []>; + +// Ret instruction - as mips does not have "ret" a +// jr $ra must be generated. +let isReturn=1, isTerminator=1, hasDelaySlot=1, noResults=1, + isBarrier=1, hasCtrlDep=1, rs=0, rt=0, shamt=0 in +{ + def RET : FR <0x00, 0x02, (ops CPURegs:$target), + "jr $target", [(MipsRet CPURegs:$target)]>; +} + +//===----------------------------------------------------------------------===// +// Arbitrary patterns that map to one or more instructions +//===----------------------------------------------------------------------===// + +// Small immediates +def : Pat<(i32 immSExt16:$in), + (ADDiu ZERO, imm:$in)>; +def : Pat<(i32 immZExt16:$in), + (ORi ZERO, imm:$in)>; + +// Arbitrary immediates +def : Pat<(i32 imm:$imm), + (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>; + +// Call +def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)), + (JAL tglobaladdr:$dst)>; +def : Pat<(MipsJmpLink (i32 texternalsym:$dst)), + (JAL texternalsym:$dst)>; + +// GlobalAddress, Constant Pool, ExternalSymbol, and JumpTable +def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>; +def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>; + +// When extracting the address from GlobalAddress we +// need something of the form "addiu $reg, %lo(addr)" +def : Pat<(add CPURegs:$a, (MipsLo tglobaladdr:$in)), + (ADDiu CPURegs:$a, tglobaladdr:$in)>; + +// Mips does not have not, so we increase the operation +def : Pat<(not CPURegs:$in), + (NOR CPURegs:$in, CPURegs:$in)>; + +// extended load and stores +def : Pat<(i32 (extloadi8 addr:$src)), (LBu addr:$src)>; +def : Pat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>; +def : Pat<(truncstorei1 CPURegs:$src, addr:$addr), + (SB CPURegs:$src, addr:$src)>; + +def : Pat<(brcond (setne CPURegs:$lhs, (add ZERO, 0)), bb:$dst), + (BNE CPURegs:$lhs, ZERO, bb:$dst)>; + + +// Conditional branch patterns. +// cond branches patterns, 2 register operands signed. +def : Pat<(brcond (setlt CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BNE (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setgt CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BNE (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>; + +// cond branches patterns, 2 register operands unsigned. +def : Pat<(brcond (setult CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BNE (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setugt CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BNE (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst), + (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>; + +// cond branches patterns, reg/imm operands signed. +def : Pat<(brcond (setult CPURegs:$lhs, immSExt16:$rhs), bb:$dst), + (BNE (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst), + (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>; + +// cond branches patterns, reg/imm operands unsigned. +def : Pat<(brcond (setult CPURegs:$lhs, immZExt16:$rhs), bb:$dst), + (BNE (SLTiu CPURegs:$lhs, immZExt16:$rhs), ZERO, bb:$dst)>; +def : Pat<(brcond (setuge CPURegs:$lhs, immZExt16:$rhs), bb:$dst), + (BEQ (SLTiu CPURegs:$lhs, immZExt16:$rhs), ZERO, bb:$dst)>; diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h new file mode 100644 index 0000000..b362dc3 --- /dev/null +++ b/lib/Target/Mips/MipsMachineFunction.h @@ -0,0 +1,54 @@ +//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Mips specific subclass of MachineFunctionInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPS_MACHINE_FUNCTION_INFO_H +#define MIPS_MACHINE_FUNCTION_INFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// MipsFunctionInfo - This class is derived from MachineFunction private +/// Mips target-specific information for each MachineFunction. +class MipsFunctionInfo : public MachineFunctionInfo { + +private: + /// Holds for each function where on the stack + /// the Frame Pointer must be saved + int FPStackOffset; + + /// Holds for each function where on the stack + /// the Return Address must be saved + int RAStackOffset; + +public: + MipsFunctionInfo(MachineFunction& MF) + : FPStackOffset(0), RAStackOffset(0) + {} + + int getFPStackOffset() const { return FPStackOffset; } + void setFPStackOffset(int Off) { FPStackOffset = Off; } + + int getRAStackOffset() const { return RAStackOffset; } + void setRAStackOffset(int Off) { RAStackOffset = Off; } + + int getTopSavedRegOffset() const { + return (RAStackOffset > FPStackOffset) ? + (RAStackOffset) : (FPStackOffset); + } +}; + +} // end of namespace llvm + + +#endif diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp new file mode 100644 index 0000000..c7a87ca --- /dev/null +++ b/lib/Target/Mips/MipsRegisterInfo.cpp @@ -0,0 +1,422 @@ +//===- MipsRegisterInfo.cpp - MIPS Register Information -== -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the MIPS implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mips-reg-info" + +#include "Mips.h" +#include "MipsRegisterInfo.h" +#include "MipsMachineFunction.h" +#include "llvm/Constants.h" +#include "llvm/Type.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +//#include "MipsSubtarget.h" + +using namespace llvm; + +// TODO: add subtarget support +MipsRegisterInfo::MipsRegisterInfo(const TargetInstrInfo &tii) + : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), + TII(tii) {} + +void MipsRegisterInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, int FI, + const TargetRegisterClass *RC) const +{ + if (RC == Mips::CPURegsRegisterClass) + BuildMI(MBB, I, TII.get(Mips::SW)).addReg(SrcReg, false, false, true) + .addImm(0).addFrameIndex(FI); + else + assert(0 && "Can't store this register to stack slot"); +} + +void MipsRegisterInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC) const +{ + if (RC == Mips::CPURegsRegisterClass) + BuildMI(MBB, I, TII.get(Mips::LW), DestReg).addImm(0).addFrameIndex(FI); + else + assert(0 && "Can't load this register from stack slot"); +} + +void MipsRegisterInfo:: +copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const +{ + if (RC == Mips::CPURegsRegisterClass) + BuildMI(MBB, I, TII.get(Mips::ADDu), DestReg).addReg(Mips::ZERO) + .addReg(SrcReg); + else + assert (0 && "Can't copy this register"); +} + +void MipsRegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const +{ + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +MachineInstr *MipsRegisterInfo:: +foldMemoryOperand(MachineInstr* MI, unsigned OpNum, int FI) const +{ + MachineInstr *NewMI = NULL; + + switch (MI->getOpcode()) + { + case Mips::ADDu: + if ((MI->getOperand(0).isRegister()) && + (MI->getOperand(1).isRegister()) && + (MI->getOperand(1).getReg() == Mips::ZERO) && + (MI->getOperand(2).isRegister())) + { + if (OpNum == 0) // COPY -> STORE + NewMI = BuildMI(TII.get(Mips::SW)).addFrameIndex(FI) + .addImm(0).addReg(MI->getOperand(2).getReg()); + else // COPY -> LOAD + NewMI = BuildMI(TII.get(Mips::LW), MI->getOperand(0) + .getReg()).addImm(0).addFrameIndex(FI); + } + break; + } + + if (NewMI) + NewMI->copyKillDeadInfo(MI); + return NewMI; +} + +/// Mips Callee Saved Registers +const unsigned* MipsRegisterInfo:: +getCalleeSavedRegs() const +{ + // Mips calle-save register range is $16-$26(s0-s7) + static const unsigned CalleeSavedRegs[] = { + Mips::S0, Mips::S1, Mips::S2, Mips::S3, + Mips::S4, Mips::S5, Mips::S6, Mips::S7, 0 + }; + return CalleeSavedRegs; +} + +/// Mips Callee Saved Register Classes +const TargetRegisterClass* const* +MipsRegisterInfo::getCalleeSavedRegClasses() const +{ + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { + &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, + &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, + &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, + &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 0 + }; + return CalleeSavedRegClasses; +} + +BitVector MipsRegisterInfo:: +getReservedRegs(const MachineFunction &MF) const +{ + BitVector Reserved(getNumRegs()); + Reserved.set(Mips::ZERO); + Reserved.set(Mips::AT); + Reserved.set(Mips::K0); + Reserved.set(Mips::K1); + Reserved.set(Mips::GP); + Reserved.set(Mips::SP); + Reserved.set(Mips::FP); + Reserved.set(Mips::RA); + return Reserved; +} + +//===----------------------------------------------------------------------===// +// +// Stack Frame Processing methods +// +----------------------------+ +// +// Too meet ABI, we construct the frame on the reverse +// of natural order. +// +// The LLVM Frame will look like this: +// +// As the stack grows down, we start at 0, and the reference +// is decrement. +// +// 0 ---------- +// -4 Args to pass +// . saved "Callee Saved" Registers +// . Local Area +// . saved FP +// . saved RA +// -StackSize ----------- +// +// On the EliminateFrameIndex we just negate the address above +// and we get the stack frame required by the ABI, which is: +// +// sp + stacksize ------------- +// saved $RA (only on non-leaf functions) +// saved $FP (only with frame pointer) +// saved "Callee Saved" Registers +// Local Area +// saved $GP (used in PIC - not supported yet) +// Args to pass area +// sp ------------- +// +// The sp is the stack pointer subtracted/added from the stack size +// at the Prologue/Epilogue +// +// References to the previous stack (to obtain arguments) are done +// with fixed location stack frames using positive stack offsets. +// +// Examples: +// - reference to the actual stack frame +// for any local area var there is smt like : FI >= 0, StackOffset: -4 +// sw REGX, 4(REGY) +// +// - reference to previous stack frame +// suppose there's a store to the 5th arguments : FI < 0, StackOffset: 16. +// The emitted instruction will be something like: +// sw REGX, 16+StackSize (REGY) +// +//===----------------------------------------------------------------------===// + +// hasFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +bool MipsRegisterInfo:: +hasFP(const MachineFunction &MF) const { + return (NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects()); +} + +// This function eliminate ADJCALLSTACKDOWN, +// ADJCALLSTACKUP pseudo instructions +void MipsRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions. + MBB.erase(I); +} + +// FrameIndex represent objects inside a abstract stack. +// We must replace FrameIndex with an stack/frame pointer +// direct reference. +void MipsRegisterInfo:: +eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + RegScavenger *RS) const +{ + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + + unsigned i = 0; + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getFrameIndex(); + int stackSize = MF.getFrameInfo()->getStackSize(); + int spOffset = MF.getFrameInfo()->getObjectOffset(FrameIndex); + + #ifndef NDEBUG + DOUT << "\nFunction : " << MF.getFunction()->getName() << "\n"; + DOUT << "<--------->\n"; + MI.print(DOUT); + DOUT << "FrameIndex : " << FrameIndex << "\n"; + DOUT << "spOffset : " << spOffset << "\n"; + DOUT << "stackSize : " << stackSize << "\n"; + #endif + + int Offset = ( (spOffset >= 0) ? (stackSize + spOffset) : (-spOffset)); + + #ifndef NDEBUG + DOUT << "Offset : " << Offset << "\n"; + DOUT << "<--------->\n"; + #endif + + MI.getOperand(i-1).ChangeToImmediate(Offset); + MI.getOperand(i).ChangeToRegister(getFrameRegister(MF),false); +} + +void MipsRegisterInfo:: +emitPrologue(MachineFunction &MF) const +{ + MachineBasicBlock &MBB = MF.front(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + + // Get the number of bytes to allocate from the FrameInfo + int NumBytes = (int) MFI->getStackSize(); + + #ifndef NDEBUG + DOUT << "\n<--- EMIT PROLOGUE --->"; + DOUT << "Stack size :" << NumBytes << "\n"; + #endif + + // Do we need to allocate space on the stack? + if (NumBytes == 0) return; + + int FPOffset, RAOffset; + + // Always allocate space for saved RA and FP, + // even if FramePointer is not used. When not + // using FP, the last stack slot becomes empty + // and RA is saved before it. + if ((hasFP(MF)) && (MFI->hasCalls())) { + FPOffset = NumBytes; + RAOffset = (NumBytes+4); + } else if ((!hasFP(MF)) && (MFI->hasCalls())) { + FPOffset = 0; + RAOffset = NumBytes; + } else if ((hasFP(MF)) && (!MFI->hasCalls())) { + FPOffset = NumBytes; + RAOffset = 0; + } + + MFI->setObjectOffset(MFI->CreateStackObject(4,4), -FPOffset); + MFI->setObjectOffset(MFI->CreateStackObject(4,4), -RAOffset); + MipsFI->setFPStackOffset(FPOffset); + MipsFI->setRAStackOffset(RAOffset); + + #ifndef NDEBUG + DOUT << "FPOffset :" << FPOffset << "\n"; + DOUT << "RAOffset :" << RAOffset << "\n"; + #endif + + // Align stack. + NumBytes += 8; + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + NumBytes = ((NumBytes+Align-1)/Align*Align); + + #ifndef NDEBUG + DOUT << "New stack size :" << NumBytes << "\n\n"; + #endif + + // Update frame info + MFI->setStackSize(NumBytes); + + // Adjust stack : addi sp, sp, (-imm) + BuildMI(MBB, MBBI, TII.get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(-NumBytes); + + // Save the return address only if the function isnt a leaf one. + // sw $ra, stack_loc($sp) + if (MFI->hasCalls()) { + BuildMI(MBB, MBBI, TII.get(Mips::SW)) + .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP); + } + + // if framepointer enabled, save it and set it + // to point to the stack pointer + if (hasFP(MF)) { + // sw $fp,stack_loc($sp) + BuildMI(MBB, MBBI, TII.get(Mips::SW)) + .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP); + + // move $fp, $sp + BuildMI(MBB, MBBI, TII.get(Mips::ADDu), Mips::FP) + .addReg(Mips::SP).addReg(Mips::ZERO); + } +} + +void MipsRegisterInfo:: +emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const +{ + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + + // Get the number of bytes from FrameInfo + int NumBytes = (int) MFI->getStackSize(); + + // Get the FI's where RA and FP are saved. + int FPOffset = MipsFI->getFPStackOffset(); + int RAOffset = MipsFI->getRAStackOffset(); + + #ifndef NDEBUG + DOUT << "\n<--- EMIT EPILOGUE --->" << "\n"; + DOUT << "Stack size :" << NumBytes << "\n"; + DOUT << "FPOffset :" << FPOffset << "\n"; + DOUT << "RAOffset :" << RAOffset << "\n\n"; + #endif + + // if framepointer enabled, restore it and restore the + // stack pointer + if (hasFP(MF)) { + // move $sp, $fp + BuildMI(MBB, MBBI, TII.get(Mips::ADDu), Mips::SP) + .addReg(Mips::FP).addReg(Mips::ZERO); + + // lw $fp,stack_loc($sp) + BuildMI(MBB, MBBI, TII.get(Mips::LW)) + .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP); + } + + // Restore the return address only if the function isnt a leaf one. + // lw $ra, stack_loc($sp) + if (MFI->hasCalls()) { + BuildMI(MBB, MBBI, TII.get(Mips::LW)) + .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP); + } + + // adjust stack : insert addi sp, sp, (imm) + if (NumBytes) { + BuildMI(MBB, MBBI, TII.get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP).addImm(NumBytes); + } +} + +void MipsRegisterInfo:: +processFunctionBeforeFrameFinalized(MachineFunction &MF) const { +} + +unsigned MipsRegisterInfo:: +getRARegister() const { + return Mips::RA; +} + +unsigned MipsRegisterInfo:: +getFrameRegister(MachineFunction &MF) const { + return hasFP(MF) ? Mips::FP : Mips::SP; +} + +unsigned MipsRegisterInfo:: +getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned MipsRegisterInfo:: +getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +#include "MipsGenRegisterInfo.inc" + diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h new file mode 100644 index 0000000..d84194f --- /dev/null +++ b/lib/Target/Mips/MipsRegisterInfo.h @@ -0,0 +1,83 @@ +//===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Mips implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSREGISTERINFO_H +#define MIPSREGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "MipsGenRegisterInfo.h.inc" + +namespace llvm { + +class TargetInstrInfo; +class Type; + +struct MipsRegisterInfo : public MipsGenRegisterInfo { + const TargetInstrInfo &TII; + + MipsRegisterInfo(const TargetInstrInfo &tii); + + /// Code Generation virtual methods... + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum, + int FrameIndex) const; + + void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + + const unsigned *getCalleeSavedRegs() const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses() const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td new file mode 100644 index 0000000..2b7d15f --- /dev/null +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -0,0 +1,80 @@ +//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the MIPS register file +//===----------------------------------------------------------------------===// + +// We have banks of 32 registers each. +class MipsReg<string n> : Register<n> { + field bits<5> Num; + let Namespace = "Mips"; +} + +// Mips CPU Registers +class MipsGPRReg<bits<5> num, string n> : MipsReg<n> { + let Num = num; +} + +// CPU GPR Registers +def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<0>; +def AT : MipsGPRReg< 1, "AT">, DwarfRegNum<1>; +def V0 : MipsGPRReg< 2, "2">, DwarfRegNum<2>; +def V1 : MipsGPRReg< 3, "3">, DwarfRegNum<3>; +def A0 : MipsGPRReg< 4, "4">, DwarfRegNum<5>; +def A1 : MipsGPRReg< 5, "5">, DwarfRegNum<5>; +def A2 : MipsGPRReg< 6, "6">, DwarfRegNum<6>; +def A3 : MipsGPRReg< 7, "7">, DwarfRegNum<7>; +def T0 : MipsGPRReg< 8, "8">, DwarfRegNum<8>; +def T1 : MipsGPRReg< 9, "9">, DwarfRegNum<9>; +def T2 : MipsGPRReg< 10, "10">, DwarfRegNum<10>; +def T3 : MipsGPRReg< 11, "11">, DwarfRegNum<11>; +def T4 : MipsGPRReg< 12, "12">, DwarfRegNum<12>; +def T5 : MipsGPRReg< 13, "13">, DwarfRegNum<13>; +def T6 : MipsGPRReg< 14, "14">, DwarfRegNum<14>; +def T7 : MipsGPRReg< 15, "15">, DwarfRegNum<15>; +def S0 : MipsGPRReg< 16, "16">, DwarfRegNum<16>; +def S1 : MipsGPRReg< 17, "17">, DwarfRegNum<17>; +def S2 : MipsGPRReg< 18, "18">, DwarfRegNum<18>; +def S3 : MipsGPRReg< 19, "19">, DwarfRegNum<19>; +def S4 : MipsGPRReg< 20, "20">, DwarfRegNum<20>; +def S5 : MipsGPRReg< 21, "21">, DwarfRegNum<21>; +def S6 : MipsGPRReg< 22, "22">, DwarfRegNum<22>; +def S7 : MipsGPRReg< 23, "23">, DwarfRegNum<23>; +def T8 : MipsGPRReg< 24, "24">, DwarfRegNum<24>; +def T9 : MipsGPRReg< 25, "25">, DwarfRegNum<25>; +def K0 : MipsGPRReg< 26, "26">, DwarfRegNum<26>; +def K1 : MipsGPRReg< 27, "27">, DwarfRegNum<27>; +def GP : MipsGPRReg< 28, "GP">, DwarfRegNum<28>; +def SP : MipsGPRReg< 29, "SP">, DwarfRegNum<29>; +def FP : MipsGPRReg< 30, "FP">, DwarfRegNum<30>; +def RA : MipsGPRReg< 31, "RA">, DwarfRegNum<31>; + +// CPU Registers Class +def CPURegs : RegisterClass<"Mips", [i32], 32, + // Return Values and Arguments + [V0, V1, A0, A1, A2, A3, + // Not preserved across procedure calls + T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, + // Callee save + S0, S1, S2, S3, S4, S5, S6, S7, + // Reserved + ZERO, AT, K0, K1, GP, SP, FP, RA]> +{ + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + CPURegsClass::iterator + CPURegsClass::allocation_order_end(const MachineFunction &MF) const { + // The last 8 registers on the list above are reserved + return end()-8; + } + }]; +} diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp new file mode 100644 index 0000000..a394f77 --- /dev/null +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -0,0 +1,26 @@ +//===- MipsSubtarget.cpp - Mips Subtarget Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Mips specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "MipsSubtarget.h" +#include "Mips.h" +#include "MipsGenSubtarget.inc" +using namespace llvm; + +MipsSubtarget::MipsSubtarget(const TargetMachine &TM, const Module &M, + const std::string &FS) : isR3000(false) +{ + std::string CPU = "generic"; + + // Parse features string. + ParseSubtargetFeatures(FS, CPU); +} diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h new file mode 100644 index 0000000..7ec61ca --- /dev/null +++ b/lib/Target/Mips/MipsSubtarget.h @@ -0,0 +1,43 @@ +//=====-- MipsSubtarget.h - Define Subtarget for the Mips -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Mips specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSSUBTARGET_H +#define MIPSSUBTARGET_H + +#include "llvm/Target/TargetSubtarget.h" +#include "llvm/Target/TargetMachine.h" + +#include <string> + +namespace llvm { +class Module; + +class MipsSubtarget : public TargetSubtarget { +protected: + bool isR3000; +public: + /// This constructor initializes the data members to match that + /// of the specified module. + /// + MipsSubtarget(const TargetMachine &TM, const Module &M, + const std::string &FS); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + + bool IsR3000() const { return isR3000; } +}; +} // End llvm namespace + +#endif diff --git a/lib/Target/Mips/MipsTargetAsmInfo.cpp b/lib/Target/Mips/MipsTargetAsmInfo.cpp new file mode 100644 index 0000000..08166f6 --- /dev/null +++ b/lib/Target/Mips/MipsTargetAsmInfo.cpp @@ -0,0 +1,22 @@ +//===-- MipsTargetAsmInfo.cpp - Mips asm properties -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the MipsTargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "MipsTargetAsmInfo.h" + +using namespace llvm; + +MipsTargetAsmInfo::MipsTargetAsmInfo(const MipsTargetMachine &TM) { + Data16bitsDirective = "\t.half\t"; + Data32bitsDirective = "\t.word\t"; + CommentString = "#"; +} diff --git a/lib/Target/Mips/MipsTargetAsmInfo.h b/lib/Target/Mips/MipsTargetAsmInfo.h new file mode 100644 index 0000000..908f036 --- /dev/null +++ b/lib/Target/Mips/MipsTargetAsmInfo.h @@ -0,0 +1,30 @@ +//=====-- MipsTargetAsmInfo.h - Mips asm properties -----------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the MipsTargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSTARGETASMINFO_H +#define MIPSTARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class MipsTargetMachine; + + struct MipsTargetAsmInfo : public TargetAsmInfo { + MipsTargetAsmInfo(const MipsTargetMachine &TM); + }; + +} // namespace llvm + +#endif diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp new file mode 100644 index 0000000..7fdba30 --- /dev/null +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -0,0 +1,83 @@ +//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements the info about Mips target spec. +// +//===----------------------------------------------------------------------===// + +#include "Mips.h" +#include "MipsTargetAsmInfo.h" +#include "MipsTargetMachine.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Target/TargetMachineRegistry.h" +using namespace llvm; + +namespace { + // Register the target. + RegisterTarget<MipsTargetMachine> X("mips", " Mips"); +} + +const TargetAsmInfo *MipsTargetMachine:: +createTargetAsmInfo() const +{ + return new MipsTargetAsmInfo(*this); +} + +// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment +// +// FrameInfo --> StackGrowsDown, 8 bytes aligned, +// LOA : 0 +MipsTargetMachine:: +MipsTargetMachine(const Module &M, const std::string &FS): + Subtarget(*this, M, FS), DataLayout("E-p:32:32:32"), + InstrInfo(*this), FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0), + TLInfo(*this) {} + +// return 0 and must specify -march to gen MIPS code. +unsigned MipsTargetMachine:: +getModuleMatchQuality(const Module &M) +{ + // We strongly match "mips-*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 5 && std::string(TT.begin(), TT.begin()+5) == "mips-") + return 20; + + return 0; +} + +// Install an instruction selector pass using +// the ISelDag to gen Mips code. +bool MipsTargetMachine:: +addInstSelector(FunctionPassManager &PM, bool Fast) +{ + PM.add(createMipsISelDag(*this)); + return false; +} + +// Implemented by targets that want to run passes immediately before +// machine code is emitted. return true if -print-machineinstrs should +// print out the code after the passes. +// TODO: Delay slot must be implemented here. +bool MipsTargetMachine:: +addPreEmitPass(FunctionPassManager &PM, bool Fast) +{ + return false; +} + +// Implements the AssemblyEmitter for the target. Must return +// true if AssemblyEmitter is supported +bool MipsTargetMachine:: +addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) +{ + // Output assembly language. + PM.add(createMipsCodePrinterPass(Out, *this)); + return false; +} diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h new file mode 100644 index 0000000..9e1ccc3 --- /dev/null +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -0,0 +1,65 @@ +//===-- MipsTargetMachine.h - Define TargetMachine for Mips -00--*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bruno Cardoso Lopes and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Mips specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef MIPSTARGETMACHINE_H +#define MIPSTARGETMACHINE_H + +#include "MipsSubtarget.h" +#include "MipsInstrInfo.h" +#include "MipsISelLowering.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" + +namespace llvm { + class MipsTargetMachine : public LLVMTargetMachine { + MipsSubtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + MipsInstrInfo InstrInfo; + TargetFrameInfo FrameInfo; + MipsTargetLowering TLInfo; + + protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + + public: + MipsTargetMachine(const Module &M, const std::string &FS); + + virtual const MipsInstrInfo *getInstrInfo() const + { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const + { return &FrameInfo; } + virtual const TargetSubtarget *getSubtargetImpl() const + { return &Subtarget; } + virtual const TargetData *getTargetData() const + { return &DataLayout;} + + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + + virtual MipsTargetLowering *getTargetLowering() const { + return const_cast<MipsTargetLowering*>(&TLInfo); + } + + static unsigned getModuleMatchQuality(const Module &M); + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); + }; +} // End llvm namespace + +#endif diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile new file mode 100644 index 0000000..77288ed --- /dev/null +++ b/lib/Target/PowerPC/Makefile @@ -0,0 +1,20 @@ +##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMPowerPC +TARGET = PPC + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \ + PPCGenAsmWriter.inc PPCGenCodeEmitter.inc \ + PPCGenRegisterInfo.h.inc PPCGenRegisterInfo.inc \ + PPCGenInstrInfo.inc PPCGenDAGISel.inc \ + PPCGenSubtarget.inc PPCGenCallingConv.inc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h new file mode 100644 index 0000000..9327f30 --- /dev/null +++ b/lib/Target/PowerPC/PPC.h @@ -0,0 +1,47 @@ +//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// PowerPC back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_H +#define LLVM_TARGET_POWERPC_H + +#include <iosfwd> + + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { + class PPCTargetMachine; + class FunctionPassManager; + class FunctionPass; + class MachineCodeEmitter; + +FunctionPass *createPPCBranchSelectionPass(); +FunctionPass *createPPCISelDag(PPCTargetMachine &TM); +FunctionPass *createPPCAsmPrinterPass(std::ostream &OS, + PPCTargetMachine &TM); +FunctionPass *createPPCCodeEmitterPass(PPCTargetMachine &TM, + MachineCodeEmitter &MCE); +} // end namespace llvm; + +// Defines symbolic names for PowerPC registers. This defines a mapping from +// register name to register number. +// +#include "PPCGenRegisterNames.inc" + +// Defines symbolic names for the PowerPC instructions. +// +#include "PPCGenInstrNames.inc" + +#endif diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td new file mode 100644 index 0000000..76f8ac4 --- /dev/null +++ b/lib/Target/PowerPC/PPC.td @@ -0,0 +1,114 @@ +//===- PPC.td - Describe the PowerPC Target Machine --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the top level entry point for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing. +// +include "../Target.td" + +//===----------------------------------------------------------------------===// +// PowerPC Subtarget features. +// + +//===----------------------------------------------------------------------===// +// CPU Directives // +//===----------------------------------------------------------------------===// + +def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">; +def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">; +def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">; +def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">; +def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; +def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; +def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; + +def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", + "Enable 64-bit instructions">; +def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", + "Enable 64-bit registers usage for ppc32 [beta]">; +def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true", + "Enable Altivec instructions">; +def FeatureGPUL : SubtargetFeature<"gpul","IsGigaProcessor", "true", + "Enable GPUL instructions">; +def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", + "Enable the fsqrt instruction">; +def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true", + "Enable the stfiwx instruction">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "PPCRegisterInfo.td" +include "PPCSchedule.td" +include "PPCInstrInfo.td" + +//===----------------------------------------------------------------------===// +// PowerPC processors supported. +// + +def : Processor<"generic", G3Itineraries, [Directive32]>; +def : Processor<"601", G3Itineraries, [Directive601]>; +def : Processor<"602", G3Itineraries, [Directive602]>; +def : Processor<"603", G3Itineraries, [Directive603]>; +def : Processor<"603e", G3Itineraries, [Directive603]>; +def : Processor<"603ev", G3Itineraries, [Directive603]>; +def : Processor<"604", G3Itineraries, [Directive604]>; +def : Processor<"604e", G3Itineraries, [Directive604]>; +def : Processor<"620", G3Itineraries, [Directive620]>; +def : Processor<"g3", G3Itineraries, [Directive7400]>; +def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>; +def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>; +def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>; +def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>; +def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>; +def : Processor<"970", G5Itineraries, + [Directive970, FeatureAltivec, + FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; +def : Processor<"g5", G5Itineraries, + [Directive970, FeatureAltivec, + FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; +def : Processor<"ppc", G3Itineraries, [Directive32]>; +def : Processor<"ppc64", G5Itineraries, + [Directive64, FeatureAltivec, + FeatureGPUL, FeatureFSqrt, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; + + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "PPCCallingConv.td" + +def PPCInstrInfo : InstrInfo { + // Define how we want to layout our TargetSpecific information field... This + // should be kept up-to-date with the fields in the PPCInstrInfo.h file. + let TSFlagsFields = ["PPC970_First", + "PPC970_Single", + "PPC970_Cracked", + "PPC970_Unit"]; + let TSFlagsShifts = [0, 1, 2, 3]; + + let isLittleEndianEncoding = 1; +} + + +def PPC : Target { + // Information about the instructions. + let InstructionSet = PPCInstrInfo; +} diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp new file mode 100644 index 0000000..2880196 --- /dev/null +++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -0,0 +1,1101 @@ +//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly --------=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to PowerPC assembly language. This printer is +// the output mechanism used by `llc'. +// +// Documentation at http://developer.apple.com/documentation/DeveloperTools/ +// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asmprinter" +#include "PPC.h" +#include "PPCPredicates.h" +#include "PPCTargetMachine.h" +#include "PPCSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/DwarfWriter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/MRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include <set> +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct VISIBILITY_HIDDEN PPCAsmPrinter : public AsmPrinter { + std::set<std::string> FnStubs, GVStubs; + const PPCSubtarget &Subtarget; + + PPCAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T) + : AsmPrinter(O, TM, T), Subtarget(TM.getSubtarget<PPCSubtarget>()) { + } + + virtual const char *getPassName() const { + return "PowerPC Assembly Printer"; + } + + PPCTargetMachine &getTM() { + return static_cast<PPCTargetMachine&>(TM); + } + + unsigned enumRegToMachineReg(unsigned enumReg) { + switch (enumReg) { + default: assert(0 && "Unhandled register!"); break; + case PPC::CR0: return 0; + case PPC::CR1: return 1; + case PPC::CR2: return 2; + case PPC::CR3: return 3; + case PPC::CR4: return 4; + case PPC::CR5: return 5; + case PPC::CR6: return 6; + case PPC::CR7: return 7; + } + abort(); + } + + /// printInstruction - This method is automatically generated by tablegen + /// from the instruction set description. This method returns true if the + /// machine instruction was sufficiently described to print it, otherwise it + /// returns false. + bool printInstruction(const MachineInstr *MI); + + void printMachineInstruction(const MachineInstr *MI); + void printOp(const MachineOperand &MO); + + /// stripRegisterPrefix - This method strips the character prefix from a + /// register name so that only the number is left. Used by for linux asm. + const char *stripRegisterPrefix(const char *RegName) { + switch (RegName[0]) { + case 'r': + case 'f': + case 'v': return RegName + 1; + case 'c': if (RegName[1] == 'r') return RegName + 2; + } + + return RegName; + } + + /// printRegister - Print register according to target requirements. + /// + void printRegister(const MachineOperand &MO, bool R0AsZero) { + unsigned RegNo = MO.getReg(); + assert(MRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??"); + + // If we should use 0 for R0. + if (R0AsZero && RegNo == PPC::R0) { + O << "0"; + return; + } + + const char *RegName = TM.getRegisterInfo()->get(RegNo).Name; + // Linux assembler (Others?) does not take register mnemonics. + // FIXME - What about special registers used in mfspr/mtspr? + if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName); + O << RegName; + } + + void printOperand(const MachineInstr *MI, unsigned OpNo) { + const MachineOperand &MO = MI->getOperand(OpNo); + if (MO.isRegister()) { + printRegister(MO, false); + } else if (MO.isImmediate()) { + O << MO.getImmedValue(); + } else { + printOp(MO); + } + } + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + + + void printS5ImmOperand(const MachineInstr *MI, unsigned OpNo) { + char value = MI->getOperand(OpNo).getImmedValue(); + value = (value << (32-5)) >> (32-5); + O << (int)value; + } + void printU5ImmOperand(const MachineInstr *MI, unsigned OpNo) { + unsigned char value = MI->getOperand(OpNo).getImmedValue(); + assert(value <= 31 && "Invalid u5imm argument!"); + O << (unsigned int)value; + } + void printU6ImmOperand(const MachineInstr *MI, unsigned OpNo) { + unsigned char value = MI->getOperand(OpNo).getImmedValue(); + assert(value <= 63 && "Invalid u6imm argument!"); + O << (unsigned int)value; + } + void printS16ImmOperand(const MachineInstr *MI, unsigned OpNo) { + O << (short)MI->getOperand(OpNo).getImmedValue(); + } + void printU16ImmOperand(const MachineInstr *MI, unsigned OpNo) { + O << (unsigned short)MI->getOperand(OpNo).getImmedValue(); + } + void printS16X4ImmOperand(const MachineInstr *MI, unsigned OpNo) { + if (MI->getOperand(OpNo).isImmediate()) { + O << (short)(MI->getOperand(OpNo).getImmedValue()*4); + } else { + O << "lo16("; + printOp(MI->getOperand(OpNo)); + if (TM.getRelocationModel() == Reloc::PIC_) + O << "-\"L" << getFunctionNumber() << "$pb\")"; + else + O << ')'; + } + } + void printBranchOperand(const MachineInstr *MI, unsigned OpNo) { + // Branches can take an immediate operand. This is used by the branch + // selection pass to print $+8, an eight byte displacement from the PC. + if (MI->getOperand(OpNo).isImmediate()) { + O << "$+" << MI->getOperand(OpNo).getImmedValue()*4; + } else { + printOp(MI->getOperand(OpNo)); + } + } + void printCallOperand(const MachineInstr *MI, unsigned OpNo) { + const MachineOperand &MO = MI->getOperand(OpNo); + if (TM.getRelocationModel() != Reloc::Static) { + if (MO.getType() == MachineOperand::MO_GlobalAddress) { + GlobalValue *GV = MO.getGlobal(); + if (((GV->isDeclaration() || GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()))) { + // Dynamically-resolved functions need a stub for the function. + std::string Name = Mang->getValueName(GV); + FnStubs.insert(Name); + O << "L" << Name << "$stub"; + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + return; + } + } + if (MO.getType() == MachineOperand::MO_ExternalSymbol) { + std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName(); + FnStubs.insert(Name); + O << "L" << Name << "$stub"; + return; + } + } + + printOp(MI->getOperand(OpNo)); + } + void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo) { + O << (int)MI->getOperand(OpNo).getImmedValue()*4; + } + void printPICLabel(const MachineInstr *MI, unsigned OpNo) { + O << "\"L" << getFunctionNumber() << "$pb\"\n"; + O << "\"L" << getFunctionNumber() << "$pb\":"; + } + void printSymbolHi(const MachineInstr *MI, unsigned OpNo) { + if (MI->getOperand(OpNo).isImmediate()) { + printS16ImmOperand(MI, OpNo); + } else { + if (Subtarget.isDarwin()) O << "ha16("; + printOp(MI->getOperand(OpNo)); + if (TM.getRelocationModel() == Reloc::PIC_) + O << "-\"L" << getFunctionNumber() << "$pb\""; + if (Subtarget.isDarwin()) + O << ')'; + else + O << "@ha"; + } + } + void printSymbolLo(const MachineInstr *MI, unsigned OpNo) { + if (MI->getOperand(OpNo).isImmediate()) { + printS16ImmOperand(MI, OpNo); + } else { + if (Subtarget.isDarwin()) O << "lo16("; + printOp(MI->getOperand(OpNo)); + if (TM.getRelocationModel() == Reloc::PIC_) + O << "-\"L" << getFunctionNumber() << "$pb\""; + if (Subtarget.isDarwin()) + O << ')'; + else + O << "@l"; + } + } + void printcrbitm(const MachineInstr *MI, unsigned OpNo) { + unsigned CCReg = MI->getOperand(OpNo).getReg(); + unsigned RegNo = enumRegToMachineReg(CCReg); + O << (0x80 >> RegNo); + } + // The new addressing mode printers. + void printMemRegImm(const MachineInstr *MI, unsigned OpNo) { + printSymbolLo(MI, OpNo); + O << '('; + if (MI->getOperand(OpNo+1).isRegister() && + MI->getOperand(OpNo+1).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo+1); + O << ')'; + } + void printMemRegImmShifted(const MachineInstr *MI, unsigned OpNo) { + if (MI->getOperand(OpNo).isImmediate()) + printS16X4ImmOperand(MI, OpNo); + else + printSymbolLo(MI, OpNo); + O << '('; + if (MI->getOperand(OpNo+1).isRegister() && + MI->getOperand(OpNo+1).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo+1); + O << ')'; + } + + void printMemRegReg(const MachineInstr *MI, unsigned OpNo) { + // When used as the base register, r0 reads constant zero rather than + // the value contained in the register. For this reason, the darwin + // assembler requires that we print r0 as 0 (no r) when used as the base. + const MachineOperand &MO = MI->getOperand(OpNo); + printRegister(MO, true); + O << ", "; + printOperand(MI, OpNo+1); + } + + void printPredicateOperand(const MachineInstr *MI, unsigned OpNo, + const char *Modifier); + + virtual bool runOnMachineFunction(MachineFunction &F) = 0; + virtual bool doFinalization(Module &M) = 0; + + virtual void EmitExternalGlobal(const GlobalVariable *GV); + }; + + /// LinuxAsmPrinter - PowerPC assembly printer, customized for Linux + struct VISIBILITY_HIDDEN LinuxAsmPrinter : public PPCAsmPrinter { + + DwarfWriter DW; + + LinuxAsmPrinter(std::ostream &O, PPCTargetMachine &TM, + const TargetAsmInfo *T) + : PPCAsmPrinter(O, TM, T), DW(O, this, T) { + } + + virtual const char *getPassName() const { + return "Linux PPC Assembly Printer"; + } + + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineModuleInfo>(); + PPCAsmPrinter::getAnalysisUsage(AU); + } + + /// getSectionForFunction - Return the section that we should emit the + /// specified function body into. + virtual std::string getSectionForFunction(const Function &F) const; + }; + + /// DarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac OS + /// X + struct VISIBILITY_HIDDEN DarwinAsmPrinter : public PPCAsmPrinter { + + DwarfWriter DW; + + DarwinAsmPrinter(std::ostream &O, PPCTargetMachine &TM, + const TargetAsmInfo *T) + : PPCAsmPrinter(O, TM, T), DW(O, this, T) { + } + + virtual const char *getPassName() const { + return "Darwin PPC Assembly Printer"; + } + + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineModuleInfo>(); + PPCAsmPrinter::getAnalysisUsage(AU); + } + + /// getSectionForFunction - Return the section that we should emit the + /// specified function body into. + virtual std::string getSectionForFunction(const Function &F) const; + }; +} // end of anonymous namespace + +// Include the auto-generated portion of the assembly writer +#include "PPCGenAsmWriter.inc" + +void PPCAsmPrinter::printOp(const MachineOperand &MO) { + switch (MO.getType()) { + case MachineOperand::MO_Immediate: + cerr << "printOp() does not handle immediate values\n"; + abort(); + return; + + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_JumpTableIndex: + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << '_' << MO.getJumpTableIndex(); + // FIXME: PIC relocation model + return; + case MachineOperand::MO_ConstantPoolIndex: + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + << '_' << MO.getConstantPoolIndex(); + return; + case MachineOperand::MO_ExternalSymbol: + // Computing the address of an external symbol, not calling it. + if (TM.getRelocationModel() != Reloc::Static) { + std::string Name(TAI->getGlobalPrefix()); Name += MO.getSymbolName(); + GVStubs.insert(Name); + O << "L" << Name << "$non_lazy_ptr"; + return; + } + O << TAI->getGlobalPrefix() << MO.getSymbolName(); + return; + case MachineOperand::MO_GlobalAddress: { + // Computing the address of a global symbol, not calling it. + GlobalValue *GV = MO.getGlobal(); + std::string Name = Mang->getValueName(GV); + + // External or weakly linked global variables need non-lazily-resolved stubs + if (TM.getRelocationModel() != Reloc::Static) { + if (((GV->isDeclaration() || GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()))) { + GVStubs.insert(Name); + O << "L" << Name << "$non_lazy_ptr"; + return; + } + } + O << Name; + + if (MO.getOffset() > 0) + O << "+" << MO.getOffset(); + else if (MO.getOffset() < 0) + O << MO.getOffset(); + + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + return; + } + + default: + O << "<unknown operand type: " << MO.getType() << ">"; + return; + } +} + +/// EmitExternalGlobal - In this case we need to use the indirect symbol. +/// +void PPCAsmPrinter::EmitExternalGlobal(const GlobalVariable *GV) { + std::string Name = getGlobalLinkName(GV); + if (TM.getRelocationModel() != Reloc::Static) { + GVStubs.insert(Name); + O << "L" << Name << "$non_lazy_ptr"; + return; + } + O << Name; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'c': // Don't print "$" before a global var name or constant. + // PPC never has a prefix. + printOperand(MI, OpNo); + return false; + case 'L': // Write second word of DImode reference. + // Verify that this operand has two consecutive registers. + if (!MI->getOperand(OpNo).isRegister() || + OpNo+1 == MI->getNumOperands() || + !MI->getOperand(OpNo+1).isRegister()) + return true; + ++OpNo; // Return the high-part. + break; + case 'I': + // Write 'i' if an integer constant, otherwise nothing. Used to print + // addi vs add, etc. + if (MI->getOperand(OpNo).isImm()) + O << "i"; + return false; + } + } + + printOperand(MI, OpNo); + return false; +} + +bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + if (MI->getOperand(OpNo).isRegister()) + printMemRegReg(MI, OpNo); + else + printMemRegImm(MI, OpNo); + return false; +} + +void PPCAsmPrinter::printPredicateOperand(const MachineInstr *MI, unsigned OpNo, + const char *Modifier) { + assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!"); + unsigned Code = MI->getOperand(OpNo).getImm(); + if (!strcmp(Modifier, "cc")) { + switch ((PPC::Predicate)Code) { + case PPC::PRED_ALWAYS: return; // Don't print anything for always. + case PPC::PRED_LT: O << "lt"; return; + case PPC::PRED_LE: O << "le"; return; + case PPC::PRED_EQ: O << "eq"; return; + case PPC::PRED_GE: O << "ge"; return; + case PPC::PRED_GT: O << "gt"; return; + case PPC::PRED_NE: O << "ne"; return; + case PPC::PRED_UN: O << "un"; return; + case PPC::PRED_NU: O << "nu"; return; + } + + } else { + assert(!strcmp(Modifier, "reg") && + "Need to specify 'cc' or 'reg' as predicate op modifier!"); + // Don't print the register for 'always'. + if (Code == PPC::PRED_ALWAYS) return; + printOperand(MI, OpNo+1); + } +} + + +/// printMachineInstruction -- Print out a single PowerPC MI in Darwin syntax to +/// the current output stream. +/// +void PPCAsmPrinter::printMachineInstruction(const MachineInstr *MI) { + ++EmittedInsts; + + // Check for slwi/srwi mnemonics. + if (MI->getOpcode() == PPC::RLWINM) { + bool FoundMnemonic = false; + unsigned char SH = MI->getOperand(2).getImmedValue(); + unsigned char MB = MI->getOperand(3).getImmedValue(); + unsigned char ME = MI->getOperand(4).getImmedValue(); + if (SH <= 31 && MB == 0 && ME == (31-SH)) { + O << "slwi "; FoundMnemonic = true; + } + if (SH <= 31 && MB == (32-SH) && ME == 31) { + O << "srwi "; FoundMnemonic = true; + SH = 32-SH; + } + if (FoundMnemonic) { + printOperand(MI, 0); + O << ", "; + printOperand(MI, 1); + O << ", " << (unsigned int)SH << "\n"; + return; + } + } else if (MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) { + if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + O << "mr "; + printOperand(MI, 0); + O << ", "; + printOperand(MI, 1); + O << "\n"; + return; + } + } else if (MI->getOpcode() == PPC::RLDICR) { + unsigned char SH = MI->getOperand(2).getImmedValue(); + unsigned char ME = MI->getOperand(3).getImmedValue(); + // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH + if (63-SH == ME) { + O << "sldi "; + printOperand(MI, 0); + O << ", "; + printOperand(MI, 1); + O << ", " << (unsigned int)SH << "\n"; + return; + } + } + + if (printInstruction(MI)) + return; // Printer was automatically generated + + assert(0 && "Unhandled instruction in asm writer!"); + abort(); + return; +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool LinuxAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>()); + + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // Print out labels for the function. + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + switch (F->getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: // Symbols default to internal. + break; + case Function::ExternalLinkage: + O << "\t.global\t" << CurrentFnName << '\n' + << "\t.type\t" << CurrentFnName << ", @function\n"; + break; + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + O << "\t.global\t" << CurrentFnName << '\n'; + O << "\t.weak\t" << CurrentFnName << '\n'; + break; + } + + if (F->hasHiddenVisibility()) + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << CurrentFnName << "\n"; + + EmitAlignment(2, F); + O << CurrentFnName << ":\n"; + + // Emit pre-function debug information. + DW.BeginFunction(&MF); + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block. + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printMachineInstruction(II); + } + } + + O << "\t.size\t" << CurrentFnName << ",.-" << CurrentFnName << "\n"; + + // Print out jump tables referenced by the function. + EmitJumpTableInfo(MF.getJumpTableInfo(), MF); + + // Emit post-function debug information. + DW.EndFunction(); + + // We didn't modify anything. + return false; +} + +bool LinuxAsmPrinter::doInitialization(Module &M) { + AsmPrinter::doInitialization(M); + + // GNU as handles section names wrapped in quotes + Mang->setUseQuotes(true); + + SwitchToTextSection(TAI->getTextSection()); + + // Emit initial debug information. + DW.BeginModule(&M); + return false; +} + +bool LinuxAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (!I->hasInitializer()) continue; // External global require no code + + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + std::string name = Mang->getValueName(I); + + if (I->hasHiddenVisibility()) + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << name << "\n"; + + Constant *C = I->getInitializer(); + unsigned Size = TD->getTypeSize(C->getType()); + unsigned Align = TD->getPreferredAlignmentLog(I); + + if (C->isNullValue() && /* FIXME: Verify correct */ + (I->hasInternalLinkage() || I->hasWeakLinkage() || + I->hasLinkOnceLinkage() || + (I->hasExternalLinkage() && !I->hasSection()))) { + if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. + if (I->hasExternalLinkage()) { + O << "\t.global " << name << '\n'; + O << "\t.type " << name << ", @object\n"; + //O << "\t.zerofill __DATA, __common, " << name << ", " + // << Size << ", " << Align; + } else if (I->hasInternalLinkage()) { + SwitchToDataSection("\t.data", I); + O << TAI->getLCOMMDirective() << name << "," << Size; + } else { + SwitchToDataSection("\t.data", I); + O << ".comm " << name << "," << Size; + } + O << "\t\t" << TAI->getCommentString() << " '" << I->getName() << "'\n"; + } else { + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + O << "\t.global " << name << '\n' + << "\t.type " << name << ", @object\n" + << "\t.weak " << name << '\n'; + SwitchToDataSection("\t.data", I); + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.global " << name << "\n" + << "\t.type " << name << ", @object\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: + if (I->isConstant()) { + const ConstantArray *CVA = dyn_cast<ConstantArray>(C); + if (TAI->getCStringSection() && CVA && CVA->isCString()) { + SwitchToDataSection(TAI->getCStringSection(), I); + break; + } + } + + // FIXME: special handling for ".ctors" & ".dtors" sections + if (I->hasSection() && + (I->getSection() == ".ctors" || + I->getSection() == ".dtors")) { + std::string SectionName = ".section " + I->getSection() + + ",\"aw\",@progbits"; + SwitchToDataSection(SectionName.c_str()); + } else { + if (I->isConstant() && TAI->getReadOnlySection()) + SwitchToDataSection(TAI->getReadOnlySection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + } + break; + default: + cerr << "Unknown linkage type!"; + abort(); + } + + EmitAlignment(Align, I); + O << name << ":\t\t\t\t" << TAI->getCommentString() << " '" + << I->getName() << "'\n"; + + // If the initializer is a extern weak symbol, remember to emit the weak + // reference! + if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + EmitGlobalConstant(C); + O << '\n'; + } + } + + // TODO + + // Emit initial debug information. + DW.EndModule(); + + AsmPrinter::doFinalization(M); + return false; // success +} + +std::string LinuxAsmPrinter::getSectionForFunction(const Function &F) const { + switch (F.getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::ExternalLinkage: + case Function::InternalLinkage: return TAI->getTextSection(); + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + return ".text"; + } +} + +std::string DarwinAsmPrinter::getSectionForFunction(const Function &F) const { + switch (F.getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::ExternalLinkage: + case Function::InternalLinkage: return TAI->getTextSection(); + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions"; + } +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool DarwinAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>()); + + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // Print out labels for the function. + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + switch (F->getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: // Symbols default to internal. + break; + case Function::ExternalLinkage: + O << "\t.globl\t" << CurrentFnName << "\n"; + break; + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + O << "\t.globl\t" << CurrentFnName << "\n"; + O << "\t.weak_definition\t" << CurrentFnName << "\n"; + break; + } + + if (F->hasHiddenVisibility()) + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << CurrentFnName << "\n"; + + EmitAlignment(4, F); + O << CurrentFnName << ":\n"; + + // Emit pre-function debug information. + DW.BeginFunction(&MF); + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block. + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printMachineInstruction(II); + } + } + + // Print out jump tables referenced by the function. + EmitJumpTableInfo(MF.getJumpTableInfo(), MF); + + // Emit post-function debug information. + DW.EndFunction(); + + // We didn't modify anything. + return false; +} + + +bool DarwinAsmPrinter::doInitialization(Module &M) { + static const char *CPUDirectives[] = { + "ppc", + "ppc601", + "ppc602", + "ppc603", + "ppc7400", + "ppc750", + "ppc970", + "ppc64" + }; + + unsigned Directive = Subtarget.getDarwinDirective(); + if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970) + Directive = PPC::DIR_970; + if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400) + Directive = PPC::DIR_7400; + if (Subtarget.isPPC64() && Directive < PPC::DIR_970) + Directive = PPC::DIR_64; + assert(Directive <= PPC::DIR_64 && "Directive out of range."); + O << "\t.machine " << CPUDirectives[Directive] << "\n"; + + AsmPrinter::doInitialization(M); + + // Darwin wants symbols to be quoted if they have complex names. + Mang->setUseQuotes(true); + + // Prime text sections so they are adjacent. This reduces the likelihood a + // large data or debug section causes a branch to exceed 16M limit. + SwitchToTextSection(".section __TEXT,__textcoal_nt,coalesced," + "pure_instructions"); + if (TM.getRelocationModel() == Reloc::PIC_) { + SwitchToTextSection(".section __TEXT,__picsymbolstub1,symbol_stubs," + "pure_instructions,32"); + } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) { + SwitchToTextSection(".section __TEXT,__symbol_stub1,symbol_stubs," + "pure_instructions,16"); + } + SwitchToTextSection(TAI->getTextSection()); + + // Emit initial debug information. + DW.BeginModule(&M); + return false; +} + +bool DarwinAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (!I->hasInitializer()) continue; // External global require no code + + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) { + if (TM.getRelocationModel() == Reloc::Static) { + if (I->getName() == "llvm.global_ctors") + O << ".reference .constructors_used\n"; + else if (I->getName() == "llvm.global_dtors") + O << ".reference .destructors_used\n"; + } + continue; + } + + std::string name = Mang->getValueName(I); + + if (I->hasHiddenVisibility()) + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << name << "\n"; + + Constant *C = I->getInitializer(); + const Type *Type = C->getType(); + unsigned Size = TD->getTypeSize(Type); + unsigned Align = TD->getPreferredAlignmentLog(I); + + if (C->isNullValue() && /* FIXME: Verify correct */ + (I->hasInternalLinkage() || I->hasWeakLinkage() || + I->hasLinkOnceLinkage() || + (I->hasExternalLinkage() && !I->hasSection()))) { + if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. + if (I->hasExternalLinkage()) { + O << "\t.globl " << name << '\n'; + O << "\t.zerofill __DATA, __common, " << name << ", " + << Size << ", " << Align; + } else if (I->hasInternalLinkage()) { + SwitchToDataSection("\t.data", I); + O << TAI->getLCOMMDirective() << name << "," << Size << "," << Align; + } else { + SwitchToDataSection("\t.data", I); + O << ".comm " << name << "," << Size; + } + O << "\t\t" << TAI->getCommentString() << " '" << I->getName() << "'\n"; + } else { + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + O << "\t.globl " << name << '\n' + << "\t.weak_definition " << name << '\n'; + SwitchToDataSection(".section __DATA,__datacoal_nt,coalesced", I); + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.globl " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: + if (I->isConstant()) { + const ConstantArray *CVA = dyn_cast<ConstantArray>(C); + if (TAI->getCStringSection() && CVA && CVA->isCString()) { + SwitchToDataSection(TAI->getCStringSection(), I); + break; + } + } + + if (!I->isConstant()) + SwitchToDataSection(TAI->getDataSection(), I); + else { + // Read-only data. + bool HasReloc = C->ContainsRelocations(); + if (HasReloc && + TM.getRelocationModel() != Reloc::Static) + SwitchToDataSection("\t.const_data\n"); + else if (!HasReloc && Size == 4 && + TAI->getFourByteConstantSection()) + SwitchToDataSection(TAI->getFourByteConstantSection(), I); + else if (!HasReloc && Size == 8 && + TAI->getEightByteConstantSection()) + SwitchToDataSection(TAI->getEightByteConstantSection(), I); + else if (!HasReloc && Size == 16 && + TAI->getSixteenByteConstantSection()) + SwitchToDataSection(TAI->getSixteenByteConstantSection(), I); + else if (TAI->getReadOnlySection()) + SwitchToDataSection(TAI->getReadOnlySection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + } + break; + default: + cerr << "Unknown linkage type!"; + abort(); + } + + EmitAlignment(Align, I); + O << name << ":\t\t\t\t" << TAI->getCommentString() << " '" + << I->getName() << "'\n"; + + // If the initializer is a extern weak symbol, remember to emit the weak + // reference! + if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + EmitGlobalConstant(C); + O << '\n'; + } + } + + bool isPPC64 = TD->getPointerSizeInBits() == 64; + + // Output stubs for dynamically-linked functions + if (TM.getRelocationModel() == Reloc::PIC_) { + for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end(); + i != e; ++i) { + SwitchToTextSection(".section __TEXT,__picsymbolstub1,symbol_stubs," + "pure_instructions,32"); + EmitAlignment(4); + O << "L" << *i << "$stub:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\tmflr r0\n"; + O << "\tbcl 20,31,L0$" << *i << "\n"; + O << "L0$" << *i << ":\n"; + O << "\tmflr r11\n"; + O << "\taddis r11,r11,ha16(L" << *i << "$lazy_ptr-L0$" << *i << ")\n"; + O << "\tmtlr r0\n"; + if (isPPC64) + O << "\tldu r12,lo16(L" << *i << "$lazy_ptr-L0$" << *i << ")(r11)\n"; + else + O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr-L0$" << *i << ")(r11)\n"; + O << "\tmtctr r12\n"; + O << "\tbctr\n"; + SwitchToDataSection(".lazy_symbol_pointer"); + O << "L" << *i << "$lazy_ptr:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + if (isPPC64) + O << "\t.quad dyld_stub_binding_helper\n"; + else + O << "\t.long dyld_stub_binding_helper\n"; + } + } else { + for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end(); + i != e; ++i) { + SwitchToTextSection(".section __TEXT,__symbol_stub1,symbol_stubs," + "pure_instructions,16"); + EmitAlignment(4); + O << "L" << *i << "$stub:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\tlis r11,ha16(L" << *i << "$lazy_ptr)\n"; + if (isPPC64) + O << "\tldu r12,lo16(L" << *i << "$lazy_ptr)(r11)\n"; + else + O << "\tlwzu r12,lo16(L" << *i << "$lazy_ptr)(r11)\n"; + O << "\tmtctr r12\n"; + O << "\tbctr\n"; + SwitchToDataSection(".lazy_symbol_pointer"); + O << "L" << *i << "$lazy_ptr:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + if (isPPC64) + O << "\t.quad dyld_stub_binding_helper\n"; + else + O << "\t.long dyld_stub_binding_helper\n"; + } + } + + O << "\n"; + + // Output stubs for external and common global variables. + if (GVStubs.begin() != GVStubs.end()) { + SwitchToDataSection(".non_lazy_symbol_pointer"); + for (std::set<std::string>::iterator I = GVStubs.begin(), + E = GVStubs.end(); I != E; ++I) { + O << "L" << *I << "$non_lazy_ptr:\n"; + O << "\t.indirect_symbol " << *I << "\n"; + if (isPPC64) + O << "\t.quad\t0\n"; + else + O << "\t.long\t0\n"; + + } + } + + // Emit initial debug information. + DW.EndModule(); + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never generates + // code that does this, it is always safe to set. + O << "\t.subsections_via_symbols\n"; + + AsmPrinter::doFinalization(M); + return false; // success +} + + + +/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code +/// for a MachineFunction to the given output stream, in a format that the +/// Darwin assembler can deal with. +/// +FunctionPass *llvm::createPPCAsmPrinterPass(std::ostream &o, + PPCTargetMachine &tm) { + const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>(); + + if (Subtarget->isDarwin()) { + return new DarwinAsmPrinter(o, tm, tm.getTargetAsmInfo()); + } else { + return new LinuxAsmPrinter(o, tm, tm.getTargetAsmInfo()); + } +} + diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp new file mode 100644 index 0000000..4286f01 --- /dev/null +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -0,0 +1,199 @@ +//===-- PPCBranchSelector.cpp - Emit long conditional branches-----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Baegeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that scans a machine function to determine which +// conditional branches need more than 16 bits of displacement to reach their +// target basic block. It does this in two passes; a calculation of basic block +// positions pass, and a branch psuedo op to machine branch opcode pass. This +// pass should be run last, just before the assembly printer. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-branch-select" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "PPCPredicates.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/MathExtras.h" +using namespace llvm; + +STATISTIC(NumExpanded, "Number of branches expanded to long format"); + +namespace { + struct VISIBILITY_HIDDEN PPCBSel : public MachineFunctionPass { + static char ID; + PPCBSel() : MachineFunctionPass((intptr_t)&ID) {} + + /// BlockSizes - The sizes of the basic blocks in the function. + std::vector<unsigned> BlockSizes; + + virtual bool runOnMachineFunction(MachineFunction &Fn); + + virtual const char *getPassName() const { + return "PowerPC Branch Selector"; + } + }; + char PPCBSel::ID = 0; +} + +/// createPPCBranchSelectionPass - returns an instance of the Branch Selection +/// Pass +/// +FunctionPass *llvm::createPPCBranchSelectionPass() { + return new PPCBSel(); +} + +/// getNumBytesForInstruction - Return the number of bytes of code the specified +/// instruction may be. This returns the maximum number of bytes. +/// +static unsigned getNumBytesForInstruction(MachineInstr *MI) { + switch (MI->getOpcode()) { + case PPC::IMPLICIT_DEF_GPRC: // no asm emitted + case PPC::IMPLICIT_DEF_G8RC: // no asm emitted + case PPC::IMPLICIT_DEF_F4: // no asm emitted + case PPC::IMPLICIT_DEF_F8: // no asm emitted + case PPC::IMPLICIT_DEF_VRRC: // no asm emitted + return 0; + case PPC::INLINEASM: { // Inline Asm: Variable size. + MachineFunction *MF = MI->getParent()->getParent(); + const char *AsmStr = MI->getOperand(0).getSymbolName(); + return MF->getTarget().getTargetAsmInfo()->getInlineAsmLength(AsmStr); + } + case PPC::LABEL: { + return 0; + } + default: + return 4; // PowerPC instructions are all 4 bytes + } +} + + +bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + // Give the blocks of the function a dense, in-order, numbering. + Fn.RenumberBlocks(); + BlockSizes.resize(Fn.getNumBlockIDs()); + + // Measure each MBB and compute a size for the entire function. + unsigned FuncSize = 0; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock *MBB = MFI; + + unsigned BlockSize = 0; + for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); + MBBI != EE; ++MBBI) + BlockSize += getNumBytesForInstruction(MBBI); + + BlockSizes[MBB->getNumber()] = BlockSize; + FuncSize += BlockSize; + } + + // If the entire function is smaller than the displacement of a branch field, + // we know we don't need to shrink any branches in this function. This is a + // common case. + if (FuncSize < (1 << 15)) { + BlockSizes.clear(); + return false; + } + + // For each conditional branch, if the offset to its destination is larger + // than the offset field allows, transform it into a long branch sequence + // like this: + // short branch: + // bCC MBB + // long branch: + // b!CC $PC+8 + // b MBB + // + bool MadeChange = true; + bool EverMadeChange = false; + while (MadeChange) { + // Iteratively expand branches until we reach a fixed point. + MadeChange = false; + + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + unsigned MBBStartOffset = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) { + MBBStartOffset += getNumBytesForInstruction(I); + continue; + } + + // Determine the offset from the current branch to the destination + // block. + MachineBasicBlock *Dest = I->getOperand(2).getMachineBasicBlock(); + + int BranchSize; + if (Dest->getNumber() <= MBB.getNumber()) { + // If this is a backwards branch, the delta is the offset from the + // start of this block to this branch, plus the sizes of all blocks + // from this block to the dest. + BranchSize = MBBStartOffset; + + for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } else { + // Otherwise, add the size of the blocks between this block and the + // dest to the number of bytes left in this block. + BranchSize = -MBBStartOffset; + + for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } + + // If this branch is in range, ignore it. + if (isInt16(BranchSize)) { + MBBStartOffset += 4; + continue; + } + + // Otherwise, we have to expand it to a long branch. + // The BCC operands are: + // 0. PPC branch predicate + // 1. CR register + // 2. Target MBB + PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); + unsigned CRReg = I->getOperand(1).getReg(); + + MachineInstr *OldBranch = I; + + // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. + BuildMI(MBB, I, TII->get(PPC::BCC)) + .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); + + // Uncond branch to the real destination. + I = BuildMI(MBB, I, TII->get(PPC::B)).addMBB(Dest); + + // Remove the old branch from the function. + OldBranch->eraseFromParent(); + + // Remember that this instruction is 8-bytes, increase the size of the + // block by 4, remember to iterate. + BlockSizes[MBB.getNumber()] += 4; + MBBStartOffset += 8; + ++NumExpanded; + MadeChange = true; + } + } + EverMadeChange |= MadeChange; + } + + BlockSizes.clear(); + return true; +} + diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td new file mode 100644 index 0000000..9e31b5a --- /dev/null +++ b/lib/Target/PowerPC/PPCCallingConv.td @@ -0,0 +1,65 @@ +//===- PPCCallingConv.td - Calling Conventions for PowerPC ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the PowerPC 32- and 64-bit +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention +//===----------------------------------------------------------------------===// + +// Return-value convention for PowerPC +def RetCC_PPC : CallingConv<[ + CCIfType<[i32], CCAssignToReg<[R3, R4]>>, + CCIfType<[i64], CCAssignToReg<[X3, X4]>>, + + CCIfType<[f32, f64], CCAssignToReg<[F1]>>, + + // Vector types are always returned in V2. + CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>> +]>; + + +//===----------------------------------------------------------------------===// +// PowerPC Argument Calling Conventions +//===----------------------------------------------------------------------===// +/* +def CC_PPC : CallingConv<[ + // The first 8 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>, + + // Common sub-targets passes FP values in F1 - F13 + CCIfType<[f32, f64], CCIfSubtarget<"isMachoABI()", + CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>>, + // ELF32 sub-target pass FP values in F1 - F8. + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + + // The first 12 Vector arguments are passed in altivec registers. + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>> + +/* + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToStack<16, 16>>*/ +]>; + +*/ + diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp new file mode 100644 index 0000000..5dceffd --- /dev/null +++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp @@ -0,0 +1,237 @@ +//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC32 -------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to +// JIT-compile bitcode to native PowerPC. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" +#include "PPCRelocations.h" +#include "PPC.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + class VISIBILITY_HIDDEN PPCCodeEmitter : public MachineFunctionPass { + TargetMachine &TM; + MachineCodeEmitter &MCE; + + /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record + /// its address in the function into this pointer. + void *MovePCtoLROffset; + + /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr + /// + int getMachineOpValue(MachineInstr &MI, MachineOperand &MO); + + public: + static char ID; + PPCCodeEmitter(TargetMachine &T, MachineCodeEmitter &M) + : MachineFunctionPass((intptr_t)&ID), TM(T), MCE(M) {} + + const char *getPassName() const { return "PowerPC Machine Code Emitter"; } + + /// runOnMachineFunction - emits the given MachineFunction to memory + /// + bool runOnMachineFunction(MachineFunction &MF); + + /// emitBasicBlock - emits the given MachineBasicBlock to memory + /// + void emitBasicBlock(MachineBasicBlock &MBB); + + /// getValueBit - return the particular bit of Val + /// + unsigned getValueBit(int64_t Val, unsigned bit) { return (Val >> bit) & 1; } + + /// getBinaryCodeForInstr - This function, generated by the + /// CodeEmitterGenerator using TableGen, produces the binary encoding for + /// machine instructions. + /// + unsigned getBinaryCodeForInstr(MachineInstr &MI); + }; + char PPCCodeEmitter::ID = 0; +} + +/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code +/// to the specified MCE object. +FunctionPass *llvm::createPPCCodeEmitterPass(PPCTargetMachine &TM, + MachineCodeEmitter &MCE) { + return new PPCCodeEmitter(TM, MCE); +} + +#ifdef __APPLE__ +extern "C" void sys_icache_invalidate(const void *Addr, size_t len); +#endif + +bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + do { + MovePCtoLROffset = 0; + MCE.startFunction(MF); + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + emitBasicBlock(*BB); + } while (MCE.finishFunction(MF)); + + return false; +} + +void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) { + MCE.StartMachineBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){ + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + default: + MCE.emitWordBE(getBinaryCodeForInstr(*I)); + break; + case PPC::IMPLICIT_DEF_GPRC: + case PPC::IMPLICIT_DEF_G8RC: + case PPC::IMPLICIT_DEF_F8: + case PPC::IMPLICIT_DEF_F4: + case PPC::IMPLICIT_DEF_VRRC: + break; // pseudo opcode, no side effects + case PPC::MovePCtoLR: + case PPC::MovePCtoLR8: + assert(TM.getRelocationModel() == Reloc::PIC_); + MovePCtoLROffset = (void*)MCE.getCurrentPCValue(); + MCE.emitWordBE(0x48000005); // bl 1 + break; + } + } +} + +int PPCCodeEmitter::getMachineOpValue(MachineInstr &MI, MachineOperand &MO) { + + intptr_t rv = 0; // Return value; defaults to 0 for unhandled cases + // or things that get fixed up later by the JIT. + if (MO.isRegister()) { + rv = PPCRegisterInfo::getRegisterNumbering(MO.getReg()); + + // Special encoding for MTCRF and MFOCRF, which uses a bit mask for the + // register, not the register number directly. + if ((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) && + (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)) { + rv = 0x80 >> rv; + } + } else if (MO.isImmediate()) { + rv = MO.getImmedValue(); + } else if (MO.isGlobalAddress() || MO.isExternalSymbol() || + MO.isConstantPoolIndex() || MO.isJumpTableIndex()) { + unsigned Reloc = 0; + if (MI.getOpcode() == PPC::BL_Macho || MI.getOpcode() == PPC::BL8_Macho || + MI.getOpcode() == PPC::BL_ELF || MI.getOpcode() == PPC::BL8_ELF) + Reloc = PPC::reloc_pcrel_bx; + else { + if (TM.getRelocationModel() == Reloc::PIC_) { + assert(MovePCtoLROffset && "MovePCtoLR not seen yet?"); + } + switch (MI.getOpcode()) { + default: MI.dump(); assert(0 && "Unknown instruction for relocation!"); + case PPC::LIS: + case PPC::LIS8: + case PPC::ADDIS: + case PPC::ADDIS8: + Reloc = PPC::reloc_absolute_high; // Pointer to symbol + break; + case PPC::LI: + case PPC::LI8: + case PPC::LA: + // Loads. + case PPC::LBZ: + case PPC::LBZ8: + case PPC::LHA: + case PPC::LHA8: + case PPC::LHZ: + case PPC::LHZ8: + case PPC::LWZ: + case PPC::LWZ8: + case PPC::LFS: + case PPC::LFD: + + // Stores. + case PPC::STB: + case PPC::STB8: + case PPC::STH: + case PPC::STH8: + case PPC::STW: + case PPC::STW8: + case PPC::STFS: + case PPC::STFD: + Reloc = PPC::reloc_absolute_low; + break; + + case PPC::LWA: + case PPC::LD: + case PPC::STD: + case PPC::STD_32: + Reloc = PPC::reloc_absolute_low_ix; + break; + } + } + + MachineRelocation R; + if (MO.isGlobalAddress()) { + R = MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + MO.getGlobal(), 0); + } else if (MO.isExternalSymbol()) { + R = MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, MO.getSymbolName(), 0); + } else if (MO.isConstantPoolIndex()) { + R = MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, MO.getConstantPoolIndex(), 0); + } else { + assert(MO.isJumpTableIndex()); + R = MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, MO.getJumpTableIndex(), 0); + } + + // If in PIC mode, we need to encode the negated address of the + // 'movepctolr' into the unrelocated field. After relocation, we'll have + // &gv-&movepctolr-4 in the imm field. Once &movepctolr is added to the imm + // field, we get &gv. This doesn't happen for branch relocations, which are + // always implicitly pc relative. + if (TM.getRelocationModel() == Reloc::PIC_ && Reloc != PPC::reloc_pcrel_bx){ + assert(MovePCtoLROffset && "MovePCtoLR not seen yet?"); + R.setConstantVal(-(intptr_t)MovePCtoLROffset - 4); + } + MCE.addRelocation(R); + + } else if (MO.isMachineBasicBlock()) { + unsigned Reloc = 0; + unsigned Opcode = MI.getOpcode(); + if (Opcode == PPC::B || Opcode == PPC::BL_Macho || + Opcode == PPC::BLA_Macho || Opcode == PPC::BL_ELF || + Opcode == PPC::BLA_ELF) + Reloc = PPC::reloc_pcrel_bx; + else // BCC instruction + Reloc = PPC::reloc_pcrel_bcx; + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + Reloc, + MO.getMachineBasicBlock())); + } else { + cerr << "ERROR: Unknown type of MachineOperand: " << MO << "\n"; + abort(); + } + + return rv; +} + +#include "PPCGenCodeEmitter.inc" + diff --git a/lib/Target/PowerPC/PPCFrameInfo.h b/lib/Target/PowerPC/PPCFrameInfo.h new file mode 100644 index 0000000..81365e9 --- /dev/null +++ b/lib/Target/PowerPC/PPCFrameInfo.h @@ -0,0 +1,93 @@ +//===-- PPCFrameInfo.h - Define TargetFrameInfo for PowerPC -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_FRAMEINFO_H +#define POWERPC_FRAMEINFO_H + +#include "PPC.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class PPCFrameInfo: public TargetFrameInfo { + const TargetMachine &TM; + +public: + PPCFrameInfo(const TargetMachine &tm, bool LP64) + : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), TM(tm) { + } + + /// getReturnSaveOffset - Return the previous frame offset to save the + /// return address. + static unsigned getReturnSaveOffset(bool LP64, bool isMacho) { + if (isMacho) + return LP64 ? 16 : 8; + // For ELF 32 ABI: + return 4; + } + + /// getFramePointerSaveOffset - Return the previous frame offset to save the + /// frame pointer. + static unsigned getFramePointerSaveOffset(bool LP64, bool isMacho) { + // For MachO ABI: + // Use the TOC save slot in the PowerPC linkage area for saving the frame + // pointer (if needed.) LLVM does not generate code that uses the TOC (R2 + // is treated as a caller saved register.) + if (isMacho) + return LP64 ? 40 : 20; + + // For ELF 32 ABI: + // Save it right before the link register + return -4U; + } + + /// getLinkageSize - Return the size of the PowerPC ABI linkage area. + /// + static unsigned getLinkageSize(bool LP64, bool isMacho) { + if (isMacho) + return 6 * (LP64 ? 8 : 4); + + // For ELF 32 ABI: + return 8; + } + + /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI + /// argument area. + static unsigned getMinCallArgumentsSize(bool LP64, bool isMacho) { + // For Macho ABI: + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + if (isMacho) + return 8 * (LP64 ? 8 : 4); + + // For ELF 32 ABI: + // There is no default stack allocated for the 8 first GPR arguments. + return 0; + } + + /// getMinCallFrameSize - Return the minimum size a call frame can be using + /// the PowerPC ABI. + static unsigned getMinCallFrameSize(bool LP64, bool isMacho) { + // The call frame needs to be at least big enough for linkage and 8 args. + return getLinkageSize(LP64, isMacho) + + getMinCallArgumentsSize(LP64, isMacho); + } + +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp new file mode 100644 index 0000000..26e1f47 --- /dev/null +++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -0,0 +1,303 @@ +//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "pre-RA-sched" +#include "PPCHazardRecognizers.h" +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "llvm/Support/Debug.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// PowerPC 970 Hazard Recognizer +// +// This models the dispatch group formation of the PPC970 processor. Dispatch +// groups are bundles of up to five instructions that can contain various mixes +// of instructions. The PPC970 can dispatch a peak of 4 non-branch and one +// branch instruction per-cycle. +// +// There are a number of restrictions to dispatch group formation: some +// instructions can only be issued in the first slot of a dispatch group, & some +// instructions fill an entire dispatch group. Additionally, only branches can +// issue in the 5th (last) slot. +// +// Finally, there are a number of "structural" hazards on the PPC970. These +// conditions cause large performance penalties due to misprediction, recovery, +// and replay logic that has to happen. These cases include setting a CTR and +// branching through it in the same dispatch group, and storing to an address, +// then loading from the same address within a dispatch group. To avoid these +// conditions, we insert no-op instructions when appropriate. +// +// FIXME: This is missing some significant cases: +// 1. Modeling of microcoded instructions. +// 2. Handling of serialized operations. +// 3. Handling of the esoteric cases in "Resource-based Instruction Grouping". +// + +PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii) + : TII(tii) { + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::EndDispatchGroup() { + DOUT << "=== Start of dispatch group\n"; + NumIssued = 0; + + // Structural hazard info. + HasCTRSet = false; + NumStores = 0; +} + + +PPCII::PPC970_Unit +PPCHazardRecognizer970::GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle, + bool &isCracked, + bool &isLoad, bool &isStore) { + if (Opcode < ISD::BUILTIN_OP_END) { + isFirst = isSingle = isCracked = isLoad = isStore = false; + return PPCII::PPC970_Pseudo; + } + Opcode -= ISD::BUILTIN_OP_END; + + const TargetInstrDescriptor &TID = TII.get(Opcode); + + isLoad = TID.Flags & M_LOAD_FLAG; + isStore = TID.Flags & M_STORE_FLAG; + + unsigned TSFlags = TID.TSFlags; + + isFirst = TSFlags & PPCII::PPC970_First; + isSingle = TSFlags & PPCII::PPC970_Single; + isCracked = TSFlags & PPCII::PPC970_Cracked; + return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask); +} + +/// isLoadOfStoredAddress - If we have a load from the previously stored pointer +/// as indicated by StorePtr1/StorePtr2/StoreSize, return true. +bool PPCHazardRecognizer970:: +isLoadOfStoredAddress(unsigned LoadSize, SDOperand Ptr1, SDOperand Ptr2) const { + for (unsigned i = 0, e = NumStores; i != e; ++i) { + // Handle exact and commuted addresses. + if (Ptr1 == StorePtr1[i] && Ptr2 == StorePtr2[i]) + return true; + if (Ptr2 == StorePtr1[i] && Ptr1 == StorePtr2[i]) + return true; + + // Okay, we don't have an exact match, if this is an indexed offset, see if + // we have overlap (which happens during fp->int conversion for example). + if (StorePtr2[i] == Ptr2) { + if (ConstantSDNode *StoreOffset = dyn_cast<ConstantSDNode>(StorePtr1[i])) + if (ConstantSDNode *LoadOffset = dyn_cast<ConstantSDNode>(Ptr1)) { + // Okay the base pointers match, so we have [c1+r] vs [c2+r]. Check + // to see if the load and store actually overlap. + int StoreOffs = StoreOffset->getValue(); + int LoadOffs = LoadOffset->getValue(); + if (StoreOffs < LoadOffs) { + if (int(StoreOffs+StoreSize[i]) > LoadOffs) return true; + } else { + if (int(LoadOffs+LoadSize) > StoreOffs) return true; + } + } + } + } + return false; +} + +/// getHazardType - We return hazard for any non-branch instruction that would +/// terminate terminate the dispatch group. We turn NoopHazard for any +/// instructions that wouldn't terminate the dispatch group that would cause a +/// pipeline flush. +HazardRecognizer::HazardType PPCHazardRecognizer970:: +getHazardType(SDNode *Node) { + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return NoHazard; + unsigned Opcode = Node->getOpcode()-ISD::BUILTIN_OP_END; + + // We can only issue a PPC970_First/PPC970_Single instruction (such as + // crand/mtspr/etc) if this is the first cycle of the dispatch group. + if (NumIssued != 0 && (isFirst || isSingle)) + return Hazard; + + // If this instruction is cracked into two ops by the decoder, we know that + // it is not a branch and that it cannot issue if 3 other instructions are + // already in the dispatch group. + if (isCracked && NumIssued > 2) + return Hazard; + + switch (InstrType) { + default: assert(0 && "Unknown instruction type!"); + case PPCII::PPC970_FXU: + case PPCII::PPC970_LSU: + case PPCII::PPC970_FPU: + case PPCII::PPC970_VALU: + case PPCII::PPC970_VPERM: + // We can only issue a branch as the last instruction in a group. + if (NumIssued == 4) return Hazard; + break; + case PPCII::PPC970_CRU: + // We can only issue a CR instruction in the first two slots. + if (NumIssued >= 2) return Hazard; + break; + case PPCII::PPC970_BRU: + break; + } + + // Do not allow MTCTR and BCTRL to be in the same dispatch group. + if (HasCTRSet && (Opcode == PPC::BCTRL_Macho || Opcode == PPC::BCTRL_ELF)) + return NoopHazard; + + // If this is a load following a store, make sure it's not to the same or + // overlapping address. + if (isLoad && NumStores) { + unsigned LoadSize; + switch (Opcode) { + default: assert(0 && "Unknown load!"); + case PPC::LBZ: case PPC::LBZU: + case PPC::LBZX: + case PPC::LBZ8: case PPC::LBZU8: + case PPC::LBZX8: + case PPC::LVEBX: + LoadSize = 1; + break; + case PPC::LHA: case PPC::LHAU: + case PPC::LHAX: + case PPC::LHZ: case PPC::LHZU: + case PPC::LHZX: + case PPC::LVEHX: + case PPC::LHBRX: + case PPC::LHA8: case PPC::LHAU8: + case PPC::LHAX8: + case PPC::LHZ8: case PPC::LHZU8: + case PPC::LHZX8: + LoadSize = 2; + break; + case PPC::LFS: case PPC::LFSU: + case PPC::LFSX: + case PPC::LWZ: case PPC::LWZU: + case PPC::LWZX: + case PPC::LWA: + case PPC::LWAX: + case PPC::LVEWX: + case PPC::LWBRX: + case PPC::LWZ8: + case PPC::LWZX8: + LoadSize = 4; + break; + case PPC::LFD: case PPC::LFDU: + case PPC::LFDX: + case PPC::LD: case PPC::LDU: + case PPC::LDX: + LoadSize = 8; + break; + case PPC::LVX: + LoadSize = 16; + break; + } + + if (isLoadOfStoredAddress(LoadSize, + Node->getOperand(0), Node->getOperand(1))) + return NoopHazard; + } + + return NoHazard; +} + +void PPCHazardRecognizer970::EmitInstruction(SDNode *Node) { + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return; + unsigned Opcode = Node->getOpcode()-ISD::BUILTIN_OP_END; + + // Update structural hazard information. + if (Opcode == PPC::MTCTR) HasCTRSet = true; + + // Track the address stored to. + if (isStore) { + unsigned ThisStoreSize; + switch (Opcode) { + default: assert(0 && "Unknown store instruction!"); + case PPC::STB: case PPC::STB8: + case PPC::STBU: case PPC::STBU8: + case PPC::STBX: case PPC::STBX8: + case PPC::STVEBX: + ThisStoreSize = 1; + break; + case PPC::STH: case PPC::STH8: + case PPC::STHU: case PPC::STHU8: + case PPC::STHX: case PPC::STHX8: + case PPC::STVEHX: + case PPC::STHBRX: + ThisStoreSize = 2; + break; + case PPC::STFS: + case PPC::STFSU: + case PPC::STFSX: + case PPC::STWX: case PPC::STWX8: + case PPC::STWUX: + case PPC::STW: case PPC::STW8: + case PPC::STWU: case PPC::STWU8: + case PPC::STVEWX: + case PPC::STFIWX: + case PPC::STWBRX: + ThisStoreSize = 4; + break; + case PPC::STD_32: + case PPC::STDX_32: + case PPC::STD: + case PPC::STDU: + case PPC::STFD: + case PPC::STFDX: + case PPC::STDX: + case PPC::STDUX: + ThisStoreSize = 8; + break; + case PPC::STVX: + ThisStoreSize = 16; + break; + } + + StoreSize[NumStores] = ThisStoreSize; + StorePtr1[NumStores] = Node->getOperand(1); + StorePtr2[NumStores] = Node->getOperand(2); + ++NumStores; + } + + if (InstrType == PPCII::PPC970_BRU || isSingle) + NumIssued = 4; // Terminate a d-group. + ++NumIssued; + + // If this instruction is cracked into two ops by the decoder, remember that + // we issued two pieces. + if (isCracked) + ++NumIssued; + + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::AdvanceCycle() { + assert(NumIssued < 5 && "Illegal dispatch group!"); + ++NumIssued; + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::EmitNoop() { + AdvanceCycle(); +} diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h new file mode 100644 index 0000000..cbff943 --- /dev/null +++ b/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -0,0 +1,73 @@ +//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCHAZRECS_H +#define PPCHAZRECS_H + +#include "llvm/CodeGen/ScheduleDAG.h" +#include "PPCInstrInfo.h" + +namespace llvm { + +/// PPCHazardRecognizer970 - This class defines a finite state automata that +/// models the dispatch logic on the PowerPC 970 (aka G5) processor. This +/// promotes good dispatch group formation and implements noop insertion to +/// avoid structural hazards that cause significant performance penalties (e.g. +/// setting the CTR register then branching through it within a dispatch group), +/// or storing then loading from the same address within a dispatch group. +class PPCHazardRecognizer970 : public HazardRecognizer { + const TargetInstrInfo &TII; + + unsigned NumIssued; // Number of insts issued, including advanced cycles. + + // Various things that can cause a structural hazard. + + // HasCTRSet - If the CTR register is set in this group, disallow BCTRL. + bool HasCTRSet; + + // StoredPtr - Keep track of the address of any store. If we see a load from + // the same address (or one that aliases it), disallow the store. We can have + // up to four stores in one dispatch group, hence we track up to 4. + // + // This is null if we haven't seen a store yet. We keep track of both + // operands of the store here, since we support [r+r] and [r+i] addressing. + SDOperand StorePtr1[4], StorePtr2[4]; + unsigned StoreSize[4]; + unsigned NumStores; + +public: + PPCHazardRecognizer970(const TargetInstrInfo &TII); + virtual HazardType getHazardType(SDNode *Node); + virtual void EmitInstruction(SDNode *Node); + virtual void AdvanceCycle(); + virtual void EmitNoop(); + +private: + /// EndDispatchGroup - Called when we are finishing a new dispatch group. + /// + void EndDispatchGroup(); + + /// GetInstrType - Classify the specified powerpc opcode according to its + /// pipeline. + PPCII::PPC970_Unit GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle,bool &isCracked, + bool &isLoad, bool &isStore); + + bool isLoadOfStoredAddress(unsigned LoadSize, + SDOperand Ptr1, SDOperand Ptr2) const; +}; + +} // end namespace llvm + +#endif + diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp new file mode 100644 index 0000000..730bac6 --- /dev/null +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -0,0 +1,1122 @@ +//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for PowerPC, +// converting from a legalized dag to a PPC dag. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-codegen" +#include "PPC.h" +#include "PPCPredicates.h" +#include "PPCTargetMachine.h" +#include "PPCISelLowering.h" +#include "PPCHazardRecognizers.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Constants.h" +#include "llvm/GlobalValue.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Compiler.h" +#include <queue> +#include <set> +using namespace llvm; + +namespace { + //===--------------------------------------------------------------------===// + /// PPCDAGToDAGISel - PPC specific code to select PPC machine + /// instructions for SelectionDAG operations. + /// + class VISIBILITY_HIDDEN PPCDAGToDAGISel : public SelectionDAGISel { + PPCTargetMachine &TM; + PPCTargetLowering PPCLowering; + unsigned GlobalBaseReg; + public: + PPCDAGToDAGISel(PPCTargetMachine &tm) + : SelectionDAGISel(PPCLowering), TM(tm), + PPCLowering(*TM.getTargetLowering()) {} + + virtual bool runOnFunction(Function &Fn) { + // Make sure we re-emit a set of the global base reg if necessary + GlobalBaseReg = 0; + SelectionDAGISel::runOnFunction(Fn); + + InsertVRSaveCode(Fn); + return true; + } + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDOperand getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + /// getI64Imm - Return a target constant with the specified value, of type + /// i64. + inline SDOperand getI64Imm(uint64_t Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i64); + } + + /// getSmallIPtrImm - Return a target constant of pointer type. + inline SDOperand getSmallIPtrImm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy()); + } + + /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s + /// with any number of 0s on either side. The 1s are allowed to wrap from + /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. + /// 0x0F0F0000 is not, since all 1s are not contiguous. + static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME); + + + /// isRotateAndMask - Returns true if Mask and Shift can be folded into a + /// rotate and mask opcode and mask operation. + static bool isRotateAndMask(SDNode *N, unsigned Mask, bool IsShiftMask, + unsigned &SH, unsigned &MB, unsigned &ME); + + /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC + /// base register. Return the virtual register that holds this value. + SDNode *getGlobalBaseReg(); + + // Select - Convert the specified operand from a target-independent to a + // target-specific node if it hasn't already been changed. + SDNode *Select(SDOperand Op); + + SDNode *SelectBitfieldInsert(SDNode *N); + + /// SelectCC - Select a comparison of the specified values with the + /// specified condition code, returning the CR# of the expression. + SDOperand SelectCC(SDOperand LHS, SDOperand RHS, ISD::CondCode CC); + + /// SelectAddrImm - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement [r+imm]. + bool SelectAddrImm(SDOperand Op, SDOperand N, SDOperand &Disp, + SDOperand &Base) { + return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG); + } + + /// SelectAddrImmOffs - Return true if the operand is valid for a preinc + /// immediate field. Because preinc imms have already been validated, just + /// accept it. + bool SelectAddrImmOffs(SDOperand Op, SDOperand N, SDOperand &Out) const { + Out = N; + return true; + } + + /// SelectAddrIdx - Given the specified addressed, check to see if it can be + /// represented as an indexed [r+r] operation. Returns false if it can + /// be represented by [r+imm], which are preferred. + bool SelectAddrIdx(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Index) { + return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG); + } + + /// SelectAddrIdxOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddrIdxOnly(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Index) { + return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG); + } + + /// SelectAddrImmShift - Returns true if the address N can be represented by + /// a base register plus a signed 14-bit displacement [r+imm*4]. Suitable + /// for use by STD and friends. + bool SelectAddrImmShift(SDOperand Op, SDOperand N, SDOperand &Disp, + SDOperand &Base) { + return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG); + } + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op, + char ConstraintCode, + std::vector<SDOperand> &OutOps, + SelectionDAG &DAG) { + SDOperand Op0, Op1; + switch (ConstraintCode) { + default: return true; + case 'm': // memory + if (!SelectAddrIdx(Op, Op, Op0, Op1)) + SelectAddrImm(Op, Op, Op0, Op1); + break; + case 'o': // offsetable + if (!SelectAddrImm(Op, Op, Op0, Op1)) { + Op0 = Op; + AddToISelQueue(Op0); // r+0. + Op1 = getSmallIPtrImm(0); + } + break; + case 'v': // not offsetable + SelectAddrIdxOnly(Op, Op, Op0, Op1); + break; + } + + OutOps.push_back(Op0); + OutOps.push_back(Op1); + return false; + } + + SDOperand BuildSDIVSequence(SDNode *N); + SDOperand BuildUDIVSequence(SDNode *N); + + /// InstructionSelectBasicBlock - This callback is invoked by + /// SelectionDAGISel when it has created a SelectionDAG for us to codegen. + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + + void InsertVRSaveCode(Function &Fn); + + virtual const char *getPassName() const { + return "PowerPC DAG->DAG Pattern Instruction Selection"; + } + + /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for + /// this target when scheduling the DAG. + virtual HazardRecognizer *CreateTargetHazardRecognizer() { + // Should use subtarget info to pick the right hazard recognizer. For + // now, always return a PPC970 recognizer. + const TargetInstrInfo *II = PPCLowering.getTargetMachine().getInstrInfo(); + assert(II && "No InstrInfo?"); + return new PPCHazardRecognizer970(*II); + } + +// Include the pieces autogenerated from the target description. +#include "PPCGenDAGISel.inc" + +private: + SDNode *SelectSETCC(SDOperand Op); + }; +} + +/// InstructionSelectBasicBlock - This callback is invoked by +/// SelectionDAGISel when it has created a SelectionDAG for us to codegen. +void PPCDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + + // Select target instructions for the DAG. + DAG.setRoot(SelectRoot(DAG.getRoot())); + DAG.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(DAG); +} + +/// InsertVRSaveCode - Once the entire function has been instruction selected, +/// all virtual registers are created and all machine instructions are built, +/// check to see if we need to save/restore VRSAVE. If so, do it. +void PPCDAGToDAGISel::InsertVRSaveCode(Function &F) { + // Check to see if this function uses vector registers, which means we have to + // save and restore the VRSAVE register and update it with the regs we use. + // + // In this case, there will be virtual registers of vector type type created + // by the scheduler. Detect them now. + MachineFunction &Fn = MachineFunction::get(&F); + SSARegMap *RegMap = Fn.getSSARegMap(); + bool HasVectorVReg = false; + for (unsigned i = MRegisterInfo::FirstVirtualRegister, + e = RegMap->getLastVirtReg()+1; i != e; ++i) + if (RegMap->getRegClass(i) == &PPC::VRRCRegClass) { + HasVectorVReg = true; + break; + } + if (!HasVectorVReg) return; // nothing to do. + + // If we have a vector register, we want to emit code into the entry and exit + // blocks to save and restore the VRSAVE register. We do this here (instead + // of marking all vector instructions as clobbering VRSAVE) for two reasons: + // + // 1. This (trivially) reduces the load on the register allocator, by not + // having to represent the live range of the VRSAVE register. + // 2. This (more significantly) allows us to create a temporary virtual + // register to hold the saved VRSAVE value, allowing this temporary to be + // register allocated, instead of forcing it to be spilled to the stack. + + // Create two vregs - one to hold the VRSAVE register that is live-in to the + // function and one for the value after having bits or'd into it. + unsigned InVRSAVE = RegMap->createVirtualRegister(&PPC::GPRCRegClass); + unsigned UpdatedVRSAVE = RegMap->createVirtualRegister(&PPC::GPRCRegClass); + + const TargetInstrInfo &TII = *TM.getInstrInfo(); + MachineBasicBlock &EntryBB = *Fn.begin(); + // Emit the following code into the entry block: + // InVRSAVE = MFVRSAVE + // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE + // MTVRSAVE UpdatedVRSAVE + MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point + BuildMI(EntryBB, IP, TII.get(PPC::MFVRSAVE), InVRSAVE); + BuildMI(EntryBB, IP, TII.get(PPC::UPDATE_VRSAVE), UpdatedVRSAVE).addReg(InVRSAVE); + BuildMI(EntryBB, IP, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE); + + // Find all return blocks, outputting a restore in each epilog. + for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { + if (!BB->empty() && TII.isReturn(BB->back().getOpcode())) { + IP = BB->end(); --IP; + + // Skip over all terminator instructions, which are part of the return + // sequence. + MachineBasicBlock::iterator I2 = IP; + while (I2 != BB->begin() && TII.isTerminatorInstr((--I2)->getOpcode())) + IP = I2; + + // Emit: MTVRSAVE InVRSave + BuildMI(*BB, IP, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE); + } + } +} + + +/// getGlobalBaseReg - Output the instructions required to put the +/// base address to use for accessing globals into a register. +/// +SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { + if (!GlobalBaseReg) { + const TargetInstrInfo &TII = *TM.getInstrInfo(); + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = BB->getParent()->front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + SSARegMap *RegMap = BB->getParent()->getSSARegMap(); + + if (PPCLowering.getPointerTy() == MVT::i32) { + GlobalBaseReg = RegMap->createVirtualRegister(PPC::GPRCRegisterClass); + BuildMI(FirstMBB, MBBI, TII.get(PPC::MovePCtoLR), PPC::LR); + BuildMI(FirstMBB, MBBI, TII.get(PPC::MFLR), GlobalBaseReg); + } else { + GlobalBaseReg = RegMap->createVirtualRegister(PPC::G8RCRegisterClass); + BuildMI(FirstMBB, MBBI, TII.get(PPC::MovePCtoLR8), PPC::LR8); + BuildMI(FirstMBB, MBBI, TII.get(PPC::MFLR8), GlobalBaseReg); + } + } + return CurDAG->getRegister(GlobalBaseReg, PPCLowering.getPointerTy()).Val; +} + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (N->getOpcode() != ISD::Constant) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getValue(); +} + +static bool isIntS16Immediate(SDOperand Op, short &Imm) { + return isIntS16Immediate(Op.Val, Imm); +} + + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getValue(); + return true; + } + return false; +} + +/// isInt64Immediate - This method tests to see if the node is a 64-bit constant +/// operand. If so Imm will receive the 64-bit value. +static bool isInt64Immediate(SDNode *N, uint64_t &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { + Imm = cast<ConstantSDNode>(N)->getValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDOperand N, unsigned &Imm) { + return isInt32Immediate(N.Val, Imm); +} + + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc && isInt32Immediate(N->getOperand(1).Val, Imm); +} + +bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { + if (isShiftedMask_32(Val)) { + // look for the first non-zero bit + MB = CountLeadingZeros_32(Val); + // look for the first zero bit after the run of ones + ME = CountLeadingZeros_32((Val - 1) ^ Val); + return true; + } else { + Val = ~Val; // invert mask + if (isShiftedMask_32(Val)) { + // effectively look for the first zero bit + ME = CountLeadingZeros_32(Val) - 1; + // effectively look for the first one bit after the run of zeros + MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1; + return true; + } + } + // no run present + return false; +} + +bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, + bool IsShiftMask, unsigned &SH, + unsigned &MB, unsigned &ME) { + // Don't even go down this path for i64, since different logic will be + // necessary for rldicl/rldicr/rldimi. + if (N->getValueType(0) != MVT::i32) + return false; + + unsigned Shift = 32; + unsigned Indeterminant = ~0; // bit mask marking indeterminant results + unsigned Opcode = N->getOpcode(); + if (N->getNumOperands() != 2 || + !isInt32Immediate(N->getOperand(1).Val, Shift) || (Shift > 31)) + return false; + + if (Opcode == ISD::SHL) { + // apply shift left to mask if it comes first + if (IsShiftMask) Mask = Mask << Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu << Shift); + } else if (Opcode == ISD::SRL) { + // apply shift right to mask if it comes first + if (IsShiftMask) Mask = Mask >> Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu >> Shift); + // adjust for the left rotate + Shift = 32 - Shift; + } else if (Opcode == ISD::ROTL) { + Indeterminant = 0; + } else { + return false; + } + + // if the mask doesn't intersect any Indeterminant bits + if (Mask && !(Mask & Indeterminant)) { + SH = Shift & 31; + // make sure the mask is still a mask (wrap arounds may not be) + return isRunOfOnes(Mask, MB, ME); + } + return false; +} + +/// SelectBitfieldInsert - turn an or of two masked values into +/// the rotate left word immediate then mask insert (rlwimi) instruction. +SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) { + SDOperand Op0 = N->getOperand(0); + SDOperand Op1 = N->getOperand(1); + + uint64_t LKZ, LKO, RKZ, RKO; + CurDAG->ComputeMaskedBits(Op0, 0xFFFFFFFFULL, LKZ, LKO); + CurDAG->ComputeMaskedBits(Op1, 0xFFFFFFFFULL, RKZ, RKO); + + unsigned TargetMask = LKZ; + unsigned InsertMask = RKZ; + + if ((TargetMask | InsertMask) == 0xFFFFFFFF) { + unsigned Op0Opc = Op0.getOpcode(); + unsigned Op1Opc = Op1.getOpcode(); + unsigned Value, SH = 0; + TargetMask = ~TargetMask; + InsertMask = ~InsertMask; + + // If the LHS has a foldable shift and the RHS does not, then swap it to the + // RHS so that we can fold the shift into the insert. + if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) { + if (Op0.getOperand(0).getOpcode() == ISD::SHL || + Op0.getOperand(0).getOpcode() == ISD::SRL) { + if (Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) { + if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + + unsigned MB, ME; + if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) { + SDOperand Tmp1, Tmp2, Tmp3; + bool DisjointMask = (TargetMask ^ InsertMask) == 0xFFFFFFFF; + + if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) && + isInt32Immediate(Op1.getOperand(1), Value)) { + Op1 = Op1.getOperand(0); + SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value; + } + if (Op1Opc == ISD::AND) { + unsigned SHOpc = Op1.getOperand(0).getOpcode(); + if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && + isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) { + Op1 = Op1.getOperand(0).getOperand(0); + SH = (SHOpc == ISD::SHL) ? Value : 32 - Value; + } else { + Op1 = Op1.getOperand(0); + } + } + + Tmp3 = (Op0Opc == ISD::AND && DisjointMask) ? Op0.getOperand(0) : Op0; + AddToISelQueue(Tmp3); + AddToISelQueue(Op1); + SH &= 31; + SDOperand Ops[] = { Tmp3, Op1, getI32Imm(SH), getI32Imm(MB), + getI32Imm(ME) }; + return CurDAG->getTargetNode(PPC::RLWIMI, MVT::i32, Ops, 5); + } + } + return 0; +} + +/// SelectCC - Select a comparison of the specified values with the specified +/// condition code, returning the CR# of the expression. +SDOperand PPCDAGToDAGISel::SelectCC(SDOperand LHS, SDOperand RHS, + ISD::CondCode CC) { + // Always select the LHS. + AddToISelQueue(LHS); + unsigned Opc; + + if (LHS.getValueType() == MVT::i32) { + unsigned Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt32Immediate(RHS, Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt16(Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt16((int)Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPWI, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpw cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmplwi cr0,r0,0x5678 + // beq cr0,L6 + SDOperand Xor(CurDAG->getTargetNode(PPC::XORIS, MVT::i32, LHS, + getI32Imm(Imm >> 16)), 0); + return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, Xor, + getI32Imm(Imm & 0xFFFF)), 0); + } + Opc = PPC::CMPLW; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt32Immediate(RHS, Imm) && isUInt16(Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPLWI, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + Opc = PPC::CMPLW; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPWI, MVT::i32, LHS, + getI32Imm((int)SImm & 0xFFFF)), + 0); + Opc = PPC::CMPW; + } + } else if (LHS.getValueType() == MVT::i64) { + uint64_t Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt64Immediate(RHS.Val, Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt16(Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt16(Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPDI, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpd cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmpldi cr0,r0,0x5678 + // beq cr0,L6 + if (isUInt32(Imm)) { + SDOperand Xor(CurDAG->getTargetNode(PPC::XORIS8, MVT::i64, LHS, + getI64Imm(Imm >> 16)), 0); + return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, Xor, + getI64Imm(Imm & 0xFFFF)), 0); + } + } + Opc = PPC::CMPLD; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt64Immediate(RHS.Val, Imm) && isUInt16(Imm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPLDI, MVT::i64, LHS, + getI64Imm(Imm & 0xFFFF)), 0); + Opc = PPC::CMPLD; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDOperand(CurDAG->getTargetNode(PPC::CMPDI, MVT::i64, LHS, + getI64Imm(SImm & 0xFFFF)), + 0); + Opc = PPC::CMPD; + } + } else if (LHS.getValueType() == MVT::f32) { + Opc = PPC::FCMPUS; + } else { + assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); + Opc = PPC::FCMPUD; + } + AddToISelQueue(RHS); + return SDOperand(CurDAG->getTargetNode(Opc, MVT::i32, LHS, RHS), 0); +} + +static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { + switch (CC) { + default: assert(0 && "Unknown condition!"); abort(); + case ISD::SETOEQ: // FIXME: This is incorrect see PR642. + case ISD::SETUEQ: + case ISD::SETEQ: return PPC::PRED_EQ; + case ISD::SETONE: // FIXME: This is incorrect see PR642. + case ISD::SETUNE: + case ISD::SETNE: return PPC::PRED_NE; + case ISD::SETOLT: // FIXME: This is incorrect see PR642. + case ISD::SETULT: + case ISD::SETLT: return PPC::PRED_LT; + case ISD::SETOLE: // FIXME: This is incorrect see PR642. + case ISD::SETULE: + case ISD::SETLE: return PPC::PRED_LE; + case ISD::SETOGT: // FIXME: This is incorrect see PR642. + case ISD::SETUGT: + case ISD::SETGT: return PPC::PRED_GT; + case ISD::SETOGE: // FIXME: This is incorrect see PR642. + case ISD::SETUGE: + case ISD::SETGE: return PPC::PRED_GE; + + case ISD::SETO: return PPC::PRED_NU; + case ISD::SETUO: return PPC::PRED_UN; + } +} + +/// getCRIdxForSetCC - Return the index of the condition register field +/// associated with the SetCC condition, and whether or not the field is +/// treated as inverted. That is, lt = 0; ge = 0 inverted. +static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool& Inv) { + switch (CC) { + default: assert(0 && "Unknown condition!"); abort(); + case ISD::SETOLT: // FIXME: This is incorrect see PR642. + case ISD::SETULT: + case ISD::SETLT: Inv = false; return 0; + case ISD::SETOGE: // FIXME: This is incorrect see PR642. + case ISD::SETUGE: + case ISD::SETGE: Inv = true; return 0; + case ISD::SETOGT: // FIXME: This is incorrect see PR642. + case ISD::SETUGT: + case ISD::SETGT: Inv = false; return 1; + case ISD::SETOLE: // FIXME: This is incorrect see PR642. + case ISD::SETULE: + case ISD::SETLE: Inv = true; return 1; + case ISD::SETOEQ: // FIXME: This is incorrect see PR642. + case ISD::SETUEQ: + case ISD::SETEQ: Inv = false; return 2; + case ISD::SETONE: // FIXME: This is incorrect see PR642. + case ISD::SETUNE: + case ISD::SETNE: Inv = true; return 2; + case ISD::SETO: Inv = true; return 3; + case ISD::SETUO: Inv = false; return 3; + } + return 0; +} + +SDNode *PPCDAGToDAGISel::SelectSETCC(SDOperand Op) { + SDNode *N = Op.Val; + unsigned Imm; + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + if (isInt32Immediate(N->getOperand(1), Imm)) { + // We can codegen setcc op, imm very efficiently compared to a brcond. + // Check for those cases here. + // setcc op, 0 + if (Imm == 0) { + SDOperand Op = N->getOperand(0); + AddToISelQueue(Op); + switch (CC) { + default: break; + case ISD::SETEQ: { + Op = SDOperand(CurDAG->getTargetNode(PPC::CNTLZW, MVT::i32, Op), 0); + SDOperand Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + case ISD::SETNE: { + SDOperand AD = + SDOperand(CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag, + Op, getI32Imm(~0U)), 0); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, + AD.getValue(1)); + } + case ISD::SETLT: { + SDOperand Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + case ISD::SETGT: { + SDOperand T = + SDOperand(CurDAG->getTargetNode(PPC::NEG, MVT::i32, Op), 0); + T = SDOperand(CurDAG->getTargetNode(PPC::ANDC, MVT::i32, T, Op), 0); + SDOperand Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + } + } else if (Imm == ~0U) { // setcc op, -1 + SDOperand Op = N->getOperand(0); + AddToISelQueue(Op); + switch (CC) { + default: break; + case ISD::SETEQ: + Op = SDOperand(CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag, + Op, getI32Imm(1)), 0); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDOperand(CurDAG->getTargetNode(PPC::LI, MVT::i32, + getI32Imm(0)), 0), + Op.getValue(1)); + case ISD::SETNE: { + Op = SDOperand(CurDAG->getTargetNode(PPC::NOR, MVT::i32, Op, Op), 0); + SDNode *AD = CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag, + Op, getI32Imm(~0U)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDOperand(AD, 0), + Op, SDOperand(AD, 1)); + } + case ISD::SETLT: { + SDOperand AD = SDOperand(CurDAG->getTargetNode(PPC::ADDI, MVT::i32, Op, + getI32Imm(1)), 0); + SDOperand AN = SDOperand(CurDAG->getTargetNode(PPC::AND, MVT::i32, AD, + Op), 0); + SDOperand Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + case ISD::SETGT: { + SDOperand Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + Op = SDOperand(CurDAG->getTargetNode(PPC::RLWINM, MVT::i32, Ops, 4), 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, + getI32Imm(1)); + } + } + } + } + + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + SDOperand CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC); + SDOperand IntCR; + + // Force the ccreg into CR7. + SDOperand CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); + + SDOperand InFlag(0, 0); // Null incoming flag value. + CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), CR7Reg, CCReg, + InFlag).getValue(1); + + if (TLI.getTargetMachine().getSubtarget<PPCSubtarget>().isGigaProcessor()) + IntCR = SDOperand(CurDAG->getTargetNode(PPC::MFOCRF, MVT::i32, CR7Reg, + CCReg), 0); + else + IntCR = SDOperand(CurDAG->getTargetNode(PPC::MFCR, MVT::i32, CCReg), 0); + + SDOperand Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31), + getI32Imm(31), getI32Imm(31) }; + if (!Inv) { + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } else { + SDOperand Tmp = + SDOperand(CurDAG->getTargetNode(PPC::RLWINM, MVT::i32, Ops, 4), 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1)); + } +} + + +// Select - Convert the specified operand from a target-independent to a +// target-specific node if it hasn't already been changed. +SDNode *PPCDAGToDAGISel::Select(SDOperand Op) { + SDNode *N = Op.Val; + if (N->getOpcode() >= ISD::BUILTIN_OP_END && + N->getOpcode() < PPCISD::FIRST_NUMBER) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + + case ISD::Constant: { + if (N->getValueType(0) == MVT::i64) { + // Get 64 bit value. + int64_t Imm = cast<ConstantSDNode>(N)->getValue(); + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt32(Imm)) { + Shift = CountTrailingZeros_64(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt32(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + SDNode *Result; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + // Simple value. + if (isInt16(Imm)) { + // Just the Lo bits. + Result = CurDAG->getTargetNode(PPC::LI8, MVT::i64, getI32Imm(Lo)); + } else if (Lo) { + // Handle the Hi bits. + unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getTargetNode(OpC, MVT::i64, getI32Imm(Hi)); + // And Lo bits. + Result = CurDAG->getTargetNode(PPC::ORI8, MVT::i64, + SDOperand(Result, 0), getI32Imm(Lo)); + } else { + // Just the Hi bits. + Result = CurDAG->getTargetNode(PPC::LIS8, MVT::i64, getI32Imm(Hi)); + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) { + Result = CurDAG->getTargetNode(PPC::RLDICR, MVT::i64, + SDOperand(Result, 0), + getI32Imm(Shift), getI32Imm(63 - Shift)); + } + + // Add in the last bits as required. + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + Result = CurDAG->getTargetNode(PPC::ORIS8, MVT::i64, + SDOperand(Result, 0), getI32Imm(Hi)); + } + if ((Lo = Remainder & 0xFFFF)) { + Result = CurDAG->getTargetNode(PPC::ORI8, MVT::i64, + SDOperand(Result, 0), getI32Imm(Lo)); + } + + return Result; + } + break; + } + + case ISD::SETCC: + return SelectSETCC(Op); + case PPCISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case ISD::FrameIndex: { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDOperand TFI = CurDAG->getTargetFrameIndex(FI, Op.getValueType()); + unsigned Opc = Op.getValueType() == MVT::i32 ? PPC::ADDI : PPC::ADDI8; + if (N->hasOneUse()) + return CurDAG->SelectNodeTo(N, Opc, Op.getValueType(), TFI, + getSmallIPtrImm(0)); + return CurDAG->getTargetNode(Opc, Op.getValueType(), TFI, + getSmallIPtrImm(0)); + } + + case PPCISD::MFCR: { + SDOperand InFlag = N->getOperand(1); + AddToISelQueue(InFlag); + // Use MFOCRF if supported. + if (TLI.getTargetMachine().getSubtarget<PPCSubtarget>().isGigaProcessor()) + return CurDAG->getTargetNode(PPC::MFOCRF, MVT::i32, + N->getOperand(0), InFlag); + else + return CurDAG->getTargetNode(PPC::MFCR, MVT::i32, InFlag); + } + + case ISD::SDIV: { + // FIXME: since this depends on the setting of the carry flag from the srawi + // we should really be making notes about that for the scheduler. + // FIXME: It sure would be nice if we could cheaply recognize the + // srl/add/sra pattern the dag combiner will generate for this as + // sra/addze rather than having to handle sdiv ourselves. oh well. + unsigned Imm; + if (isInt32Immediate(N->getOperand(1), Imm)) { + SDOperand N0 = N->getOperand(0); + AddToISelQueue(N0); + if ((signed)Imm > 0 && isPowerOf2_32(Imm)) { + SDNode *Op = + CurDAG->getTargetNode(PPC::SRAWI, MVT::i32, MVT::Flag, + N0, getI32Imm(Log2_32(Imm))); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDOperand(Op, 0), SDOperand(Op, 1)); + } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) { + SDNode *Op = + CurDAG->getTargetNode(PPC::SRAWI, MVT::i32, MVT::Flag, + N0, getI32Imm(Log2_32(-Imm))); + SDOperand PT = + SDOperand(CurDAG->getTargetNode(PPC::ADDZE, MVT::i32, + SDOperand(Op, 0), SDOperand(Op, 1)), + 0); + return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT); + } + } + + // Other cases are autogenerated. + break; + } + + case ISD::LOAD: { + // Handle preincrement loads. + LoadSDNode *LD = cast<LoadSDNode>(Op); + MVT::ValueType LoadedVT = LD->getLoadedVT(); + + // Normal loads are handled by code generated from the .td file. + if (LD->getAddressingMode() != ISD::PRE_INC) + break; + + SDOperand Offset = LD->getOffset(); + if (isa<ConstantSDNode>(Offset) || + Offset.getOpcode() == ISD::TargetGlobalAddress) { + + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert(!isSExt || LoadedVT == MVT::i16 && "Invalid sext update load"); + switch (LoadedVT) { + default: assert(0 && "Invalid PPC load type!"); + case MVT::f64: Opcode = PPC::LFDU; break; + case MVT::f32: Opcode = PPC::LFSU; break; + case MVT::i32: Opcode = PPC::LWZU; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert(!isSExt || LoadedVT == MVT::i16 && "Invalid sext update load"); + switch (LoadedVT) { + default: assert(0 && "Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDU; break; + case MVT::i32: Opcode = PPC::LWZU8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU8; break; + } + } + + SDOperand Chain = LD->getChain(); + SDOperand Base = LD->getBasePtr(); + AddToISelQueue(Chain); + AddToISelQueue(Base); + AddToISelQueue(Offset); + SDOperand Ops[] = { Offset, Base, Chain }; + // FIXME: PPC64 + return CurDAG->getTargetNode(Opcode, MVT::i32, MVT::i32, + MVT::Other, Ops, 3); + } else { + assert(0 && "R+R preindex loads not supported yet!"); + } + } + + case ISD::AND: { + unsigned Imm, Imm2, SH, MB, ME; + + // If this is an and of a value rotated between 0 and 31 bits and then and'd + // with a mask, emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRotateAndMask(N->getOperand(0).Val, Imm, false, SH, MB, ME)) { + SDOperand Val = N->getOperand(0).getOperand(0); + AddToISelQueue(Val); + SDOperand Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + // If this is just a masked value where the input is not handled above, and + // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRunOfOnes(Imm, MB, ME) && + N->getOperand(0).getOpcode() != ISD::ROTL) { + SDOperand Val = N->getOperand(0); + AddToISelQueue(Val); + SDOperand Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + // AND X, 0 -> 0, not "rlwinm 32". + if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { + AddToISelQueue(N->getOperand(1)); + ReplaceUses(SDOperand(N, 0), N->getOperand(1)); + return NULL; + } + // ISD::OR doesn't get all the bitfield insertion fun. + // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert + if (isInt32Immediate(N->getOperand(1), Imm) && + N->getOperand(0).getOpcode() == ISD::OR && + isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { + unsigned MB, ME; + Imm = ~(Imm^Imm2); + if (isRunOfOnes(Imm, MB, ME)) { + AddToISelQueue(N->getOperand(0).getOperand(0)); + AddToISelQueue(N->getOperand(0).getOperand(1)); + SDOperand Ops[] = { N->getOperand(0).getOperand(0), + N->getOperand(0).getOperand(1), + getI32Imm(0), getI32Imm(MB),getI32Imm(ME) }; + return CurDAG->getTargetNode(PPC::RLWIMI, MVT::i32, Ops, 5); + } + } + + // Other cases are autogenerated. + break; + } + case ISD::OR: + if (N->getValueType(0) == MVT::i32) + if (SDNode *I = SelectBitfieldInsert(N)) + return I; + + // Other cases are autogenerated. + break; + case ISD::SHL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).Val, ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + AddToISelQueue(N->getOperand(0).getOperand(0)); + SDOperand Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + + // Other cases are autogenerated. + break; + } + case ISD::SRL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).Val, ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + AddToISelQueue(N->getOperand(0).getOperand(0)); + SDOperand Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4); + } + + // Other cases are autogenerated. + break; + } + case ISD::SELECT_CC: { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + + // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc + if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1))) + if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3))) + if (N1C->isNullValue() && N3C->isNullValue() && + N2C->getValue() == 1ULL && CC == ISD::SETNE && + // FIXME: Implement this optzn for PPC64. + N->getValueType(0) == MVT::i32) { + AddToISelQueue(N->getOperand(0)); + SDNode *Tmp = + CurDAG->getTargetNode(PPC::ADDIC, MVT::i32, MVT::Flag, + N->getOperand(0), getI32Imm(~0U)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, + SDOperand(Tmp, 0), N->getOperand(0), + SDOperand(Tmp, 1)); + } + + SDOperand CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC); + unsigned BROpc = getPredicateForSetCC(CC); + + unsigned SelectCCOp; + if (N->getValueType(0) == MVT::i32) + SelectCCOp = PPC::SELECT_CC_I4; + else if (N->getValueType(0) == MVT::i64) + SelectCCOp = PPC::SELECT_CC_I8; + else if (N->getValueType(0) == MVT::f32) + SelectCCOp = PPC::SELECT_CC_F4; + else if (N->getValueType(0) == MVT::f64) + SelectCCOp = PPC::SELECT_CC_F8; + else + SelectCCOp = PPC::SELECT_CC_VRRC; + + AddToISelQueue(N->getOperand(2)); + AddToISelQueue(N->getOperand(3)); + SDOperand Ops[] = { CCReg, N->getOperand(2), N->getOperand(3), + getI32Imm(BROpc) }; + return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4); + } + case PPCISD::COND_BRANCH: { + AddToISelQueue(N->getOperand(0)); // Op #0 is the Chain. + // Op #1 is the PPC::PRED_* number. + // Op #2 is the CR# + // Op #3 is the Dest MBB + AddToISelQueue(N->getOperand(4)); // Op #4 is the Flag. + // Prevent PPC::PRED_* from being selected into LI. + SDOperand Pred = + getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getValue()); + SDOperand Ops[] = { Pred, N->getOperand(2), N->getOperand(3), + N->getOperand(0), N->getOperand(4) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5); + } + case ISD::BR_CC: { + AddToISelQueue(N->getOperand(0)); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + SDOperand CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC); + SDOperand Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode, + N->getOperand(4), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4); + } + case ISD::BRIND: { + // FIXME: Should custom lower this. + SDOperand Chain = N->getOperand(0); + SDOperand Target = N->getOperand(1); + AddToISelQueue(Chain); + AddToISelQueue(Target); + unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8; + Chain = SDOperand(CurDAG->getTargetNode(Opc, MVT::Other, Target, + Chain), 0); + return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain); + } + } + + return SelectCode(Op); +} + + + +/// createPPCISelDag - This pass converts a legalized DAG into a +/// PowerPC-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { + return new PPCDAGToDAGISel(TM); +} + diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp new file mode 100644 index 0000000..6c2f383 --- /dev/null +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -0,0 +1,3451 @@ +//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCISelLowering class. +// +//===----------------------------------------------------------------------===// + +#include "PPCISelLowering.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCPredicates.h" +#include "PPCTargetMachine.h" +#include "PPCPerfectShuffle.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc", +cl::desc("enable preincrement load/store generation on PPC (experimental)"), + cl::Hidden); + +PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) + : TargetLowering(TM), PPCSubTarget(*TM.getSubtargetImpl()) { + + setPow2DivIsCheap(); + + // Use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + + // Set up the register classes. + addRegisterClass(MVT::i32, PPC::GPRCRegisterClass); + addRegisterClass(MVT::f32, PPC::F4RCRegisterClass); + addRegisterClass(MVT::f64, PPC::F8RCRegisterClass); + + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD + setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand); + setLoadXAction(ISD::SEXTLOAD, MVT::i8, Expand); + + // PowerPC does not have truncstore for i1. + setStoreXAction(MVT::i1, Promote); + + // PowerPC has pre-inc load and store's. + setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); + + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + + // PowerPC has no intrinsics for these particular operations + setOperationAction(ISD::MEMMOVE, MVT::Other, Expand); + setOperationAction(ISD::MEMSET, MVT::Other, Expand); + setOperationAction(ISD::MEMCPY, MVT::Other, Expand); + + // PowerPC has no SREM/UREM instructions + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // We don't support sin/cos/sqrt/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + + // If we're enabling GP optimizations, use hardware square root + if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) { + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + } + + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + // PowerPC does not have BSWAP, CTPOP or CTTZ + setOperationAction(ISD::BSWAP, MVT::i32 , Expand); + setOperationAction(ISD::CTPOP, MVT::i32 , Expand); + setOperationAction(ISD::CTTZ , MVT::i32 , Expand); + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); + setOperationAction(ISD::CTPOP, MVT::i64 , Expand); + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + + // PowerPC does not have ROTR + setOperationAction(ISD::ROTR, MVT::i32 , Expand); + + // PowerPC does not have Select + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::i64, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + + // PowerPC wants to turn select_cc of FP into fsel when possible. + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + // PowerPC wants to optimize integer setcc a bit + setOperationAction(ISD::SETCC, MVT::i32, Custom); + + // PowerPC does not have BRCOND which requires SetCC + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + + // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + + // PowerPC does not have [U|S]INT_TO_FP + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + + setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand); + setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand); + setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand); + setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand); + + // We cannot sextinreg(i1). Expand to shifts. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + // Support label based line numbers. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + if (!TM.getSubtarget<PPCSubtarget>().isDarwin()) { + setOperationAction(ISD::LABEL, MVT::Other, Expand); + } else { + setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + } + + // We want to legalize GlobalAddress and ConstantPool nodes into the + // appropriate instructions to materialize the address. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + + // RET must be custom lowered, to meet ABI requirements + setOperationAction(ISD::RET , MVT::Other, Custom); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + + // VAARG is custom lowered with ELF 32 ABI + if (TM.getSubtarget<PPCSubtarget>().isELF32_ABI()) + setOperationAction(ISD::VAARG, MVT::Other, Custom); + else + setOperationAction(ISD::VAARG, MVT::Other, Expand); + + // Use the default implementation. + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { + // They also have instructions for converting between i64 and fp. + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + + // FIXME: disable this lowered code. This generates 64-bit register values, + // and we don't model the fact that the top part is clobbered by calls. We + // need to flag these together so that the value isn't live across a call. + //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + + // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); + } else { + // PowerPC does not have FP_TO_UINT on 32-bit implementations. + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + } + + if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) { + // 64 bit PowerPC implementations can support i64 types directly + addRegisterClass(MVT::i64, PPC::G8RCRegisterClass); + // BUILD_PAIR can't be handled natively, and should be expanded to shl/or + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); + } else { + // 32 bit PowerPC wants to expand i64 shifts itself. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + } + + if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) { + // First set operation action for all vector types to expand. Then we + // will selectively turn on ones that can be effectively codegen'd. + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + // add/sub are legal for all supported vector VT's. + setOperationAction(ISD::ADD , (MVT::ValueType)VT, Legal); + setOperationAction(ISD::SUB , (MVT::ValueType)VT, Legal); + + // We promote all shuffles to v16i8. + setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, MVT::v16i8); + + // We promote all non-typed operations to v4i32. + setOperationAction(ISD::AND , (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::AND , (MVT::ValueType)VT, MVT::v4i32); + setOperationAction(ISD::OR , (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::OR , (MVT::ValueType)VT, MVT::v4i32); + setOperationAction(ISD::XOR , (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::XOR , (MVT::ValueType)VT, MVT::v4i32); + setOperationAction(ISD::LOAD , (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::LOAD , (MVT::ValueType)VT, MVT::v4i32); + setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v4i32); + setOperationAction(ISD::STORE, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::STORE, (MVT::ValueType)VT, MVT::v4i32); + + // No other operations are legal. + setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand); + setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Expand); + + setOperationAction(ISD::SCALAR_TO_VECTOR, (MVT::ValueType)VT, Expand); + } + + // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle + // with merges, splats, etc. + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + + setOperationAction(ISD::AND , MVT::v4i32, Legal); + setOperationAction(ISD::OR , MVT::v4i32, Legal); + setOperationAction(ISD::XOR , MVT::v4i32, Legal); + setOperationAction(ISD::LOAD , MVT::v4i32, Legal); + setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + setOperationAction(ISD::STORE , MVT::v4i32, Legal); + + addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass); + addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass); + addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass); + addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass); + + setOperationAction(ISD::MUL, MVT::v4f32, Legal); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + } + + setSetCCResultType(MVT::i32); + setShiftAmountType(MVT::i32); + setSetCCResultContents(ZeroOrOneSetCCResult); + + if (TM.getSubtarget<PPCSubtarget>().isPPC64()) { + setStackPointerRegisterToSaveRestore(PPC::X1); + setExceptionPointerRegister(PPC::X3); + setExceptionSelectorRegister(PPC::X4); + } else { + setStackPointerRegisterToSaveRestore(PPC::R1); + setExceptionPointerRegister(PPC::R3); + setExceptionSelectorRegister(PPC::R4); + } + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::BR_CC); + setTargetDAGCombine(ISD::BSWAP); + + computeRegisterProperties(); +} + +const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case PPCISD::FSEL: return "PPCISD::FSEL"; + case PPCISD::FCFID: return "PPCISD::FCFID"; + case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; + case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::STFIWX: return "PPCISD::STFIWX"; + case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; + case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; + case PPCISD::VPERM: return "PPCISD::VPERM"; + case PPCISD::Hi: return "PPCISD::Hi"; + case PPCISD::Lo: return "PPCISD::Lo"; + case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; + case PPCISD::SRL: return "PPCISD::SRL"; + case PPCISD::SRA: return "PPCISD::SRA"; + case PPCISD::SHL: return "PPCISD::SHL"; + case PPCISD::EXTSW_32: return "PPCISD::EXTSW_32"; + case PPCISD::STD_32: return "PPCISD::STD_32"; + case PPCISD::CALL_ELF: return "PPCISD::CALL_ELF"; + case PPCISD::CALL_Macho: return "PPCISD::CALL_Macho"; + case PPCISD::MTCTR: return "PPCISD::MTCTR"; + case PPCISD::BCTRL_Macho: return "PPCISD::BCTRL_Macho"; + case PPCISD::BCTRL_ELF: return "PPCISD::BCTRL_ELF"; + case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::MFCR: return "PPCISD::MFCR"; + case PPCISD::VCMP: return "PPCISD::VCMP"; + case PPCISD::VCMPo: return "PPCISD::VCMPo"; + case PPCISD::LBRX: return "PPCISD::LBRX"; + case PPCISD::STBRX: return "PPCISD::STBRX"; + case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; + } +} + +//===----------------------------------------------------------------------===// +// Node matching predicates, for use by the tblgen matching code. +//===----------------------------------------------------------------------===// + +/// isFloatingPointZero - Return true if this is 0.0 or -0.0. +static bool isFloatingPointZero(SDOperand Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->isExactlyValue(-0.0) || CFP->isExactlyValue(0.0); + else if (ISD::isEXTLoad(Op.Val) || ISD::isNON_EXTLoad(Op.Val)) { + // Maybe this has already been legalized into the constant pool? + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) + if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->isExactlyValue(-0.0) || CFP->isExactlyValue(0.0); + } + return false; +} + +/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return +/// true if Op is undef or if it matches the specified value. +static bool isConstantOrUndef(SDOperand Op, unsigned Val) { + return Op.getOpcode() == ISD::UNDEF || + cast<ConstantSDNode>(Op)->getValue() == Val; +} + +/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUHUM instruction. +bool PPC::isVPKUHUMShuffleMask(SDNode *N, bool isUnary) { + if (!isUnary) { + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getOperand(i), i*2+1)) + return false; + } else { + for (unsigned i = 0; i != 8; ++i) + if (!isConstantOrUndef(N->getOperand(i), i*2+1) || + !isConstantOrUndef(N->getOperand(i+8), i*2+1)) + return false; + } + return true; +} + +/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUWUM instruction. +bool PPC::isVPKUWUMShuffleMask(SDNode *N, bool isUnary) { + if (!isUnary) { + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getOperand(i ), i*2+2) || + !isConstantOrUndef(N->getOperand(i+1), i*2+3)) + return false; + } else { + for (unsigned i = 0; i != 8; i += 2) + if (!isConstantOrUndef(N->getOperand(i ), i*2+2) || + !isConstantOrUndef(N->getOperand(i+1), i*2+3) || + !isConstantOrUndef(N->getOperand(i+8), i*2+2) || + !isConstantOrUndef(N->getOperand(i+9), i*2+3)) + return false; + } + return true; +} + +/// isVMerge - Common function, used to match vmrg* shuffles. +/// +static bool isVMerge(SDNode *N, unsigned UnitSize, + unsigned LHSStart, unsigned RHSStart) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && + N->getNumOperands() == 16 && "PPC only supports shuffles by bytes!"); + assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && + "Unsupported merge size!"); + + for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units + for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit + if (!isConstantOrUndef(N->getOperand(i*UnitSize*2+j), + LHSStart+j+i*UnitSize) || + !isConstantOrUndef(N->getOperand(i*UnitSize*2+UnitSize+j), + RHSStart+j+i*UnitSize)) + return false; + } + return true; +} + +/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for +/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). +bool PPC::isVMRGLShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary) { + if (!isUnary) + return isVMerge(N, UnitSize, 8, 24); + return isVMerge(N, UnitSize, 8, 8); +} + +/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for +/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). +bool PPC::isVMRGHShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary) { + if (!isUnary) + return isVMerge(N, UnitSize, 0, 16); + return isVMerge(N, UnitSize, 0, 0); +} + + +/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift +/// amount, otherwise return -1. +int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && + N->getNumOperands() == 16 && "PPC only supports shuffles by bytes!"); + // Find the first non-undef value in the shuffle mask. + unsigned i; + for (i = 0; i != 16 && N->getOperand(i).getOpcode() == ISD::UNDEF; ++i) + /*search*/; + + if (i == 16) return -1; // all undef. + + // Otherwise, check to see if the rest of the elements are consequtively + // numbered from this value. + unsigned ShiftAmt = cast<ConstantSDNode>(N->getOperand(i))->getValue(); + if (ShiftAmt < i) return -1; + ShiftAmt -= i; + + if (!isUnary) { + // Check the rest of the elements to see if they are consequtive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(N->getOperand(i), ShiftAmt+i)) + return -1; + } else { + // Check the rest of the elements to see if they are consequtive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(N->getOperand(i), (ShiftAmt+i) & 15)) + return -1; + } + + return ShiftAmt; +} + +/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a splat of a single element that is suitable for input to +/// VSPLTB/VSPLTH/VSPLTW. +bool PPC::isSplatShuffleMask(SDNode *N, unsigned EltSize) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && + N->getNumOperands() == 16 && + (EltSize == 1 || EltSize == 2 || EltSize == 4)); + + // This is a splat operation if each element of the permute is the same, and + // if the value doesn't reference the second vector. + unsigned ElementBase = 0; + SDOperand Elt = N->getOperand(0); + if (ConstantSDNode *EltV = dyn_cast<ConstantSDNode>(Elt)) + ElementBase = EltV->getValue(); + else + return false; // FIXME: Handle UNDEF elements too! + + if (cast<ConstantSDNode>(Elt)->getValue() >= 16) + return false; + + // Check that they are consequtive. + for (unsigned i = 1; i != EltSize; ++i) { + if (!isa<ConstantSDNode>(N->getOperand(i)) || + cast<ConstantSDNode>(N->getOperand(i))->getValue() != i+ElementBase) + return false; + } + + assert(isa<ConstantSDNode>(Elt) && "Invalid VECTOR_SHUFFLE mask!"); + for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(N->getOperand(i)) && + "Invalid VECTOR_SHUFFLE mask!"); + for (unsigned j = 0; j != EltSize; ++j) + if (N->getOperand(i+j) != N->getOperand(j)) + return false; + } + + return true; +} + +/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the +/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. +unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) { + assert(isSplatShuffleMask(N, EltSize)); + return cast<ConstantSDNode>(N->getOperand(0))->getValue() / EltSize; +} + +/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed +/// by using a vspltis[bhw] instruction of the specified element size, return +/// the constant being splatted. The ByteSize field indicates the number of +/// bytes of each element [124] -> [bhw]. +SDOperand PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + SDOperand OpVal(0, 0); + + // If ByteSize of the splat is bigger than the element size of the + // build_vector, then we have a case where we are checking for a splat where + // multiple elements of the buildvector are folded together into a single + // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). + unsigned EltSize = 16/N->getNumOperands(); + if (EltSize < ByteSize) { + unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. + SDOperand UniquedVals[4]; + assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); + + // See if all of the elements in the buildvector agree across. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + // If the element isn't a constant, bail fully out. + if (!isa<ConstantSDNode>(N->getOperand(i))) return SDOperand(); + + + if (UniquedVals[i&(Multiple-1)].Val == 0) + UniquedVals[i&(Multiple-1)] = N->getOperand(i); + else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) + return SDOperand(); // no match. + } + + // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains + // either constant or undef values that are identical for each chunk. See + // if these chunks can form into a larger vspltis*. + + // Check to see if all of the leading entries are either 0 or -1. If + // neither, then this won't fit into the immediate field. + bool LeadingZero = true; + bool LeadingOnes = true; + for (unsigned i = 0; i != Multiple-1; ++i) { + if (UniquedVals[i].Val == 0) continue; // Must have been undefs. + + LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue(); + LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue(); + } + // Finally, check the least significant entry. + if (LeadingZero) { + if (UniquedVals[Multiple-1].Val == 0) + return DAG.getTargetConstant(0, MVT::i32); // 0,0,0,undef + int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getValue(); + if (Val < 16) + return DAG.getTargetConstant(Val, MVT::i32); // 0,0,0,4 -> vspltisw(4) + } + if (LeadingOnes) { + if (UniquedVals[Multiple-1].Val == 0) + return DAG.getTargetConstant(~0U, MVT::i32); // -1,-1,-1,undef + int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSignExtended(); + if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) + return DAG.getTargetConstant(Val, MVT::i32); + } + + return SDOperand(); + } + + // Check to see if this buildvec has a single non-undef value in its elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + if (OpVal.Val == 0) + OpVal = N->getOperand(i); + else if (OpVal != N->getOperand(i)) + return SDOperand(); + } + + if (OpVal.Val == 0) return SDOperand(); // All UNDEF: use implicit def. + + unsigned ValSizeInBytes = 0; + uint64_t Value = 0; + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { + Value = CN->getValue(); + ValSizeInBytes = MVT::getSizeInBits(CN->getValueType(0))/8; + } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { + assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); + Value = FloatToBits(CN->getValue()); + ValSizeInBytes = 4; + } + + // If the splat value is larger than the element value, then we can never do + // this splat. The only case that we could fit the replicated bits into our + // immediate field for would be zero, and we prefer to use vxor for it. + if (ValSizeInBytes < ByteSize) return SDOperand(); + + // If the element value is larger than the splat value, cut it in half and + // check to see if the two halves are equal. Continue doing this until we + // get to ByteSize. This allows us to handle 0x01010101 as 0x01. + while (ValSizeInBytes > ByteSize) { + ValSizeInBytes >>= 1; + + // If the top half equals the bottom half, we're still ok. + if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) != + (Value & ((1 << (8*ValSizeInBytes))-1))) + return SDOperand(); + } + + // Properly sign extend the value. + int ShAmt = (4-ByteSize)*8; + int MaskVal = ((int)Value << ShAmt) >> ShAmt; + + // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. + if (MaskVal == 0) return SDOperand(); + + // Finally, if this value fits in a 5 bit sext field, return it + if (((MaskVal << (32-5)) >> (32-5)) == MaskVal) + return DAG.getTargetConstant(MaskVal, MVT::i32); + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// Addressing Mode Selection +//===----------------------------------------------------------------------===// + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (N->getOpcode() != ISD::Constant) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getValue(); +} +static bool isIntS16Immediate(SDOperand Op, short &Imm) { + return isIntS16Immediate(Op.Val, Imm); +} + + +/// SelectAddressRegReg - Given the specified addressed, check to see if it +/// can be represented as an indexed [r+r] operation. Returns false if it +/// can be more efficiently represented with [r+imm]. +bool PPCTargetLowering::SelectAddressRegReg(SDOperand N, SDOperand &Base, + SDOperand &Index, + SelectionDAG &DAG) { + short imm = 0; + if (N.getOpcode() == ISD::ADD) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i + if (N.getOperand(1).getOpcode() == PPCISD::Lo) + return false; // r+i + + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } else if (N.getOpcode() == ISD::OR) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i can fold it if we can. + + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are provably + // disjoint. + uint64_t LHSKnownZero, LHSKnownOne; + uint64_t RHSKnownZero, RHSKnownOne; + DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne); + + if (LHSKnownZero) { + DAG.ComputeMaskedBits(N.getOperand(1), ~0U, RHSKnownZero, RHSKnownOne); + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + if ((LHSKnownZero | RHSKnownZero) == ~0U) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + } + } + + return false; +} + +/// Returns true if the address N can be represented by a base register plus +/// a signed 16-bit displacement [r+imm], and if it is not better +/// represented as reg+reg. +bool PPCTargetLowering::SelectAddressRegImm(SDOperand N, SDOperand &Disp, + SDOperand &Base, SelectionDAG &DAG){ + // If this can be more profitably realized as r+r, fail. + if (SelectAddressRegReg(N, Disp, Base, DAG)) + return false; + + if (N.getOpcode() == ISD::ADD) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm)) { + Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + return true; // [r+i] + } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { + // Match LOAD (ADD (X, Lo(G))). + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getValue() + && "Cannot handle constant offsets yet!"); + Disp = N.getOperand(1).getOperand(0); // The global address. + assert(Disp.getOpcode() == ISD::TargetGlobalAddress || + Disp.getOpcode() == ISD::TargetConstantPool || + Disp.getOpcode() == ISD::TargetJumpTable); + Base = N.getOperand(0); + return true; // [&g+r] + } + } else if (N.getOpcode() == ISD::OR) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm)) { + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + uint64_t LHSKnownZero, LHSKnownOne; + DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne); + if ((LHSKnownZero|~(unsigned)imm) == ~0U) { + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + Base = N.getOperand(0); + Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32); + return true; + } + } + } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + // Loading from a constant address. + + // If this address fits entirely in a 16-bit sext immediate field, codegen + // this as "d, 0" + short Imm; + if (isIntS16Immediate(CN, Imm)) { + Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); + Base = DAG.getRegister(PPC::R0, CN->getValueType(0)); + return true; + } + + // Handle 32-bit sext immediates with LIS + addr mode. + if (CN->getValueType(0) == MVT::i32 || + (int64_t)CN->getValue() == (int)CN->getValue()) { + int Addr = (int)CN->getValue(); + + // Otherwise, break this down into an LIS + disp. + Disp = DAG.getTargetConstant((short)Addr, MVT::i32); + + Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32); + unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; + Base = SDOperand(DAG.getTargetNode(Opc, CN->getValueType(0), Base), 0); + return true; + } + } + + Disp = DAG.getTargetConstant(0, getPointerTy()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N; + return true; // [r+0] +} + +/// SelectAddressRegRegOnly - Given the specified addressed, force it to be +/// represented as an indexed [r+r] operation. +bool PPCTargetLowering::SelectAddressRegRegOnly(SDOperand N, SDOperand &Base, + SDOperand &Index, + SelectionDAG &DAG) { + // Check to see if we can easily represent this as an [r+r] address. This + // will fail if it thinks that the address is more profitably represented as + // reg+imm, e.g. where imm = 0. + if (SelectAddressRegReg(N, Base, Index, DAG)) + return true; + + // If the operand is an addition, always emit this as [r+r], since this is + // better (for code size, and execution, as the memop does the add for free) + // than emitting an explicit add. + if (N.getOpcode() == ISD::ADD) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + + // Otherwise, do it the hard way, using R0 as the base register. + Base = DAG.getRegister(PPC::R0, N.getValueType()); + Index = N; + return true; +} + +/// SelectAddressRegImmShift - Returns true if the address N can be +/// represented by a base register plus a signed 14-bit displacement +/// [r+imm*4]. Suitable for use by STD and friends. +bool PPCTargetLowering::SelectAddressRegImmShift(SDOperand N, SDOperand &Disp, + SDOperand &Base, + SelectionDAG &DAG) { + // If this can be more profitably realized as r+r, fail. + if (SelectAddressRegReg(N, Disp, Base, DAG)) + return false; + + if (N.getOpcode() == ISD::ADD) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) { + Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + return true; // [r+i] + } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { + // Match LOAD (ADD (X, Lo(G))). + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getValue() + && "Cannot handle constant offsets yet!"); + Disp = N.getOperand(1).getOperand(0); // The global address. + assert(Disp.getOpcode() == ISD::TargetGlobalAddress || + Disp.getOpcode() == ISD::TargetConstantPool || + Disp.getOpcode() == ISD::TargetJumpTable); + Base = N.getOperand(0); + return true; // [&g+r] + } + } else if (N.getOpcode() == ISD::OR) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) { + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + uint64_t LHSKnownZero, LHSKnownOne; + DAG.ComputeMaskedBits(N.getOperand(0), ~0U, LHSKnownZero, LHSKnownOne); + if ((LHSKnownZero|~(unsigned)imm) == ~0U) { + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + Base = N.getOperand(0); + Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32); + return true; + } + } + } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + // Loading from a constant address. Verify low two bits are clear. + if ((CN->getValue() & 3) == 0) { + // If this address fits entirely in a 14-bit sext immediate field, codegen + // this as "d, 0" + short Imm; + if (isIntS16Immediate(CN, Imm)) { + Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy()); + Base = DAG.getRegister(PPC::R0, CN->getValueType(0)); + return true; + } + + // Fold the low-part of 32-bit absolute addresses into addr mode. + if (CN->getValueType(0) == MVT::i32 || + (int64_t)CN->getValue() == (int)CN->getValue()) { + int Addr = (int)CN->getValue(); + + // Otherwise, break this down into an LIS + disp. + Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32); + + Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32); + unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; + Base = SDOperand(DAG.getTargetNode(Opc, CN->getValueType(0), Base), 0); + return true; + } + } + } + + Disp = DAG.getTargetConstant(0, getPointerTy()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N; + return true; // [r+0] +} + + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDOperand &Base, + SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) { + // Disabled by default for now. + if (!EnablePPCPreinc) return false; + + SDOperand Ptr; + MVT::ValueType VT; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getLoadedVT(); + + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ST = ST; + Ptr = ST->getBasePtr(); + VT = ST->getStoredVT(); + } else + return false; + + // PowerPC doesn't have preinc load/store instructions for vectors. + if (MVT::isVector(VT)) + return false; + + // TODO: Check reg+reg first. + + // LDU/STU use reg+imm*4, others use reg+imm. + if (VT != MVT::i64) { + // reg + imm + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG)) + return false; + } else { + // reg + imm * 4. + if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG)) + return false; + } + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of + // sext i32 to i64 when addr mode is r+i. + if (LD->getValueType(0) == MVT::i64 && LD->getLoadedVT() == MVT::i32 && + LD->getExtensionType() == ISD::SEXTLOAD && + isa<ConstantSDNode>(Offset)) + return false; + } + + AM = ISD::PRE_INC; + return true; +} + +//===----------------------------------------------------------------------===// +// LowerOperation implementation +//===----------------------------------------------------------------------===// + +static SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType PtrVT = Op.getValueType(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + Constant *C = CP->getConstVal(); + SDOperand CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment()); + SDOperand Zero = DAG.getConstant(0, PtrVT); + + const TargetMachine &TM = DAG.getTarget(); + + SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, CPI, Zero); + SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, CPI, Zero); + + // If this is a non-darwin platform, we don't support non-static relo models + // yet. + if (TM.getRelocationModel() == Reloc::Static || + !TM.getSubtarget<PPCSubtarget>().isDarwin()) { + // Generate non-pic code that has direct accesses to the constant pool. + // The address of the global is just (hi(&g)+lo(&g)). + return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + } + + if (TM.getRelocationModel() == Reloc::PIC_) { + // With PIC, the first instruction is actually "GR+hi(&G)". + Hi = DAG.getNode(ISD::ADD, PtrVT, + DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi); + } + + Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + return Lo; +} + +static SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType PtrVT = Op.getValueType(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + SDOperand JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + SDOperand Zero = DAG.getConstant(0, PtrVT); + + const TargetMachine &TM = DAG.getTarget(); + + SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, JTI, Zero); + SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, JTI, Zero); + + // If this is a non-darwin platform, we don't support non-static relo models + // yet. + if (TM.getRelocationModel() == Reloc::Static || + !TM.getSubtarget<PPCSubtarget>().isDarwin()) { + // Generate non-pic code that has direct accesses to the constant pool. + // The address of the global is just (hi(&g)+lo(&g)). + return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + } + + if (TM.getRelocationModel() == Reloc::PIC_) { + // With PIC, the first instruction is actually "GR+hi(&G)". + Hi = DAG.getNode(ISD::ADD, PtrVT, + DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi); + } + + Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + return Lo; +} + +static SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) { + assert(0 && "TLS not implemented for PPC."); +} + +static SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType PtrVT = Op.getValueType(); + GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); + GlobalValue *GV = GSDN->getGlobal(); + SDOperand GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset()); + SDOperand Zero = DAG.getConstant(0, PtrVT); + + const TargetMachine &TM = DAG.getTarget(); + + SDOperand Hi = DAG.getNode(PPCISD::Hi, PtrVT, GA, Zero); + SDOperand Lo = DAG.getNode(PPCISD::Lo, PtrVT, GA, Zero); + + // If this is a non-darwin platform, we don't support non-static relo models + // yet. + if (TM.getRelocationModel() == Reloc::Static || + !TM.getSubtarget<PPCSubtarget>().isDarwin()) { + // Generate non-pic code that has direct accesses to globals. + // The address of the global is just (hi(&g)+lo(&g)). + return DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + } + + if (TM.getRelocationModel() == Reloc::PIC_) { + // With PIC, the first instruction is actually "GR+hi(&G)". + Hi = DAG.getNode(ISD::ADD, PtrVT, + DAG.getNode(PPCISD::GlobalBaseReg, PtrVT), Hi); + } + + Lo = DAG.getNode(ISD::ADD, PtrVT, Hi, Lo); + + if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV)) + return Lo; + + // If the global is weak or external, we have to go through the lazy + // resolution stub. + return DAG.getLoad(PtrVT, DAG.getEntryNode(), Lo, NULL, 0); +} + +static SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG) { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // If we're comparing for equality to zero, expose the fact that this is + // implented as a ctlz/srl pair on ppc, so that the dag combiner can + // fold the new nodes. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (C->isNullValue() && CC == ISD::SETEQ) { + MVT::ValueType VT = Op.getOperand(0).getValueType(); + SDOperand Zext = Op.getOperand(0); + if (VT < MVT::i32) { + VT = MVT::i32; + Zext = DAG.getNode(ISD::ZERO_EXTEND, VT, Op.getOperand(0)); + } + unsigned Log2b = Log2_32(MVT::getSizeInBits(VT)); + SDOperand Clz = DAG.getNode(ISD::CTLZ, VT, Zext); + SDOperand Scc = DAG.getNode(ISD::SRL, VT, Clz, + DAG.getConstant(Log2b, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, MVT::i32, Scc); + } + // Leave comparisons against 0 and -1 alone for now, since they're usually + // optimized. FIXME: revisit this when we can custom lower all setcc + // optimizations. + if (C->isAllOnesValue() || C->isNullValue()) + return SDOperand(); + } + + // If we have an integer seteq/setne, turn it into a compare against zero + // by xor'ing the rhs with the lhs, which is faster than setting a + // condition register, reading it back out, and masking the correct bit. The + // normal approach here uses sub to do this instead of xor. Using xor exposes + // the result to other bit-twiddling opportunities. + MVT::ValueType LHSVT = Op.getOperand(0).getValueType(); + if (MVT::isInteger(LHSVT) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + MVT::ValueType VT = Op.getValueType(); + SDOperand Sub = DAG.getNode(ISD::XOR, LHSVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getSetCC(VT, Sub, DAG.getConstant(0, LHSVT), CC); + } + return SDOperand(); +} + +static SDOperand LowerVAARG(SDOperand Op, SelectionDAG &DAG, + int VarArgsFrameIndex, + int VarArgsStackOffset, + unsigned VarArgsNumGPR, + unsigned VarArgsNumFPR, + const PPCSubtarget &Subtarget) { + + assert(0 && "VAARG in ELF32 ABI not implemented yet!"); +} + +static SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG, + int VarArgsFrameIndex, + int VarArgsStackOffset, + unsigned VarArgsNumGPR, + unsigned VarArgsNumFPR, + const PPCSubtarget &Subtarget) { + + if (Subtarget.isMachoABI()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + return DAG.getStore(Op.getOperand(0), FR, Op.getOperand(1), SV->getValue(), + SV->getOffset()); + } + + // For ELF 32 ABI we follow the layout of the va_list struct. + // We suppose the given va_list is already allocated. + // + // typedef struct { + // char gpr; /* index into the array of 8 GPRs + // * stored in the register save area + // * gpr=0 corresponds to r3, + // * gpr=1 to r4, etc. + // */ + // char fpr; /* index into the array of 8 FPRs + // * stored in the register save area + // * fpr=0 corresponds to f1, + // * fpr=1 to f2, etc. + // */ + // char *overflow_arg_area; + // /* location on stack that holds + // * the next overflow argument + // */ + // char *reg_save_area; + // /* where r3:r10 and f1:f8 (if saved) + // * are stored + // */ + // } va_list[1]; + + + SDOperand ArgGPR = DAG.getConstant(VarArgsNumGPR, MVT::i8); + SDOperand ArgFPR = DAG.getConstant(VarArgsNumFPR, MVT::i8); + + + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + SDOperand StackOffset = DAG.getFrameIndex(VarArgsStackOffset, PtrVT); + SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); + + SDOperand ConstFrameOffset = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, + PtrVT); + SDOperand ConstStackOffset = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8 - 1, + PtrVT); + SDOperand ConstFPROffset = DAG.getConstant(1, PtrVT); + + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + + // Store first byte : number of int regs + SDOperand firstStore = DAG.getStore(Op.getOperand(0), ArgGPR, + Op.getOperand(1), SV->getValue(), + SV->getOffset()); + SDOperand nextPtr = DAG.getNode(ISD::ADD, PtrVT, Op.getOperand(1), + ConstFPROffset); + + // Store second byte : number of float regs + SDOperand secondStore = DAG.getStore(firstStore, ArgFPR, nextPtr, + SV->getValue(), SV->getOffset()); + nextPtr = DAG.getNode(ISD::ADD, PtrVT, nextPtr, ConstStackOffset); + + // Store second word : arguments given on stack + SDOperand thirdStore = DAG.getStore(secondStore, StackOffset, nextPtr, + SV->getValue(), SV->getOffset()); + nextPtr = DAG.getNode(ISD::ADD, PtrVT, nextPtr, ConstFrameOffset); + + // Store third word : arguments given in registers + return DAG.getStore(thirdStore, FR, nextPtr, SV->getValue(), + SV->getOffset()); + +} + +#include "PPCGenCallingConv.inc" + +/// GetFPR - Get the set of FP registers that should be allocated for arguments, +/// depending on which subtarget is selected. +static const unsigned *GetFPR(const PPCSubtarget &Subtarget) { + if (Subtarget.isMachoABI()) { + static const unsigned FPR[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13 + }; + return FPR; + } + + + static const unsigned FPR[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + return FPR; +} + +static SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG, + int &VarArgsFrameIndex, + int &VarArgsStackOffset, + unsigned &VarArgsNumGPR, + unsigned &VarArgsNumFPR, + const PPCSubtarget &Subtarget) { + // TODO: add description of PPC stack frame format, or at least some docs. + // + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + SSARegMap *RegMap = MF.getSSARegMap(); + SmallVector<SDOperand, 8> ArgValues; + SDOperand Root = Op.getOperand(0); + + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + bool isMachoABI = Subtarget.isMachoABI(); + bool isELF32_ABI = Subtarget.isELF32_ABI(); + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI); + + static const unsigned GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const unsigned GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + + static const unsigned *FPR = GetFPR(Subtarget); + + static const unsigned VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + + const unsigned Num_GPR_Regs = sizeof(GPR_32)/sizeof(GPR_32[0]); + const unsigned Num_FPR_Regs = isMachoABI ? 13 : 8; + const unsigned Num_VR_Regs = sizeof( VR)/sizeof( VR[0]); + + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32; + + // Add DAG nodes to load the arguments or copy them out of registers. On + // entry to a function on PPC, the arguments start after the linkage area, + // although the first ones are often in registers. + // + // In the ELF 32 ABI, GPRs and stack are double word align: an argument + // represented with two words (long long or double) must be copied to an + // even GPR_idx value or to an even ArgOffset value. + + for (unsigned ArgNo = 0, e = Op.Val->getNumValues()-1; ArgNo != e; ++ArgNo) { + SDOperand ArgVal; + bool needsLoad = false; + MVT::ValueType ObjectVT = Op.getValue(ArgNo).getValueType(); + unsigned ObjSize = MVT::getSizeInBits(ObjectVT)/8; + unsigned ArgSize = ObjSize; + unsigned Flags = cast<ConstantSDNode>(Op.getOperand(ArgNo+3))->getValue(); + unsigned AlignFlag = 1 << ISD::ParamFlags::OrigAlignmentOffs; + // See if next argument requires stack alignment in ELF + bool Expand = (ObjectVT == MVT::f64) || ((ArgNo + 1 < e) && + (cast<ConstantSDNode>(Op.getOperand(ArgNo+4))->getValue() & AlignFlag) && + (!(Flags & AlignFlag))); + + unsigned CurArgOffset = ArgOffset; + switch (ObjectVT) { + default: assert(0 && "Unhandled argument type!"); + case MVT::i32: + // Double word align in ELF + if (Expand && isELF32_ABI) GPR_idx += (GPR_idx % 2); + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = RegMap->createVirtualRegister(&PPC::GPRCRegClass); + MF.addLiveIn(GPR[GPR_idx], VReg); + ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i32); + ++GPR_idx; + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + // Stack align in ELF + if (needsLoad && Expand && isELF32_ABI) + ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize; + // All int arguments reserve stack space in Macho ABI. + if (isMachoABI || needsLoad) ArgOffset += PtrByteSize; + break; + + case MVT::i64: // PPC64 + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = RegMap->createVirtualRegister(&PPC::G8RCRegClass); + MF.addLiveIn(GPR[GPR_idx], VReg); + ArgVal = DAG.getCopyFromReg(Root, VReg, MVT::i64); + ++GPR_idx; + } else { + needsLoad = true; + } + // All int arguments reserve stack space in Macho ABI. + if (isMachoABI || needsLoad) ArgOffset += 8; + break; + + case MVT::f32: + case MVT::f64: + // Every 4 bytes of argument space consumes one of the GPRs available for + // argument passing. + if (GPR_idx != Num_GPR_Regs && isMachoABI) { + ++GPR_idx; + if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) + ++GPR_idx; + } + if (FPR_idx != Num_FPR_Regs) { + unsigned VReg; + if (ObjectVT == MVT::f32) + VReg = RegMap->createVirtualRegister(&PPC::F4RCRegClass); + else + VReg = RegMap->createVirtualRegister(&PPC::F8RCRegClass); + MF.addLiveIn(FPR[FPR_idx], VReg); + ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT); + ++FPR_idx; + } else { + needsLoad = true; + } + + // Stack align in ELF + if (needsLoad && Expand && isELF32_ABI) + ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize; + // All FP arguments reserve stack space in Macho ABI. + if (isMachoABI || needsLoad) ArgOffset += isPPC64 ? 8 : ObjSize; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + // Note that vector arguments in registers don't reserve stack space. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = RegMap->createVirtualRegister(&PPC::VRRCRegClass); + MF.addLiveIn(VR[VR_idx], VReg); + ArgVal = DAG.getCopyFromReg(Root, VReg, ObjectVT); + ++VR_idx; + } else { + // This should be simple, but requires getting 16-byte aligned stack + // values. + assert(0 && "Loading VR argument not implemented yet!"); + needsLoad = true; + } + break; + } + + // We need to load the argument to a virtual register if we determined above + // that we ran out of physical registers of the appropriate type + if (needsLoad) { + // If the argument is actually used, emit a load from the right stack + // slot. + if (!Op.Val->hasNUsesOfValue(0, ArgNo)) { + int FI = MFI->CreateFixedObject(ObjSize, + CurArgOffset + (ArgSize - ObjSize)); + SDOperand FIN = DAG.getFrameIndex(FI, PtrVT); + ArgVal = DAG.getLoad(ObjectVT, Root, FIN, NULL, 0); + } else { + // Don't emit a dead load. + ArgVal = DAG.getNode(ISD::UNDEF, ObjectVT); + } + } + + ArgValues.push_back(ArgVal); + } + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + if (isVarArg) { + + int depth; + if (isELF32_ABI) { + VarArgsNumGPR = GPR_idx; + VarArgsNumFPR = FPR_idx; + + // Make room for Num_GPR_Regs, Num_FPR_Regs and for a possible frame + // pointer. + depth = -(Num_GPR_Regs * MVT::getSizeInBits(PtrVT)/8 + + Num_FPR_Regs * MVT::getSizeInBits(MVT::f64)/8 + + MVT::getSizeInBits(PtrVT)/8); + + VarArgsStackOffset = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8, + ArgOffset); + + } + else + depth = ArgOffset; + + VarArgsFrameIndex = MFI->CreateFixedObject(MVT::getSizeInBits(PtrVT)/8, + depth); + SDOperand FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT); + + SmallVector<SDOperand, 8> MemOps; + + // In ELF 32 ABI, the fixed integer arguments of a variadic function are + // stored to the VarArgsFrameIndex on the stack. + if (isELF32_ABI) { + for (GPR_idx = 0; GPR_idx != VarArgsNumGPR; ++GPR_idx) { + SDOperand Val = DAG.getRegister(GPR[GPR_idx], PtrVT); + SDOperand Store = DAG.getStore(Root, Val, FIN, NULL, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT); + FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff); + } + } + + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing the + // result of va_next. + for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + unsigned VReg; + if (isPPC64) + VReg = RegMap->createVirtualRegister(&PPC::G8RCRegClass); + else + VReg = RegMap->createVirtualRegister(&PPC::GPRCRegClass); + + MF.addLiveIn(GPR[GPR_idx], VReg); + SDOperand Val = DAG.getCopyFromReg(Root, VReg, PtrVT); + SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(PtrVT)/8, PtrVT); + FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff); + } + + // In ELF 32 ABI, the double arguments are stored to the VarArgsFrameIndex + // on the stack. + if (isELF32_ABI) { + for (FPR_idx = 0; FPR_idx != VarArgsNumFPR; ++FPR_idx) { + SDOperand Val = DAG.getRegister(FPR[FPR_idx], MVT::f64); + SDOperand Store = DAG.getStore(Root, Val, FIN, NULL, 0); + MemOps.push_back(Store); + // Increment the address by eight for the next argument to store + SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(MVT::f64)/8, + PtrVT); + FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff); + } + + for (; FPR_idx != Num_FPR_Regs; ++FPR_idx) { + unsigned VReg; + VReg = RegMap->createVirtualRegister(&PPC::F8RCRegClass); + + MF.addLiveIn(FPR[FPR_idx], VReg); + SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::f64); + SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0); + MemOps.push_back(Store); + // Increment the address by eight for the next argument to store + SDOperand PtrOff = DAG.getConstant(MVT::getSizeInBits(MVT::f64)/8, + PtrVT); + FIN = DAG.getNode(ISD::ADD, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Root = DAG.getNode(ISD::TokenFactor, MVT::Other,&MemOps[0],MemOps.size()); + } + + ArgValues.push_back(Root); + + // Return the new list of results. + std::vector<MVT::ValueType> RetVT(Op.Val->value_begin(), + Op.Val->value_end()); + return DAG.getNode(ISD::MERGE_VALUES, RetVT, &ArgValues[0], ArgValues.size()); +} + +/// isCallCompatibleAddress - Return the immediate to use if the specified +/// 32-bit value is representable in the immediate field of a BxA instruction. +static SDNode *isBLACompatibleAddress(SDOperand Op, SelectionDAG &DAG) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) return 0; + + int Addr = C->getValue(); + if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. + (Addr << 6 >> 6) != Addr) + return 0; // Top 6 bits have to be sext of immediate. + + return DAG.getConstant((int)C->getValue() >> 2, MVT::i32).Val; +} + + +static SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + SDOperand Chain = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + unsigned NumOps = (Op.getNumOperands() - 5) / 2; + + bool isMachoABI = Subtarget.isMachoABI(); + bool isELF32_ABI = Subtarget.isELF32_ABI(); + + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + // args_to_use will accumulate outgoing args for the PPCISD::CALL case in + // SelectExpr to use to put the arguments in the appropriate registers. + std::vector<SDOperand> args_to_use; + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. We start with 24/48 bytes, which is + // prereserved space for [SP][CR][LR][3 x unused]. + unsigned NumBytes = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI); + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + unsigned ArgSize =MVT::getSizeInBits(Op.getOperand(5+2*i).getValueType())/8; + ArgSize = std::max(ArgSize, PtrByteSize); + NumBytes += ArgSize; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, + PPCFrameInfo::getMinCallFrameSize(isPPC64, isMachoABI)); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, + DAG.getConstant(NumBytes, PtrVT)); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDOperand StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, isMachoABI); + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + static const unsigned GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const unsigned GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const unsigned *FPR = GetFPR(Subtarget); + + static const unsigned VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + const unsigned NumGPRs = sizeof(GPR_32)/sizeof(GPR_32[0]); + const unsigned NumFPRs = isMachoABI ? 13 : 8; + const unsigned NumVRs = sizeof( VR)/sizeof( VR[0]); + + const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32; + + std::vector<std::pair<unsigned, SDOperand> > RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + bool inMem = false; + SDOperand Arg = Op.getOperand(5+2*i); + unsigned Flags = cast<ConstantSDNode>(Op.getOperand(5+2*i+1))->getValue(); + unsigned AlignFlag = 1 << ISD::ParamFlags::OrigAlignmentOffs; + // See if next argument requires stack alignment in ELF + unsigned next = 5+2*(i+1)+1; + bool Expand = (Arg.getValueType() == MVT::f64) || ((i + 1 < NumOps) && + (cast<ConstantSDNode>(Op.getOperand(next))->getValue() & AlignFlag) && + (!(Flags & AlignFlag))); + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDOperand PtrOff; + + // Stack align in ELF 32 + if (isELF32_ABI && Expand) + PtrOff = DAG.getConstant(ArgOffset + ((ArgOffset/4) % 2) * PtrByteSize, + StackPtr.getValueType()); + else + PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, PtrVT, StackPtr, PtrOff); + + // On PPC64, promote integers to 64-bit values. + if (isPPC64 && Arg.getValueType() == MVT::i32) { + unsigned ExtOp = (Flags & 1) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + Arg = DAG.getNode(ExtOp, MVT::i64, Arg); + } + + switch (Arg.getValueType()) { + default: assert(0 && "Unexpected ValueType for argument!"); + case MVT::i32: + case MVT::i64: + // Double word align in ELF + if (isELF32_ABI && Expand) GPR_idx += (GPR_idx % 2); + if (GPR_idx != NumGPRs) { + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + } else { + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + inMem = true; + } + if (inMem || isMachoABI) { + // Stack align in ELF + if (isELF32_ABI && Expand) + ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize; + + ArgOffset += PtrByteSize; + } + break; + case MVT::f32: + case MVT::f64: + if (isVarArg) { + // Float varargs need to be promoted to double. + if (Arg.getValueType() == MVT::f32) + Arg = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Arg); + } + + if (FPR_idx != NumFPRs) { + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + if (isVarArg) { + SDOperand Store = DAG.getStore(Chain, Arg, PtrOff, NULL, 0); + MemOpChains.push_back(Store); + + // Float varargs are always shadowed in available integer registers + if (GPR_idx != NumGPRs) { + SDOperand Load = DAG.getLoad(PtrVT, Store, PtrOff, NULL, 0); + MemOpChains.push_back(Load.getValue(1)); + if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], + Load)); + } + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ + SDOperand ConstFour = DAG.getConstant(4, PtrOff.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, PtrVT, PtrOff, ConstFour); + SDOperand Load = DAG.getLoad(PtrVT, Store, PtrOff, NULL, 0); + MemOpChains.push_back(Load.getValue(1)); + if (isMachoABI) RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], + Load)); + } + } else { + // If we have any FPRs remaining, we may also have GPRs remaining. + // Args passed in FPRs consume either 1 (f32) or 2 (f64) available + // GPRs. + if (isMachoABI) { + if (GPR_idx != NumGPRs) + ++GPR_idx; + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && + !isPPC64) // PPC64 has 64-bit GPR's obviously :) + ++GPR_idx; + } + } + } else { + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + inMem = true; + } + if (inMem || isMachoABI) { + // Stack align in ELF + if (isELF32_ABI && Expand) + ArgOffset += ((ArgOffset/4) % 2) * PtrByteSize; + if (isPPC64) + ArgOffset += 8; + else + ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; + } + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + assert(!isVarArg && "Don't support passing vectors to varargs yet!"); + assert(VR_idx != NumVRs && + "Don't support passing more than 12 vector args yet!"); + RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); + break; + } + } + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + + // With the ELF 32 ABI, set CR6 to true if this is a vararg call. + if (isVarArg && isELF32_ABI) { + SDOperand SetCR(DAG.getTargetNode(PPC::SETCR, MVT::i32), 0); + Chain = DAG.getCopyToReg(Chain, PPC::CR6, SetCR, InFlag); + InFlag = Chain.getValue(1); + } + + std::vector<MVT::ValueType> NodeTys; + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Flag); // Returns a flag for retval copy to use. + + SmallVector<SDOperand, 8> Ops; + unsigned CallOpc = isMachoABI? PPCISD::CALL_Macho : PPCISD::CALL_ELF; + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), Callee.getValueType()); + else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType()); + else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) + // If this is an absolute destination address, use the munged value. + Callee = SDOperand(Dest, 0); + else { + // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair + // to do the call, we can't use PPCISD::CALL. + SDOperand MTCTROps[] = {Chain, Callee, InFlag}; + Chain = DAG.getNode(PPCISD::MTCTR, NodeTys, MTCTROps, 2+(InFlag.Val!=0)); + InFlag = Chain.getValue(1); + + // Copy the callee address into R12 on darwin. + if (isMachoABI) { + Chain = DAG.getCopyToReg(Chain, PPC::R12, Callee, InFlag); + InFlag = Chain.getValue(1); + } + + NodeTys.clear(); + NodeTys.push_back(MVT::Other); + NodeTys.push_back(MVT::Flag); + Ops.push_back(Chain); + CallOpc = isMachoABI ? PPCISD::BCTRL_Macho : PPCISD::BCTRL_ELF; + Callee.Val = 0; + } + + // If this is a direct call, pass the chain and the callee. + if (Callee.Val) { + Ops.push_back(Chain); + Ops.push_back(Callee); + } + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.Val) + Ops.push_back(InFlag); + Chain = DAG.getNode(CallOpc, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + SDOperand ResultVals[3]; + unsigned NumResults = 0; + NodeTys.clear(); + + // If the call has results, copy the values out of the ret val registers. + switch (Op.Val->getValueType(0)) { + default: assert(0 && "Unexpected ret value!"); + case MVT::Other: break; + case MVT::i32: + if (Op.Val->getValueType(1) == MVT::i32) { + Chain = DAG.getCopyFromReg(Chain, PPC::R3, MVT::i32, InFlag).getValue(1); + ResultVals[0] = Chain.getValue(0); + Chain = DAG.getCopyFromReg(Chain, PPC::R4, MVT::i32, + Chain.getValue(2)).getValue(1); + ResultVals[1] = Chain.getValue(0); + NumResults = 2; + NodeTys.push_back(MVT::i32); + } else { + Chain = DAG.getCopyFromReg(Chain, PPC::R3, MVT::i32, InFlag).getValue(1); + ResultVals[0] = Chain.getValue(0); + NumResults = 1; + } + NodeTys.push_back(MVT::i32); + break; + case MVT::i64: + Chain = DAG.getCopyFromReg(Chain, PPC::X3, MVT::i64, InFlag).getValue(1); + ResultVals[0] = Chain.getValue(0); + NumResults = 1; + NodeTys.push_back(MVT::i64); + break; + case MVT::f32: + case MVT::f64: + Chain = DAG.getCopyFromReg(Chain, PPC::F1, Op.Val->getValueType(0), + InFlag).getValue(1); + ResultVals[0] = Chain.getValue(0); + NumResults = 1; + NodeTys.push_back(Op.Val->getValueType(0)); + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + Chain = DAG.getCopyFromReg(Chain, PPC::V2, Op.Val->getValueType(0), + InFlag).getValue(1); + ResultVals[0] = Chain.getValue(0); + NumResults = 1; + NodeTys.push_back(Op.Val->getValueType(0)); + break; + } + + Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain, + DAG.getConstant(NumBytes, PtrVT)); + NodeTys.push_back(MVT::Other); + + // If the function returns void, just return the chain. + if (NumResults == 0) + return Chain; + + // Otherwise, merge everything together with a MERGE_VALUES node. + ResultVals[NumResults++] = Chain; + SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys, + ResultVals, NumResults); + return Res.getValue(Op.ResNo); +} + +static SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG, TargetMachine &TM) { + SmallVector<CCValAssign, 16> RVLocs; + unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); + bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); + CCState CCInfo(CC, isVarArg, TM, RVLocs); + CCInfo.AnalyzeReturn(Op.Val, RetCC_PPC); + + // If this is the first return lowered for this function, add the regs to the + // liveout set for the function. + if (DAG.getMachineFunction().liveout_empty()) { + for (unsigned i = 0; i != RVLocs.size(); ++i) + DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg()); + } + + SDOperand Chain = Op.getOperand(0); + SDOperand Flag; + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), Flag); + Flag = Chain.getValue(1); + } + + if (Flag.Val) + return DAG.getNode(PPCISD::RET_FLAG, MVT::Other, Chain, Flag); + else + return DAG.getNode(PPCISD::RET_FLAG, MVT::Other, Chain); +} + +static SDOperand LowerSTACKRESTORE(SDOperand Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + // When we pop the dynamic allocation we need to restore the SP link. + + // Get the corect type for pointers. + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + // Construct the stack pointer operand. + bool IsPPC64 = Subtarget.isPPC64(); + unsigned SP = IsPPC64 ? PPC::X1 : PPC::R1; + SDOperand StackPtr = DAG.getRegister(SP, PtrVT); + + // Get the operands for the STACKRESTORE. + SDOperand Chain = Op.getOperand(0); + SDOperand SaveSP = Op.getOperand(1); + + // Load the old link SP. + SDOperand LoadLinkSP = DAG.getLoad(PtrVT, Chain, StackPtr, NULL, 0); + + // Restore the stack pointer. + Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), SP, SaveSP); + + // Store the old link SP. + return DAG.getStore(Chain, LoadLinkSP, StackPtr, NULL, 0); +} + +static SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + MachineFunction &MF = DAG.getMachineFunction(); + bool IsPPC64 = Subtarget.isPPC64(); + bool isMachoABI = Subtarget.isMachoABI(); + + // Get current frame pointer save index. The users of this index will be + // primarily DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, isMachoABI); + + // Allocate the frame index for frame pointer save area. + FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + + // Get the inputs. + SDOperand Chain = Op.getOperand(0); + SDOperand Size = Op.getOperand(1); + + // Get the corect type for pointers. + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + // Negate the size. + SDOperand NegSize = DAG.getNode(ISD::SUB, PtrVT, + DAG.getConstant(0, PtrVT), Size); + // Construct a node for the frame pointer save index. + SDOperand FPSIdx = DAG.getFrameIndex(FPSI, PtrVT); + // Build a DYNALLOC node. + SDOperand Ops[3] = { Chain, NegSize, FPSIdx }; + SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); + return DAG.getNode(PPCISD::DYNALLOC, VTs, Ops, 3); +} + + +/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when +/// possible. +static SDOperand LowerSELECT_CC(SDOperand Op, SelectionDAG &DAG) { + // Not FP? Not a fsel. + if (!MVT::isFloatingPoint(Op.getOperand(0).getValueType()) || + !MVT::isFloatingPoint(Op.getOperand(2).getValueType())) + return SDOperand(); + + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + + // Cannot handle SETEQ/SETNE. + if (CC == ISD::SETEQ || CC == ISD::SETNE) return SDOperand(); + + MVT::ValueType ResVT = Op.getValueType(); + MVT::ValueType CmpVT = Op.getOperand(0).getValueType(); + SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1); + SDOperand TV = Op.getOperand(2), FV = Op.getOperand(3); + + // If the RHS of the comparison is a 0.0, we don't need to do the + // subtraction at all. + if (isFloatingPointZero(RHS)) + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETULT: + case ISD::SETOLT: + case ISD::SETLT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETUGE: + case ISD::SETOGE: + case ISD::SETGE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, ResVT, LHS, TV, FV); + case ISD::SETUGT: + case ISD::SETOGT: + case ISD::SETGT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETULE: + case ISD::SETOLE: + case ISD::SETLE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, ResVT, + DAG.getNode(ISD::FNEG, MVT::f64, LHS), TV, FV); + } + + SDOperand Cmp; + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETULT: + case ISD::SETOLT: + case ISD::SETLT: + Cmp = DAG.getNode(ISD::FSUB, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, FV, TV); + case ISD::SETUGE: + case ISD::SETOGE: + case ISD::SETGE: + Cmp = DAG.getNode(ISD::FSUB, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, TV, FV); + case ISD::SETUGT: + case ISD::SETOGT: + case ISD::SETGT: + Cmp = DAG.getNode(ISD::FSUB, CmpVT, RHS, LHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, FV, TV); + case ISD::SETULE: + case ISD::SETOLE: + case ISD::SETLE: + Cmp = DAG.getNode(ISD::FSUB, CmpVT, RHS, LHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, ResVT, Cmp, TV, FV); + } + return SDOperand(); +} + +static SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) { + assert(MVT::isFloatingPoint(Op.getOperand(0).getValueType())); + SDOperand Src = Op.getOperand(0); + if (Src.getValueType() == MVT::f32) + Src = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Src); + + SDOperand Tmp; + switch (Op.getValueType()) { + default: assert(0 && "Unhandled FP_TO_SINT type in custom expander!"); + case MVT::i32: + Tmp = DAG.getNode(PPCISD::FCTIWZ, MVT::f64, Src); + break; + case MVT::i64: + Tmp = DAG.getNode(PPCISD::FCTIDZ, MVT::f64, Src); + break; + } + + // Convert the FP value to an int value through memory. + SDOperand Bits = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Tmp); + if (Op.getValueType() == MVT::i32) + Bits = DAG.getNode(ISD::TRUNCATE, MVT::i32, Bits); + return Bits; +} + +static SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) { + if (Op.getOperand(0).getValueType() == MVT::i64) { + SDOperand Bits = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, Op.getOperand(0)); + SDOperand FP = DAG.getNode(PPCISD::FCFID, MVT::f64, Bits); + if (Op.getValueType() == MVT::f32) + FP = DAG.getNode(ISD::FP_ROUND, MVT::f32, FP); + return FP; + } + + assert(Op.getOperand(0).getValueType() == MVT::i32 && + "Unhandled SINT_TO_FP type in custom expander!"); + // Since we only generate this in 64-bit mode, we can take advantage of + // 64-bit registers. In particular, sign extend the input value into the + // 64-bit register with extsw, store the WHOLE 64-bit value into the stack + // then lfd it and fcfid it. + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(8, 8); + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDOperand FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDOperand Ext64 = DAG.getNode(PPCISD::EXTSW_32, MVT::i32, + Op.getOperand(0)); + + // STD the extended value into the stack slot. + SDOperand Store = DAG.getNode(PPCISD::STD_32, MVT::Other, + DAG.getEntryNode(), Ext64, FIdx, + DAG.getSrcValue(NULL)); + // Load the value as a double. + SDOperand Ld = DAG.getLoad(MVT::f64, Store, FIdx, NULL, 0); + + // FCFID it and return it. + SDOperand FP = DAG.getNode(PPCISD::FCFID, MVT::f64, Ld); + if (Op.getValueType() == MVT::f32) + FP = DAG.getNode(ISD::FP_ROUND, MVT::f32, FP); + return FP; +} + +static SDOperand LowerSHL_PARTS(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 && + Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SHL!"); + + // Expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDOperand Lo = Op.getOperand(0); + SDOperand Hi = Op.getOperand(1); + SDOperand Amt = Op.getOperand(2); + + SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32, + DAG.getConstant(32, MVT::i32), Amt); + SDOperand Tmp2 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Amt); + SDOperand Tmp3 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Tmp1); + SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3); + SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt, + DAG.getConstant(-32U, MVT::i32)); + SDOperand Tmp6 = DAG.getNode(PPCISD::SHL, MVT::i32, Lo, Tmp5); + SDOperand OutHi = DAG.getNode(ISD::OR, MVT::i32, Tmp4, Tmp6); + SDOperand OutLo = DAG.getNode(PPCISD::SHL, MVT::i32, Lo, Amt); + SDOperand OutOps[] = { OutLo, OutHi }; + return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32), + OutOps, 2); +} + +static SDOperand LowerSRL_PARTS(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 && + Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SRL!"); + + // Otherwise, expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDOperand Lo = Op.getOperand(0); + SDOperand Hi = Op.getOperand(1); + SDOperand Amt = Op.getOperand(2); + + SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32, + DAG.getConstant(32, MVT::i32), Amt); + SDOperand Tmp2 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Amt); + SDOperand Tmp3 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Tmp1); + SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3); + SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt, + DAG.getConstant(-32U, MVT::i32)); + SDOperand Tmp6 = DAG.getNode(PPCISD::SRL, MVT::i32, Hi, Tmp5); + SDOperand OutLo = DAG.getNode(ISD::OR, MVT::i32, Tmp4, Tmp6); + SDOperand OutHi = DAG.getNode(PPCISD::SRL, MVT::i32, Hi, Amt); + SDOperand OutOps[] = { OutLo, OutHi }; + return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32), + OutOps, 2); +} + +static SDOperand LowerSRA_PARTS(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 && + Op.getOperand(1).getValueType() == MVT::i32 && "Unexpected SRA!"); + + // Otherwise, expand into a bunch of logical ops, followed by a select_cc. + SDOperand Lo = Op.getOperand(0); + SDOperand Hi = Op.getOperand(1); + SDOperand Amt = Op.getOperand(2); + + SDOperand Tmp1 = DAG.getNode(ISD::SUB, MVT::i32, + DAG.getConstant(32, MVT::i32), Amt); + SDOperand Tmp2 = DAG.getNode(PPCISD::SRL, MVT::i32, Lo, Amt); + SDOperand Tmp3 = DAG.getNode(PPCISD::SHL, MVT::i32, Hi, Tmp1); + SDOperand Tmp4 = DAG.getNode(ISD::OR , MVT::i32, Tmp2, Tmp3); + SDOperand Tmp5 = DAG.getNode(ISD::ADD, MVT::i32, Amt, + DAG.getConstant(-32U, MVT::i32)); + SDOperand Tmp6 = DAG.getNode(PPCISD::SRA, MVT::i32, Hi, Tmp5); + SDOperand OutHi = DAG.getNode(PPCISD::SRA, MVT::i32, Hi, Amt); + SDOperand OutLo = DAG.getSelectCC(Tmp5, DAG.getConstant(0, MVT::i32), + Tmp4, Tmp6, ISD::SETLE); + SDOperand OutOps[] = { OutLo, OutHi }; + return DAG.getNode(ISD::MERGE_VALUES, DAG.getVTList(MVT::i32, MVT::i32), + OutOps, 2); +} + +//===----------------------------------------------------------------------===// +// Vector related lowering. +// + +// If this is a vector of constants or undefs, get the bits. A bit in +// UndefBits is set if the corresponding element of the vector is an +// ISD::UNDEF value. For undefs, the corresponding VectorBits values are +// zero. Return true if this is not an array of constants, false if it is. +// +static bool GetConstantBuildVectorBits(SDNode *BV, uint64_t VectorBits[2], + uint64_t UndefBits[2]) { + // Start with zero'd results. + VectorBits[0] = VectorBits[1] = UndefBits[0] = UndefBits[1] = 0; + + unsigned EltBitSize = MVT::getSizeInBits(BV->getOperand(0).getValueType()); + for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) { + SDOperand OpVal = BV->getOperand(i); + + unsigned PartNo = i >= e/2; // In the upper 128 bits? + unsigned SlotNo = e/2 - (i & (e/2-1))-1; // Which subpiece of the uint64_t. + + uint64_t EltBits = 0; + if (OpVal.getOpcode() == ISD::UNDEF) { + uint64_t EltUndefBits = ~0U >> (32-EltBitSize); + UndefBits[PartNo] |= EltUndefBits << (SlotNo*EltBitSize); + continue; + } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { + EltBits = CN->getValue() & (~0U >> (32-EltBitSize)); + } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { + assert(CN->getValueType(0) == MVT::f32 && + "Only one legal FP vector type!"); + EltBits = FloatToBits(CN->getValue()); + } else { + // Nonconstant element. + return true; + } + + VectorBits[PartNo] |= EltBits << (SlotNo*EltBitSize); + } + + //printf("%llx %llx %llx %llx\n", + // VectorBits[0], VectorBits[1], UndefBits[0], UndefBits[1]); + return false; +} + +// If this is a splat (repetition) of a value across the whole vector, return +// the smallest size that splats it. For example, "0x01010101010101..." is a +// splat of 0x01, 0x0101, and 0x01010101. We return SplatBits = 0x01 and +// SplatSize = 1 byte. +static bool isConstantSplat(const uint64_t Bits128[2], + const uint64_t Undef128[2], + unsigned &SplatBits, unsigned &SplatUndef, + unsigned &SplatSize) { + + // Don't let undefs prevent splats from matching. See if the top 64-bits are + // the same as the lower 64-bits, ignoring undefs. + if ((Bits128[0] & ~Undef128[1]) != (Bits128[1] & ~Undef128[0])) + return false; // Can't be a splat if two pieces don't match. + + uint64_t Bits64 = Bits128[0] | Bits128[1]; + uint64_t Undef64 = Undef128[0] & Undef128[1]; + + // Check that the top 32-bits are the same as the lower 32-bits, ignoring + // undefs. + if ((Bits64 & (~Undef64 >> 32)) != ((Bits64 >> 32) & ~Undef64)) + return false; // Can't be a splat if two pieces don't match. + + uint32_t Bits32 = uint32_t(Bits64) | uint32_t(Bits64 >> 32); + uint32_t Undef32 = uint32_t(Undef64) & uint32_t(Undef64 >> 32); + + // If the top 16-bits are different than the lower 16-bits, ignoring + // undefs, we have an i32 splat. + if ((Bits32 & (~Undef32 >> 16)) != ((Bits32 >> 16) & ~Undef32)) { + SplatBits = Bits32; + SplatUndef = Undef32; + SplatSize = 4; + return true; + } + + uint16_t Bits16 = uint16_t(Bits32) | uint16_t(Bits32 >> 16); + uint16_t Undef16 = uint16_t(Undef32) & uint16_t(Undef32 >> 16); + + // If the top 8-bits are different than the lower 8-bits, ignoring + // undefs, we have an i16 splat. + if ((Bits16 & (uint16_t(~Undef16) >> 8)) != ((Bits16 >> 8) & ~Undef16)) { + SplatBits = Bits16; + SplatUndef = Undef16; + SplatSize = 2; + return true; + } + + // Otherwise, we have an 8-bit splat. + SplatBits = uint8_t(Bits16) | uint8_t(Bits16 >> 8); + SplatUndef = uint8_t(Undef16) & uint8_t(Undef16 >> 8); + SplatSize = 1; + return true; +} + +/// BuildSplatI - Build a canonical splati of Val with an element size of +/// SplatSize. Cast the result to VT. +static SDOperand BuildSplatI(int Val, unsigned SplatSize, MVT::ValueType VT, + SelectionDAG &DAG) { + assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); + + static const MVT::ValueType VTys[] = { // canonical VT to use for each size. + MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 + }; + + MVT::ValueType ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; + + // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. + if (Val == -1) + SplatSize = 1; + + MVT::ValueType CanonicalVT = VTys[SplatSize-1]; + + // Build a canonical splat for this value. + SDOperand Elt = DAG.getConstant(Val, MVT::getVectorElementType(CanonicalVT)); + SmallVector<SDOperand, 8> Ops; + Ops.assign(MVT::getVectorNumElements(CanonicalVT), Elt); + SDOperand Res = DAG.getNode(ISD::BUILD_VECTOR, CanonicalVT, + &Ops[0], Ops.size()); + return DAG.getNode(ISD::BIT_CONVERT, ReqVT, Res); +} + +/// BuildIntrinsicOp - Return a binary operator intrinsic node with the +/// specified intrinsic ID. +static SDOperand BuildIntrinsicOp(unsigned IID, SDOperand LHS, SDOperand RHS, + SelectionDAG &DAG, + MVT::ValueType DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = LHS.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DestVT, + DAG.getConstant(IID, MVT::i32), LHS, RHS); +} + +/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the +/// specified intrinsic ID. +static SDOperand BuildIntrinsicOp(unsigned IID, SDOperand Op0, SDOperand Op1, + SDOperand Op2, SelectionDAG &DAG, + MVT::ValueType DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op0.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DestVT, + DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2); +} + + +/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified +/// amount. The result has the specified value type. +static SDOperand BuildVSLDOI(SDOperand LHS, SDOperand RHS, unsigned Amt, + MVT::ValueType VT, SelectionDAG &DAG) { + // Force LHS/RHS to be the right type. + LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, LHS); + RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, RHS); + + SDOperand Ops[16]; + for (unsigned i = 0; i != 16; ++i) + Ops[i] = DAG.getConstant(i+Amt, MVT::i32); + SDOperand T = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v16i8, LHS, RHS, + DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops,16)); + return DAG.getNode(ISD::BIT_CONVERT, VT, T); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. If we CAN select this case, and if it +// selects to a single instruction, return Op. Otherwise, if we can codegen +// this case more efficiently than a constant pool load, lower it to the +// sequence of ops that should be used. +static SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { + // If this is a vector of constants or undefs, get the bits. A bit in + // UndefBits is set if the corresponding element of the vector is an + // ISD::UNDEF value. For undefs, the corresponding VectorBits values are + // zero. + uint64_t VectorBits[2]; + uint64_t UndefBits[2]; + if (GetConstantBuildVectorBits(Op.Val, VectorBits, UndefBits)) + return SDOperand(); // Not a constant vector. + + // If this is a splat (repetition) of a value across the whole vector, return + // the smallest size that splats it. For example, "0x01010101010101..." is a + // splat of 0x01, 0x0101, and 0x01010101. We return SplatBits = 0x01 and + // SplatSize = 1 byte. + unsigned SplatBits, SplatUndef, SplatSize; + if (isConstantSplat(VectorBits, UndefBits, SplatBits, SplatUndef, SplatSize)){ + bool HasAnyUndefs = (UndefBits[0] | UndefBits[1]) != 0; + + // First, handle single instruction cases. + + // All zeros? + if (SplatBits == 0) { + // Canonicalize all zero vectors to be v4i32. + if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { + SDOperand Z = DAG.getConstant(0, MVT::i32); + Z = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Z, Z, Z, Z); + Op = DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Z); + } + return Op; + } + + // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. + int32_t SextVal= int32_t(SplatBits << (32-8*SplatSize)) >> (32-8*SplatSize); + if (SextVal >= -16 && SextVal <= 15) + return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG); + + + // Two instruction sequences. + + // If this value is in the range [-32,30] and is even, use: + // tmp = VSPLTI[bhw], result = add tmp, tmp + if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) { + Op = BuildSplatI(SextVal >> 1, SplatSize, Op.getValueType(), DAG); + return DAG.getNode(ISD::ADD, Op.getValueType(), Op, Op); + } + + // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is + // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important + // for fneg/fabs. + if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { + // Make -1 and vspltisw -1: + SDOperand OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG); + + // Make the VSLW intrinsic, computing 0x8000_0000. + SDOperand Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, + OnesV, DAG); + + // xor by OnesV to invert it. + Res = DAG.getNode(ISD::XOR, MVT::v4i32, Res, OnesV); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res); + } + + // Check to see if this is a wide variety of vsplti*, binop self cases. + unsigned SplatBitSize = SplatSize*8; + static const signed char SplatCsts[] = { + -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, + -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 + }; + + for (unsigned idx = 0; idx < sizeof(SplatCsts)/sizeof(SplatCsts[0]); ++idx){ + // Indirect through the SplatCsts array so that we favor 'vsplti -1' for + // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' + int i = SplatCsts[idx]; + + // Figure out what shift amount will be used by altivec if shifted by i in + // this splat size. + unsigned TypeShiftAmt = i & (SplatBitSize-1); + + // vsplti + shl self. + if (SextVal == (i << (int)TypeShiftAmt)) { + SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, + Intrinsic::ppc_altivec_vslw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res); + } + + // vsplti + srl self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, + Intrinsic::ppc_altivec_vsrw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res); + } + + // vsplti + sra self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, + Intrinsic::ppc_altivec_vsraw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res); + } + + // vsplti + rol self. + if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | + ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { + SDOperand Res = BuildSplatI(i, SplatSize, MVT::Other, DAG); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, + Intrinsic::ppc_altivec_vrlw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Res); + } + + // t = vsplti c, result = vsldoi t, t, 1 + if (SextVal == ((i << 8) | (i >> (TypeShiftAmt-8)))) { + SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG); + return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG); + } + // t = vsplti c, result = vsldoi t, t, 2 + if (SextVal == ((i << 16) | (i >> (TypeShiftAmt-16)))) { + SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG); + return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG); + } + // t = vsplti c, result = vsldoi t, t, 3 + if (SextVal == ((i << 24) | (i >> (TypeShiftAmt-24)))) { + SDOperand T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG); + return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG); + } + } + + // Three instruction sequences. + + // Odd, in range [17,31]: (vsplti C)-(vsplti -16). + if (SextVal >= 0 && SextVal <= 31) { + SDOperand LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG); + SDOperand RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG); + LHS = DAG.getNode(ISD::SUB, Op.getValueType(), LHS, RHS); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), LHS); + } + // Odd, in range [-31,-17]: (vsplti C)+(vsplti -16). + if (SextVal >= -31 && SextVal <= 0) { + SDOperand LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG); + SDOperand RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG); + LHS = DAG.getNode(ISD::ADD, Op.getValueType(), LHS, RHS); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), LHS); + } + } + + return SDOperand(); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDOperand GeneratePerfectShuffle(unsigned PFEntry, SDOperand LHS, + SDOperand RHS, SelectionDAG &DAG) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VMRGHW, + OP_VMRGLW, + OP_VSPLTISW0, + OP_VSPLTISW1, + OP_VSPLTISW2, + OP_VSPLTISW3, + OP_VSLDOI4, + OP_VSLDOI8, + OP_VSLDOI12 + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDOperand OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG); + + unsigned ShufIdxs[16]; + switch (OpNum) { + default: assert(0 && "Unknown i32 permute!"); + case OP_VMRGHW: + ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; + ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; + ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; + ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; + break; + case OP_VMRGLW: + ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; + ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; + ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; + ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; + break; + case OP_VSPLTISW0: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+0; + break; + case OP_VSPLTISW1: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+4; + break; + case OP_VSPLTISW2: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+8; + break; + case OP_VSPLTISW3: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+12; + break; + case OP_VSLDOI4: + return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG); + case OP_VSLDOI8: + return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG); + case OP_VSLDOI12: + return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG); + } + SDOperand Ops[16]; + for (unsigned i = 0; i != 16; ++i) + Ops[i] = DAG.getConstant(ShufIdxs[i], MVT::i32); + + return DAG.getNode(ISD::VECTOR_SHUFFLE, OpLHS.getValueType(), OpLHS, OpRHS, + DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops, 16)); +} + +/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this +/// is a shuffle we can handle in a single instruction, return it. Otherwise, +/// return the code it can be lowered into. Worst case, it can always be +/// lowered into a vperm. +static SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { + SDOperand V1 = Op.getOperand(0); + SDOperand V2 = Op.getOperand(1); + SDOperand PermMask = Op.getOperand(2); + + // Cases that are handled by instructions that take permute immediates + // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be + // selected by the instruction selector. + if (V2.getOpcode() == ISD::UNDEF) { + if (PPC::isSplatShuffleMask(PermMask.Val, 1) || + PPC::isSplatShuffleMask(PermMask.Val, 2) || + PPC::isSplatShuffleMask(PermMask.Val, 4) || + PPC::isVPKUWUMShuffleMask(PermMask.Val, true) || + PPC::isVPKUHUMShuffleMask(PermMask.Val, true) || + PPC::isVSLDOIShuffleMask(PermMask.Val, true) != -1 || + PPC::isVMRGLShuffleMask(PermMask.Val, 1, true) || + PPC::isVMRGLShuffleMask(PermMask.Val, 2, true) || + PPC::isVMRGLShuffleMask(PermMask.Val, 4, true) || + PPC::isVMRGHShuffleMask(PermMask.Val, 1, true) || + PPC::isVMRGHShuffleMask(PermMask.Val, 2, true) || + PPC::isVMRGHShuffleMask(PermMask.Val, 4, true)) { + return Op; + } + } + + // Altivec has a variety of "shuffle immediates" that take two vector inputs + // and produce a fixed permutation. If any of these match, do not lower to + // VPERM. + if (PPC::isVPKUWUMShuffleMask(PermMask.Val, false) || + PPC::isVPKUHUMShuffleMask(PermMask.Val, false) || + PPC::isVSLDOIShuffleMask(PermMask.Val, false) != -1 || + PPC::isVMRGLShuffleMask(PermMask.Val, 1, false) || + PPC::isVMRGLShuffleMask(PermMask.Val, 2, false) || + PPC::isVMRGLShuffleMask(PermMask.Val, 4, false) || + PPC::isVMRGHShuffleMask(PermMask.Val, 1, false) || + PPC::isVMRGHShuffleMask(PermMask.Val, 2, false) || + PPC::isVMRGHShuffleMask(PermMask.Val, 4, false)) + return Op; + + // Check to see if this is a shuffle of 4-byte values. If so, we can use our + // perfect shuffle table to emit an optimal matching sequence. + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask.getOperand(i*4+j).getOpcode() == ISD::UNDEF) + continue; // Undef, ignore it. + + unsigned ByteSource = + cast<ConstantSDNode>(PermMask.getOperand(i*4+j))->getValue(); + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } + + if (EltNo == 8) { + EltNo = ByteSource/4; + } else if (EltNo != ByteSource/4) { + isFourElementShuffle = false; + break; + } + } + PFIndexes[i] = EltNo; + } + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + if (isFourElementShuffle) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be computed. + // For example, if the perm mask can be hoisted out of a loop or is already + // used (perhaps because there are multiple permutes with the same shuffle + // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of + // the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can be + // generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG); + } + + // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant + // vector that will get spilled to the constant pool. + if (V2.getOpcode() == ISD::UNDEF) V2 = V1; + + // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except + // that it is in input element units, not in bytes. Convert now. + MVT::ValueType EltVT = MVT::getVectorElementType(V1.getValueType()); + unsigned BytesPerElement = MVT::getSizeInBits(EltVT)/8; + + SmallVector<SDOperand, 16> ResultMask; + for (unsigned i = 0, e = PermMask.getNumOperands(); i != e; ++i) { + unsigned SrcElt; + if (PermMask.getOperand(i).getOpcode() == ISD::UNDEF) + SrcElt = 0; + else + SrcElt = cast<ConstantSDNode>(PermMask.getOperand(i))->getValue(); + + for (unsigned j = 0; j != BytesPerElement; ++j) + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, + MVT::i8)); + } + + SDOperand VPermMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, + &ResultMask[0], ResultMask.size()); + return DAG.getNode(PPCISD::VPERM, V1.getValueType(), V1, V2, VPermMask); +} + +/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an +/// altivec comparison. If it is, return true and fill in Opc/isDot with +/// information about the intrinsic. +static bool getAltivecCompareInfo(SDOperand Intrin, int &CompareOpc, + bool &isDot) { + unsigned IntrinsicID = cast<ConstantSDNode>(Intrin.getOperand(0))->getValue(); + CompareOpc = -1; + isDot = false; + switch (IntrinsicID) { + default: return false; + // Comparison predicates. + case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + + // Normal Comparisons. + case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + } + return true; +} + +/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom +/// lower, do it, otherwise return null. +static SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) { + // If this is a lowered altivec predicate compare, CompareOpc is set to the + // opcode number of the comparison. + int CompareOpc; + bool isDot; + if (!getAltivecCompareInfo(Op, CompareOpc, isDot)) + return SDOperand(); // Don't custom lower most intrinsics. + + // If this is a non-dot comparison, make the VCMP node and we are done. + if (!isDot) { + SDOperand Tmp = DAG.getNode(PPCISD::VCMP, Op.getOperand(2).getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(CompareOpc, MVT::i32)); + return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Tmp); + } + + // Create the PPCISD altivec 'dot' comparison node. + SDOperand Ops[] = { + Op.getOperand(2), // LHS + Op.getOperand(3), // RHS + DAG.getConstant(CompareOpc, MVT::i32) + }; + std::vector<MVT::ValueType> VTs; + VTs.push_back(Op.getOperand(2).getValueType()); + VTs.push_back(MVT::Flag); + SDOperand CompNode = DAG.getNode(PPCISD::VCMPo, VTs, Ops, 3); + + // Now that we have the comparison, emit a copy from the CR to a GPR. + // This is flagged to the above dot comparison. + SDOperand Flags = DAG.getNode(PPCISD::MFCR, MVT::i32, + DAG.getRegister(PPC::CR6, MVT::i32), + CompNode.getValue(1)); + + // Unpack the result based on how the target uses it. + unsigned BitNo; // Bit # of CR6. + bool InvertBit; // Invert result? + switch (cast<ConstantSDNode>(Op.getOperand(1))->getValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Return the value of the EQ bit of CR6. + BitNo = 0; InvertBit = false; + break; + case 1: // Return the inverted value of the EQ bit of CR6. + BitNo = 0; InvertBit = true; + break; + case 2: // Return the value of the LT bit of CR6. + BitNo = 2; InvertBit = false; + break; + case 3: // Return the inverted value of the LT bit of CR6. + BitNo = 2; InvertBit = true; + break; + } + + // Shift the bit into the low position. + Flags = DAG.getNode(ISD::SRL, MVT::i32, Flags, + DAG.getConstant(8-(3-BitNo), MVT::i32)); + // Isolate the bit. + Flags = DAG.getNode(ISD::AND, MVT::i32, Flags, + DAG.getConstant(1, MVT::i32)); + + // If we are supposed to, toggle the bit. + if (InvertBit) + Flags = DAG.getNode(ISD::XOR, MVT::i32, Flags, + DAG.getConstant(1, MVT::i32)); + return Flags; +} + +static SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) { + // Create a stack slot that is 16-byte aligned. + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16); + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDOperand FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + // Store the input value into Value#0 of the stack slot. + SDOperand Store = DAG.getStore(DAG.getEntryNode(), + Op.getOperand(0), FIdx, NULL, 0); + // Load it out. + return DAG.getLoad(Op.getValueType(), Store, FIdx, NULL, 0); +} + +static SDOperand LowerMUL(SDOperand Op, SelectionDAG &DAG) { + if (Op.getValueType() == MVT::v4i32) { + SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDOperand Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG); + SDOperand Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG); // +16 as shift amt. + + SDOperand RHSSwap = // = vrlw RHS, 16 + BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG); + + // Shrinkify inputs to v8i16. + LHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, LHS); + RHS = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHS); + RHSSwap = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, RHSSwap); + + // Low parts multiplied together, generating 32-bit results (we ignore the + // top parts). + SDOperand LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, + LHS, RHS, DAG, MVT::v4i32); + + SDOperand HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, + LHS, RHSSwap, Zero, DAG, MVT::v4i32); + // Shift the high parts up 16 bits. + HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, Neg16, DAG); + return DAG.getNode(ISD::ADD, MVT::v4i32, LoProd, HiProd); + } else if (Op.getValueType() == MVT::v8i16) { + SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDOperand Zero = BuildSplatI(0, 1, MVT::v8i16, DAG); + + return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, + LHS, RHS, Zero, DAG); + } else if (Op.getValueType() == MVT::v16i8) { + SDOperand LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + // Multiply the even 8-bit parts, producing 16-bit sums. + SDOperand EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, + LHS, RHS, DAG, MVT::v8i16); + EvenParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, EvenParts); + + // Multiply the odd 8-bit parts, producing 16-bit sums. + SDOperand OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, + LHS, RHS, DAG, MVT::v8i16); + OddParts = DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, OddParts); + + // Merge the results together. + SDOperand Ops[16]; + for (unsigned i = 0; i != 8; ++i) { + Ops[i*2 ] = DAG.getConstant(2*i+1, MVT::i8); + Ops[i*2+1] = DAG.getConstant(2*i+1+16, MVT::i8); + } + return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v16i8, EvenParts, OddParts, + DAG.getNode(ISD::BUILD_VECTOR, MVT::v16i8, Ops, 16)); + } else { + assert(0 && "Unknown mul to lower!"); + abort(); + } +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDOperand PPCTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Wasn't expecting to be able to lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset, + VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget); + + case ISD::VAARG: + return LowerVAARG(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset, + VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget); + + case ISD::FORMAL_ARGUMENTS: + return LowerFORMAL_ARGUMENTS(Op, DAG, VarArgsFrameIndex, + VarArgsStackOffset, VarArgsNumGPR, + VarArgsNumFPR, PPCSubTarget); + + case ISD::CALL: return LowerCALL(Op, DAG, PPCSubTarget); + case ISD::RET: return LowerRET(Op, DAG, getTargetMachine()); + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, PPCSubTarget); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget); + + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + + // Lower 64-bit shifts. + case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); + case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); + case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + + // Vector-related lowering. + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + + // Frame & Return address. Currently unimplemented + case ISD::RETURNADDR: break; + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + } + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// Other Lowering Code +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +PPCTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *BB) { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + assert((MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_CC_F4 || + MI->getOpcode() == PPC::SELECT_CC_F8 || + MI->getOpcode() == PPC::SELECT_CC_VRRC) && + "Unexpected instr type to insert"); + + // To "insert" a SELECT_CC instruction, we actually have to insert the diamond + // control-flow pattern. The incoming instruction knows the destination vreg + // to set, the condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + ilist<MachineBasicBlock>::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB); + unsigned SelectPred = MI->getOperand(4).getImm(); + BuildMI(BB, TII->get(PPC::BCC)) + .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); + MachineFunction *F = BB->getParent(); + F->getBasicBlockList().insert(It, copy0MBB); + F->getBasicBlockList().insert(It, sinkMBB); + // Update machine-CFG edges by first adding all successors of the current + // block to the new block which will contain the Phi node for the select. + for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), + e = BB->succ_end(); i != e; ++i) + sinkMBB->addSuccessor(*i); + // Next, remove all successors of the current block, and add the true + // and fallthrough blocks as its successors. + while(!BB->succ_empty()) + BB->removeSuccessor(BB->succ_begin()); + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, TII->get(PPC::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + delete MI; // The pseudo instruction is gone now. + return BB; +} + +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +SDOperand PPCTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + TargetMachine &TM = getTargetMachine(); + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: break; + case PPCISD::SHL: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->getValue() == 0) // 0 << V -> 0. + return N->getOperand(0); + } + break; + case PPCISD::SRL: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->getValue() == 0) // 0 >>u V -> 0. + return N->getOperand(0); + } + break; + case PPCISD::SRA: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->getValue() == 0 || // 0 >>s V -> 0. + C->isAllOnesValue()) // -1 >>s V -> -1. + return N->getOperand(0); + } + break; + + case ISD::SINT_TO_FP: + if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { + if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { + // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores. + // We allow the src/dst to be either f32/f64, but the intermediate + // type must be i64. + if (N->getOperand(0).getValueType() == MVT::i64) { + SDOperand Val = N->getOperand(0).getOperand(0); + if (Val.getValueType() == MVT::f32) { + Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val); + DCI.AddToWorklist(Val.Val); + } + + Val = DAG.getNode(PPCISD::FCTIDZ, MVT::f64, Val); + DCI.AddToWorklist(Val.Val); + Val = DAG.getNode(PPCISD::FCFID, MVT::f64, Val); + DCI.AddToWorklist(Val.Val); + if (N->getValueType(0) == MVT::f32) { + Val = DAG.getNode(ISD::FP_ROUND, MVT::f32, Val); + DCI.AddToWorklist(Val.Val); + } + return Val; + } else if (N->getOperand(0).getValueType() == MVT::i32) { + // If the intermediate type is i32, we can avoid the load/store here + // too. + } + } + } + break; + case ISD::STORE: + // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). + if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() && + N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && + N->getOperand(1).getValueType() == MVT::i32) { + SDOperand Val = N->getOperand(1).getOperand(0); + if (Val.getValueType() == MVT::f32) { + Val = DAG.getNode(ISD::FP_EXTEND, MVT::f64, Val); + DCI.AddToWorklist(Val.Val); + } + Val = DAG.getNode(PPCISD::FCTIWZ, MVT::f64, Val); + DCI.AddToWorklist(Val.Val); + + Val = DAG.getNode(PPCISD::STFIWX, MVT::Other, N->getOperand(0), Val, + N->getOperand(2), N->getOperand(3)); + DCI.AddToWorklist(Val.Val); + return Val; + } + + // Turn STORE (BSWAP) -> sthbrx/stwbrx. + if (N->getOperand(1).getOpcode() == ISD::BSWAP && + N->getOperand(1).Val->hasOneUse() && + (N->getOperand(1).getValueType() == MVT::i32 || + N->getOperand(1).getValueType() == MVT::i16)) { + SDOperand BSwapOp = N->getOperand(1).getOperand(0); + // Do an any-extend to 32-bits if this is a half-word input. + if (BSwapOp.getValueType() == MVT::i16) + BSwapOp = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, BSwapOp); + + return DAG.getNode(PPCISD::STBRX, MVT::Other, N->getOperand(0), BSwapOp, + N->getOperand(2), N->getOperand(3), + DAG.getValueType(N->getOperand(1).getValueType())); + } + break; + case ISD::BSWAP: + // Turn BSWAP (LOAD) -> lhbrx/lwbrx. + if (ISD::isNON_EXTLoad(N->getOperand(0).Val) && + N->getOperand(0).hasOneUse() && + (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16)) { + SDOperand Load = N->getOperand(0); + LoadSDNode *LD = cast<LoadSDNode>(Load); + // Create the byte-swapping load. + std::vector<MVT::ValueType> VTs; + VTs.push_back(MVT::i32); + VTs.push_back(MVT::Other); + SDOperand SV = DAG.getSrcValue(LD->getSrcValue(), LD->getSrcValueOffset()); + SDOperand Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + SV, // SrcValue + DAG.getValueType(N->getValueType(0)) // VT + }; + SDOperand BSLoad = DAG.getNode(PPCISD::LBRX, VTs, Ops, 4); + + // If this is an i16 load, insert the truncate. + SDOperand ResVal = BSLoad; + if (N->getValueType(0) == MVT::i16) + ResVal = DAG.getNode(ISD::TRUNCATE, MVT::i16, BSLoad); + + // First, combine the bswap away. This makes the value produced by the + // load dead. + DCI.CombineTo(N, ResVal); + + // Next, combine the load away, we give it a bogus result value but a real + // chain result. The result value is dead because the bswap is dead. + DCI.CombineTo(Load.Val, ResVal, BSLoad.getValue(1)); + + // Return N so it doesn't get rechecked! + return SDOperand(N, 0); + } + + break; + case PPCISD::VCMP: { + // If a VCMPo node already exists with exactly the same operands as this + // node, use its result instead of this node (VCMPo computes both a CR6 and + // a normal output). + // + if (!N->getOperand(0).hasOneUse() && + !N->getOperand(1).hasOneUse() && + !N->getOperand(2).hasOneUse()) { + + // Scan all of the users of the LHS, looking for VCMPo's that match. + SDNode *VCMPoNode = 0; + + SDNode *LHSN = N->getOperand(0).Val; + for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); + UI != E; ++UI) + if ((*UI)->getOpcode() == PPCISD::VCMPo && + (*UI)->getOperand(1) == N->getOperand(1) && + (*UI)->getOperand(2) == N->getOperand(2) && + (*UI)->getOperand(0) == N->getOperand(0)) { + VCMPoNode = *UI; + break; + } + + // If there is no VCMPo node, or if the flag value has a single use, don't + // transform this. + if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) + break; + + // Look at the (necessarily single) use of the flag value. If it has a + // chain, this transformation is more complex. Note that multiple things + // could use the value result, which we should ignore. + SDNode *FlagUser = 0; + for (SDNode::use_iterator UI = VCMPoNode->use_begin(); + FlagUser == 0; ++UI) { + assert(UI != VCMPoNode->use_end() && "Didn't find user!"); + SDNode *User = *UI; + for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { + if (User->getOperand(i) == SDOperand(VCMPoNode, 1)) { + FlagUser = User; + break; + } + } + } + + // If the user is a MFCR instruction, we know this is safe. Otherwise we + // give up for right now. + if (FlagUser->getOpcode() == PPCISD::MFCR) + return SDOperand(VCMPoNode, 0); + } + break; + } + case ISD::BR_CC: { + // If this is a branch on an altivec predicate comparison, lower this so + // that we don't have to do a MFCR: instead, branch directly on CR6. This + // lowering is done pre-legalize, because the legalizer lowers the predicate + // compare down to code that is difficult to reassemble. + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + SDOperand LHS = N->getOperand(2), RHS = N->getOperand(3); + int CompareOpc; + bool isDot; + + if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && + isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && + getAltivecCompareInfo(LHS, CompareOpc, isDot)) { + assert(isDot && "Can't compare against a vector result!"); + + // If this is a comparison against something other than 0/1, then we know + // that the condition is never/always true. + unsigned Val = cast<ConstantSDNode>(RHS)->getValue(); + if (Val != 0 && Val != 1) { + if (CC == ISD::SETEQ) // Cond never true, remove branch. + return N->getOperand(0); + // Always !=, turn it into an unconditional branch. + return DAG.getNode(ISD::BR, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + + bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); + + // Create the PPCISD altivec 'dot' comparison node. + std::vector<MVT::ValueType> VTs; + SDOperand Ops[] = { + LHS.getOperand(2), // LHS of compare + LHS.getOperand(3), // RHS of compare + DAG.getConstant(CompareOpc, MVT::i32) + }; + VTs.push_back(LHS.getOperand(2).getValueType()); + VTs.push_back(MVT::Flag); + SDOperand CompNode = DAG.getNode(PPCISD::VCMPo, VTs, Ops, 3); + + // Unpack the result based on how the target uses it. + PPC::Predicate CompOpc; + switch (cast<ConstantSDNode>(LHS.getOperand(1))->getValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Branch on the value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; + break; + case 1: // Branch on the inverted value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; + break; + case 2: // Branch on the value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; + break; + case 3: // Branch on the inverted value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; + break; + } + + return DAG.getNode(PPCISD::COND_BRANCH, MVT::Other, N->getOperand(0), + DAG.getConstant(CompOpc, MVT::i32), + DAG.getRegister(PPC::CR6, MVT::i32), + N->getOperand(4), CompNode.getValue(1)); + } + break; + } + } + + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// Inline Assembly Support +//===----------------------------------------------------------------------===// + +void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = 0; + KnownOne = 0; + switch (Op.getOpcode()) { + default: break; + case PPCISD::LBRX: { + // lhbrx is known to have the top bits cleared out. + if (cast<VTSDNode>(Op.getOperand(3))->getVT() == MVT::i16) + KnownZero = 0xFFFF0000; + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(Op.getOperand(0))->getValue()) { + default: break; + case Intrinsic::ppc_altivec_vcmpbfp_p: + case Intrinsic::ppc_altivec_vcmpeqfp_p: + case Intrinsic::ppc_altivec_vcmpequb_p: + case Intrinsic::ppc_altivec_vcmpequh_p: + case Intrinsic::ppc_altivec_vcmpequw_p: + case Intrinsic::ppc_altivec_vcmpgefp_p: + case Intrinsic::ppc_altivec_vcmpgtfp_p: + case Intrinsic::ppc_altivec_vcmpgtsb_p: + case Intrinsic::ppc_altivec_vcmpgtsh_p: + case Intrinsic::ppc_altivec_vcmpgtsw_p: + case Intrinsic::ppc_altivec_vcmpgtub_p: + case Intrinsic::ppc_altivec_vcmpgtuh_p: + case Intrinsic::ppc_altivec_vcmpgtuw_p: + KnownZero = ~1U; // All bits but the low one are known to be zero. + break; + } + } + } +} + + +/// getConstraintType - Given a constraint, return the type of +/// constraint it is for this target. +PPCTargetLowering::ConstraintType +PPCTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'b': + case 'r': + case 'f': + case 'v': + case 'y': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +std::pair<unsigned, const TargetRegisterClass*> +PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + if (Constraint.size() == 1) { + // GCC RS6000 Constraint Letters + switch (Constraint[0]) { + case 'b': // R1-R31 + case 'r': // R0-R31 + if (VT == MVT::i64 && PPCSubTarget.isPPC64()) + return std::make_pair(0U, PPC::G8RCRegisterClass); + return std::make_pair(0U, PPC::GPRCRegisterClass); + case 'f': + if (VT == MVT::f32) + return std::make_pair(0U, PPC::F4RCRegisterClass); + else if (VT == MVT::f64) + return std::make_pair(0U, PPC::F8RCRegisterClass); + break; + case 'v': + return std::make_pair(0U, PPC::VRRCRegisterClass); + case 'y': // crrc + return std::make_pair(0U, PPC::CRRCRegisterClass); + } + } + + return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); +} + + +// isOperandValidForConstraint +SDOperand PPCTargetLowering:: +isOperandValidForConstraint(SDOperand Op, char Letter, SelectionDAG &DAG) { + switch (Letter) { + default: break; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': { + ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); + if (!CST) return SDOperand(0, 0); // Must be an immediate to match. + unsigned Value = CST->getValue(); + switch (Letter) { + default: assert(0 && "Unknown constraint letter!"); + case 'I': // "I" is a signed 16-bit constant. + if ((short)Value == (int)Value) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'J': // "J" is a constant with only the high-order 16 bits nonzero. + case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. + if ((short)Value == 0) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'K': // "K" is a constant with only the low-order 16 bits nonzero. + if ((Value >> 16) == 0) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'M': // "M" is a constant that is greater than 31. + if (Value > 31) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'N': // "N" is a positive constant that is an exact power of two. + if ((int)Value > 0 && isPowerOf2_32(Value)) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'O': // "O" is the constant zero. + if (Value == 0) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'P': // "P" is a constant whose negation is a signed 16-bit constant. + if ((short)-Value == (int)-Value) + return DAG.getTargetConstant(Value, Op.getValueType()); + break; + } + break; + } + } + + // Handle standard constraint letters. + return TargetLowering::isOperandValidForConstraint(Op, Letter, DAG); +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const { + // FIXME: PPC does not allow r+i addressing modes for vectors! + + // PPC allows a sign-extended 16-bit immediate field. + if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) + return false; + + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // PPC only support r+r, + switch (AM.Scale) { + case 0: // "r+i" or just "i", depending on HasBaseReg. + break; + case 1: + if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + return false; + // Otherwise we have r+r or r+i. + break; + case 2: + if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + return false; + // Allow 2*r as r+r. + break; + default: + // No other scales are supported. + return false; + } + + return true; +} + +/// isLegalAddressImmediate - Return true if the integer value can be used +/// as the offset of the target addressing mode for load / store of the +/// given type. +bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{ + // PPC allows a sign-extended 16-bit immediate field. + return (V > -(1 << 16) && V < (1 << 16)-1); +} + +bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const { + return false; +} + +SDOperand PPCTargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG) +{ + // Depths > 0 not supported yet! + if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0) + return SDOperand(); + + MVT::ValueType PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool is31 = (NoFramePointerElim || MFI->hasVarSizedObjects()) + && MFI->getStackSize(); + + if (isPPC64) + return DAG.getCopyFromReg(DAG.getEntryNode(), is31 ? PPC::X31 : PPC::X1, + MVT::i32); + else + return DAG.getCopyFromReg(DAG.getEntryNode(), is31 ? PPC::R31 : PPC::R1, + MVT::i32); +} diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h new file mode 100644 index 0000000..0581865 --- /dev/null +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -0,0 +1,263 @@ +//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that PPC uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H +#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "PPC.h" +#include "PPCSubtarget.h" + +namespace llvm { + namespace PPCISD { + enum NodeType { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END+PPC::INSTRUCTION_LIST_END, + + /// FSEL - Traditional three-operand fsel node. + /// + FSEL, + + /// FCFID - The FCFID instruction, taking an f64 operand and producing + /// and f64 value containing the FP representation of the integer that + /// was temporarily in the f64 operand. + FCFID, + + /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 + /// operand, producing an f64 value containing the integer representation + /// of that FP value. + FCTIDZ, FCTIWZ, + + /// STFIWX - The STFIWX instruction. The first operand is an input token + /// chain, then an f64 value to store, then an address to store it to, + /// then a SRCVALUE for the address. + STFIWX, + + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking + // three v4f32 operands and producing a v4f32 result. + VMADDFP, VNMSUBFP, + + /// VPERM - The PPC VPERM Instruction. + /// + VPERM, + + /// Hi/Lo - These represent the high and low 16-bit parts of a global + /// address respectively. These nodes have two operands, the first of + /// which must be a TargetGlobalAddress, and the second of which must be a + /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C', + /// though these are usually folded into other nodes. + Hi, Lo, + + /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an allocation on the stack. + DYNALLOC, + + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// These nodes represent the 32-bit PPC shifts that operate on 6-bit + /// shift amounts. These nodes are generated by the multi-precision shift + /// code. + SRL, SRA, SHL, + + /// EXTSW_32 - This is the EXTSW instruction for use with "32-bit" + /// registers. + EXTSW_32, + + /// STD_32 - This is the STD instruction for use with "32-bit" registers. + STD_32, + + /// CALL - A direct function call. + CALL_Macho, CALL_ELF, + + /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a + /// MTCTR instruction. + MTCTR, + + /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a + /// BCTRL instruction. + BCTRL_Macho, BCTRL_ELF, + + /// Return with a flag operand, matched by 'blr' + RET_FLAG, + + /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCR/MFOCRF instructions. + /// This copies the bits corresponding to the specified CRREG into the + /// resultant GPR. Bits corresponding to other CR regs are undefined. + MFCR, + + /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP* + /// instructions. For lack of better number, we use the opcode number + /// encoding for the OPC field to identify the compare. For example, 838 + /// is VCMPGTSH. + VCMP, + + /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the + /// altivec VCMP*o instructions. For lack of better number, we use the + /// opcode number encoding for the OPC field to identify the compare. For + /// example, 838 is VCMPGTSH. + VCMPo, + + /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This + /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the + /// condition register to branch on, OPC is the branch opcode to use (e.g. + /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is + /// an optional input flag argument. + COND_BRANCH, + + /// CHAIN = STBRX CHAIN, GPRC, Ptr, SRCVALUE, Type - This is a + /// byte-swapping store instruction. It byte-swaps the low "Type" bits of + /// the GPRC input, then stores it through Ptr. Type can be either i16 or + /// i32. + STBRX, + + /// GPRC, CHAIN = LBRX CHAIN, Ptr, SRCVALUE, Type - This is a + /// byte-swapping load instruction. It loads "Type" bits, byte swaps it, + /// then puts it in the bottom bits of the GPRC. TYPE can be either i16 + /// or i32. + LBRX + }; + } + + /// Define some predicates that are used for node matching. + namespace PPC { + /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUHUM instruction. + bool isVPKUHUMShuffleMask(SDNode *N, bool isUnary); + + /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUWUM instruction. + bool isVPKUWUMShuffleMask(SDNode *N, bool isUnary); + + /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGLShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary); + + /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGHShuffleMask(SDNode *N, unsigned UnitSize, bool isUnary); + + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift + /// amount, otherwise return -1. + int isVSLDOIShuffleMask(SDNode *N, bool isUnary); + + /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a splat of a single element that is suitable for input to + /// VSPLTB/VSPLTH/VSPLTW. + bool isSplatShuffleMask(SDNode *N, unsigned EltSize); + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the + /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. + unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize); + + /// get_VSPLTI_elt - If this is a build_vector of constants which can be + /// formed by using a vspltis[bhw] instruction of the specified element + /// size, return the constant being splatted. The ByteSize field indicates + /// the number of bytes of each element [124] -> [bhw]. + SDOperand get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); + } + + class PPCTargetLowering : public TargetLowering { + int VarArgsFrameIndex; // FrameIndex for start of varargs area. + int VarArgsStackOffset; // StackOffset for start of stack + // arguments. + unsigned VarArgsNumGPR; // Index of the first unused integer + // register for parameter passing. + unsigned VarArgsNumFPR; // Index of the first unused double + // register for parameter passing. + int ReturnAddrIndex; // FrameIndex for return slot. + const PPCSubtarget &PPCSubTarget; + public: + PPCTargetLowering(PPCTargetMachine &TM); + + /// getTargetNodeName() - This method returns the name of a target specific + /// DAG node. + virtual const char *getTargetNodeName(unsigned Opcode) const; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + virtual bool getPreIndexedAddressParts(SDNode *N, SDOperand &Base, + SDOperand &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG); + + /// SelectAddressRegReg - Given the specified addressed, check to see if it + /// can be represented as an indexed [r+r] operation. Returns false if it + /// can be more efficiently represented with [r+imm]. + bool SelectAddressRegReg(SDOperand N, SDOperand &Base, SDOperand &Index, + SelectionDAG &DAG); + + /// SelectAddressRegImm - Returns true if the address N can be represented + /// by a base register plus a signed 16-bit displacement [r+imm], and if it + /// is not better represented as reg+reg. + bool SelectAddressRegImm(SDOperand N, SDOperand &Disp, SDOperand &Base, + SelectionDAG &DAG); + + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddressRegRegOnly(SDOperand N, SDOperand &Base, SDOperand &Index, + SelectionDAG &DAG); + + /// SelectAddressRegImmShift - Returns true if the address N can be + /// represented by a base register plus a signed 14-bit displacement + /// [r+imm*4]. Suitable for use by STD and friends. + bool SelectAddressRegImmShift(SDOperand N, SDOperand &Disp, SDOperand &Base, + SelectionDAG &DAG); + + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + + virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + virtual void computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *MBB); + + ConstraintType getConstraintType(const std::string &Constraint) const; + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter, + SelectionDAG &DAG); + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + + /// isLegalAddressImmediate - Return true if the integer value can be used + /// as the offset of the target addressing mode for load / store of the + /// given type. + virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const; + + /// isLegalAddressImmediate - Return true if the GlobalValue can be used as + /// the offset of the target addressing mode. + virtual bool isLegalAddressImmediate(GlobalValue *GV) const; + + SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG); + }; +} + +#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td new file mode 100644 index 0000000..a7e25cf --- /dev/null +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -0,0 +1,590 @@ +//===- PPCInstr64Bit.td - The PowerPC 64-bit Support -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC 64-bit instructions. These patterns are used +// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 64-bit operands. +// +def s16imm64 : Operand<i64> { + let PrintMethod = "printS16ImmOperand"; +} +def u16imm64 : Operand<i64> { + let PrintMethod = "printU16ImmOperand"; +} +def symbolHi64 : Operand<i64> { + let PrintMethod = "printSymbolHi"; +} +def symbolLo64 : Operand<i64> { + let PrintMethod = "printSymbolLo"; +} + +//===----------------------------------------------------------------------===// +// 64-bit transformation functions. +// + +def SHL64 : SDNodeXForm<imm, [{ + // Transformation function: 63 - imm + return getI32Imm(63 - N->getValue()); +}]>; + +def SRL64 : SDNodeXForm<imm, [{ + // Transformation function: 64 - imm + return N->getValue() ? getI32Imm(64 - N->getValue()) : getI32Imm(0); +}]>; + +def HI32_48 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getValue() >> 32)); +}]>; + +def HI48_64 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getValue() >> 48)); +}]>; + + +//===----------------------------------------------------------------------===// +// Pseudo instructions. +// + +def IMPLICIT_DEF_G8RC : Pseudo<(ops G8RC:$rD), "; IMPLICIT_DEF_G8RC $rD", + [(set G8RC:$rD, (undef))]>; + + +//===----------------------------------------------------------------------===// +// Calls. +// + +let Defs = [LR8] in + def MovePCtoLR8 : Pseudo<(ops piclabel:$label), "bl $label", []>, + PPC970_Unit_BRU; + +// Macho ABI Calls. +let isCall = 1, noResults = 1, PPC970_Unit = 7, + // All calls clobber the PPC64 non-callee saved registers. + Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12, + F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13, + V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19, + LR8,CTR8, + CR0,CR1,CR5,CR6,CR7] in { + // Convenient aliases for call instructions + def BL8_Macho : IForm<18, 0, 1, + (ops calltarget:$func, variable_ops), + "bl $func", BrB, []>; // See Pat patterns below. + + def BLA8_Macho : IForm<18, 1, 1, + (ops aaddr:$func, variable_ops), + "bla $func", BrB, [(PPCcall_Macho (i64 imm:$func))]>; +} + +// ELF 64 ABI Calls = Macho ABI Calls +// Used to define BL8_ELF and BLA8_ELF +let isCall = 1, noResults = 1, PPC970_Unit = 7, + // All calls clobber the PPC64 non-callee saved registers. + Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12, + F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13, + V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19, + LR8,CTR8, + CR0,CR1,CR5,CR6,CR7] in { + // Convenient aliases for call instructions + def BL8_ELF : IForm<18, 0, 1, + (ops calltarget:$func, variable_ops), + "bl $func", BrB, []>; // See Pat patterns below. + + def BLA8_ELF : IForm<18, 1, 1, + (ops aaddr:$func, variable_ops), + "bla $func", BrB, [(PPCcall_ELF (i64 imm:$func))]>; +} + + +// Calls +def : Pat<(PPCcall_Macho (i64 tglobaladdr:$dst)), + (BL8_Macho tglobaladdr:$dst)>; +def : Pat<(PPCcall_Macho (i64 texternalsym:$dst)), + (BL8_Macho texternalsym:$dst)>; + +def : Pat<(PPCcall_ELF (i64 tglobaladdr:$dst)), + (BL8_ELF tglobaladdr:$dst)>; +def : Pat<(PPCcall_ELF (i64 texternalsym:$dst)), + (BL8_ELF texternalsym:$dst)>; + +//===----------------------------------------------------------------------===// +// 64-bit SPR manipulation instrs. + +def MFCTR8 : XFXForm_1_ext<31, 339, 9, (ops G8RC:$rT), "mfctr $rT", SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +let Pattern = [(PPCmtctr G8RC:$rS)] in { +def MTCTR8 : XFXForm_7_ext<31, 467, 9, (ops G8RC:$rS), "mtctr $rS", SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +def DYNALLOC8 : Pseudo<(ops G8RC:$result, G8RC:$negsize, memri:$fpsi), + "${:comment} DYNALLOC8 $result, $negsize, $fpsi", + [(set G8RC:$result, + (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>, + Imp<[X1],[X1]>; + +def MTLR8 : XFXForm_7_ext<31, 467, 8, (ops G8RC:$rS), "mtlr $rS", SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +def MFLR8 : XFXForm_1_ext<31, 339, 8, (ops G8RC:$rT), "mflr $rT", SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; + + +//===----------------------------------------------------------------------===// +// Fixed point instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. + +// Copies, extends, truncates. +def OR4To8 : XForm_6<31, 444, (ops G8RC:$rA, GPRC:$rS, GPRC:$rB), + "or $rA, $rS, $rB", IntGeneral, + []>; +def OR8To4 : XForm_6<31, 444, (ops GPRC:$rA, G8RC:$rS, G8RC:$rB), + "or $rA, $rS, $rB", IntGeneral, + []>; + +def LI8 : DForm_2_r0<14, (ops G8RC:$rD, symbolLo64:$imm), + "li $rD, $imm", IntGeneral, + [(set G8RC:$rD, immSExt16:$imm)]>; +def LIS8 : DForm_2_r0<15, (ops G8RC:$rD, symbolHi64:$imm), + "lis $rD, $imm", IntGeneral, + [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>; + +// Logical ops. +def NAND8: XForm_6<31, 476, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "nand $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>; +def AND8 : XForm_6<31, 28, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "and $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>; +def ANDC8: XForm_6<31, 60, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "andc $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>; +def OR8 : XForm_6<31, 444, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "or $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>; +def NOR8 : XForm_6<31, 124, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "nor $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>; +def ORC8 : XForm_6<31, 412, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "orc $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>; +def EQV8 : XForm_6<31, 284, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "eqv $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>; +def XOR8 : XForm_6<31, 316, (ops G8RC:$rA, G8RC:$rS, G8RC:$rB), + "xor $rA, $rS, $rB", IntGeneral, + [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>; + +// Logical ops with immediate. +def ANDIo8 : DForm_4<28, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "andi. $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (and G8RC:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo8 : DForm_4<29, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "andis. $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>, + isDOT; +def ORI8 : DForm_4<24, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "ori $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>; +def ORIS8 : DForm_4<25, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "oris $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>; +def XORI8 : DForm_4<26, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "xori $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>; +def XORIS8 : DForm_4<27, (ops G8RC:$dst, G8RC:$src1, u16imm:$src2), + "xoris $dst, $src1, $src2", IntGeneral, + [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>; + +def ADD8 : XOForm_1<31, 266, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "add $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>; + +def ADDC8 : XOForm_1<31, 10, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "addc $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (addc G8RC:$rA, G8RC:$rB))]>, + PPC970_DGroup_Cracked; +def ADDE8 : XOForm_1<31, 138, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "adde $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>; + +def ADDI8 : DForm_2<14, (ops G8RC:$rD, G8RC:$rA, s16imm64:$imm), + "addi $rD, $rA, $imm", IntGeneral, + [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>; +def ADDIS8 : DForm_2<15, (ops G8RC:$rD, G8RC:$rA, symbolHi64:$imm), + "addis $rD, $rA, $imm", IntGeneral, + [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>; + +def SUBFIC8: DForm_2< 8, (ops G8RC:$rD, G8RC:$rA, s16imm64:$imm), + "subfic $rD, $rA, $imm", IntGeneral, + [(set G8RC:$rD, (subc immSExt16:$imm, G8RC:$rA))]>; +def SUBF8 : XOForm_1<31, 40, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "subf $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>; + +def SUBFC8 : XOForm_1<31, 8, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "subfc $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (subc G8RC:$rB, G8RC:$rA))]>, + PPC970_DGroup_Cracked; + +def SUBFE8 : XOForm_1<31, 136, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "subfe $rT, $rA, $rB", IntGeneral, + [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>; +def ADDME8 : XOForm_3<31, 234, 0, (ops G8RC:$rT, G8RC:$rA), + "addme $rT, $rA", IntGeneral, + [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>; +def ADDZE8 : XOForm_3<31, 202, 0, (ops G8RC:$rT, G8RC:$rA), + "addze $rT, $rA", IntGeneral, + [(set G8RC:$rT, (adde G8RC:$rA, 0))]>; +def NEG8 : XOForm_3<31, 104, 0, (ops G8RC:$rT, G8RC:$rA), + "neg $rT, $rA", IntGeneral, + [(set G8RC:$rT, (ineg G8RC:$rA))]>; +def SUBFME8 : XOForm_3<31, 232, 0, (ops G8RC:$rT, G8RC:$rA), + "subfme $rT, $rA", IntGeneral, + [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>; +def SUBFZE8 : XOForm_3<31, 200, 0, (ops G8RC:$rT, G8RC:$rA), + "subfze $rT, $rA", IntGeneral, + [(set G8RC:$rT, (sube 0, G8RC:$rA))]>; + + + +def MULHD : XOForm_1<31, 73, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "mulhd $rT, $rA, $rB", IntMulHW, + [(set G8RC:$rT, (mulhs G8RC:$rA, G8RC:$rB))]>; +def MULHDU : XOForm_1<31, 9, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "mulhdu $rT, $rA, $rB", IntMulHWU, + [(set G8RC:$rT, (mulhu G8RC:$rA, G8RC:$rB))]>; + +def CMPD : XForm_16_ext<31, 0, (ops CRRC:$crD, G8RC:$rA, G8RC:$rB), + "cmpd $crD, $rA, $rB", IntCompare>, isPPC64; +def CMPLD : XForm_16_ext<31, 32, (ops CRRC:$crD, G8RC:$rA, G8RC:$rB), + "cmpld $crD, $rA, $rB", IntCompare>, isPPC64; +def CMPDI : DForm_5_ext<11, (ops CRRC:$crD, G8RC:$rA, s16imm:$imm), + "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64; +def CMPLDI : DForm_6_ext<10, (ops CRRC:$dst, G8RC:$src1, u16imm:$src2), + "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64; + +def SLD : XForm_6<31, 27, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB), + "sld $rA, $rS, $rB", IntRotateD, + [(set G8RC:$rA, (shl G8RC:$rS, GPRC:$rB))]>, isPPC64; +def SRD : XForm_6<31, 539, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB), + "srd $rA, $rS, $rB", IntRotateD, + [(set G8RC:$rA, (srl G8RC:$rS, GPRC:$rB))]>, isPPC64; +def SRAD : XForm_6<31, 794, (ops G8RC:$rA, G8RC:$rS, GPRC:$rB), + "srad $rA, $rS, $rB", IntRotateD, + [(set G8RC:$rA, (sra G8RC:$rS, GPRC:$rB))]>, isPPC64; + +def EXTSB8 : XForm_11<31, 954, (ops G8RC:$rA, G8RC:$rS), + "extsb $rA, $rS", IntGeneral, + [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>; +def EXTSH8 : XForm_11<31, 922, (ops G8RC:$rA, G8RC:$rS), + "extsh $rA, $rS", IntGeneral, + [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>; + +def EXTSW : XForm_11<31, 986, (ops G8RC:$rA, G8RC:$rS), + "extsw $rA, $rS", IntGeneral, + [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64; +/// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers. +def EXTSW_32 : XForm_11<31, 986, (ops GPRC:$rA, GPRC:$rS), + "extsw $rA, $rS", IntGeneral, + [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64; +def EXTSW_32_64 : XForm_11<31, 986, (ops G8RC:$rA, GPRC:$rS), + "extsw $rA, $rS", IntGeneral, + [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64; + +def SRADI : XSForm_1<31, 413, (ops G8RC:$rA, G8RC:$rS, u6imm:$SH), + "sradi $rA, $rS, $SH", IntRotateD, + [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64; +def CNTLZD : XForm_11<31, 58, (ops G8RC:$rA, G8RC:$rS), + "cntlzd $rA, $rS", IntGeneral, + [(set G8RC:$rA, (ctlz G8RC:$rS))]>; + +def DIVD : XOForm_1<31, 489, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "divd $rT, $rA, $rB", IntDivD, + [(set G8RC:$rT, (sdiv G8RC:$rA, G8RC:$rB))]>, isPPC64, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +def DIVDU : XOForm_1<31, 457, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "divdu $rT, $rA, $rB", IntDivD, + [(set G8RC:$rT, (udiv G8RC:$rA, G8RC:$rB))]>, isPPC64, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +def MULLD : XOForm_1<31, 233, 0, (ops G8RC:$rT, G8RC:$rA, G8RC:$rB), + "mulld $rT, $rA, $rB", IntMulHD, + [(set G8RC:$rT, (mul G8RC:$rA, G8RC:$rB))]>, isPPC64; + + +let isCommutable = 1 in { +def RLDIMI : MDForm_1<30, 3, + (ops G8RC:$rA, G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB), + "rldimi $rA, $rS, $SH, $MB", IntRotateD, + []>, isPPC64, RegConstraint<"$rSi = $rA">, + NoEncode<"$rSi">; +} + +// Rotate instructions. +def RLDICL : MDForm_1<30, 0, + (ops G8RC:$rA, G8RC:$rS, u6imm:$SH, u6imm:$MB), + "rldicl $rA, $rS, $SH, $MB", IntRotateD, + []>, isPPC64; +def RLDICR : MDForm_1<30, 1, + (ops G8RC:$rA, G8RC:$rS, u6imm:$SH, u6imm:$ME), + "rldicr $rA, $rS, $SH, $ME", IntRotateD, + []>, isPPC64; +} // End FXU Operations. + + +//===----------------------------------------------------------------------===// +// Load/Store instructions. +// + + +// Sign extending loads. +let isLoad = 1, PPC970_Unit = 2 in { +def LHA8: DForm_1<42, (ops G8RC:$rD, memri:$src), + "lha $rD, $src", LdStLHA, + [(set G8RC:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWA : DSForm_1<58, 2, (ops G8RC:$rD, memrix:$src), + "lwa $rD, $src", LdStLWA, + [(set G8RC:$rD, (sextloadi32 ixaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; +def LHAX8: XForm_1<31, 343, (ops G8RC:$rD, memrr:$src), + "lhax $rD, $src", LdStLHA, + [(set G8RC:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWAX : XForm_1<31, 341, (ops G8RC:$rD, memrr:$src), + "lwax $rD, $src", LdStLHA, + [(set G8RC:$rD, (sextloadi32 xaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; + +// Update forms. +def LHAU8 : DForm_1<43, (ops G8RC:$rD, ptr_rc:$ea_result, symbolLo:$disp, + ptr_rc:$rA), + "lhau $rD, $disp($rA)", LdStGeneral, + []>, RegConstraint<"$rA = $ea_result">, + NoEncode<"$ea_result">; +// NO LWAU! + +} + +// Zero extending loads. +let isLoad = 1, PPC970_Unit = 2 in { +def LBZ8 : DForm_1<34, (ops G8RC:$rD, memri:$src), + "lbz $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi8 iaddr:$src))]>; +def LHZ8 : DForm_1<40, (ops G8RC:$rD, memri:$src), + "lhz $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ8 : DForm_1<32, (ops G8RC:$rD, memri:$src), + "lwz $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi32 iaddr:$src))]>, isPPC64; + +def LBZX8 : XForm_1<31, 87, (ops G8RC:$rD, memrr:$src), + "lbzx $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi8 xaddr:$src))]>; +def LHZX8 : XForm_1<31, 279, (ops G8RC:$rD, memrr:$src), + "lhzx $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX8 : XForm_1<31, 23, (ops G8RC:$rD, memrr:$src), + "lwzx $rD, $src", LdStGeneral, + [(set G8RC:$rD, (zextloadi32 xaddr:$src))]>; + + +// Update forms. +def LBZU8 : DForm_1<35, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr), + "lbzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LHZU8 : DForm_1<41, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr), + "lhzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LWZU8 : DForm_1<33, (ops G8RC:$rD, ptr_rc:$ea_result, memri:$addr), + "lwzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +} + + +// Full 8-byte loads. +let isLoad = 1, PPC970_Unit = 2 in { +def LD : DSForm_1<58, 0, (ops G8RC:$rD, memrix:$src), + "ld $rD, $src", LdStLD, + [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64; +def LDX : XForm_1<31, 21, (ops G8RC:$rD, memrr:$src), + "ldx $rD, $src", LdStLD, + [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64; + +def LDU : DSForm_1<58, 1, (ops G8RC:$rD, ptr_rc:$ea_result, memrix:$addr), + "ldu $rD, $addr", LdStLD, + []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, + NoEncode<"$ea_result">; + +} + +let isStore = 1, noResults = 1, PPC970_Unit = 2 in { +// Truncating stores. +def STB8 : DForm_1<38, (ops G8RC:$rS, memri:$src), + "stb $rS, $src", LdStGeneral, + [(truncstorei8 G8RC:$rS, iaddr:$src)]>; +def STH8 : DForm_1<44, (ops G8RC:$rS, memri:$src), + "sth $rS, $src", LdStGeneral, + [(truncstorei16 G8RC:$rS, iaddr:$src)]>; +def STW8 : DForm_1<36, (ops G8RC:$rS, memri:$src), + "stw $rS, $src", LdStGeneral, + [(truncstorei32 G8RC:$rS, iaddr:$src)]>; +def STBX8 : XForm_8<31, 215, (ops G8RC:$rS, memrr:$dst), + "stbx $rS, $dst", LdStGeneral, + [(truncstorei8 G8RC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX8 : XForm_8<31, 407, (ops G8RC:$rS, memrr:$dst), + "sthx $rS, $dst", LdStGeneral, + [(truncstorei16 G8RC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX8 : XForm_8<31, 151, (ops G8RC:$rS, memrr:$dst), + "stwx $rS, $dst", LdStGeneral, + [(truncstorei32 G8RC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +// Normal 8-byte stores. +def STD : DSForm_1<62, 0, (ops G8RC:$rS, memrix:$dst), + "std $rS, $dst", LdStSTD, + [(store G8RC:$rS, ixaddr:$dst)]>, isPPC64; +def STDX : XForm_8<31, 149, (ops G8RC:$rS, memrr:$dst), + "stdx $rS, $dst", LdStSTD, + [(store G8RC:$rS, xaddr:$dst)]>, isPPC64, + PPC970_DGroup_Cracked; +} + +let isStore = 1, PPC970_Unit = 2 in { + +def STBU8 : DForm_1<38, (ops ptr_rc:$ea_res, G8RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stbu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, + (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STHU8 : DForm_1<45, (ops ptr_rc:$ea_res, G8RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "sthu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, + (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STWU8 : DForm_1<37, (ops ptr_rc:$ea_res, G8RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stwu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; + + +def STDU : DSForm_1<62, 1, (ops ptr_rc:$ea_res, G8RC:$rS, + s16immX4:$ptroff, ptr_rc:$ptrreg), + "stdu $rS, $ptroff($ptrreg)", LdStSTD, + [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">, + isPPC64; + +} + +let isStore = 1, noResults = 1, PPC970_Unit = 2 in { + +def STDUX : XForm_8<31, 181, (ops G8RC:$rS, memrr:$dst), + "stdux $rS, $dst", LdStSTD, + []>, isPPC64; + + +// STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register. +def STD_32 : DSForm_1<62, 0, (ops GPRC:$rT, memrix:$dst), + "std $rT, $dst", LdStSTD, + [(PPCstd_32 GPRC:$rT, ixaddr:$dst)]>, isPPC64; +def STDX_32 : XForm_8<31, 149, (ops GPRC:$rT, memrr:$dst), + "stdx $rT, $dst", LdStSTD, + [(PPCstd_32 GPRC:$rT, xaddr:$dst)]>, isPPC64, + PPC970_DGroup_Cracked; +} + + + +//===----------------------------------------------------------------------===// +// Floating point instructions. +// + + +let PPC970_Unit = 3 in { // FPU Operations. +def FCFID : XForm_26<63, 846, (ops F8RC:$frD, F8RC:$frB), + "fcfid $frD, $frB", FPGeneral, + [(set F8RC:$frD, (PPCfcfid F8RC:$frB))]>, isPPC64; +def FCTIDZ : XForm_26<63, 815, (ops F8RC:$frD, F8RC:$frB), + "fctidz $frD, $frB", FPGeneral, + [(set F8RC:$frD, (PPCfctidz F8RC:$frB))]>, isPPC64; +} + + +//===----------------------------------------------------------------------===// +// Instruction Patterns +// + +// Extensions and truncates to/from 32-bit regs. +def : Pat<(i64 (zext GPRC:$in)), + (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>; +def : Pat<(i64 (anyext GPRC:$in)), + (OR4To8 GPRC:$in, GPRC:$in)>; +def : Pat<(i32 (trunc G8RC:$in)), + (OR8To4 G8RC:$in, G8RC:$in)>; + +// Extending loads with i64 targets. +def : Pat<(zextloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ8 iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX8 xaddr:$src)>; +def : Pat<(extloadi32 iaddr:$src), + (LWZ8 iaddr:$src)>; +def : Pat<(extloadi32 xaddr:$src), + (LWZX8 xaddr:$src)>; + +// SHL/SRL +def : Pat<(shl G8RC:$in, (i32 imm:$imm)), + (RLDICR G8RC:$in, imm:$imm, (SHL64 imm:$imm))>; +def : Pat<(srl G8RC:$in, (i32 imm:$imm)), + (RLDICL G8RC:$in, (SRL64 imm:$imm), imm:$imm)>; + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI8 tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in , 0), (LI8 tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>; +def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS8 G8RC:$in, tglobaladdr:$g)>; +def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)), + (ADDIS8 G8RC:$in, tconstpool:$g)>; +def : Pat<(add G8RC:$in, (PPChi tjumptable:$g, 0)), + (ADDIS8 G8RC:$in, tjumptable:$g)>; diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td new file mode 100644 index 0000000..8a2f255 --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -0,0 +1,622 @@ +//===- PPCInstrAltivec.td - The PowerPC Altivec Extension --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Altivec extension to the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Altivec transformation functions and pattern fragments. +// + +/// VPKUHUM_shuffle_mask/VPKUWUM_shuffle_mask - Return true if this is a valid +/// shuffle mask for the VPKUHUM or VPKUWUM instructions. +def VPKUHUM_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVPKUHUMShuffleMask(N, false); +}]>; +def VPKUWUM_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVPKUWUMShuffleMask(N, false); +}]>; + +def VPKUHUM_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVPKUHUMShuffleMask(N, true); +}]>; +def VPKUWUM_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVPKUWUMShuffleMask(N, true); +}]>; + + +def VMRGLB_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 1, false); +}]>; +def VMRGLH_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 2, false); +}]>; +def VMRGLW_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 4, false); +}]>; +def VMRGHB_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 1, false); +}]>; +def VMRGHH_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 2, false); +}]>; +def VMRGHW_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 4, false); +}]>; + +def VMRGLB_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 1, true); +}]>; +def VMRGLH_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 2, true); +}]>; +def VMRGLW_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGLShuffleMask(N, 4, true); +}]>; +def VMRGHB_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 1, true); +}]>; +def VMRGHH_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 2, true); +}]>; +def VMRGHW_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVMRGHShuffleMask(N, 4, true); +}]>; + + +def VSLDOI_get_imm : SDNodeXForm<build_vector, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, false)); +}]>; +def VSLDOI_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVSLDOIShuffleMask(N, false) != -1; +}], VSLDOI_get_imm>; + +/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into +/// vector_shuffle(X,undef,mask) by the dag combiner. +def VSLDOI_unary_get_imm : SDNodeXForm<build_vector, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, true)); +}]>; +def VSLDOI_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isVSLDOIShuffleMask(N, true) != -1; +}], VSLDOI_unary_get_imm>; + + +// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. +def VSPLTB_get_imm : SDNodeXForm<build_vector, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 1)); +}]>; +def VSPLTB_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isSplatShuffleMask(N, 1); +}], VSPLTB_get_imm>; +def VSPLTH_get_imm : SDNodeXForm<build_vector, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 2)); +}]>; +def VSPLTH_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isSplatShuffleMask(N, 2); +}], VSPLTH_get_imm>; +def VSPLTW_get_imm : SDNodeXForm<build_vector, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 4)); +}]>; +def VSPLTW_shuffle_mask : PatLeaf<(build_vector), [{ + return PPC::isSplatShuffleMask(N, 4); +}], VSPLTW_get_imm>; + + +// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm. +def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG); +}]>; +def vecspltisb : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG).Val != 0; +}], VSPLTISB_get_imm>; + +// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm. +def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG); +}]>; +def vecspltish : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG).Val != 0; +}], VSPLTISH_get_imm>; + +// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm. +def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG); +}]>; +def vecspltisw : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG).Val != 0; +}], VSPLTISW_get_imm>; + +//===----------------------------------------------------------------------===// +// Helpers for defining instructions that directly correspond to intrinsics. + +// VA1a_Int - A VAForm_1a intrinsic definition. +class VA1a_Int<bits<6> xo, string opc, Intrinsic IntID> + : VAForm_1a<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB, VRRC:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP, + [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB, VRRC:$vC))]>; + +// VX1_Int - A VXForm_1 intrinsic definition. +class VX1_Int<bits<11> xo, string opc, Intrinsic IntID> + : VXForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + !strconcat(opc, " $vD, $vA, $vB"), VecFP, + [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB))]>; + +// VX2_Int - A VXForm_2 intrinsic definition. +class VX2_Int<bits<11> xo, string opc, Intrinsic IntID> + : VXForm_2<xo, (ops VRRC:$vD, VRRC:$vB), + !strconcat(opc, " $vD, $vB"), VecFP, + [(set VRRC:$vD, (IntID VRRC:$vB))]>; + +//===----------------------------------------------------------------------===// +// Instruction Definitions. + +def IMPLICIT_DEF_VRRC : Pseudo<(ops VRRC:$rD), "; IMPLICIT_DEF_VRRC $rD", + [(set VRRC:$rD, (v4i32 (undef)))]>; + +let noResults = 1 in { +def DSS : DSS_Form<822, (ops u5imm:$A, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2), + "dss $STRM, $A", LdStGeneral /*FIXME*/, []>; +def DST : DSS_Form<342, (ops u5imm:$T, u5imm:$STRM, GPRC:$rA, GPRC:$rB), + "dst $rA, $rB, $STRM, $T", LdStGeneral /*FIXME*/, []>; +def DSTST : DSS_Form<374, (ops u5imm:$T, u5imm:$STRM, GPRC:$rA, GPRC:$rB), + "dstst $rA, $rB, $STRM, $T", LdStGeneral /*FIXME*/, []>; +} + +def MFVSCR : VXForm_4<1540, (ops VRRC:$vD), + "mfvcr $vD", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_mfvscr))]>; +def MTVSCR : VXForm_5<1604, (ops VRRC:$vB), + "mtvcr $vB", LdStGeneral, + [(int_ppc_altivec_mtvscr VRRC:$vB)]>; + +let isLoad = 1, PPC970_Unit = 2 in { // Loads. +def LVEBX: XForm_1<31, 7, (ops VRRC:$vD, memrr:$src), + "lvebx $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; +def LVEHX: XForm_1<31, 39, (ops VRRC:$vD, memrr:$src), + "lvehx $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>; +def LVEWX: XForm_1<31, 71, (ops VRRC:$vD, memrr:$src), + "lvewx $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>; +def LVX : XForm_1<31, 103, (ops VRRC:$vD, memrr:$src), + "lvx $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>; +def LVXL : XForm_1<31, 359, (ops VRRC:$vD, memrr:$src), + "lvxl $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>; +} + +def LVSL : XForm_1<31, 6, (ops VRRC:$vD, memrr:$src), + "lvsl $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>, + PPC970_Unit_LSU; +def LVSR : XForm_1<31, 38, (ops VRRC:$vD, memrr:$src), + "lvsr $vD, $src", LdStGeneral, + [(set VRRC:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>, + PPC970_Unit_LSU; + +let isStore = 1, noResults = 1, PPC970_Unit = 2 in { // Stores. +def STVEBX: XForm_8<31, 135, (ops VRRC:$rS, memrr:$dst), + "stvebx $rS, $dst", LdStGeneral, + [(int_ppc_altivec_stvebx VRRC:$rS, xoaddr:$dst)]>; +def STVEHX: XForm_8<31, 167, (ops VRRC:$rS, memrr:$dst), + "stvehx $rS, $dst", LdStGeneral, + [(int_ppc_altivec_stvehx VRRC:$rS, xoaddr:$dst)]>; +def STVEWX: XForm_8<31, 199, (ops VRRC:$rS, memrr:$dst), + "stvewx $rS, $dst", LdStGeneral, + [(int_ppc_altivec_stvewx VRRC:$rS, xoaddr:$dst)]>; +def STVX : XForm_8<31, 231, (ops VRRC:$rS, memrr:$dst), + "stvx $rS, $dst", LdStGeneral, + [(int_ppc_altivec_stvx VRRC:$rS, xoaddr:$dst)]>; +def STVXL : XForm_8<31, 487, (ops VRRC:$rS, memrr:$dst), + "stvxl $rS, $dst", LdStGeneral, + [(int_ppc_altivec_stvxl VRRC:$rS, xoaddr:$dst)]>; +} + +let PPC970_Unit = 5 in { // VALU Operations. +// VA-Form instructions. 3-input AltiVec ops. +def VMADDFP : VAForm_1<46, (ops VRRC:$vD, VRRC:$vA, VRRC:$vC, VRRC:$vB), + "vmaddfp $vD, $vA, $vC, $vB", VecFP, + [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC), + VRRC:$vB))]>, + Requires<[FPContractions]>; +def VNMSUBFP: VAForm_1<47, (ops VRRC:$vD, VRRC:$vA, VRRC:$vC, VRRC:$vB), + "vnmsubfp $vD, $vA, $vC, $vB", VecFP, + [(set VRRC:$vD, (fneg (fsub (fmul VRRC:$vA, VRRC:$vC), + VRRC:$vB)))]>, + Requires<[FPContractions]>; + +def VMHADDSHS : VA1a_Int<32, "vmhaddshs", int_ppc_altivec_vmhaddshs>; +def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>; +def VMLADDUHM : VA1a_Int<34, "vmladduhm", int_ppc_altivec_vmladduhm>; +def VPERM : VA1a_Int<43, "vperm", int_ppc_altivec_vperm>; +def VSEL : VA1a_Int<42, "vsel", int_ppc_altivec_vsel>; + +// Shuffles. +def VSLDOI : VAForm_2<44, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB, u5imm:$SH), + "vsldoi $vD, $vA, $vB, $SH", VecFP, + [(set VRRC:$vD, + (vector_shuffle (v16i8 VRRC:$vA), VRRC:$vB, + VSLDOI_shuffle_mask:$SH))]>; + +// VX-Form instructions. AltiVec arithmetic ops. +def VADDFP : VXForm_1<10, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vaddfp $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (fadd VRRC:$vA, VRRC:$vB))]>; + +def VADDUBM : VXForm_1<0, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vaddubm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (add (v16i8 VRRC:$vA), VRRC:$vB))]>; +def VADDUHM : VXForm_1<64, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vadduhm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (add (v8i16 VRRC:$vA), VRRC:$vB))]>; +def VADDUWM : VXForm_1<128, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vadduwm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (add (v4i32 VRRC:$vA), VRRC:$vB))]>; + +def VADDCUW : VX1_Int<384, "vaddcuw", int_ppc_altivec_vaddcuw>; +def VADDSBS : VX1_Int<768, "vaddsbs", int_ppc_altivec_vaddsbs>; +def VADDSHS : VX1_Int<832, "vaddshs", int_ppc_altivec_vaddshs>; +def VADDSWS : VX1_Int<896, "vaddsws", int_ppc_altivec_vaddsws>; +def VADDUBS : VX1_Int<512, "vaddubs", int_ppc_altivec_vaddubs>; +def VADDUHS : VX1_Int<576, "vadduhs", int_ppc_altivec_vadduhs>; +def VADDUWS : VX1_Int<640, "vadduws", int_ppc_altivec_vadduws>; + + +def VAND : VXForm_1<1028, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vand $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>; +def VANDC : VXForm_1<1092, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vandc $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>; + +def VCFSX : VXForm_1<842, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vcfsx $vD, $vB, $UIMM", VecFP, + [(set VRRC:$vD, + (int_ppc_altivec_vcfsx VRRC:$vB, imm:$UIMM))]>; +def VCFUX : VXForm_1<778, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vcfux $vD, $vB, $UIMM", VecFP, + [(set VRRC:$vD, + (int_ppc_altivec_vcfux VRRC:$vB, imm:$UIMM))]>; +def VCTSXS : VXForm_1<970, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vctsxs $vD, $vB, $UIMM", VecFP, + [(set VRRC:$vD, + (int_ppc_altivec_vctsxs VRRC:$vB, imm:$UIMM))]>; +def VCTUXS : VXForm_1<906, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vctuxs $vD, $vB, $UIMM", VecFP, + [(set VRRC:$vD, + (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>; +def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>; +def VLOGEFP : VX2_Int<458, "vlogefp", int_ppc_altivec_vlogefp>; + +def VAVGSB : VX1_Int<1282, "vavgsb", int_ppc_altivec_vavgsb>; +def VAVGSH : VX1_Int<1346, "vavgsh", int_ppc_altivec_vavgsh>; +def VAVGSW : VX1_Int<1410, "vavgsw", int_ppc_altivec_vavgsw>; +def VAVGUB : VX1_Int<1026, "vavgub", int_ppc_altivec_vavgub>; +def VAVGUH : VX1_Int<1090, "vavguh", int_ppc_altivec_vavguh>; +def VAVGUW : VX1_Int<1154, "vavguw", int_ppc_altivec_vavguw>; + +def VMAXFP : VX1_Int<1034, "vmaxfp", int_ppc_altivec_vmaxfp>; +def VMAXSB : VX1_Int< 258, "vmaxsb", int_ppc_altivec_vmaxsb>; +def VMAXSH : VX1_Int< 322, "vmaxsh", int_ppc_altivec_vmaxsh>; +def VMAXSW : VX1_Int< 386, "vmaxsw", int_ppc_altivec_vmaxsw>; +def VMAXUB : VX1_Int< 2, "vmaxub", int_ppc_altivec_vmaxub>; +def VMAXUH : VX1_Int< 66, "vmaxuh", int_ppc_altivec_vmaxuh>; +def VMAXUW : VX1_Int< 130, "vmaxuw", int_ppc_altivec_vmaxuw>; +def VMINFP : VX1_Int<1098, "vminfp", int_ppc_altivec_vminfp>; +def VMINSB : VX1_Int< 770, "vminsb", int_ppc_altivec_vminsb>; +def VMINSH : VX1_Int< 834, "vminsh", int_ppc_altivec_vminsh>; +def VMINSW : VX1_Int< 898, "vminsw", int_ppc_altivec_vminsw>; +def VMINUB : VX1_Int< 514, "vminub", int_ppc_altivec_vminub>; +def VMINUH : VX1_Int< 578, "vminuh", int_ppc_altivec_vminuh>; +def VMINUW : VX1_Int< 642, "vminuw", int_ppc_altivec_vminuw>; + +def VMRGHB : VXForm_1< 12, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrghb $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGHB_shuffle_mask))]>; +def VMRGHH : VXForm_1< 76, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrghh $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGHH_shuffle_mask))]>; +def VMRGHW : VXForm_1<140, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrghw $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGHW_shuffle_mask))]>; +def VMRGLB : VXForm_1<268, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrglb $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGLB_shuffle_mask))]>; +def VMRGLH : VXForm_1<332, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrglh $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGLH_shuffle_mask))]>; +def VMRGLW : VXForm_1<396, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vmrglw $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VMRGLW_shuffle_mask))]>; + +def VMSUMMBM : VA1a_Int<37, "vmsummbm", int_ppc_altivec_vmsummbm>; +def VMSUMSHM : VA1a_Int<40, "vmsumshm", int_ppc_altivec_vmsumshm>; +def VMSUMSHS : VA1a_Int<41, "vmsumshs", int_ppc_altivec_vmsumshs>; +def VMSUMUBM : VA1a_Int<36, "vmsumubm", int_ppc_altivec_vmsumubm>; +def VMSUMUHM : VA1a_Int<38, "vmsumuhm", int_ppc_altivec_vmsumuhm>; +def VMSUMUHS : VA1a_Int<39, "vmsumuhs", int_ppc_altivec_vmsumuhs>; + +def VMULESB : VX1_Int<776, "vmulesb", int_ppc_altivec_vmulesb>; +def VMULESH : VX1_Int<840, "vmulesh", int_ppc_altivec_vmulesh>; +def VMULEUB : VX1_Int<520, "vmuleub", int_ppc_altivec_vmuleub>; +def VMULEUH : VX1_Int<584, "vmuleuh", int_ppc_altivec_vmuleuh>; +def VMULOSB : VX1_Int<264, "vmulosb", int_ppc_altivec_vmulosb>; +def VMULOSH : VX1_Int<328, "vmulosh", int_ppc_altivec_vmulosh>; +def VMULOUB : VX1_Int< 8, "vmuloub", int_ppc_altivec_vmuloub>; +def VMULOUH : VX1_Int< 72, "vmulouh", int_ppc_altivec_vmulouh>; + +def VREFP : VX2_Int<266, "vrefp", int_ppc_altivec_vrefp>; +def VRFIM : VX2_Int<714, "vrfim", int_ppc_altivec_vrfim>; +def VRFIN : VX2_Int<522, "vrfin", int_ppc_altivec_vrfin>; +def VRFIP : VX2_Int<650, "vrfip", int_ppc_altivec_vrfip>; +def VRFIZ : VX2_Int<586, "vrfiz", int_ppc_altivec_vrfiz>; +def VRSQRTEFP : VX2_Int<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>; + +def VSUBCUW : VX1_Int<74, "vsubcuw", int_ppc_altivec_vsubcuw>; + +def VSUBFP : VXForm_1<74, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vsubfp $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (fsub VRRC:$vA, VRRC:$vB))]>; +def VSUBUBM : VXForm_1<1024, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vsububm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (sub (v16i8 VRRC:$vA), VRRC:$vB))]>; +def VSUBUHM : VXForm_1<1088, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vsubuhm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (sub (v8i16 VRRC:$vA), VRRC:$vB))]>; +def VSUBUWM : VXForm_1<1152, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vsubuwm $vD, $vA, $vB", VecGeneral, + [(set VRRC:$vD, (sub (v4i32 VRRC:$vA), VRRC:$vB))]>; + +def VSUBSBS : VX1_Int<1792, "vsubsbs" , int_ppc_altivec_vsubsbs>; +def VSUBSHS : VX1_Int<1856, "vsubshs" , int_ppc_altivec_vsubshs>; +def VSUBSWS : VX1_Int<1920, "vsubsws" , int_ppc_altivec_vsubsws>; +def VSUBUBS : VX1_Int<1536, "vsububs" , int_ppc_altivec_vsububs>; +def VSUBUHS : VX1_Int<1600, "vsubuhs" , int_ppc_altivec_vsubuhs>; +def VSUBUWS : VX1_Int<1664, "vsubuws" , int_ppc_altivec_vsubuws>; +def VSUMSWS : VX1_Int<1928, "vsumsws" , int_ppc_altivec_vsumsws>; +def VSUM2SWS: VX1_Int<1672, "vsum2sws", int_ppc_altivec_vsum2sws>; +def VSUM4SBS: VX1_Int<1672, "vsum4sbs", int_ppc_altivec_vsum4sbs>; +def VSUM4SHS: VX1_Int<1608, "vsum4shs", int_ppc_altivec_vsum4shs>; +def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>; + +def VNOR : VXForm_1<1284, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vnor $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>; +def VOR : VXForm_1<1156, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vor $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>; +def VXOR : VXForm_1<1220, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vxor $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (xor (v4i32 VRRC:$vA), VRRC:$vB))]>; + +def VRLB : VX1_Int< 4, "vrlb", int_ppc_altivec_vrlb>; +def VRLH : VX1_Int< 68, "vrlh", int_ppc_altivec_vrlh>; +def VRLW : VX1_Int< 132, "vrlw", int_ppc_altivec_vrlw>; + +def VSL : VX1_Int< 452, "vsl" , int_ppc_altivec_vsl >; +def VSLO : VX1_Int<1036, "vslo", int_ppc_altivec_vslo>; +def VSLB : VX1_Int< 260, "vslb", int_ppc_altivec_vslb>; +def VSLH : VX1_Int< 324, "vslh", int_ppc_altivec_vslh>; +def VSLW : VX1_Int< 388, "vslw", int_ppc_altivec_vslw>; + +def VSPLTB : VXForm_1<524, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vspltb $vD, $vB, $UIMM", VecPerm, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef), + VSPLTB_shuffle_mask:$UIMM))]>; +def VSPLTH : VXForm_1<588, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vsplth $vD, $vB, $UIMM", VecPerm, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef), + VSPLTH_shuffle_mask:$UIMM))]>; +def VSPLTW : VXForm_1<652, (ops VRRC:$vD, u5imm:$UIMM, VRRC:$vB), + "vspltw $vD, $vB, $UIMM", VecPerm, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vB), (undef), + VSPLTW_shuffle_mask:$UIMM))]>; + +def VSR : VX1_Int< 708, "vsr" , int_ppc_altivec_vsr>; +def VSRO : VX1_Int<1100, "vsro" , int_ppc_altivec_vsro>; +def VSRAB : VX1_Int< 772, "vsrab", int_ppc_altivec_vsrab>; +def VSRAH : VX1_Int< 836, "vsrah", int_ppc_altivec_vsrah>; +def VSRAW : VX1_Int< 900, "vsraw", int_ppc_altivec_vsraw>; +def VSRB : VX1_Int< 516, "vsrb" , int_ppc_altivec_vsrb>; +def VSRH : VX1_Int< 580, "vsrh" , int_ppc_altivec_vsrh>; +def VSRW : VX1_Int< 644, "vsrw" , int_ppc_altivec_vsrw>; + + +def VSPLTISB : VXForm_3<780, (ops VRRC:$vD, s5imm:$SIMM), + "vspltisb $vD, $SIMM", VecPerm, + [(set VRRC:$vD, (v16i8 vecspltisb:$SIMM))]>; +def VSPLTISH : VXForm_3<844, (ops VRRC:$vD, s5imm:$SIMM), + "vspltish $vD, $SIMM", VecPerm, + [(set VRRC:$vD, (v8i16 vecspltish:$SIMM))]>; +def VSPLTISW : VXForm_3<908, (ops VRRC:$vD, s5imm:$SIMM), + "vspltisw $vD, $SIMM", VecPerm, + [(set VRRC:$vD, (v4i32 vecspltisw:$SIMM))]>; + +// Vector Pack. +def VPKPX : VX1_Int<782, "vpkpx", int_ppc_altivec_vpkpx>; +def VPKSHSS : VX1_Int<398, "vpkshss", int_ppc_altivec_vpkshss>; +def VPKSHUS : VX1_Int<270, "vpkshus", int_ppc_altivec_vpkshus>; +def VPKSWSS : VX1_Int<462, "vpkswss", int_ppc_altivec_vpkswss>; +def VPKSWUS : VX1_Int<334, "vpkswus", int_ppc_altivec_vpkswus>; +def VPKUHUM : VXForm_1<14, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vpkuhum $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VPKUHUM_shuffle_mask))]>; +def VPKUHUS : VX1_Int<142, "vpkuhus", int_ppc_altivec_vpkuhus>; +def VPKUWUM : VXForm_1<78, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), + "vpkuwum $vD, $vA, $vB", VecFP, + [(set VRRC:$vD, (vector_shuffle (v16i8 VRRC:$vA), + VRRC:$vB, VPKUWUM_shuffle_mask))]>; +def VPKUWUS : VX1_Int<206, "vpkuwus", int_ppc_altivec_vpkuwus>; + +// Vector Unpack. +def VUPKHPX : VX2_Int<846, "vupkhpx", int_ppc_altivec_vupkhpx>; +def VUPKHSB : VX2_Int<526, "vupkhsb", int_ppc_altivec_vupkhsb>; +def VUPKHSH : VX2_Int<590, "vupkhsh", int_ppc_altivec_vupkhsh>; +def VUPKLPX : VX2_Int<974, "vupklpx", int_ppc_altivec_vupklpx>; +def VUPKLSB : VX2_Int<654, "vupklsb", int_ppc_altivec_vupklsb>; +def VUPKLSH : VX2_Int<718, "vupklsh", int_ppc_altivec_vupklsh>; + + +// Altivec Comparisons. + +class VCMP<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), asmstr, VecFPCompare, + [(set VRRC:$vD, (Ty (PPCvcmp VRRC:$vA, VRRC:$vB, xo)))]>; +class VCMPo<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (ops VRRC:$vD, VRRC:$vA, VRRC:$vB), asmstr, VecFPCompare, + [(set VRRC:$vD, (Ty (PPCvcmp_o VRRC:$vA, VRRC:$vB, xo)))]> { + let Defs = [CR6]; + let RC = 1; +} + +// f32 element comparisons.0 +def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>; +def VCMPBFPo : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; +def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>; +def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; +def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>; +def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; +def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>; +def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; + +// i8 element comparisons. +def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>; +def VCMPEQUBo : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; +def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>; +def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; +def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>; +def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; + +// i16 element comparisons. +def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>; +def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; +def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>; +def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; +def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>; +def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; + +// i32 element comparisons. +def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>; +def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; +def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; +def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; +def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; +def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; + +def V_SET0 : VXForm_setzero<1220, (ops VRRC:$vD), + "vxor $vD, $vD, $vD", VecFP, + [(set VRRC:$vD, (v4i32 immAllZerosV))]>; +} + +//===----------------------------------------------------------------------===// +// Additional Altivec Patterns +// + +// DS* intrinsics. +def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>; +def : Pat<(int_ppc_altivec_dssall), (DSS 1, 0, 0, 0)>; +def : Pat<(int_ppc_altivec_dst GPRC:$rA, GPRC:$rB, imm:$STRM), + (DST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>; +def : Pat<(int_ppc_altivec_dstt GPRC:$rA, GPRC:$rB, imm:$STRM), + (DST 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>; +def : Pat<(int_ppc_altivec_dstst GPRC:$rA, GPRC:$rB, imm:$STRM), + (DSTST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>; +def : Pat<(int_ppc_altivec_dststt GPRC:$rA, GPRC:$rB, imm:$STRM), + (DSTST 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>; + +// Undef. +def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VRRC)>; +def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VRRC)>; +def : Pat<(v4f32 (undef)), (IMPLICIT_DEF_VRRC)>; + +// Loads. +def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>; + +// Stores. +def : Pat<(store (v4i32 VRRC:$rS), xoaddr:$dst), + (STVX (v4i32 VRRC:$rS), xoaddr:$dst)>; + +// Bit conversions. +def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>; + +def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>; + +def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>; + +def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>; + +// Shuffles. + +// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x) +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VSLDOI_unary_shuffle_mask:$in), + (VSLDOI VRRC:$vA, VRRC:$vA, VSLDOI_unary_shuffle_mask:$in)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef,VPKUWUM_unary_shuffle_mask:$in), + (VPKUWUM VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef,VPKUHUM_unary_shuffle_mask:$in), + (VPKUHUM VRRC:$vA, VRRC:$vA)>; + +// Match vmrg*(x,x) +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLB_unary_shuffle_mask:$in), + (VMRGLB VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLH_unary_shuffle_mask:$in), + (VMRGLH VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGLW_unary_shuffle_mask:$in), + (VMRGLW VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHB_unary_shuffle_mask:$in), + (VMRGHB VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHH_unary_shuffle_mask:$in), + (VMRGHH VRRC:$vA, VRRC:$vA)>; +def:Pat<(vector_shuffle (v16i8 VRRC:$vA), undef, VMRGHW_unary_shuffle_mask:$in), + (VMRGHW VRRC:$vA, VRRC:$vA)>; + +// Logical Operations +def : Pat<(v4i32 (vnot VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>; +def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>; + +def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))), + (VNOR VRRC:$A, VRRC:$B)>; +def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))), + (VANDC VRRC:$A, VRRC:$B)>; + +def : Pat<(fmul VRRC:$vA, VRRC:$vB), + (VMADDFP VRRC:$vA, VRRC:$vB, (v4i32 (V_SET0)))>; + +// Fused multiply add and multiply sub for packed float. These are represented +// separately from the real instructions above, for operations that must have +// the additional precision, such as Newton-Rhapson (used by divide, sqrt) +def : Pat<(PPCvmaddfp VRRC:$A, VRRC:$B, VRRC:$C), + (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>; +def : Pat<(PPCvnmsubfp VRRC:$A, VRRC:$B, VRRC:$C), + (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>; + +def : Pat<(int_ppc_altivec_vmaddfp VRRC:$A, VRRC:$B, VRRC:$C), + (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>; +def : Pat<(int_ppc_altivec_vnmsubfp VRRC:$A, VRRC:$B, VRRC:$C), + (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>; + +def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC), + (VPERM VRRC:$vA, VRRC:$vB, VRRC:$vC)>; diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h new file mode 100644 index 0000000..3861918 --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrBuilder.h @@ -0,0 +1,55 @@ +//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to simplify generating frame and constant pool +// references. +// +// For reference, the order of operands for memory references is: +// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate +// Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_INSTRBUILDER_H +#define POWERPC_INSTRBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +namespace llvm { + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +inline const MachineInstrBuilder& +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0, + bool mem = true) { + if (mem) + return MIB.addImm(Offset).addFrameIndex(FI); + else + return MIB.addFrameIndex(FI).addImm(Offset); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference has base register ConstantPoolIndex offset which is retained until +/// either machine code emission or assembly output. This allows an optional +/// offset to be added as well. +/// +inline const MachineInstrBuilder& +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + int Offset = 0) { + return MIB.addImm(Offset).addConstantPoolIndex(CPI); +} + +} // End llvm namespace + +#endif diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td new file mode 100644 index 0000000..6a4a59b --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrFormats.td @@ -0,0 +1,800 @@ +//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// PowerPC instruction formats + +class I<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : Instruction { + field bits<32> Inst; + + bit PPC64 = 0; // Default value, override with isPPC64 + + let Name = ""; + let Namespace = "PPC"; + let Inst{0-5} = opcode; + let OperandList = OL; + let AsmString = asmstr; + let Itinerary = itin; + + /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to + /// these must be reflected there! See comments there for what these are. + bits<1> PPC970_First = 0; + bits<1> PPC970_Single = 0; + bits<1> PPC970_Cracked = 0; + bits<3> PPC970_Unit = 0; +} + +class PPC970_DGroup_First { bits<1> PPC970_First = 1; } +class PPC970_DGroup_Single { bits<1> PPC970_Single = 1; } +class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; } +class PPC970_MicroCode; + +class PPC970_Unit_Pseudo { bits<3> PPC970_Unit = 0; } +class PPC970_Unit_FXU { bits<3> PPC970_Unit = 1; } +class PPC970_Unit_LSU { bits<3> PPC970_Unit = 2; } +class PPC970_Unit_FPU { bits<3> PPC970_Unit = 3; } +class PPC970_Unit_CRU { bits<3> PPC970_Unit = 4; } +class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; } +class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; } +class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; } + + +// 1.7.1 I-Form +class IForm<bits<6> opcode, bit aa, bit lk, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + let Pattern = pattern; + bits<24> LI; + + let Inst{6-29} = LI; + let Inst{30} = aa; + let Inst{31} = lk; +} + +// 1.7.2 B-Form +class BForm<bits<6> opcode, bit aa, bit lk, dag OL, string asmstr> + : I<opcode, OL, asmstr, BrB> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + bits<14> BD; + + bits<5> BI; + let BI{0-1} = BIBO{5-6}; + let BI{2-4} = CR{0-2}; + + let Inst{6-10} = BIBO{4-0}; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + + +// 1.7.4 D-Form +class DForm_base<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> A; + bits<5> B; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_1<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> A; + bits<16> C; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_2<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : DForm_base<opcode, OL, asmstr, itin, pattern>; + +class DForm_2_r0<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> A; + bits<16> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = 0; + let Inst{16-31} = B; +} + +class DForm_4<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> B; + bits<5> A; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_4_zero<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : DForm_1<opcode, OL, asmstr, itin, pattern> { + let A = 0; + let B = 0; + let C = 0; +} + +class DForm_5<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<16> I; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-31} = I; +} + +class DForm_5_ext<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : DForm_5<opcode, OL, asmstr, itin> { + let L = PPC64; +} + +class DForm_6<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : DForm_5<opcode, OL, asmstr, itin>; + +class DForm_6_ext<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin> + : DForm_6<opcode, OL, asmstr, itin> { + let L = PPC64; +} + + +// 1.7.5 DS-Form +class DSForm_1<bits<6> opcode, bits<2> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> RST; + bits<14> DS; + bits<5> RA; + + let Pattern = pattern; + + let Inst{6-10} = RST; + let Inst{11-15} = RA; + let Inst{16-29} = DS; + let Inst{30-31} = xo; +} + +// 1.7.6 X-Form +class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> RST; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +// This is the same as XForm_base_r3xo, but the first two operands are swapped +// when code is emitted. +class XForm_base_r3xo_swapped + <bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<5> A; + bits<5> RST; + bits<5> B; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + + +class XForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern>; + +class XForm_6<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_8<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern>; + +class XForm_10<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_11<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OL, asmstr, itin> { + let B = 0; + let Pattern = pattern; +} + +class XForm_16<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<5> RB; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : XForm_16<opcode, xo, OL, asmstr, itin> { + let L = PPC64; +} + +class XForm_17<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<3> BF; + bits<5> FRA; + bits<5> FRB; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_25<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> { +} + +class XForm_26<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> { + let A = 0; +} + +class XForm_28<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OL, asmstr, itin, pattern> { +} + +// DCB_Form - Form X instruction, used for dcb* instructions. +class DCB_Form<bits<10> xo, bits<5> immfield, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OL, asmstr, itin> { + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = immfield; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + + +// DSS_Form - Form X instruction, used for altivec dss* instructions. +class DSS_Form<bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OL, asmstr, itin> { + bits<1> T; + bits<2> STRM; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6} = T; + let Inst{7-8} = 0; + let Inst{9-10} = STRM; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// 1.7.7 XL-Form +class XLForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> CRD; + bits<5> CRA; + bits<5> CRB; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRA; + let Inst{16-20} = CRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> CRD; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRD; + let Inst{16-20} = CRD; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> BO; + bits<5> BI; + bits<2> BH; + + let Pattern = pattern; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-18} = 0; + let Inst{19-20} = BH; + let Inst{21-30} = xo; + let Inst{31} = lk; +} + +class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk, + dag OL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OL, asmstr, itin, pattern> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + + let BO = BIBO{2-6}; + let BI{0-1} = BIBO{0-1}; + let BI{2-4} = CR; + let BH = 0; +} + + +class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo, bits<5> bi, bit lk, + dag OL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OL, asmstr, itin, pattern> { + let BO = bo; + let BI = bi; + let BH = 0; +} + +class XLForm_3<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<3> BF; + bits<3> BFA; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-13} = BFA; + let Inst{14-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// 1.7.8 XFX-Form +class XFXForm_1<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<5> RT; + bits<10> SPR; + + let Inst{6-10} = RT; + let Inst{11} = SPR{4}; + let Inst{12} = SPR{3}; + let Inst{13} = SPR{2}; + let Inst{14} = SPR{1}; + let Inst{15} = SPR{0}; + let Inst{16} = SPR{9}; + let Inst{17} = SPR{8}; + let Inst{18} = SPR{7}; + let Inst{19} = SPR{6}; + let Inst{20} = SPR{5}; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OL, string asmstr, InstrItinClass itin> + : XFXForm_1<opcode, xo, OL, asmstr, itin> { + let SPR = spr; +} + +class XFXForm_3<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<5> RT; + + let Inst{6-10} = RT; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<8> FXM; + bits<5> ST; + + let Inst{6-10} = ST; + let Inst{11} = 0; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : I<opcode, OL, asmstr, itin> { + bits<5> ST; + bits<8> FXM; + + let Inst{6-10} = ST; + let Inst{11} = 1; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_7<bits<6> opcode, bits<10> xo, dag OL, string asmstr, + InstrItinClass itin> + : XFXForm_1<opcode, xo, OL, asmstr, itin>; + +class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OL, string asmstr, InstrItinClass itin> + : XFXForm_7<opcode, xo, OL, asmstr, itin> { + let SPR = spr; +} + +// 1.7.10 XS-Form - SRADI. +class XSForm_1<bits<6> opcode, bits<9> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> A; + bits<5> RS; + bits<6> SH; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6-10} = RS; + let Inst{11-15} = A; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + +// 1.7.11 XO-Form +class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21} = oe; + let Inst{22-30} = xo; + let Inst{31} = RC; +} + +class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, + dag OL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XOForm_1<opcode, xo, oe, OL, asmstr, itin, pattern> { + let RB = 0; +} + +// 1.7.12 A-Form +class AForm_1<bits<6> opcode, bits<5> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRC; + bits<5> FRB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-25} = FRC; + let Inst{26-30} = xo; + let Inst{31} = RC; +} + +class AForm_2<bits<6> opcode, bits<5> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OL, asmstr, itin, pattern> { + let FRC = 0; +} + +class AForm_3<bits<6> opcode, bits<5> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OL, asmstr, itin, pattern> { + let FRB = 0; +} + +// 1.7.13 M-Form +class MForm_1<bits<6> opcode, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<5> RB; + bits<5> MB; + bits<5> ME; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-25} = MB; + let Inst{26-30} = ME; + let Inst{31} = RC; +} + +class MForm_2<bits<6> opcode, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : MForm_1<opcode, OL, asmstr, itin, pattern> { +} + +// 1.7.14 MD-Form +class MDForm_1<bits<6> opcode, bits<3> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<6> SH; + bits<6> MBE; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-26} = MBE{4,3,2,1,0,5}; + let Inst{27-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + + + +// E-1 VA-Form + +// VAForm_1 - DACB ordering. +class VAForm_1<bits<6> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VC; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +// VAForm_1a - DABC ordering. +class VAForm_1a<bits<6> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<5> VC; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +class VAForm_2<bits<6> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<4> SH; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = 0; + let Inst{22-25} = SH; + let Inst{26-31} = xo; +} + +// E-2 VX-Form +class VXForm_1<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_setzero<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : VXForm_1<xo, OL, asmstr, itin, pattern> { + let VA = VD; + let VB = VD; +} + + +class VXForm_2<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_3<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> IMM; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = IMM; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr. +class VXForm_4<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr. +class VXForm_5<bits<11> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = 0; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +// E-4 VXR-Form +class VXRForm_1<bits<10> xo, dag OL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bit RC = 0; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = RC; + let Inst{22-31} = xo; +} + +//===----------------------------------------------------------------------===// +class Pseudo<dag OL, string asmstr, list<dag> pattern> + : I<0, OL, asmstr, NoItinerary> { + let PPC64 = 0; + let Pattern = pattern; + let Inst{31-0} = 0; +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp new file mode 100644 index 0000000..d7ee5ed --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -0,0 +1,303 @@ +//===- PPCInstrInfo.cpp - PowerPC32 Instruction Information -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPCPredicates.h" +#include "PPCGenInstrInfo.inc" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +using namespace llvm; + +PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm) + : TargetInstrInfo(PPCInsts, sizeof(PPCInsts)/sizeof(PPCInsts[0])), TM(tm), + RI(*TM.getSubtargetImpl(), *this) {} + +/// getPointerRegClass - Return the register class to use to hold pointers. +/// This is used for addressing modes. +const TargetRegisterClass *PPCInstrInfo::getPointerRegClass() const { + if (TM.getSubtargetImpl()->isPPC64()) + return &PPC::G8RCRegClass; + else + return &PPC::GPRCRegClass; +} + + +bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const { + MachineOpCode oc = MI.getOpcode(); + if (oc == PPC::OR || oc == PPC::OR8 || oc == PPC::VOR || + oc == PPC::OR4To8 || oc == PPC::OR8To4) { // or r1, r2, r2 + assert(MI.getNumOperands() >= 3 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + MI.getOperand(2).isRegister() && + "invalid PPC OR instruction!"); + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + } else if (oc == PPC::ADDI) { // addi r1, r2, 0 + assert(MI.getNumOperands() >= 3 && + MI.getOperand(0).isRegister() && + MI.getOperand(2).isImmediate() && + "invalid PPC ADDI instruction!"); + if (MI.getOperand(1).isRegister() && MI.getOperand(2).getImmedValue()==0) { + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + } else if (oc == PPC::ORI) { // ori r1, r2, 0 + assert(MI.getNumOperands() >= 3 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + MI.getOperand(2).isImmediate() && + "invalid PPC ORI instruction!"); + if (MI.getOperand(2).getImmedValue()==0) { + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + } else if (oc == PPC::FMRS || oc == PPC::FMRD || + oc == PPC::FMRSD) { // fmr r1, r2 + assert(MI.getNumOperands() >= 2 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + "invalid PPC FMR instruction"); + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } else if (oc == PPC::MCRF) { // mcrf cr1, cr2 + assert(MI.getNumOperands() >= 2 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + "invalid PPC MCRF instruction"); + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + return false; +} + +unsigned PPCInstrInfo::isLoadFromStackSlot(MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case PPC::LD: + case PPC::LWZ: + case PPC::LFS: + case PPC::LFD: + if (MI->getOperand(1).isImmediate() && !MI->getOperand(1).getImmedValue() && + MI->getOperand(2).isFrameIndex()) { + FrameIndex = MI->getOperand(2).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned PPCInstrInfo::isStoreToStackSlot(MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case PPC::STD: + case PPC::STW: + case PPC::STFS: + case PPC::STFD: + if (MI->getOperand(1).isImmediate() && !MI->getOperand(1).getImmedValue() && + MI->getOperand(2).isFrameIndex()) { + FrameIndex = MI->getOperand(2).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +// commuteInstruction - We can commute rlwimi instructions, but only if the +// rotate amt is zero. We also have to munge the immediates a bit. +MachineInstr *PPCInstrInfo::commuteInstruction(MachineInstr *MI) const { + // Normal instructions can be commuted the obvious way. + if (MI->getOpcode() != PPC::RLWIMI) + return TargetInstrInfo::commuteInstruction(MI); + + // Cannot commute if it has a non-zero rotate count. + if (MI->getOperand(3).getImmedValue() != 0) + return 0; + + // If we have a zero rotate count, we have: + // M = mask(MB,ME) + // Op0 = (Op1 & ~M) | (Op2 & M) + // Change this to: + // M = mask((ME+1)&31, (MB-1)&31) + // Op0 = (Op2 & ~M) | (Op1 & M) + + // Swap op1/op2 + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + bool Reg1IsKill = MI->getOperand(1).isKill(); + bool Reg2IsKill = MI->getOperand(2).isKill(); + MI->getOperand(2).setReg(Reg1); + MI->getOperand(1).setReg(Reg2); + if (Reg1IsKill) + MI->getOperand(2).setIsKill(); + else + MI->getOperand(2).unsetIsKill(); + if (Reg2IsKill) + MI->getOperand(1).setIsKill(); + else + MI->getOperand(1).unsetIsKill(); + + // Swap the mask around. + unsigned MB = MI->getOperand(4).getImmedValue(); + unsigned ME = MI->getOperand(5).getImmedValue(); + MI->getOperand(4).setImmedValue((ME+1) & 31); + MI->getOperand(5).setImmedValue((MB-1) & 31); + return MI; +} + +void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + BuildMI(MBB, MI, get(PPC::NOP)); +} + + +// Branch analysis. +bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastInst->getOpcode() == PPC::B) { + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } else if (LastInst->getOpcode() == PPC::BCC) { + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(2).getMachineBasicBlock(); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + return false; + } + // Otherwise, don't know what this is. + return true; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && + isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with PPC::B and PPC:BCC, handle it. + if (SecondLastInst->getOpcode() == PPC::BCC && + LastInst->getOpcode() == PPC::B) { + TBB = SecondLastInst->getOperand(2).getMachineBasicBlock(); + Cond.push_back(SecondLastInst->getOperand(0)); + Cond.push_back(SecondLastInst->getOperand(1)); + FBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + + // If the block ends with two PPC:Bs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == PPC::B && + LastInst->getOpcode() == PPC::B) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + I = LastInst; + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != PPC::BCC) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "PPC branch conditions have two components!"); + + // One-way branch. + if (FBB == 0) { + if (Cond.empty()) // Unconditional branch + BuildMI(&MBB, get(PPC::B)).addMBB(TBB); + else // Conditional branch + BuildMI(&MBB, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + return 1; + } + + // Two-way Conditional Branch. + BuildMI(&MBB, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB); + BuildMI(&MBB, get(PPC::B)).addMBB(FBB); + return 2; +} + +bool PPCInstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case PPC::BLR: // Return. + case PPC::B: // Uncond branch. + case PPC::BCTR: // Indirect branch. + return true; + default: return false; + } +} + +bool PPCInstrInfo:: +ReverseBranchCondition(std::vector<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Invalid PPC branch opcode!"); + // Leave the CR# the same, but invert the condition. + Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm())); + return false; +} diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h new file mode 100644 index 0000000..498a8e5 --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrInfo.h @@ -0,0 +1,112 @@ +//===- PPCInstrInfo.h - PowerPC Instruction Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC32_INSTRUCTIONINFO_H +#define POWERPC32_INSTRUCTIONINFO_H + +#include "PPC.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "PPCRegisterInfo.h" + +namespace llvm { + +/// PPCII - This namespace holds all of the PowerPC target-specific +/// per-instruction flags. These must match the corresponding definitions in +/// PPC.td and PPCInstrFormats.td. +namespace PPCII { +enum { + // PPC970 Instruction Flags. These flags describe the characteristics of the + // PowerPC 970 (aka G5) dispatch groups and how they are formed out of + // raw machine instructions. + + /// PPC970_First - This instruction starts a new dispatch group, so it will + /// always be the first one in the group. + PPC970_First = 0x1, + + /// PPC970_Single - This instruction starts a new dispatch group and + /// terminates it, so it will be the sole instruction in the group. + PPC970_Single = 0x2, + + /// PPC970_Cracked - This instruction is cracked into two pieces, requiring + /// two dispatch pipes to be available to issue. + PPC970_Cracked = 0x4, + + /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that + /// an instruction is issued to. + PPC970_Shift = 3, + PPC970_Mask = 0x07 << PPC970_Shift +}; +enum PPC970_Unit { + /// These are the various PPC970 execution unit pipelines. Each instruction + /// is one of these. + PPC970_Pseudo = 0 << PPC970_Shift, // Pseudo instruction + PPC970_FXU = 1 << PPC970_Shift, // Fixed Point (aka Integer/ALU) Unit + PPC970_LSU = 2 << PPC970_Shift, // Load Store Unit + PPC970_FPU = 3 << PPC970_Shift, // Floating Point Unit + PPC970_CRU = 4 << PPC970_Shift, // Control Register Unit + PPC970_VALU = 5 << PPC970_Shift, // Vector ALU + PPC970_VPERM = 6 << PPC970_Shift, // Vector Permute Unit + PPC970_BRU = 7 << PPC970_Shift // Branch Unit +}; +} + + +class PPCInstrInfo : public TargetInstrInfo { + PPCTargetMachine &TM; + const PPCRegisterInfo RI; +public: + PPCInstrInfo(PPCTargetMachine &TM); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + /// getPointerRegClass - Return the register class to use to hold pointers. + /// This is used for addressing modes. + virtual const TargetRegisterClass *getPointerRegClass() const; + + // Return true if the instruction is a register to register move and + // leave the source and dest operands in the passed parameters. + // + virtual bool isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const; + + unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + + // commuteInstruction - We can commute rlwimi instructions, but only if the + // rotate amt is zero. We also have to munge the immediates a bit. + virtual MachineInstr *commuteInstruction(MachineInstr *MI) const; + + virtual void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + + + // Branch analysis. + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; + virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const; + virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const; +}; + +} + +#endif diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td new file mode 100644 index 0000000..fe18978 --- /dev/null +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -0,0 +1,1164 @@ +//===- PPCInstrInfo.td - The PowerPC Instruction Set -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the subset of the 32-bit PowerPC instruction set, as used +// by the PowerPC instruction selector. +// +//===----------------------------------------------------------------------===// + +include "PPCInstrFormats.td" + +//===----------------------------------------------------------------------===// +// PowerPC specific type constraints. +// +def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx + SDTCisVT<0, f64>, SDTCisPtrTy<1> +]>; +def SDT_PPCShiftOp : SDTypeProfile<1, 2, [ // PPCshl, PPCsra, PPCsrl + SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32> +]>; +def SDT_PPCCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>; + +def SDT_PPCvperm : SDTypeProfile<1, 3, [ + SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2> +]>; + +def SDT_PPCvcmp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> +]>; + +def SDT_PPCcondbr : SDTypeProfile<0, 3, [ + SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> +]>; + +def SDT_PPClbrx : SDTypeProfile<1, 3, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT> +]>; +def SDT_PPCstbrx : SDTypeProfile<0, 4, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT> +]>; + +//===----------------------------------------------------------------------===// +// PowerPC specific DAG Nodes. +// + +def PPCfcfid : SDNode<"PPCISD::FCFID" , SDTFPUnaryOp, []>; +def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>; +def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>; +def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx, [SDNPHasChain]>; + +def PPCfsel : SDNode<"PPCISD::FSEL", + // Type constraint for fsel. + SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisFP<0>, SDTCisVT<1, f64>]>, []>; + +def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>; +def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>; +def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>; +def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>; + +def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; + +// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift +// amounts. These nodes are generated by the multi-precision shift code. +def PPCsrl : SDNode<"PPCISD::SRL" , SDT_PPCShiftOp>; +def PPCsra : SDNode<"PPCISD::SRA" , SDT_PPCShiftOp>; +def PPCshl : SDNode<"PPCISD::SHL" , SDT_PPCShiftOp>; + +def PPCextsw_32 : SDNode<"PPCISD::EXTSW_32" , SDTIntUnaryOp>; +def PPCstd_32 : SDNode<"PPCISD::STD_32" , SDTStore, [SDNPHasChain]>; + +// These are target-independent nodes, but have target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeq, + [SDNPHasChain, SDNPOutFlag]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeq, + [SDNPHasChain, SDNPOutFlag]>; + +def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>; +def PPCcall_Macho : SDNode<"PPCISD::CALL_Macho", SDT_PPCCall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; +def PPCcall_ELF : SDNode<"PPCISD::CALL_ELF", SDT_PPCCall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; +def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; +def PPCbctrl_Macho : SDNode<"PPCISD::BCTRL_Macho", SDTRet, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +def PPCbctrl_ELF : SDNode<"PPCISD::BCTRL_ELF", SDTRet, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +def retflag : SDNode<"PPCISD::RET_FLAG", SDTRet, + [SDNPHasChain, SDNPOptInFlag]>; + +def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; +def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutFlag]>; + +def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, + [SDNPHasChain, SDNPOptInFlag]>; + +def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, [SDNPHasChain]>; +def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain]>; + +// Instructions to support dynamic alloca. +def SDTDynOp : SDTypeProfile<1, 2, []>; +def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// PowerPC specific transformation functions and pattern fragments. +// + +def SHL32 : SDNodeXForm<imm, [{ + // Transformation function: 31 - imm + return getI32Imm(31 - N->getValue()); +}]>; + +def SRL32 : SDNodeXForm<imm, [{ + // Transformation function: 32 - imm + return N->getValue() ? getI32Imm(32 - N->getValue()) : getI32Imm(0); +}]>; + +def LO16 : SDNodeXForm<imm, [{ + // Transformation function: get the low 16 bits. + return getI32Imm((unsigned short)N->getValue()); +}]>; + +def HI16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned)N->getValue() >> 16); +}]>; + +def HA16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + signed int Val = N->getValue(); + return getI32Imm((Val - (signed short)Val) >> 16); +}]>; +def MB : SDNodeXForm<imm, [{ + // Transformation function: get the start bit of a mask + unsigned mb, me; + (void)isRunOfOnes((unsigned)N->getValue(), mb, me); + return getI32Imm(mb); +}]>; + +def ME : SDNodeXForm<imm, [{ + // Transformation function: get the end bit of a mask + unsigned mb, me; + (void)isRunOfOnes((unsigned)N->getValue(), mb, me); + return getI32Imm(me); +}]>; +def maskimm32 : PatLeaf<(imm), [{ + // maskImm predicate - True if immediate is a run of ones. + unsigned mb, me; + if (N->getValueType(0) == MVT::i32) + return isRunOfOnes((unsigned)N->getValue(), mb, me); + else + return false; +}]>; + +def immSExt16 : PatLeaf<(imm), [{ + // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended + // field. Used by instructions like 'addi'. + if (N->getValueType(0) == MVT::i32) + return (int32_t)N->getValue() == (short)N->getValue(); + else + return (int64_t)N->getValue() == (short)N->getValue(); +}]>; +def immZExt16 : PatLeaf<(imm), [{ + // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended + // field. Used by instructions like 'ori'. + return (uint64_t)N->getValue() == (unsigned short)N->getValue(); +}], LO16>; + +// imm16Shifted* - These match immediates where the low 16-bits are zero. There +// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are +// identical in 32-bit mode, but in 64-bit mode, they return true if the +// immediate fits into a sign/zero extended 32-bit immediate (with the low bits +// clear). +def imm16ShiftedZExt : PatLeaf<(imm), [{ + // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'xoris'. + return (N->getValue() & ~uint64_t(0xFFFF0000)) == 0; +}], HI16>; + +def imm16ShiftedSExt : PatLeaf<(imm), [{ + // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'addis'. Identical to + // imm16ShiftedZExt in 32-bit mode. + if (N->getValue() & 0xFFFF) return false; + if (N->getValueType(0) == MVT::i32) + return true; + // For 64-bit, make sure it is sext right. + return N->getValue() == (uint64_t)(int)N->getValue(); +}], HI16>; + + +//===----------------------------------------------------------------------===// +// PowerPC Flag Definitions. + +class isPPC64 { bit PPC64 = 1; } +class isDOT { + list<Register> Defs = [CR0]; + bit RC = 1; +} + +class RegConstraint<string C> { + string Constraints = C; +} +class NoEncode<string E> { + string DisableEncoding = E; +} + + +//===----------------------------------------------------------------------===// +// PowerPC Operand Definitions. + +def s5imm : Operand<i32> { + let PrintMethod = "printS5ImmOperand"; +} +def u5imm : Operand<i32> { + let PrintMethod = "printU5ImmOperand"; +} +def u6imm : Operand<i32> { + let PrintMethod = "printU6ImmOperand"; +} +def s16imm : Operand<i32> { + let PrintMethod = "printS16ImmOperand"; +} +def u16imm : Operand<i32> { + let PrintMethod = "printU16ImmOperand"; +} +def s16immX4 : Operand<i32> { // Multiply imm by 4 before printing. + let PrintMethod = "printS16X4ImmOperand"; +} +def target : Operand<OtherVT> { + let PrintMethod = "printBranchOperand"; +} +def calltarget : Operand<iPTR> { + let PrintMethod = "printCallOperand"; +} +def aaddr : Operand<iPTR> { + let PrintMethod = "printAbsAddrOperand"; +} +def piclabel: Operand<iPTR> { + let PrintMethod = "printPICLabel"; +} +def symbolHi: Operand<i32> { + let PrintMethod = "printSymbolHi"; +} +def symbolLo: Operand<i32> { + let PrintMethod = "printSymbolLo"; +} +def crbitm: Operand<i8> { + let PrintMethod = "printcrbitm"; +} +// Address operands +def memri : Operand<iPTR> { + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg); +} +def memrr : Operand<iPTR> { + let PrintMethod = "printMemRegReg"; + let MIOperandInfo = (ops ptr_rc, ptr_rc); +} +def memrix : Operand<iPTR> { // memri where the imm is shifted 2 bits. + let PrintMethod = "printMemRegImmShifted"; + let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg); +} + +// PowerPC Predicate operand. 20 = (0<<5)|20 = always, CR0 is a dummy reg +// that doesn't matter. +def pred : PredicateOperand<OtherVT, (ops imm, CRRC), + (ops (i32 20), CR0)> { + let PrintMethod = "printPredicateOperand"; +} + +// Define PowerPC specific addressing mode. +def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; +def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; +def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>; +def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std" + +/// This is just the offset part of iaddr, used for preinc. +def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>; + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Predicate Definitions. +def FPContractions : Predicate<"!NoExcessFPPrecision">; + + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Definitions. + +// Pseudo-instructions: + +let hasCtrlDep = 1 in { +def ADJCALLSTACKDOWN : Pseudo<(ops u16imm:$amt), + "${:comment} ADJCALLSTACKDOWN", + [(callseq_start imm:$amt)]>, Imp<[R1],[R1]>; +def ADJCALLSTACKUP : Pseudo<(ops u16imm:$amt), + "${:comment} ADJCALLSTACKUP", + [(callseq_end imm:$amt)]>, Imp<[R1],[R1]>; + +def UPDATE_VRSAVE : Pseudo<(ops GPRC:$rD, GPRC:$rS), + "UPDATE_VRSAVE $rD, $rS", []>; +} + +def DYNALLOC : Pseudo<(ops GPRC:$result, GPRC:$negsize, memri:$fpsi), + "${:comment} DYNALLOC $result, $negsize, $fpsi", + [(set GPRC:$result, + (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>, + Imp<[R1],[R1]>; + +def IMPLICIT_DEF_GPRC: Pseudo<(ops GPRC:$rD),"${:comment}IMPLICIT_DEF_GPRC $rD", + [(set GPRC:$rD, (undef))]>; +def IMPLICIT_DEF_F8 : Pseudo<(ops F8RC:$rD), "${:comment} IMPLICIT_DEF_F8 $rD", + [(set F8RC:$rD, (undef))]>; +def IMPLICIT_DEF_F4 : Pseudo<(ops F4RC:$rD), "${:comment} IMPLICIT_DEF_F4 $rD", + [(set F4RC:$rD, (undef))]>; + +// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded by the +// scheduler into a branch sequence. +let usesCustomDAGSchedInserter = 1, // Expanded by the scheduler. + PPC970_Single = 1 in { + def SELECT_CC_I4 : Pseudo<(ops GPRC:$dst, CRRC:$cond, GPRC:$T, GPRC:$F, + i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!", + []>; + def SELECT_CC_I8 : Pseudo<(ops G8RC:$dst, CRRC:$cond, G8RC:$T, G8RC:$F, + i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!", + []>; + def SELECT_CC_F4 : Pseudo<(ops F4RC:$dst, CRRC:$cond, F4RC:$T, F4RC:$F, + i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!", + []>; + def SELECT_CC_F8 : Pseudo<(ops F8RC:$dst, CRRC:$cond, F8RC:$T, F8RC:$F, + i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!", + []>; + def SELECT_CC_VRRC: Pseudo<(ops VRRC:$dst, CRRC:$cond, VRRC:$T, VRRC:$F, + i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!", + []>; +} + +let isTerminator = 1, isBarrier = 1, noResults = 1, PPC970_Unit = 7 in { + let isReturn = 1 in + def BLR : XLForm_2_br<19, 16, 0, (ops pred:$p), + "b${p:cc}lr ${p:reg}", BrB, + [(retflag)]>; + def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (ops), "bctr", BrB, []>; +} + + + +let Defs = [LR] in + def MovePCtoLR : Pseudo<(ops piclabel:$label), "bl $label", []>, + PPC970_Unit_BRU; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, + noResults = 1, PPC970_Unit = 7 in { + let isBarrier = 1 in { + def B : IForm<18, 0, 0, (ops target:$dst), + "b $dst", BrB, + [(br bb:$dst)]>; + } + + // BCC represents an arbitrary conditional branch on a predicate. + // FIXME: should be able to write a pattern for PPCcondbranch, but can't use + // a two-value operand where a dag node expects two operands. :( + def BCC : BForm<16, 0, 0, (ops pred:$cond, target:$dst), + "b${cond:cc} ${cond:reg}, $dst" + /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>; +} + +// Macho ABI Calls. +let isCall = 1, noResults = 1, PPC970_Unit = 7, + // All calls clobber the non-callee saved registers... + Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, + F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13, + V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19, + LR,CTR, + CR0,CR1,CR5,CR6,CR7] in { + // Convenient aliases for call instructions + def BL_Macho : IForm<18, 0, 1, + (ops calltarget:$func, variable_ops), + "bl $func", BrB, []>; // See Pat patterns below. + def BLA_Macho : IForm<18, 1, 1, + (ops aaddr:$func, variable_ops), + "bla $func", BrB, [(PPCcall_Macho (i32 imm:$func))]>; + def BCTRL_Macho : XLForm_2_ext<19, 528, 20, 0, 1, + (ops variable_ops), + "bctrl", BrB, + [(PPCbctrl_Macho)]>; +} + +// ELF ABI Calls. +let isCall = 1, noResults = 1, PPC970_Unit = 7, + // All calls clobber the non-callee saved registers... + Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12, + F0,F1,F2,F3,F4,F5,F6,F7,F8, + V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19, + LR,CTR, + CR0,CR1,CR5,CR6,CR7] in { + // Convenient aliases for call instructions + def BL_ELF : IForm<18, 0, 1, + (ops calltarget:$func, variable_ops), + "bl $func", BrB, []>; // See Pat patterns below. + def BLA_ELF : IForm<18, 1, 1, + (ops aaddr:$func, variable_ops), + "bla $func", BrB, + [(PPCcall_ELF (i32 imm:$func))]>; + def BCTRL_ELF : XLForm_2_ext<19, 528, 20, 0, 1, + (ops variable_ops), + "bctrl", BrB, + [(PPCbctrl_ELF)]>; +} + +// DCB* instructions. +def DCBA : DCB_Form<758, 0, (ops memrr:$dst), + "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBF : DCB_Form<86, 0, (ops memrr:$dst), + "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBI : DCB_Form<470, 0, (ops memrr:$dst), + "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBST : DCB_Form<54, 0, (ops memrr:$dst), + "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBT : DCB_Form<278, 0, (ops memrr:$dst), + "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBTST : DCB_Form<246, 0, (ops memrr:$dst), + "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZ : DCB_Form<1014, 0, (ops memrr:$dst), + "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZL : DCB_Form<1014, 1, (ops memrr:$dst), + "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>, + PPC970_DGroup_Single; + +//===----------------------------------------------------------------------===// +// PPC32 Load Instructions. +// + +// Unindexed (r+i) Loads. +let isLoad = 1, PPC970_Unit = 2 in { +def LBZ : DForm_1<34, (ops GPRC:$rD, memri:$src), + "lbz $rD, $src", LdStGeneral, + [(set GPRC:$rD, (zextloadi8 iaddr:$src))]>; +def LHA : DForm_1<42, (ops GPRC:$rD, memri:$src), + "lha $rD, $src", LdStLHA, + [(set GPRC:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZ : DForm_1<40, (ops GPRC:$rD, memri:$src), + "lhz $rD, $src", LdStGeneral, + [(set GPRC:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ : DForm_1<32, (ops GPRC:$rD, memri:$src), + "lwz $rD, $src", LdStGeneral, + [(set GPRC:$rD, (load iaddr:$src))]>; + +def LFS : DForm_1<48, (ops F4RC:$rD, memri:$src), + "lfs $rD, $src", LdStLFDU, + [(set F4RC:$rD, (load iaddr:$src))]>; +def LFD : DForm_1<50, (ops F8RC:$rD, memri:$src), + "lfd $rD, $src", LdStLFD, + [(set F8RC:$rD, (load iaddr:$src))]>; + + +// Unindexed (r+i) Loads with Update (preinc). +def LBZU : DForm_1<35, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr), + "lbzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAU : DForm_1<43, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr), + "lhau $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZU : DForm_1<41, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr), + "lhzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZU : DForm_1<33, (ops GPRC:$rD, ptr_rc:$ea_result, memri:$addr), + "lwzu $rD, $addr", LdStGeneral, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSU : DForm_1<49, (ops F4RC:$rD, ptr_rc:$ea_result, memri:$addr), + "lfs $rD, $addr", LdStLFDU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDU : DForm_1<51, (ops F8RC:$rD, ptr_rc:$ea_result, memri:$addr), + "lfd $rD, $addr", LdStLFD, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +} + +// Indexed (r+r) Loads. +// +let isLoad = 1, PPC970_Unit = 2 in { +def LBZX : XForm_1<31, 87, (ops GPRC:$rD, memrr:$src), + "lbzx $rD, $src", LdStGeneral, + [(set GPRC:$rD, (zextloadi8 xaddr:$src))]>; +def LHAX : XForm_1<31, 343, (ops GPRC:$rD, memrr:$src), + "lhax $rD, $src", LdStLHA, + [(set GPRC:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZX : XForm_1<31, 279, (ops GPRC:$rD, memrr:$src), + "lhzx $rD, $src", LdStGeneral, + [(set GPRC:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX : XForm_1<31, 23, (ops GPRC:$rD, memrr:$src), + "lwzx $rD, $src", LdStGeneral, + [(set GPRC:$rD, (load xaddr:$src))]>; + + +def LHBRX : XForm_1<31, 790, (ops GPRC:$rD, memrr:$src), + "lhbrx $rD, $src", LdStGeneral, + [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i16))]>; +def LWBRX : XForm_1<31, 534, (ops GPRC:$rD, memrr:$src), + "lwbrx $rD, $src", LdStGeneral, + [(set GPRC:$rD, (PPClbrx xoaddr:$src, srcvalue:$sv, i32))]>; + +def LFSX : XForm_25<31, 535, (ops F4RC:$frD, memrr:$src), + "lfsx $frD, $src", LdStLFDU, + [(set F4RC:$frD, (load xaddr:$src))]>; +def LFDX : XForm_25<31, 599, (ops F8RC:$frD, memrr:$src), + "lfdx $frD, $src", LdStLFDU, + [(set F8RC:$frD, (load xaddr:$src))]>; +} + +//===----------------------------------------------------------------------===// +// PPC32 Store Instructions. +// + +// Unindexed (r+i) Stores. +let isStore = 1, noResults = 1, PPC970_Unit = 2 in { +def STB : DForm_1<38, (ops GPRC:$rS, memri:$src), + "stb $rS, $src", LdStGeneral, + [(truncstorei8 GPRC:$rS, iaddr:$src)]>; +def STH : DForm_1<44, (ops GPRC:$rS, memri:$src), + "sth $rS, $src", LdStGeneral, + [(truncstorei16 GPRC:$rS, iaddr:$src)]>; +def STW : DForm_1<36, (ops GPRC:$rS, memri:$src), + "stw $rS, $src", LdStGeneral, + [(store GPRC:$rS, iaddr:$src)]>; +def STFS : DForm_1<52, (ops F4RC:$rS, memri:$dst), + "stfs $rS, $dst", LdStUX, + [(store F4RC:$rS, iaddr:$dst)]>; +def STFD : DForm_1<54, (ops F8RC:$rS, memri:$dst), + "stfd $rS, $dst", LdStUX, + [(store F8RC:$rS, iaddr:$dst)]>; +} + +// Unindexed (r+i) Stores with Update (preinc). +let isStore = 1, PPC970_Unit = 2 in { +def STBU : DForm_1<39, (ops ptr_rc:$ea_res, GPRC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stbu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, + (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STHU : DForm_1<45, (ops ptr_rc:$ea_res, GPRC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "sthu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, + (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STWU : DForm_1<37, (ops ptr_rc:$ea_res, GPRC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stwu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STFSU : DForm_1<37, (ops ptr_rc:$ea_res, F4RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stfsu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, (pre_store F4RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +def STFDU : DForm_1<37, (ops ptr_rc:$ea_res, F8RC:$rS, + symbolLo:$ptroff, ptr_rc:$ptrreg), + "stfdu $rS, $ptroff($ptrreg)", LdStGeneral, + [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, + iaddroff:$ptroff))]>, + RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">; +} + + +// Indexed (r+r) Stores. +// +let isStore = 1, noResults = 1, PPC970_Unit = 2 in { +def STBX : XForm_8<31, 215, (ops GPRC:$rS, memrr:$dst), + "stbx $rS, $dst", LdStGeneral, + [(truncstorei8 GPRC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX : XForm_8<31, 407, (ops GPRC:$rS, memrr:$dst), + "sthx $rS, $dst", LdStGeneral, + [(truncstorei16 GPRC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX : XForm_8<31, 151, (ops GPRC:$rS, memrr:$dst), + "stwx $rS, $dst", LdStGeneral, + [(store GPRC:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWUX : XForm_8<31, 183, (ops GPRC:$rS, GPRC:$rA, GPRC:$rB), + "stwux $rS, $rA, $rB", LdStGeneral, + []>; +def STHBRX: XForm_8<31, 918, (ops GPRC:$rS, memrr:$dst), + "sthbrx $rS, $dst", LdStGeneral, + [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i16)]>, + PPC970_DGroup_Cracked; +def STWBRX: XForm_8<31, 662, (ops GPRC:$rS, memrr:$dst), + "stwbrx $rS, $dst", LdStGeneral, + [(PPCstbrx GPRC:$rS, xoaddr:$dst, srcvalue:$dummy, i32)]>, + PPC970_DGroup_Cracked; + +def STFIWX: XForm_28<31, 983, (ops F8RC:$frS, memrr:$dst), + "stfiwx $frS, $dst", LdStUX, + [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>; +def STFSX : XForm_28<31, 663, (ops F4RC:$frS, memrr:$dst), + "stfsx $frS, $dst", LdStUX, + [(store F4RC:$frS, xaddr:$dst)]>; +def STFDX : XForm_28<31, 727, (ops F8RC:$frS, memrr:$dst), + "stfdx $frS, $dst", LdStUX, + [(store F8RC:$frS, xaddr:$dst)]>; +} + + +//===----------------------------------------------------------------------===// +// PPC32 Arithmetic Instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. +def ADDI : DForm_2<14, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm), + "addi $rD, $rA, $imm", IntGeneral, + [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>; +def ADDIC : DForm_2<12, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm), + "addic $rD, $rA, $imm", IntGeneral, + [(set GPRC:$rD, (addc GPRC:$rA, immSExt16:$imm))]>, + PPC970_DGroup_Cracked; +def ADDICo : DForm_2<13, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm), + "addic. $rD, $rA, $imm", IntGeneral, + []>; +def ADDIS : DForm_2<15, (ops GPRC:$rD, GPRC:$rA, symbolHi:$imm), + "addis $rD, $rA, $imm", IntGeneral, + [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>; +def LA : DForm_2<14, (ops GPRC:$rD, GPRC:$rA, symbolLo:$sym), + "la $rD, $sym($rA)", IntGeneral, + [(set GPRC:$rD, (add GPRC:$rA, + (PPClo tglobaladdr:$sym, 0)))]>; +def MULLI : DForm_2< 7, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm), + "mulli $rD, $rA, $imm", IntMulLI, + [(set GPRC:$rD, (mul GPRC:$rA, immSExt16:$imm))]>; +def SUBFIC : DForm_2< 8, (ops GPRC:$rD, GPRC:$rA, s16imm:$imm), + "subfic $rD, $rA, $imm", IntGeneral, + [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>; +def LI : DForm_2_r0<14, (ops GPRC:$rD, symbolLo:$imm), + "li $rD, $imm", IntGeneral, + [(set GPRC:$rD, immSExt16:$imm)]>; +def LIS : DForm_2_r0<15, (ops GPRC:$rD, symbolHi:$imm), + "lis $rD, $imm", IntGeneral, + [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +def ANDIo : DForm_4<28, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "andi. $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (and GPRC:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo : DForm_4<29, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "andis. $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>, + isDOT; +def ORI : DForm_4<24, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "ori $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>; +def ORIS : DForm_4<25, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "oris $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>; +def XORI : DForm_4<26, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "xori $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>; +def XORIS : DForm_4<27, (ops GPRC:$dst, GPRC:$src1, u16imm:$src2), + "xoris $dst, $src1, $src2", IntGeneral, + [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>; +def NOP : DForm_4_zero<24, (ops), "nop", IntGeneral, + []>; +def CMPWI : DForm_5_ext<11, (ops CRRC:$crD, GPRC:$rA, s16imm:$imm), + "cmpwi $crD, $rA, $imm", IntCompare>; +def CMPLWI : DForm_6_ext<10, (ops CRRC:$dst, GPRC:$src1, u16imm:$src2), + "cmplwi $dst, $src1, $src2", IntCompare>; +} + + +let PPC970_Unit = 1 in { // FXU Operations. +def NAND : XForm_6<31, 476, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "nand $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>; +def AND : XForm_6<31, 28, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "and $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>; +def ANDC : XForm_6<31, 60, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "andc $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>; +def OR : XForm_6<31, 444, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "or $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>; +def NOR : XForm_6<31, 124, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "nor $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>; +def ORC : XForm_6<31, 412, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "orc $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>; +def EQV : XForm_6<31, 284, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "eqv $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>; +def XOR : XForm_6<31, 316, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "xor $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>; +def SLW : XForm_6<31, 24, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "slw $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (PPCshl GPRC:$rS, GPRC:$rB))]>; +def SRW : XForm_6<31, 536, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "srw $rA, $rS, $rB", IntGeneral, + [(set GPRC:$rA, (PPCsrl GPRC:$rS, GPRC:$rB))]>; +def SRAW : XForm_6<31, 792, (ops GPRC:$rA, GPRC:$rS, GPRC:$rB), + "sraw $rA, $rS, $rB", IntShift, + [(set GPRC:$rA, (PPCsra GPRC:$rS, GPRC:$rB))]>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +def SRAWI : XForm_10<31, 824, (ops GPRC:$rA, GPRC:$rS, u5imm:$SH), + "srawi $rA, $rS, $SH", IntShift, + [(set GPRC:$rA, (sra GPRC:$rS, (i32 imm:$SH)))]>; +def CNTLZW : XForm_11<31, 26, (ops GPRC:$rA, GPRC:$rS), + "cntlzw $rA, $rS", IntGeneral, + [(set GPRC:$rA, (ctlz GPRC:$rS))]>; +def EXTSB : XForm_11<31, 954, (ops GPRC:$rA, GPRC:$rS), + "extsb $rA, $rS", IntGeneral, + [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>; +def EXTSH : XForm_11<31, 922, (ops GPRC:$rA, GPRC:$rS), + "extsh $rA, $rS", IntGeneral, + [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>; + +def CMPW : XForm_16_ext<31, 0, (ops CRRC:$crD, GPRC:$rA, GPRC:$rB), + "cmpw $crD, $rA, $rB", IntCompare>; +def CMPLW : XForm_16_ext<31, 32, (ops CRRC:$crD, GPRC:$rA, GPRC:$rB), + "cmplw $crD, $rA, $rB", IntCompare>; +} +let PPC970_Unit = 3 in { // FPU Operations. +//def FCMPO : XForm_17<63, 32, (ops CRRC:$crD, FPRC:$fA, FPRC:$fB), +// "fcmpo $crD, $fA, $fB", FPCompare>; +def FCMPUS : XForm_17<63, 0, (ops CRRC:$crD, F4RC:$fA, F4RC:$fB), + "fcmpu $crD, $fA, $fB", FPCompare>; +def FCMPUD : XForm_17<63, 0, (ops CRRC:$crD, F8RC:$fA, F8RC:$fB), + "fcmpu $crD, $fA, $fB", FPCompare>; + +def FCTIWZ : XForm_26<63, 15, (ops F8RC:$frD, F8RC:$frB), + "fctiwz $frD, $frB", FPGeneral, + [(set F8RC:$frD, (PPCfctiwz F8RC:$frB))]>; +def FRSP : XForm_26<63, 12, (ops F4RC:$frD, F8RC:$frB), + "frsp $frD, $frB", FPGeneral, + [(set F4RC:$frD, (fround F8RC:$frB))]>; +def FSQRT : XForm_26<63, 22, (ops F8RC:$frD, F8RC:$frB), + "fsqrt $frD, $frB", FPSqrt, + [(set F8RC:$frD, (fsqrt F8RC:$frB))]>; +def FSQRTS : XForm_26<59, 22, (ops F4RC:$frD, F4RC:$frB), + "fsqrts $frD, $frB", FPSqrt, + [(set F4RC:$frD, (fsqrt F4RC:$frB))]>; +} + +/// FMR is split into 3 versions, one for 4/8 byte FP, and one for extending. +/// +/// Note that these are defined as pseudo-ops on the PPC970 because they are +/// often coalesced away and we don't want the dispatch group builder to think +/// that they will fill slots (which could cause the load of a LSU reject to +/// sneak into a d-group with a store). +def FMRS : XForm_26<63, 72, (ops F4RC:$frD, F4RC:$frB), + "fmr $frD, $frB", FPGeneral, + []>, // (set F4RC:$frD, F4RC:$frB) + PPC970_Unit_Pseudo; +def FMRD : XForm_26<63, 72, (ops F8RC:$frD, F8RC:$frB), + "fmr $frD, $frB", FPGeneral, + []>, // (set F8RC:$frD, F8RC:$frB) + PPC970_Unit_Pseudo; +def FMRSD : XForm_26<63, 72, (ops F8RC:$frD, F4RC:$frB), + "fmr $frD, $frB", FPGeneral, + [(set F8RC:$frD, (fextend F4RC:$frB))]>, + PPC970_Unit_Pseudo; + +let PPC970_Unit = 3 in { // FPU Operations. +// These are artificially split into two different forms, for 4/8 byte FP. +def FABSS : XForm_26<63, 264, (ops F4RC:$frD, F4RC:$frB), + "fabs $frD, $frB", FPGeneral, + [(set F4RC:$frD, (fabs F4RC:$frB))]>; +def FABSD : XForm_26<63, 264, (ops F8RC:$frD, F8RC:$frB), + "fabs $frD, $frB", FPGeneral, + [(set F8RC:$frD, (fabs F8RC:$frB))]>; +def FNABSS : XForm_26<63, 136, (ops F4RC:$frD, F4RC:$frB), + "fnabs $frD, $frB", FPGeneral, + [(set F4RC:$frD, (fneg (fabs F4RC:$frB)))]>; +def FNABSD : XForm_26<63, 136, (ops F8RC:$frD, F8RC:$frB), + "fnabs $frD, $frB", FPGeneral, + [(set F8RC:$frD, (fneg (fabs F8RC:$frB)))]>; +def FNEGS : XForm_26<63, 40, (ops F4RC:$frD, F4RC:$frB), + "fneg $frD, $frB", FPGeneral, + [(set F4RC:$frD, (fneg F4RC:$frB))]>; +def FNEGD : XForm_26<63, 40, (ops F8RC:$frD, F8RC:$frB), + "fneg $frD, $frB", FPGeneral, + [(set F8RC:$frD, (fneg F8RC:$frB))]>; +} + + +// XL-Form instructions. condition register logical ops. +// +def MCRF : XLForm_3<19, 0, (ops CRRC:$BF, CRRC:$BFA), + "mcrf $BF, $BFA", BrMCR>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def CREQV : XLForm_1<19, 289, (ops CRRC:$CRD, CRRC:$CRA, CRRC:$CRB), + "creqv $CRD, $CRA, $CRB", BrCR, + []>; + +def SETCR : XLForm_1_ext<19, 289, (ops CRRC:$dst), + "creqv $dst, $dst, $dst", BrCR, + []>; + +// XFX-Form instructions. Instructions that deal with SPRs. +// +def MFCTR : XFXForm_1_ext<31, 339, 9, (ops GPRC:$rT), "mfctr $rT", SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +let Pattern = [(PPCmtctr GPRC:$rS)] in { +def MTCTR : XFXForm_7_ext<31, 467, 9, (ops GPRC:$rS), "mtctr $rS", SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +def MTLR : XFXForm_7_ext<31, 467, 8, (ops GPRC:$rS), "mtlr $rS", SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +def MFLR : XFXForm_1_ext<31, 339, 8, (ops GPRC:$rT), "mflr $rT", SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; + +// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like +// a GPR on the PPC970. As such, copies in and out have the same performance +// characteristics as an OR instruction. +def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (ops GPRC:$rS), + "mtspr 256, $rS", IntGeneral>, + PPC970_DGroup_Single, PPC970_Unit_FXU; +def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (ops GPRC:$rT), + "mfspr $rT, 256", IntGeneral>, + PPC970_DGroup_First, PPC970_Unit_FXU; + +def MTCRF : XFXForm_5<31, 144, (ops crbitm:$FXM, GPRC:$rS), + "mtcrf $FXM, $rS", BrMCRX>, + PPC970_MicroCode, PPC970_Unit_CRU; +def MFCR : XFXForm_3<31, 19, (ops GPRC:$rT), "mfcr $rT", SprMFCR>, + PPC970_MicroCode, PPC970_Unit_CRU; +def MFOCRF: XFXForm_5a<31, 19, (ops GPRC:$rT, crbitm:$FXM), + "mfcr $rT, $FXM", SprMFCR>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +let PPC970_Unit = 1 in { // FXU Operations. + +// XO-Form instructions. Arithmetic instructions that can set overflow bit +// +def ADD4 : XOForm_1<31, 266, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "add $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>; +def ADDC : XOForm_1<31, 10, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "addc $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (addc GPRC:$rA, GPRC:$rB))]>, + PPC970_DGroup_Cracked; +def ADDE : XOForm_1<31, 138, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "adde $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>; +def DIVW : XOForm_1<31, 491, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "divw $rT, $rA, $rB", IntDivW, + [(set GPRC:$rT, (sdiv GPRC:$rA, GPRC:$rB))]>, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +def DIVWU : XOForm_1<31, 459, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "divwu $rT, $rA, $rB", IntDivW, + [(set GPRC:$rT, (udiv GPRC:$rA, GPRC:$rB))]>, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +def MULHW : XOForm_1<31, 75, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "mulhw $rT, $rA, $rB", IntMulHW, + [(set GPRC:$rT, (mulhs GPRC:$rA, GPRC:$rB))]>; +def MULHWU : XOForm_1<31, 11, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "mulhwu $rT, $rA, $rB", IntMulHWU, + [(set GPRC:$rT, (mulhu GPRC:$rA, GPRC:$rB))]>; +def MULLW : XOForm_1<31, 235, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "mullw $rT, $rA, $rB", IntMulHW, + [(set GPRC:$rT, (mul GPRC:$rA, GPRC:$rB))]>; +def SUBF : XOForm_1<31, 40, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "subf $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (sub GPRC:$rB, GPRC:$rA))]>; +def SUBFC : XOForm_1<31, 8, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "subfc $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (subc GPRC:$rB, GPRC:$rA))]>, + PPC970_DGroup_Cracked; +def SUBFE : XOForm_1<31, 136, 0, (ops GPRC:$rT, GPRC:$rA, GPRC:$rB), + "subfe $rT, $rA, $rB", IntGeneral, + [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>; +def ADDME : XOForm_3<31, 234, 0, (ops GPRC:$rT, GPRC:$rA), + "addme $rT, $rA", IntGeneral, + [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>; +def ADDZE : XOForm_3<31, 202, 0, (ops GPRC:$rT, GPRC:$rA), + "addze $rT, $rA", IntGeneral, + [(set GPRC:$rT, (adde GPRC:$rA, 0))]>; +def NEG : XOForm_3<31, 104, 0, (ops GPRC:$rT, GPRC:$rA), + "neg $rT, $rA", IntGeneral, + [(set GPRC:$rT, (ineg GPRC:$rA))]>; +def SUBFME : XOForm_3<31, 232, 0, (ops GPRC:$rT, GPRC:$rA), + "subfme $rT, $rA", IntGeneral, + [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>; +def SUBFZE : XOForm_3<31, 200, 0, (ops GPRC:$rT, GPRC:$rA), + "subfze $rT, $rA", IntGeneral, + [(set GPRC:$rT, (sube 0, GPRC:$rA))]>; +} + +// A-Form instructions. Most of the instructions executed in the FPU are of +// this type. +// +let PPC970_Unit = 3 in { // FPU Operations. +def FMADD : AForm_1<63, 29, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), + "fmadd $FRT, $FRA, $FRC, $FRB", FPFused, + [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC), + F8RC:$FRB))]>, + Requires<[FPContractions]>; +def FMADDS : AForm_1<59, 29, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), + "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC), + F4RC:$FRB))]>, + Requires<[FPContractions]>; +def FMSUB : AForm_1<63, 28, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), + "fmsub $FRT, $FRA, $FRC, $FRB", FPFused, + [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC), + F8RC:$FRB))]>, + Requires<[FPContractions]>; +def FMSUBS : AForm_1<59, 28, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), + "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC), + F4RC:$FRB))]>, + Requires<[FPContractions]>; +def FNMADD : AForm_1<63, 31, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), + "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused, + [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC), + F8RC:$FRB)))]>, + Requires<[FPContractions]>; +def FNMADDS : AForm_1<59, 31, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), + "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC), + F4RC:$FRB)))]>, + Requires<[FPContractions]>; +def FNMSUB : AForm_1<63, 30, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), + "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused, + [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC), + F8RC:$FRB)))]>, + Requires<[FPContractions]>; +def FNMSUBS : AForm_1<59, 30, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRC, F4RC:$FRB), + "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC), + F4RC:$FRB)))]>, + Requires<[FPContractions]>; +// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid +// having 4 of these, force the comparison to always be an 8-byte double (code +// should use an FMRSD if the input comparison value really wants to be a float) +// and 4/8 byte forms for the result and operand type.. +def FSELD : AForm_1<63, 23, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRC, F8RC:$FRB), + "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F8RC:$FRT, (PPCfsel F8RC:$FRA,F8RC:$FRC,F8RC:$FRB))]>; +def FSELS : AForm_1<63, 23, + (ops F4RC:$FRT, F8RC:$FRA, F4RC:$FRC, F4RC:$FRB), + "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral, + [(set F4RC:$FRT, (PPCfsel F8RC:$FRA,F4RC:$FRC,F4RC:$FRB))]>; +def FADD : AForm_2<63, 21, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB), + "fadd $FRT, $FRA, $FRB", FPGeneral, + [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>; +def FADDS : AForm_2<59, 21, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB), + "fadds $FRT, $FRA, $FRB", FPGeneral, + [(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))]>; +def FDIV : AForm_2<63, 18, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB), + "fdiv $FRT, $FRA, $FRB", FPDivD, + [(set F8RC:$FRT, (fdiv F8RC:$FRA, F8RC:$FRB))]>; +def FDIVS : AForm_2<59, 18, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB), + "fdivs $FRT, $FRA, $FRB", FPDivS, + [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>; +def FMUL : AForm_3<63, 25, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB), + "fmul $FRT, $FRA, $FRB", FPFused, + [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>; +def FMULS : AForm_3<59, 25, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB), + "fmuls $FRT, $FRA, $FRB", FPGeneral, + [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>; +def FSUB : AForm_2<63, 20, + (ops F8RC:$FRT, F8RC:$FRA, F8RC:$FRB), + "fsub $FRT, $FRA, $FRB", FPGeneral, + [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>; +def FSUBS : AForm_2<59, 20, + (ops F4RC:$FRT, F4RC:$FRA, F4RC:$FRB), + "fsubs $FRT, $FRA, $FRB", FPGeneral, + [(set F4RC:$FRT, (fsub F4RC:$FRA, F4RC:$FRB))]>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +// M-Form instructions. rotate and mask instructions. +// +let isCommutable = 1 in { +// RLWIMI can be commuted if the rotate amount is zero. +def RLWIMI : MForm_2<20, + (ops GPRC:$rA, GPRC:$rSi, GPRC:$rS, u5imm:$SH, u5imm:$MB, + u5imm:$ME), "rlwimi $rA, $rS, $SH, $MB, $ME", IntRotate, + []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">, + NoEncode<"$rSi">; +} +def RLWINM : MForm_2<21, + (ops GPRC:$rA, GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral, + []>; +def RLWINMo : MForm_2<21, + (ops GPRC:$rA, GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral, + []>, isDOT, PPC970_DGroup_Cracked; +def RLWNM : MForm_2<23, + (ops GPRC:$rA, GPRC:$rS, GPRC:$rB, u5imm:$MB, u5imm:$ME), + "rlwnm $rA, $rS, $rB, $MB, $ME", IntGeneral, + []>; +} + + +//===----------------------------------------------------------------------===// +// DWARF Pseudo Instructions +// + +def DWARF_LOC : Pseudo<(ops i32imm:$line, i32imm:$col, i32imm:$file), + "${:comment} .loc $file, $line, $col", + [(dwarf_loc (i32 imm:$line), (i32 imm:$col), + (i32 imm:$file))]>; + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Patterns +// + +// Arbitrary immediate support. Implement in terms of LIS/ORI. +def : Pat<(i32 imm:$imm), + (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>; + +// Implement the 'not' operation with the NOR instruction. +def NOT : Pat<(not GPRC:$in), + (NOR GPRC:$in, GPRC:$in)>; + +// ADD an arbitrary immediate. +def : Pat<(add GPRC:$in, imm:$imm), + (ADDIS (ADDI GPRC:$in, (LO16 imm:$imm)), (HA16 imm:$imm))>; +// OR an arbitrary immediate. +def : Pat<(or GPRC:$in, imm:$imm), + (ORIS (ORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// XOR an arbitrary immediate. +def : Pat<(xor GPRC:$in, imm:$imm), + (XORIS (XORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// SUBFIC +def : Pat<(sub immSExt16:$imm, GPRC:$in), + (SUBFIC GPRC:$in, imm:$imm)>; + +// SHL/SRL +def : Pat<(shl GPRC:$in, (i32 imm:$imm)), + (RLWINM GPRC:$in, imm:$imm, 0, (SHL32 imm:$imm))>; +def : Pat<(srl GPRC:$in, (i32 imm:$imm)), + (RLWINM GPRC:$in, (SRL32 imm:$imm), imm:$imm, 31)>; + +// ROTL +def : Pat<(rotl GPRC:$in, GPRC:$sh), + (RLWNM GPRC:$in, GPRC:$sh, 0, 31)>; +def : Pat<(rotl GPRC:$in, (i32 imm:$imm)), + (RLWINM GPRC:$in, imm:$imm, 0, 31)>; + +// RLWNM +def : Pat<(and (rotl GPRC:$in, GPRC:$sh), maskimm32:$imm), + (RLWNM GPRC:$in, GPRC:$sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>; + +// Calls +def : Pat<(PPCcall_Macho (i32 tglobaladdr:$dst)), + (BL_Macho tglobaladdr:$dst)>; +def : Pat<(PPCcall_Macho (i32 texternalsym:$dst)), + (BL_Macho texternalsym:$dst)>; +def : Pat<(PPCcall_ELF (i32 tglobaladdr:$dst)), + (BL_ELF tglobaladdr:$dst)>; +def : Pat<(PPCcall_ELF (i32 texternalsym:$dst)), + (BL_ELF texternalsym:$dst)>; + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>; +def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS GPRC:$in, tglobaladdr:$g)>; +def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)), + (ADDIS GPRC:$in, tconstpool:$g)>; +def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)), + (ADDIS GPRC:$in, tjumptable:$g)>; + +// Fused negative multiply subtract, alternate pattern +def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)), + (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>, + Requires<[FPContractions]>; +def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)), + (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>, + Requires<[FPContractions]>; + +// Standard shifts. These are represented separately from the real shifts above +// so that we can distinguish between shifts that allow 5-bit and 6-bit shift +// amounts. +def : Pat<(sra GPRC:$rS, GPRC:$rB), + (SRAW GPRC:$rS, GPRC:$rB)>; +def : Pat<(srl GPRC:$rS, GPRC:$rB), + (SRW GPRC:$rS, GPRC:$rB)>; +def : Pat<(shl GPRC:$rS, GPRC:$rB), + (SLW GPRC:$rS, GPRC:$rB)>; + +def : Pat<(zextloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX xaddr:$src)>; +def : Pat<(extloadf32 iaddr:$src), + (FMRSD (LFS iaddr:$src))>; +def : Pat<(extloadf32 xaddr:$src), + (FMRSD (LFSX xaddr:$src))>; + +include "PPCInstrAltivec.td" +include "PPCInstr64Bit.td" diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp new file mode 100644 index 0000000..acaed0b --- /dev/null +++ b/lib/Target/PowerPC/PPCJITInfo.cpp @@ -0,0 +1,429 @@ +//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the 32-bit PowerPC target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "PPCJITInfo.h" +#include "PPCRelocations.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/Config/alloca.h" +#include "llvm/Support/Debug.h" +#include <set> +using namespace llvm; + +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +#define BUILD_ADDIS(RD,RS,IMM16) \ + ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535)) +#define BUILD_ORI(RD,RS,UIMM16) \ + ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535)) +#define BUILD_ORIS(RD,RS,UIMM16) \ + ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535)) +#define BUILD_RLDICR(RD,RS,SH,ME) \ + ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \ + (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1)) +#define BUILD_MTSPR(RS,SPR) \ + ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1)) +#define BUILD_BCCTRx(BO,BI,LINK) \ + ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1)) +#define BUILD_B(TARGET, LINK) \ + ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1)) + +// Pseudo-ops +#define BUILD_LIS(RD,IMM16) BUILD_ADDIS(RD,0,IMM16) +#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6) +#define BUILD_MTCTR(RS) BUILD_MTSPR(RS,9) +#define BUILD_BCTR(LINK) BUILD_BCCTRx(20,0,LINK) + +static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){ + intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2; + unsigned *AtI = (unsigned*)(intptr_t)At; + + if (Offset >= -(1 << 23) && Offset < (1 << 23)) { // In range? + AtI[0] = BUILD_B(Offset, isCall); // b/bl target + } else if (!is64Bit) { + AtI[0] = BUILD_LIS(12, To >> 16); // lis r12, hi16(address) + AtI[1] = BUILD_ORI(12, 12, To); // ori r12, r12, lo16(address) + AtI[2] = BUILD_MTCTR(12); // mtctr r12 + AtI[3] = BUILD_BCTR(isCall); // bctr/bctrl + } else { + AtI[0] = BUILD_LIS(12, To >> 48); // lis r12, hi16(address) + AtI[1] = BUILD_ORI(12, 12, To >> 32); // ori r12, r12, lo16(address) + AtI[2] = BUILD_SLDI(12, 12, 32); // sldi r12, r12, 32 + AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address) + AtI[4] = BUILD_ORI(12, 12, To); // ori r12, r12, lo16(address) + AtI[5] = BUILD_MTCTR(12); // mtctr r12 + AtI[6] = BUILD_BCTR(isCall); // bctr/bctrl + } +} + +extern "C" void PPC32CompilationCallback(); +extern "C" void PPC64CompilationCallback(); + +#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \ + !defined(__ppc64__) +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us. Instead, +// write our own wrapper, which does things our way, so we have complete control +// over register saving and restoring. +asm( + ".text\n" + ".align 2\n" + ".globl _PPC32CompilationCallback\n" +"_PPC32CompilationCallback:\n" + // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the + // FIXME: need to save v[0-19] for altivec? + // FIXME: could shrink frame + // Set up a proper stack frame + // FIXME Layout + // PowerPC64 ABI linkage - 24 bytes + // parameters - 32 bytes + // 13 double registers - 104 bytes + // 8 int registers - 32 bytes + "mflr r0\n" + "stw r0, 8(r1)\n" + "stwu r1, -208(r1)\n" + // Save all int arg registers + "stw r10, 204(r1)\n" "stw r9, 200(r1)\n" + "stw r8, 196(r1)\n" "stw r7, 192(r1)\n" + "stw r6, 188(r1)\n" "stw r5, 184(r1)\n" + "stw r4, 180(r1)\n" "stw r3, 176(r1)\n" + // Save all call-clobbered FP regs. + "stfd f13, 168(r1)\n" "stfd f12, 160(r1)\n" + "stfd f11, 152(r1)\n" "stfd f10, 144(r1)\n" + "stfd f9, 136(r1)\n" "stfd f8, 128(r1)\n" + "stfd f7, 120(r1)\n" "stfd f6, 112(r1)\n" + "stfd f5, 104(r1)\n" "stfd f4, 96(r1)\n" + "stfd f3, 88(r1)\n" "stfd f2, 80(r1)\n" + "stfd f1, 72(r1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 0. + "mr r3, r0\n" + "lwz r2, 208(r1)\n" // stub's frame + "lwz r4, 8(r2)\n" // stub's lr + "li r5, 0\n" // 0 == 32 bit + "bl _PPCCompilationCallbackC\n" + "mtctr r3\n" + // Restore all int arg registers + "lwz r10, 204(r1)\n" "lwz r9, 200(r1)\n" + "lwz r8, 196(r1)\n" "lwz r7, 192(r1)\n" + "lwz r6, 188(r1)\n" "lwz r5, 184(r1)\n" + "lwz r4, 180(r1)\n" "lwz r3, 176(r1)\n" + // Restore all FP arg registers + "lfd f13, 168(r1)\n" "lfd f12, 160(r1)\n" + "lfd f11, 152(r1)\n" "lfd f10, 144(r1)\n" + "lfd f9, 136(r1)\n" "lfd f8, 128(r1)\n" + "lfd f7, 120(r1)\n" "lfd f6, 112(r1)\n" + "lfd f5, 104(r1)\n" "lfd f4, 96(r1)\n" + "lfd f3, 88(r1)\n" "lfd f2, 80(r1)\n" + "lfd f1, 72(r1)\n" + // Pop 3 frames off the stack and branch to target + "lwz r1, 208(r1)\n" + "lwz r2, 8(r1)\n" + "mtlr r2\n" + "bctr\n" + ); + +#elif defined(__PPC__) && !defined(__ppc64__) +// Linux/PPC support + +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us. Instead, +// write our own wrapper, which does things our way, so we have complete control +// over register saving and restoring. +asm( + ".text\n" + ".align 2\n" + ".globl PPC32CompilationCallback\n" +"PPC32CompilationCallback:\n" + // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the + // FIXME: need to save v[0-19] for altivec? + // FIXME: could shrink frame + // Set up a proper stack frame + // FIXME Layout + // 8 double registers - 64 bytes + // 8 int registers - 32 bytes + "mflr 0\n" + "stw 0, 4(1)\n" + "stwu 1, -104(1)\n" + // Save all int arg registers + "stw 10, 100(1)\n" "stw 9, 96(1)\n" + "stw 8, 92(1)\n" "stw 7, 88(1)\n" + "stw 6, 84(1)\n" "stw 5, 80(1)\n" + "stw 4, 76(1)\n" "stw 3, 72(1)\n" + // Save all call-clobbered FP regs. + "stfd 8, 64(1)\n" + "stfd 7, 56(1)\n" "stfd 6, 48(1)\n" + "stfd 5, 40(1)\n" "stfd 4, 32(1)\n" + "stfd 3, 24(1)\n" "stfd 2, 16(1)\n" + "stfd 1, 8(1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 0. + "mr 3, 0\n" + "lwz 5, 104(1)\n" // stub's frame + "lwz 4, 4(5)\n" // stub's lr + "li 5, 0\n" // 0 == 32 bit + "bl PPCCompilationCallbackC\n" + "mtctr 3\n" + // Restore all int arg registers + "lwz 10, 100(1)\n" "lwz 9, 96(1)\n" + "lwz 8, 92(1)\n" "lwz 7, 88(1)\n" + "lwz 6, 84(1)\n" "lwz 5, 80(1)\n" + "lwz 4, 76(1)\n" "lwz 3, 72(1)\n" + // Restore all FP arg registers + "lfd 8, 64(1)\n" + "lfd 7, 56(1)\n" "lfd 6, 48(1)\n" + "lfd 5, 40(1)\n" "lfd 4, 32(1)\n" + "lfd 3, 24(1)\n" "lfd 2, 16(1)\n" + "lfd 1, 8(1)\n" + // Pop 3 frames off the stack and branch to target + "lwz 1, 104(1)\n" + "lwz 0, 4(1)\n" + "mtlr 0\n" + "bctr\n" + ); +#else +void PPC32CompilationCallback() { + assert(0 && "This is not a power pc, you can't execute this!"); + abort(); +} +#endif + +#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \ + defined(__ppc64__) +asm( + ".text\n" + ".align 2\n" + ".globl _PPC64CompilationCallback\n" +"_PPC64CompilationCallback:\n" + // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the + // FIXME: need to save v[0-19] for altivec? + // Set up a proper stack frame + // Layout + // PowerPC64 ABI linkage - 48 bytes + // parameters - 64 bytes + // 13 double registers - 104 bytes + // 8 int registers - 64 bytes + "mflr r0\n" + "std r0, 16(r1)\n" + "stdu r1, -280(r1)\n" + // Save all int arg registers + "std r10, 272(r1)\n" "std r9, 264(r1)\n" + "std r8, 256(r1)\n" "std r7, 248(r1)\n" + "std r6, 240(r1)\n" "std r5, 232(r1)\n" + "std r4, 224(r1)\n" "std r3, 216(r1)\n" + // Save all call-clobbered FP regs. + "stfd f13, 208(r1)\n" "stfd f12, 200(r1)\n" + "stfd f11, 192(r1)\n" "stfd f10, 184(r1)\n" + "stfd f9, 176(r1)\n" "stfd f8, 168(r1)\n" + "stfd f7, 160(r1)\n" "stfd f6, 152(r1)\n" + "stfd f5, 144(r1)\n" "stfd f4, 136(r1)\n" + "stfd f3, 128(r1)\n" "stfd f2, 120(r1)\n" + "stfd f1, 112(r1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 1. + "mr r3, r0\n" + "ld r2, 280(r1)\n" // stub's frame + "ld r4, 16(r2)\n" // stub's lr + "li r5, 1\n" // 1 == 64 bit + "bl _PPCCompilationCallbackC\n" + "mtctr r3\n" + // Restore all int arg registers + "ld r10, 272(r1)\n" "ld r9, 264(r1)\n" + "ld r8, 256(r1)\n" "ld r7, 248(r1)\n" + "ld r6, 240(r1)\n" "ld r5, 232(r1)\n" + "ld r4, 224(r1)\n" "ld r3, 216(r1)\n" + // Restore all FP arg registers + "lfd f13, 208(r1)\n" "lfd f12, 200(r1)\n" + "lfd f11, 192(r1)\n" "lfd f10, 184(r1)\n" + "lfd f9, 176(r1)\n" "lfd f8, 168(r1)\n" + "lfd f7, 160(r1)\n" "lfd f6, 152(r1)\n" + "lfd f5, 144(r1)\n" "lfd f4, 136(r1)\n" + "lfd f3, 128(r1)\n" "lfd f2, 120(r1)\n" + "lfd f1, 112(r1)\n" + // Pop 3 frames off the stack and branch to target + "ld r1, 280(r1)\n" + "ld r2, 16(r1)\n" + "mtlr r2\n" + "bctr\n" + ); +#else +void PPC64CompilationCallback() { + assert(0 && "This is not a power pc, you can't execute this!"); + abort(); +} +#endif + +extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4, + unsigned *OrigCallAddrPlus4, + bool is64Bit) { + // Adjust the pointer to the address of the call instruction in the stub + // emitted by emitFunctionStub, rather than the instruction after it. + unsigned *StubCallAddr = StubCallAddrPlus4 - 1; + unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1; + + void *Target = JITCompilerFunction(StubCallAddr); + + // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite + // it to branch directly to the destination. If so, rewrite it so it does not + // need to go through the stub anymore. + unsigned OrigCallInst = *OrigCallAddr; + if ((OrigCallInst >> 26) == 18) { // Direct call. + intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2; + + if (Offset >= -(1 << 23) && Offset < (1 << 23)) { // In range? + // Clear the original target out. + OrigCallInst &= (63 << 26) | 3; + // Fill in the new target. + OrigCallInst |= (Offset & ((1 << 24)-1)) << 2; + // Replace the call. + *OrigCallAddr = OrigCallInst; + } + } + + // Assert that we are coming from a stub that was created with our + // emitFunctionStub. + if ((*StubCallAddr >> 26) == 18) + StubCallAddr -= 3; + else { + assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!"); + StubCallAddr -= is64Bit ? 9 : 6; + } + + // Rewrite the stub with an unconditional branch to the target, for any users + // who took the address of the stub. + EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit); + + // Put the address of the target function to call and the address to return to + // after calling the target function in a place that is easy to get on the + // stack after we restore all regs. + return Target; +} + + + +TargetJITInfo::LazyResolverFn +PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) { + JITCompilerFunction = Fn; + return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback; +} + +void *PPCJITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) { + // If this is just a call to an external function, emit a branch instead of a + // call. The code is the same except for one bit of the last instruction. + if (Fn != (void*)(intptr_t)PPC32CompilationCallback && + Fn != (void*)(intptr_t)PPC64CompilationCallback) { + MCE.startFunctionStub(7*4); + intptr_t Addr = (intptr_t)MCE.getCurrentPCValue(); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + EmitBranchToAt(Addr, (intptr_t)Fn, false, is64Bit); + return MCE.finishFunctionStub(0); + } + + MCE.startFunctionStub(10*4); + if (is64Bit) { + MCE.emitWordBE(0xf821ffb1); // stdu r1,-80(r1) + MCE.emitWordBE(0x7d6802a6); // mflr r11 + MCE.emitWordBE(0xf9610060); // std r11, 96(r1) + } else if (TM.getSubtargetImpl()->isMachoABI()){ + MCE.emitWordBE(0x9421ffe0); // stwu r1,-32(r1) + MCE.emitWordBE(0x7d6802a6); // mflr r11 + MCE.emitWordBE(0x91610028); // stw r11, 40(r1) + } else { + MCE.emitWordBE(0x9421ffe0); // stwu r1,-32(r1) + MCE.emitWordBE(0x7d6802a6); // mflr r11 + MCE.emitWordBE(0x91610024); // stw r11, 36(r1) + } + intptr_t Addr = (intptr_t)MCE.getCurrentPCValue(); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + MCE.emitWordBE(0); + EmitBranchToAt(Addr, (intptr_t)Fn, true, is64Bit); + return MCE.finishFunctionStub(0); +} + + +void PPCJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4; + intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); + switch ((PPC::RelocationType)MR->getRelocationType()) { + default: assert(0 && "Unknown relocation type!"); + case PPC::reloc_pcrel_bx: + // PC-relative relocation for b and bl instructions. + ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2; + assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) && + "Relocation out of range!"); + *RelocPos |= (ResultPtr & ((1 << 24)-1)) << 2; + break; + case PPC::reloc_pcrel_bcx: + // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other + // bcx instructions. + ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2; + assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) && + "Relocation out of range!"); + *RelocPos |= (ResultPtr & ((1 << 14)-1)) << 2; + break; + case PPC::reloc_absolute_high: // high bits of ref -> low 16 of instr + case PPC::reloc_absolute_low: { // low bits of ref -> low 16 of instr + ResultPtr += MR->getConstantVal(); + + // If this is a high-part access, get the high-part. + if (MR->getRelocationType() == PPC::reloc_absolute_high) { + // If the low part will have a carry (really a borrow) from the low + // 16-bits into the high 16, add a bit to borrow from. + if (((int)ResultPtr << 16) < 0) + ResultPtr += 1 << 16; + ResultPtr >>= 16; + } + + // Do the addition then mask, so the addition does not overflow the 16-bit + // immediate section of the instruction. + unsigned LowBits = (*RelocPos + ResultPtr) & 65535; + unsigned HighBits = *RelocPos & ~65535; + *RelocPos = LowBits | HighBits; // Slam into low 16-bits + break; + } + case PPC::reloc_absolute_low_ix: { // low bits of ref -> low 14 of instr + ResultPtr += MR->getConstantVal(); + // Do the addition then mask, so the addition does not overflow the 16-bit + // immediate section of the instruction. + unsigned LowBits = (*RelocPos + ResultPtr) & 0xFFFC; + unsigned HighBits = *RelocPos & 0xFFFF0003; + *RelocPos = LowBits | HighBits; // Slam into low 14-bits. + break; + } + } + } +} + +void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit); +} diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h new file mode 100644 index 0000000..66ee0ee --- /dev/null +++ b/lib/Target/PowerPC/PPCJITInfo.h @@ -0,0 +1,46 @@ +//===- PPCJITInfo.h - PowerPC impl. of the JIT interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_JITINFO_H +#define POWERPC_JITINFO_H + +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { + class PPCTargetMachine; + + class PPCJITInfo : public TargetJITInfo { + protected: + PPCTargetMachine &TM; + bool is64Bit; + public: + PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) { + useGOT = 0; + is64Bit = tmIs64Bit; + } + + virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE); + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + }; +} + +#endif diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.cpp b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp new file mode 100644 index 0000000..5e2dc9e --- /dev/null +++ b/lib/Target/PowerPC/PPCMachOWriterInfo.cpp @@ -0,0 +1,150 @@ +//===-- PPCMachOWriterInfo.cpp - Mach-O Writer Info for the PowerPC -------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bill Wendling and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Mach-O writer information for the PowerPC backend. +// +//===----------------------------------------------------------------------===// + +#include "PPCMachOWriterInfo.h" +#include "PPCRelocations.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachORelocation.h" +#include "llvm/Support/OutputBuffer.h" +using namespace llvm; + +PPCMachOWriterInfo::PPCMachOWriterInfo(const PPCTargetMachine &TM) + : TargetMachOWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64 ? + HDR_CPU_TYPE_POWERPC64 : + HDR_CPU_TYPE_POWERPC, + HDR_CPU_SUBTYPE_POWERPC_ALL) {} +PPCMachOWriterInfo::~PPCMachOWriterInfo() {} + +/// GetTargetRelocation - For the MachineRelocation MR, convert it to one or +/// more PowerPC MachORelocation(s), add the new relocations to the +/// MachOSection, and rewrite the instruction at the section offset if required +/// by that relocation type. +unsigned PPCMachOWriterInfo::GetTargetRelocation(MachineRelocation &MR, + unsigned FromIdx, + unsigned ToAddr, + unsigned ToIdx, + OutputBuffer &RelocOut, + OutputBuffer &SecOut, + bool Scattered, + bool isExtern) const { + unsigned NumRelocs = 0; + uint64_t Addr = 0; + + // Get the address of whatever it is we're relocating, if possible. + if (!isExtern) + Addr = (uintptr_t)MR.getResultPointer() + ToAddr; + + switch ((PPC::RelocationType)MR.getRelocationType()) { + default: assert(0 && "Unknown PPC relocation type!"); + case PPC::reloc_absolute_low_ix: + assert(0 && "Unhandled PPC relocation type!"); + break; + case PPC::reloc_vanilla: + { + // FIXME: need to handle 64 bit vanilla relocs + MachORelocation VANILLA(MR.getMachineCodeOffset(), ToIdx, + false, 2, isExtern, + PPC_RELOC_VANILLA, + Scattered, (intptr_t)MR.getResultPointer()); + ++NumRelocs; + + if (Scattered) { + RelocOut.outword(VANILLA.getPackedFields()); + RelocOut.outword(VANILLA.getAddress()); + } else { + RelocOut.outword(VANILLA.getAddress()); + RelocOut.outword(VANILLA.getPackedFields()); + } + + intptr_t SymbolOffset; + + if (Scattered) + SymbolOffset = Addr + MR.getConstantVal(); + else + SymbolOffset = Addr; + + printf("vanilla fixup: sec_%x[%x] = %x\n", FromIdx, + unsigned(MR.getMachineCodeOffset()), + unsigned(SymbolOffset)); + SecOut.fixword(SymbolOffset, MR.getMachineCodeOffset()); + } + break; + case PPC::reloc_pcrel_bx: + { + // FIXME: Presumably someday we will need to branch to other, non-extern + // functions too. Need to figure out some way to distinguish between + // target is BB and target is function. + if (isExtern) { + MachORelocation BR24(MR.getMachineCodeOffset(), ToIdx, true, 2, + isExtern, PPC_RELOC_BR24, Scattered, + (intptr_t)MR.getMachineCodeOffset()); + RelocOut.outword(BR24.getAddress()); + RelocOut.outword(BR24.getPackedFields()); + ++NumRelocs; + } + + Addr -= MR.getMachineCodeOffset(); + Addr >>= 2; + Addr &= 0xFFFFFF; + Addr <<= 2; + Addr |= (SecOut[MR.getMachineCodeOffset()] << 24); + Addr |= (SecOut[MR.getMachineCodeOffset()+3] & 0x3); + SecOut.fixword(Addr, MR.getMachineCodeOffset()); + break; + } + case PPC::reloc_pcrel_bcx: + { + Addr -= MR.getMachineCodeOffset(); + Addr &= 0xFFFC; + + SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2); + break; + } + case PPC::reloc_absolute_high: + { + MachORelocation HA16(MR.getMachineCodeOffset(), ToIdx, false, 2, + isExtern, PPC_RELOC_HA16); + MachORelocation PAIR(Addr & 0xFFFF, 0xFFFFFF, false, 2, isExtern, + PPC_RELOC_PAIR); + NumRelocs = 2; + + RelocOut.outword(HA16.getRawAddress()); + RelocOut.outword(HA16.getPackedFields()); + RelocOut.outword(PAIR.getRawAddress()); + RelocOut.outword(PAIR.getPackedFields()); + + Addr += 0x8000; + + SecOut.fixhalf(Addr >> 16, MR.getMachineCodeOffset() + 2); + break; + } + case PPC::reloc_absolute_low: + { + MachORelocation LO16(MR.getMachineCodeOffset(), ToIdx, false, 2, + isExtern, PPC_RELOC_LO16); + MachORelocation PAIR(Addr >> 16, 0xFFFFFF, false, 2, isExtern, + PPC_RELOC_PAIR); + NumRelocs = 2; + + RelocOut.outword(LO16.getRawAddress()); + RelocOut.outword(LO16.getPackedFields()); + RelocOut.outword(PAIR.getRawAddress()); + RelocOut.outword(PAIR.getPackedFields()); + + SecOut.fixhalf(Addr, MR.getMachineCodeOffset() + 2); + break; + } + } + + return NumRelocs; +} diff --git a/lib/Target/PowerPC/PPCMachOWriterInfo.h b/lib/Target/PowerPC/PPCMachOWriterInfo.h new file mode 100644 index 0000000..69ed9f7 --- /dev/null +++ b/lib/Target/PowerPC/PPCMachOWriterInfo.h @@ -0,0 +1,55 @@ +//===-- PPCMachOWriterInfo.h - Mach-O Writer Info for PowerPC ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bill Wendling and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Mach-O writer information for the PowerPC backend. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC_MACHO_WRITER_INFO_H +#define PPC_MACHO_WRITER_INFO_H + +#include "llvm/Target/TargetMachOWriterInfo.h" + +namespace llvm { + + // Forward declarations + class MachineRelocation; + class OutputBuffer; + class PPCTargetMachine; + + class PPCMachOWriterInfo : public TargetMachOWriterInfo { + public: + PPCMachOWriterInfo(const PPCTargetMachine &TM); + virtual ~PPCMachOWriterInfo(); + + virtual unsigned GetTargetRelocation(MachineRelocation &MR, + unsigned FromIdx, + unsigned ToAddr, + unsigned ToIdx, + OutputBuffer &RelocOut, + OutputBuffer &SecOut, + bool Scattered, bool Extern) const; + + // Constants for the relocation r_type field. + // See <mach-o/ppc/reloc.h> + enum { + PPC_RELOC_VANILLA, // generic relocation + PPC_RELOC_PAIR, // the second relocation entry of a pair + PPC_RELOC_BR14, // 14 bit branch displacement to word address + PPC_RELOC_BR24, // 24 bit branch displacement to word address + PPC_RELOC_HI16, // a PAIR follows with the low 16 bits + PPC_RELOC_LO16, // a PAIR follows with the high 16 bits + PPC_RELOC_HA16, // a PAIR follows, which is sign extended to 32b + PPC_RELOC_LO14 // LO16 with low 2 bits implicitly zero + }; + }; + +} // end llvm namespace + +#endif // PPC_MACHO_WRITER_INFO_H diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h new file mode 100644 index 0000000..e227456 --- /dev/null +++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -0,0 +1,50 @@ +//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of MachineFunctionInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC_MACHINE_FUNCTION_INFO_H +#define PPC_MACHINE_FUNCTION_INFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// PPCFunctionInfo - This class is derived from MachineFunction private +/// PowerPC target-specific information for each MachineFunction. +class PPCFunctionInfo : public MachineFunctionInfo { +private: + /// FramePointerSaveIndex - Frame index of where the old frame pointer is + /// stored. Also used as an anchor for instructions that need to be altered + /// when using frame pointers (dyna_add, dyna_sub.) + int FramePointerSaveIndex; + + /// UsesLR - Indicates whether LR is used in the current function. + /// + bool UsesLR; + +public: + PPCFunctionInfo(MachineFunction& MF) + : FramePointerSaveIndex(0) + {} + + int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } + void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } + + void setUsesLR(bool U) { UsesLR = U; } + bool usesLR() { return UsesLR; } + +}; + +} // end of namespace llvm + + +#endif diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h new file mode 100644 index 0000000..d0f833e --- /dev/null +++ b/lib/Target/PowerPC/PPCPerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle without using vperm. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 292 entries have cost 1 +// 1384 entries have cost 2 +// 3061 entries have cost 3 +// 1733 entries have cost 4 +// 60 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 202162278U, // <0,0,0,0>: Cost 1 vspltisw0 LHS + 1140850790U, // <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS + 2617247181U, // <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0> + 2635163787U, // <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0> + 1543507254U, // <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS + 2281701705U, // <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5> + 2617250133U, // <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0> + 2659054575U, // <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,0,u>: Cost 1 vspltisw0 LHS + 1141686282U, // <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1> + 67944550U, // <0,0,1,1>: Cost 1 vmrghw LHS, LHS + 1685241958U, // <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2215870716U, // <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1141727570U, // <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 2215428562U, // <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7> + 2215428589U, // <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7> + 2659062768U, // <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1> + 67945117U, // <0,0,1,u>: Cost 1 vmrghw LHS, LHS + 2684356045U, // <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0> + 2216009830U, // <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2216009901U, // <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2> + 2698290853U, // <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0> + 3289751890U, // <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5> + 3758098275U, // <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1> + 2684356538U, // <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7> + 3758098410U, // <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1> + 2216010397U, // <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS + 2702272651U, // <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0> + 2216656998U, // <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 3844669704U, // <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3> + 2216657148U, // <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0> + 2684357122U, // <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6> + 3732820066U, // <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0> + 3778005624U, // <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7> + 3374713464U, // <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7> + 2216657565U, // <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217361408U, // <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0> + 1143619686U, // <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 3291103405U, // <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2> + 3827269988U, // <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5> + 1143619922U, // <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1610616118U, // <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 3758099833U, // <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2> + 3854107016U, // <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5> + 1143620253U, // <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2284396544U, // <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0> + 2218025062U, // <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS + 3758100203U, // <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3> + 3395966100U, // <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3> + 3804549052U, // <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5> + 2302314964U, // <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5> + 2785821138U, // <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 3395966428U, // <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7> + 2787148260U, // <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7> + 2684358997U, // <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0> + 2218631270U, // <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2684359162U, // <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3> + 3758101042U, // <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5> + 3732843830U, // <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS + 3758101227U, // <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1> + 2684359480U, // <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6> + 2724836173U, // <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0> + 2725499806U, // <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0> + 2726163439U, // <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 2219311206U, // <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS + 3868557900U, // <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3> + 3377400112U, // <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3> + 2684360038U, // <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6> + 3732852834U, // <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0> + 3871507060U, // <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7> + 2303658616U, // <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7> + 2726163439U, // <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,u,0>: Cost 1 vspltisw0 LHS + 72589414U, // <0,0,u,1>: Cost 1 vmrghw LHS, LHS + 1685242525U, // <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2220073212U, // <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1146331474U, // <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 1610619034U, // <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 2785821138U, // <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 2659120119U, // <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u> + 72589981U, // <0,0,u,u>: Cost 1 vmrghw LHS, LHS + 2698297344U, // <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0> + 1624555622U, // <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 2758984428U, // <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1> + 2635237524U, // <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0> + 2693652818U, // <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5> + 2281701714U, // <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5> + 2698297846U, // <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7> + 2659128312U, // <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0> + 1624556189U, // <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 1543585802U, // <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1> + 1141728052U, // <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1141728150U, // <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2295644334U, // <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3> + 1543589174U, // <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS + 2290999634U, // <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5> + 2617332135U, // <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1> + 2617332720U, // <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1> + 1142171004U, // <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0> + 1561509990U, // <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS + 2623308516U, // <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2> + 2698298984U, // <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1561513270U, // <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS + 2647199304U, // <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2> + 2698299322U, // <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7> + 1585402874U, // <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2698299540U, // <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0> + 3290399540U, // <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1> + 2698299720U, // <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0> + 2698299804U, // <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3> + 2698299906U, // <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6> + 3832726521U, // <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0> + 2724842160U, // <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0> + 2706926275U, // <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1> + 2698300190U, // <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2> + 2635268198U, // <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS + 2217362228U, // <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1> + 2217362326U, // <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0> + 2635270296U, // <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4> + 2635271478U, // <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS + 1624558902U, // <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2659160910U, // <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1> + 2659161084U, // <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4> + 1624559145U, // <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 3832726639U, // <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1> + 2714889871U, // <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1> + 2302314646U, // <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 3834717321U, // <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0> + 3832726679U, // <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5> + 2717544403U, // <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1> + 2718208036U, // <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1> + 3792613493U, // <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1> + 2719535302U, // <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1> + 2659172454U, // <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS + 3832726735U, // <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7> + 2724844026U, // <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3> + 3775361608U, // <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0> + 2659175734U, // <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS + 3832726771U, // <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7> + 2724844344U, // <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6> + 1651102542U, // <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1> + 1651766175U, // <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1> + 2724844536U, // <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0> + 3377397770U, // <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1> + 2698302636U, // <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0> + 2728162531U, // <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1> + 2724844902U, // <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6> + 3377398098U, // <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5> + 2724845076U, // <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0> + 2724845164U, // <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7> + 2724845186U, // <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2> + 1561559142U, // <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS + 1146331956U, // <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1146332054U, // <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1561562422U, // <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS + 1624561818U, // <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2220074191U, // <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7> + 1585452032U, // <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2214593997U, // <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0> + 2214675999U, // <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1> + 2214594152U, // <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2> + 1207959654U, // <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 3709054262U, // <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS + 3375350836U, // <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5> + 2214594490U, // <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7> + 3288336362U, // <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1> + 1207959659U, // <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS + 2215871994U, // <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2215470623U, // <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1141728872U, // <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1141728934U, // <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2215872323U, // <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2215872405U, // <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1141729210U, // <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2215430122U, // <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1141729368U, // <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3> + 3289736698U, // <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0> + 3289744927U, // <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1> + 2216011368U, // <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2> + 2216019622U, // <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1> + 3289769795U, // <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5> + 3289778069U, // <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6> + 2216044474U, // <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7> + 3732960259U, // <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2> + 2216061016U, // <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3> + 2758985382U, // <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1> + 2758985392U, // <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2> + 3290400360U, // <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2> + 2758985408U, // <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0> + 2758985422U, // <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5> + 2785822424U, // <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6> + 3290400698U, // <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7> + 2765915876U, // <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0> + 2758985453U, // <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0> + 3291104762U, // <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0> + 2217362979U, // <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2217363048U, // <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2> + 2217363110U, // <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1> + 3291105087U, // <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1> + 3291105173U, // <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6> + 2217363386U, // <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7> + 3788639688U, // <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0> + 2217363515U, // <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1> + 3376054371U, // <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0> + 3788639888U, // <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2> + 3376055912U, // <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2> + 2302312550U, // <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS + 3376054375U, // <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4> + 3374728244U, // <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5> + 3805229154U, // <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0> + 3376055512U, // <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7> + 2302312555U, // <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS + 3709100134U, // <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS + 3709100950U, // <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0> + 3709102010U, // <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7> + 2758985658U, // <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7> + 3709103414U, // <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS + 3732992098U, // <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0> + 3292374970U, // <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7> + 3798594383U, // <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2> + 2758985703U, // <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7> + 3788641274U, // <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2> + 3377398508U, // <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1> + 3377398590U, // <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2> + 2303656038U, // <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 3709111606U, // <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS + 3377398836U, // <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5> + 3803903447U, // <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2> + 3293054954U, // <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1> + 2303656043U, // <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 2220074490U, // <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2220074527U, // <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1146332776U, // <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1146332838U, // <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2220074819U, // <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2220074901U, // <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1146333114U, // <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2220074986U, // <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1146333243U, // <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1> + 2629410816U, // <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0> + 2753530006U, // <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2> + 2629412301U, // <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0> + 2214594972U, // <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3> + 2758985908U, // <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5> + 3733016674U, // <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0> + 3777364488U, // <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7> + 2281703354U, // <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7> + 2758985941U, // <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2> + 1141729430U, // <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2215471334U, // <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2215471425U, // <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1141729692U, // <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1141729794U, // <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2215430738U, // <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5> + 2215430776U, // <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7> + 2295646138U, // <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7> + 1141730078U, // <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2758986032U, // <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3> + 3709141910U, // <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0> + 3289753921U, // <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2> + 2770929992U, // <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0> + 3289754114U, // <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6> + 3362095460U, // <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5> + 3832727910U, // <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3> + 3365414842U, // <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7> + 2771298677U, // <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0> + 2216659094U, // <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2> + 3290409190U, // <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1> + 2703624496U, // <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3> + 2216683932U, // <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3> + 2216692226U, // <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6> + 3733041250U, // <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0> + 3832727988U, // <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0> + 3374712762U, // <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7> + 2216725278U, // <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2> + 2217363606U, // <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2> + 3291105510U, // <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1> + 3291105601U, // <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2> + 2217363868U, // <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3> + 2217363970U, // <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6> + 2758986242U, // <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6> + 3727077685U, // <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4> + 3364767674U, // <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7> + 2217364254U, // <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2> + 3832728102U, // <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6> + 3405916003U, // <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1> + 3376055840U, // <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2> + 3376055679U, // <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3> + 3376055194U, // <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4> + 3859565138U, // <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5> + 2727514210U, // <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 3376056250U, // <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7> + 2727514210U, // <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 2758986360U, // <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 3709174678U, // <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0> + 3795284411U, // <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3> + 3709175980U, // <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6> + 3833096860U, // <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7> + 3376728235U, // <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5> + 3859565229U, // <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6> + 2773879472U, // <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0> + 2758986360U, // <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 2303656854U, // <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0> + 3807229018U, // <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u> + 2727515284U, // <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3> + 3377399410U, // <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3> + 3377398682U, // <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4> + 3801257409U, // <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7> + 3377399980U, // <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6> + 3375409082U, // <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7> + 2731497082U, // <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3> + 1146333334U, // <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2220075238U, // <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2220075329U, // <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1146333596U, // <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1146333698U, // <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2758986566U, // <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6> + 2803739472U, // <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7> + 2295703482U, // <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7> + 1146333982U, // <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2214595473U, // <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0> + 2693677158U, // <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS + 3839437689U, // <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3> + 3709200559U, // <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0> + 2693677394U, // <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5> + 1140854070U, // <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 3767419409U, // <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7> + 3854109604U, // <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1> + 1140854313U, // <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS + 1141689234U, // <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2215431114U, // <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3> + 2215431221U, // <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635466928U, // <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1> + 1141689552U, // <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 67947830U, // <0,4,1,5>: Cost 1 vmrghw LHS, RHS + 2215431545U, // <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2659357716U, // <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1> + 67948073U, // <0,4,1,u>: Cost 1 vmrghw LHS, RHS + 3767420369U, // <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4> + 3767420451U, // <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5> + 3767420520U, // <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2> + 2698323625U, // <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4> + 3709218102U, // <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS + 2216013110U, // <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767420858U, // <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7> + 3774719981U, // <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4> + 2216013353U, // <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767421078U, // <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2> + 3776710880U, // <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4> + 3833097325U, // <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4> + 3767421340U, // <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3> + 3767421442U, // <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6> + 2216660278U, // <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 3833097361U, // <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4> + 3780692678U, // <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4> + 2216660521U, // <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS + 2617573416U, // <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4> + 2217364450U, // <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0> + 3691316771U, // <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5> + 3709233331U, // <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4> + 2785823952U, // <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4> + 1143622966U, // <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 3691319723U, // <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5> + 3854109932U, // <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5> + 1143623209U, // <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS + 2635497574U, // <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS + 2635498390U, // <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0> + 3709240936U, // <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2> + 2635499700U, // <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5> + 2635500854U, // <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS + 2785824044U, // <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6> + 1685245238U, // <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659390488U, // <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5> + 1685245256U, // <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 3839438161U, // <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7> + 3798610347U, // <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5> + 3798610426U, // <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3> + 3795956237U, // <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4> + 3733138742U, // <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS + 2218634550U, // <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS + 3798610744U, // <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6> + 2724868945U, // <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4> + 2725532578U, // <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4> + 3383371465U, // <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0> + 3800601668U, // <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4> + 3775386826U, // <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3> + 3801928934U, // <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4> + 3721202998U, // <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS + 2780368328U, // <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 3383372686U, // <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6> + 3854110170U, // <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0> + 2780368328U, // <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 1146334098U, // <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2220076002U, // <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0> + 2220076085U, // <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635524279U, // <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u> + 1146334416U, // <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 72592694U, // <0,4,u,5>: Cost 1 vmrghw LHS, RHS + 1685245481U, // <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659415067U, // <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u> + 72592937U, // <0,4,u,u>: Cost 1 vmrghw LHS, RHS + 2281704337U, // <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0> + 2704965734U, // <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 3778707666U, // <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3> + 3778707708U, // <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0> + 2687050057U, // <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5> + 2214596612U, // <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5> + 2785824372U, // <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1> + 3854110332U, // <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0> + 2704966301U, // <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 1567768678U, // <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS + 2312236570U, // <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1> + 2215431915U, // <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641512598U, // <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2> + 1567771538U, // <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1> + 1141690372U, // <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1141690466U, // <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2641515514U, // <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2> + 1141690615U, // <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5> + 3772736973U, // <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0> + 3778709024U, // <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2> + 3778709096U, // <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2> + 3778709158U, // <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1> + 3772737275U, // <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5> + 3859566351U, // <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3> + 3778709434U, // <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7> + 3805251562U, // <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1> + 3775391807U, // <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5> + 2704967830U, // <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2> + 3776719073U, // <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5> + 3777382706U, // <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5> + 3778709887U, // <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1> + 2704968148U, // <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5> + 3857428317U, // <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0> + 3364096514U, // <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6> + 3780700871U, // <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5> + 2707622680U, // <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5> + 2728856466U, // <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1> + 3697361674U, // <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4> + 3697362601U, // <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4> + 3364766635U, // <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3> + 2217365428U, // <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6> + 2704969014U, // <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 2785824700U, // <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5> + 3364766963U, // <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7> + 2704969257U, // <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 3846148050U, // <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0> + 2326203282U, // <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1> + 3291746027U, // <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3> + 3376054482U, // <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3> + 3790655366U, // <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5> + 2785824772U, // <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5> + 2724876386U, // <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0> + 3858903057U, // <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0> + 2736820484U, // <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0> + 2659467366U, // <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS + 3859566643U, // <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7> + 3798618618U, // <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3> + 3852857410U, // <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4> + 2659470646U, // <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS + 2659471458U, // <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 3832729696U, // <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7> + 1712083042U, // <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0> + 1712156779U, // <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0> + 2731512826U, // <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2> + 3859566717U, // <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0> + 3798619284U, // <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3> + 3778712803U, // <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1> + 2728858936U, // <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5> + 3859566753U, // <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0> + 3377398135U, // <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6> + 3798619686U, // <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0> + 2731513468U, // <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5> + 1567826022U, // <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS + 2704971566U, // <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 2220076779U, // <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641569942U, // <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2> + 1567828889U, // <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u> + 1146335236U, // <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1146335330U, // <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 1713410308U, // <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0> + 1713484045U, // <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0> + 2214596949U, // <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0> + 2214678951U, // <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1> + 2214597114U, // <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3> + 3852857653U, // <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4> + 3832729919U, // <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5> + 3721293427U, // <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0> + 2214597432U, // <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6> + 1207962934U, // <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 1207962935U, // <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS + 2215432481U, // <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2> + 2215432615U, // <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1141690874U, // <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2215432754U, // <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2215432817U, // <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5> + 2215432939U, // <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1> + 1141691192U, // <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221905718U, // <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 1221905719U, // <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS + 3852857787U, // <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3> + 3289764265U, // <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3> + 3289690618U, // <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3> + 3862589907U, // <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0> + 3733253430U, // <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS + 3733254242U, // <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0> + 3777390522U, // <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7> + 2785825274U, // <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3> + 2785825283U, // <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3> + 3777390742U, // <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2> + 3863106066U, // <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0> + 3777390899U, // <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6> + 3290436146U, // <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5> + 3779381762U, // <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6> + 3779381798U, // <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6> + 3733262920U, // <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0> + 2300972342U, // <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2300972343U, // <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS + 3802606482U, // <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1> + 2217365931U, // <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2217366010U, // <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3> + 3291107890U, // <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5> + 3291099805U, // <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4> + 3777391926U, // <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS + 2217366328U, // <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6> + 2291027254U, // <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 2291027255U, // <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS + 3852858033U, // <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6> + 3395964532U, // <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1> + 3864507069U, // <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0> + 3376056678U, // <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3> + 3721334070U, // <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS + 3395964860U, // <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5> + 3864802017U, // <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0> + 2302315830U, // <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 2302315831U, // <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS + 3852858108U, // <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0> + 3398624745U, // <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1> + 2218668538U, // <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3> + 3292418610U, // <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5> + 3733286198U, // <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS + 3797299889U, // <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6> + 2785825592U, // <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6> + 2785825602U, // <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7> + 2785825611U, // <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7> + 2785825614U, // <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1> + 2758988632U, // <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2> + 3377400084U, // <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2> + 2792166248U, // <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0> + 2785825654U, // <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5> + 2785825664U, // <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 3859567493U, // <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2> + 2303659318U, // <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303659319U, // <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS + 2785825695U, // <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1> + 2220077479U, // <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1146335738U, // <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2792829881U, // <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0> + 2785825735U, // <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5> + 2785825664U, // <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 1146336056U, // <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221963062U, // <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 1221963063U, // <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS + 2653593600U, // <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0> + 2706309222U, // <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 3709421498U, // <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7> + 2281705978U, // <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3> + 2785825816U, // <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5> + 2785825826U, // <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6> + 2653598037U, // <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0> + 2214598252U, // <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7> + 2706309789U, // <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 1141691386U, // <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2215433290U, // <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1> + 2706310038U, // <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0> + 2322190842U, // <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3> + 1141691750U, // <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2215433654U, // <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5> + 2653606230U, // <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1> + 1141692012U, // <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1141692034U, // <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 2785825940U, // <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3> + 3768108576U, // <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2> + 3780052584U, // <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2> + 2794820780U, // <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0> + 3859641528U, // <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3> + 3733327970U, // <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0> + 3778062266U, // <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7> + 3733328944U, // <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2> + 2795189465U, // <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0> + 2324861026U, // <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0> + 3780053233U, // <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3> + 3780053296U, // <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3> + 3778062725U, // <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7> + 3780053506U, // <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6> + 3803941469U, // <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7> + 2706311800U, // <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7> + 3398603586U, // <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7> + 2707639066U, // <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7> + 2217366522U, // <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2> + 3727369110U, // <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0> + 3291108500U, // <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3> + 3727370872U, // <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7> + 2217366886U, // <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6> + 2706312502U, // <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 3786026321U, // <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7> + 2217367148U, // <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7> + 2706312745U, // <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2322223202U, // <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0> + 3399946987U, // <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1> + 3291780244U, // <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3> + 3727378582U, // <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2> + 3727379766U, // <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS + 3859568054U, // <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5> + 2785826241U, // <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7> + 3395965762U, // <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7> + 2787153363U, // <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7> + 2785826268U, // <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7> + 3780055420U, // <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3> + 3859568110U, // <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7> + 3874534903U, // <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7> + 3859641856U, // <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7> + 3733360738U, // <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0> + 3859568145U, // <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6> + 2797770260U, // <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0> + 2797843997U, // <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0> + 2785826342U, // <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0> + 3727393686U, // <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0> + 3868563003U, // <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3> + 3377397988U, // <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3> + 2219349350U, // <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6> + 3859568217U, // <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6> + 2730202588U, // <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7> + 2785826412U, // <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7> + 2731529854U, // <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7> + 1146336250U, // <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2706315054U, // <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 2653660845U, // <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u> + 2322248186U, // <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3> + 1146336614U, // <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2706315418U, // <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2653663581U, // <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u> + 1146336876U, // <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1146336898U, // <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 202162278U, // <0,u,0,0>: Cost 1 vspltisw0 LHS + 1624612966U, // <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS + 2629780986U, // <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0> + 1207959708U, // <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 1544097078U, // <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS + 1140856986U, // <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 2698355253U, // <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7> + 1207962952U, // <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 202162278U, // <0,u,0,u>: Cost 1 vspltisw0 LHS + 1142134483U, // <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2> + 67950382U, // <0,u,1,1>: Cost 1 vmrghw LHS, LHS + 1142175624U, // <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 1142175676U, // <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1> + 1142134847U, // <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 67950746U, // <0,u,1,5>: Cost 1 vmrghw LHS, RHS + 1142175952U, // <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221905736U, // <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 67950949U, // <0,u,1,u>: Cost 1 vmrghw LHS, LHS + 1562026086U, // <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS + 2216015662U, // <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2698356328U, // <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2> + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1562029366U, // <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS + 2216016026U, // <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 2698356666U, // <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7> + 1585919033U, // <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2758989756U, // <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1> + 2216662830U, // <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 2703665461U, // <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u> + 2758989782U, // <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0> + 2758989796U, // <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5> + 2216663194U, // <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 2706319993U, // <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u> + 2300972360U, // <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2216663397U, // <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217367251U, // <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2> + 1143625518U, // <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 2217367432U, // <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3> + 2217367484U, // <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1> + 1143619922U, // <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1143625882U, // <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 2217367760U, // <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7> + 2291027272U, // <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 1143626085U, // <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2635792486U, // <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS + 2635793302U, // <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0> + 2302314646U, // <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 2635794648U, // <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5> + 2635795766U, // <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS + 2717601754U, // <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u> + 1685248154U, // <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2302315848U, // <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 1685248172U, // <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2759358645U, // <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7> + 2218637102U, // <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2724901370U, // <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3> + 2758990032U, // <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7> + 2659691830U, // <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS + 2659471458U, // <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 2724901688U, // <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6> + 1651159893U, // <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u> + 1651823526U, // <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u> + 2785827072U, // <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1> + 2803964168U, // <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0> + 2727556249U, // <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u> + 2303656092U, // <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 2785827112U, // <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5> + 2785827122U, // <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6> + 2730210781U, // <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u> + 2303659336U, // <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303656097U, // <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 202162278U, // <0,u,u,0>: Cost 1 vspltisw0 LHS + 72595246U, // <0,u,u,1>: Cost 1 vmrghw LHS, LHS + 1146337160U, // <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1146337343U, // <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 72595610U, // <0,u,u,5>: Cost 1 vmrghw LHS, RHS + 1146337488U, // <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221963080U, // <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2756853760U, // <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0> + 1677803530U, // <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1> + 3759497387U, // <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0> + 2686419196U, // <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0> + 2751766565U, // <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1> + 2687746462U, // <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0> + 3776086518U, // <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7> + 2689073728U, // <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0> + 1678319689U, // <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1> + 2287091712U, // <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0> + 1147568230U, // <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS + 1683112038U, // <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 3294970108U, // <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0> + 2623892790U, // <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS + 2647781007U, // <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1> + 2791948430U, // <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 3721524218U, // <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2> + 1683112092U, // <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2222112768U, // <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0> + 1148371046U, // <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 3356862524U, // <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2> + 2702345894U, // <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1> + 2222113106U, // <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5> + 2299709908U, // <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 3760162746U, // <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7> + 3369470584U, // <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7> + 1148371613U, // <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 2686421142U, // <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2> + 2283128486U, // <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1> + 3296305326U, // <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3> + 3760163199U, // <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1> + 3760163330U, // <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6> + 3779406377U, // <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0> + 3865690416U, // <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7> + 3366824568U, // <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7> + 2707655452U, // <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0> + 2734861202U, // <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1> + 2756854098U, // <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5> + 3830595931U, // <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5> + 3296968960U, // <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4> + 3830595949U, // <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5> + 2686422326U, // <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 3297378806U, // <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7> + 3810594248U, // <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0> + 2686422569U, // <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2284470272U, // <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0> + 2284471974U, // <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1> + 3809267435U, // <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3> + 3297968384U, // <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4> + 2284471977U, // <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4> + 3721555603U, // <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5> + 3792679010U, // <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0> + 3792679037U, // <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0> + 2284471981U, // <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u> + 3356893184U, // <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0> + 2224676966U, // <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS + 3298295985U, // <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6> + 3298345212U, // <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0> + 2224972114U, // <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 3808604907U, // <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1> + 3799978808U, // <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6> + 2726237006U, // <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1> + 2224677522U, // <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1> + 2726237176U, // <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0> + 2285815462U, // <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1> + 3805951193U, // <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0> + 3807941859U, // <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1> + 3799979366U, // <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6> + 3803297165U, // <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0> + 3799979540U, // <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0> + 3799979628U, // <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7> + 2731546240U, // <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0> + 2284494848U, // <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0> + 1683112594U, // <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1> + 1683112605U, // <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2734200772U, // <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0> + 2757075629U, // <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1> + 2686425242U, // <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2791948430U, // <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 2736855304U, // <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0> + 1683112659U, // <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1610694666U, // <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1> + 1616003174U, // <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS + 2283767958U, // <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2> + 3357507596U, // <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3> + 2689745234U, // <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5> + 3357507922U, // <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5> + 3294397647U, // <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7> + 3373433334U, // <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7> + 1616003730U, // <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1> + 1550221414U, // <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,1,1>: Cost 1 vspltisw1 LHS + 2287093910U, // <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2> + 2287092615U, // <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3> + 1550224694U, // <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 2287092050U, // <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5> + 2689746127U, // <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7> + 2659800138U, // <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1> + 269271142U, // <1,1,1,u>: Cost 1 vspltisw1 LHS + 2222113516U, // <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1> + 2756854663U, // <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3> + 1148371862U, // <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689746598U, // <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1> + 2618002742U, // <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS + 2299707730U, // <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5> + 2689746874U, // <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7> + 3361506511U, // <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7> + 1148371862U, // <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689747094U, // <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2> + 2691074278U, // <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1> + 3356870806U, // <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2> + 2283126958U, // <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3> + 2689747458U, // <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6> + 3356868946U, // <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5> + 3811265144U, // <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7> + 3362841807U, // <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7> + 2689747742U, // <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2> + 2623987814U, // <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS + 2758181931U, // <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5> + 2223408022U, // <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 3697731734U, // <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2> + 2283798784U, // <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4> + 1616006454U, // <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 3297379535U, // <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7> + 3373466102U, // <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7> + 1616006697U, // <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2760762479U, // <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1> + 2284470282U, // <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1> + 2284472470U, // <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2> + 3358212270U, // <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3> + 2284470285U, // <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4> + 1210728786U, // <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2737524834U, // <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0> + 3360867535U, // <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7> + 1210728786U, // <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 3697746022U, // <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS + 2756854991U, // <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7> + 2737525242U, // <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3> + 3839149281U, // <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7> + 3697749302U, // <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS + 3356893522U, // <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5> + 2283151537U, // <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6> + 2791949566U, // <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0> + 2792613127U, // <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0> + 2737525754U, // <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2> + 2291786386U, // <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1> + 3365528292U, // <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2> + 3365528455U, // <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3> + 2737526118U, // <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6> + 3365527890U, // <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5> + 3365528377U, // <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6> + 2291786959U, // <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7> + 2737526402U, // <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2> + 1550221414U, // <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,u,1>: Cost 1 vspltisw1 LHS + 1148371862U, // <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689750972U, // <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1> + 1550224694U, // <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1616009370U, // <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2689751248U, // <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7> + 2736863497U, // <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1> + 269271142U, // <1,1,u,u>: Cost 1 vspltisw1 LHS + 2702360576U, // <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0> + 1628618854U, // <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2685771949U, // <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2> + 2283765862U, // <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 2702360914U, // <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5> + 3788046813U, // <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0> + 2688426481U, // <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2> + 2726249024U, // <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1628619421U, // <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2690417380U, // <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2> + 2702361396U, // <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1> + 2287093352U, // <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2> + 1213349990U, // <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 3764159522U, // <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5> + 3295053672U, // <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6> + 2221311930U, // <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7> + 3799991593U, // <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7> + 1213349995U, // <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS + 2624045158U, // <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS + 2702362144U, // <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2> + 2283120232U, // <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2> + 1225965670U, // <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2624048438U, // <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS + 3356860763U, // <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5> + 2222114746U, // <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7> + 2299708632U, // <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7> + 1225965675U, // <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS + 470597734U, // <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544340276U, // <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544341096U, // <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544341916U, // <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3> + 470601014U, // <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592119300U, // <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592119802U, // <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592120314U, // <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470603566U, // <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708335471U, // <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2> + 3838043908U, // <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5> + 3357541992U, // <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2> + 2283798630U, // <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2726251728U, // <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4> + 1628622134U, // <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 3297077178U, // <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7> + 2726251976U, // <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1628622377U, // <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 2714308168U, // <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2> + 3297633827U, // <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5> + 2284471912U, // <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2> + 1210728550U, // <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 3776106420U, // <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6> + 2726252548U, // <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5> + 2726252642U, // <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0> + 3799994538U, // <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0> + 1210728555U, // <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720280865U, // <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2> + 2702365096U, // <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2> + 2726253050U, // <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3> + 2283151462U, // <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 3697823030U, // <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS + 3298715497U, // <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7> + 2726253368U, // <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6> + 2724926296U, // <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2> + 2283151467U, // <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652511738U, // <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2> + 3371500916U, // <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1> + 3365529192U, // <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2> + 2291785830U, // <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2726253926U, // <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6> + 3788051845U, // <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1> + 3794023894U, // <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1> + 2726254119U, // <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1> + 1657820802U, // <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2> + 470638699U, // <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1544381236U, // <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544382056U, // <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544382614U, // <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 470641974U, // <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1628625050U, // <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 1592160762U, // <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592161274U, // <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470644526U, // <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS + 2769389708U, // <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1> + 2685780070U, // <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2685780142U, // <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3> + 2686443775U, // <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3> + 2769684656U, // <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1> + 3357507940U, // <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5> + 3759522294U, // <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7> + 3357509562U, // <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7> + 2685780637U, // <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2287092630U, // <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0> + 2221312230U, // <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1> + 2691752839U, // <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3> + 2287093362U, // <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3> + 2287092634U, // <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4> + 3360835107U, // <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5> + 3759523041U, // <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7> + 2287093690U, // <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7> + 2287092638U, // <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u> + 2222114966U, // <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2> + 2222115057U, // <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3> + 2630092320U, // <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2> + 2685781670U, // <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1> + 2222115330U, // <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6> + 3373449572U, // <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5> + 2222115448U, // <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2299709370U, // <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7> + 2222115614U, // <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2> + 2771380607U, // <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1> + 3356874468U, // <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1> + 3759524168U, // <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0> + 2283792796U, // <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3> + 3356869530U, // <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4> + 3721760428U, // <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3> + 3296496248U, // <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7> + 3356870586U, // <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7> + 2771970503U, // <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1> + 2772044240U, // <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1> + 3362186135U, // <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1> + 3297151280U, // <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3> + 3357542002U, // <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3> + 3357540626U, // <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4> + 2685783350U, // <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 3357546622U, // <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6> + 3357542330U, // <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7> + 2685783593U, // <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2284471190U, // <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0> + 3358213015U, // <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1> + 2630116899U, // <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5> + 2284471922U, // <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3> + 2284471194U, // <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4> + 2284471843U, // <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5> + 3358218366U, // <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6> + 2284472250U, // <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7> + 2284471198U, // <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u> + 2224752790U, // <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2> + 3832736385U, // <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7> + 3703866916U, // <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6> + 3356894834U, // <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3> + 3356894106U, // <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4> + 3356894755U, // <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5> + 3356899130U, // <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6> + 2283153338U, // <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2283153338U, // <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2774035139U, // <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1> + 3703874767U, // <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7> + 3703875109U, // <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7> + 3365529202U, // <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3> + 3365528474U, // <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4> + 3789387159U, // <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1> + 3865692927U, // <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7> + 3363538874U, // <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7> + 2774625035U, // <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1> + 2284495766U, // <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0> + 2685785902U, // <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2630141478U, // <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u> + 2283169880U, // <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3> + 2284495770U, // <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4> + 2685786266U, // <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2222115448U, // <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2284496826U, // <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7> + 2685786469U, // <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2684461069U, // <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4> + 2686451814U, // <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 3759530159U, // <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4> + 2686451968U, // <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2684461394U, // <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5> + 1701989266U, // <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1> + 3776119286U, // <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7> + 2689106500U, // <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4> + 1702210477U, // <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1> + 2221312914U, // <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1> + 2691097399U, // <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4> + 3760194454U, // <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0> + 3766166489U, // <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4> + 2334870736U, // <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4> + 1147571510U, // <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 3760194794U, // <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7> + 3867315188U, // <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0> + 1147571753U, // <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS + 2222115730U, // <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1> + 2222115812U, // <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 3760195176U, // <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2> + 2702378662U, // <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1> + 2323598544U, // <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4> + 1148374326U, // <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 3760195514U, // <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7> + 3373451932U, // <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7> + 1148374569U, // <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS + 2702379160U, // <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4> + 3760195840U, // <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0> + 3776121160U, // <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0> + 3760195996U, // <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3> + 2686454274U, // <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6> + 3356870350U, // <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5> + 3800009392U, // <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0> + 3366824604U, // <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7> + 2707688224U, // <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4> + 2775731368U, // <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0> + 3830820018U, // <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1> + 3691980454U, // <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1> + 3357541282U, // <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3> + 2781039824U, // <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4> + 2686455094U, // <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 3357541528U, // <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6> + 3810627020U, // <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4> + 2686455337U, // <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 2624217190U, // <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS + 2284470309U, // <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1> + 2618246822U, // <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1> + 3358212297U, // <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3> + 2284470312U, // <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4> + 2284470637U, // <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5> + 1683115318U, // <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3721851898U, // <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2> + 1683115336U, // <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3794039075U, // <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4> + 3830820186U, // <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7> + 3800011258U, // <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3> + 3807973938U, // <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5> + 3298716880U, // <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4> + 2224680246U, // <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 3800011576U, // <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6> + 2726269774U, // <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1> + 2224680489U, // <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726269948U, // <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4> + 3383444141U, // <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1> + 3805983961U, // <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0> + 3807974667U, // <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5> + 2736887142U, // <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6> + 3365528403U, // <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5> + 3800012308U, // <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0> + 3800012396U, // <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7> + 2731579012U, // <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4> + 2624241766U, // <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS + 2686457646U, // <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 2618271398U, // <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1> + 2734233544U, // <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4> + 2689775679U, // <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6> + 1152355638U, // <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS + 1683115561U, // <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2736888076U, // <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4> + 1683115579U, // <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2687123456U, // <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0> + 1613381734U, // <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759538352U, // <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5> + 3760865532U, // <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0> + 1613381970U, // <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2687787427U, // <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5> + 2781777524U, // <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1> + 3733828717U, // <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0> + 1613382301U, // <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2781040271U, // <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1> + 2687124276U, // <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1> + 2687124374U, // <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0> + 3760866297U, // <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0> + 2693096491U, // <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5> + 2687124591U, // <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1> + 2687124723U, // <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7> + 3360834803U, // <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7> + 2687124860U, // <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0> + 2323598792U, // <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0> + 2687125027U, // <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5> + 2687125096U, // <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2> + 2687125158U, // <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1> + 2642185188U, // <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2> + 2323598554U, // <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5> + 2687125434U, // <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7> + 3373450483U, // <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7> + 2687125563U, // <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1> + 2687125654U, // <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2> + 2312990234U, // <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1> + 3760867649U, // <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2> + 2687125916U, // <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3> + 2687126018U, // <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6> + 3386731738U, // <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5> + 3356871170U, // <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6> + 3808643779U, // <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1> + 2687126302U, // <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2> + 2642198630U, // <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS + 2687126498U, // <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0> + 3715941923U, // <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5> + 3709970701U, // <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4> + 2687126736U, // <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4> + 1613385014U, // <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2283801090U, // <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 3733861489U, // <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4> + 1613385257U, // <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2624290918U, // <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS + 2624291676U, // <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5> + 3698034211U, // <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5> + 2284471211U, // <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3> + 2624294198U, // <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS + 2284471132U, // <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5> + 2284472834U, // <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6> + 2284471539U, // <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7> + 2284471216U, // <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u> + 2785316900U, // <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1> + 2781040691U, // <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7> + 2734903802U, // <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3> + 3848736834U, // <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4> + 3298717620U, // <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6> + 3298717700U, // <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5> + 2734904120U, // <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6> + 2781040738U, // <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0> + 2781040747U, // <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0> + 2734904314U, // <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2> + 2315677210U, // <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1> + 3808646292U, // <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3> + 3808646371U, // <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1> + 2734904678U, // <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6> + 3389418714U, // <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5> + 3365528656U, // <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6> + 2734904940U, // <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7> + 2734904962U, // <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2> + 2687129299U, // <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2> + 1613387566U, // <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2687129480U, // <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3> + 2687129532U, // <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1> + 1661163546U, // <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5> + 1613387930U, // <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2687129808U, // <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7> + 2781040900U, // <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0> + 1613388133U, // <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759546368U, // <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0> + 2685804646U, // <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2685804721U, // <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6> + 3861270834U, // <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1> + 3759546706U, // <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5> + 2687795620U, // <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6> + 2688459253U, // <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6> + 2283769142U, // <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 2685805213U, // <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 3698073702U, // <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS + 3759547188U, // <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1> + 2221314554U, // <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3> + 3759547401U, // <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7> + 3698076982U, // <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS + 3767510141U, // <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6> + 2334872376U, // <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6> + 1213353270U, // <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 1213353271U, // <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS + 3704053862U, // <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS + 3759547961U, // <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0> + 2222117370U, // <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3> + 3759548070U, // <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1> + 3704057142U, // <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS + 3373451057U, // <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5> + 2685806522U, // <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7> + 1225968950U, // <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1225968951U, // <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS + 3759548566U, // <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2> + 3842912793U, // <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7> + 3759548774U, // <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3> + 3759548828U, // <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3> + 3759548930U, // <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6> + 3809315421U, // <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7> + 3386733368U, // <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6> + 2283130166U, // <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS + 2283130167U, // <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS + 3704070246U, // <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS + 3862229608U, // <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5> + 3704071741U, // <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4> + 3721988610U, // <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6> + 3704073526U, // <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS + 2685807926U, // <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3865621141U, // <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5> + 2283801910U, // <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 2685808169U, // <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3710050406U, // <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS + 3710051571U, // <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7> + 3405989597U, // <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2> + 3358214502U, // <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3> + 3710053686U, // <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS + 3721998025U, // <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5> + 2332250936U, // <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6> + 1210731830U, // <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210731831U, // <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS + 2791289597U, // <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1> + 3698115430U, // <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6> + 3698116538U, // <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7> + 3356894132U, // <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3> + 3698117942U, // <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS + 3722006218U, // <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6> + 2781041464U, // <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6> + 2283154742U, // <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283154743U, // <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS + 1718211406U, // <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1> + 2792026967U, // <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1> + 2765411170U, // <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3> + 3854783336U, // <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0> + 2781041526U, // <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5> + 3365528664U, // <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5> + 2791953290U, // <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7> + 2291789110U, // <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1718801302U, // <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1> + 1718875039U, // <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1> + 2685810478U, // <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2792764337U, // <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1> + 3759552444U, // <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1> + 2781041607U, // <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5> + 2685810842U, // <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 2689792208U, // <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7> + 1210756406U, // <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 1210756407U, // <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS + 2793280496U, // <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1> + 2694439014U, // <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 3393343912U, // <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2> + 3397325306U, // <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3> + 2793575444U, // <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1> + 3722030797U, // <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0> + 2688467446U, // <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7> + 2689131079U, // <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7> + 2694439570U, // <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1> + 2654265354U, // <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1> + 2794017866U, // <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1> + 3768181639U, // <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3> + 2334872058U, // <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3> + 2654268726U, // <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS + 3792069797U, // <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1> + 2694440143U, // <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7> + 2334872386U, // <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7> + 2695767409U, // <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7> + 2654273638U, // <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2222117973U, // <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3> + 2299711912U, // <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2654275734U, // <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2> + 2654276918U, // <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS + 3385397675U, // <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5> + 2654278056U, // <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2> + 2323599627U, // <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7> + 2654279470U, // <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2795271395U, // <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1> + 3768183059U, // <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1> + 3728025254U, // <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1> + 3768183196U, // <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3> + 3768183298U, // <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6> + 3792071255U, // <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1> + 3780127361U, // <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7> + 3847779617U, // <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0> + 2795861291U, // <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1> + 2795935028U, // <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1> + 3728032975U, // <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7> + 3839153480U, // <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3> + 3397358074U, // <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3> + 3854783835U, // <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4> + 2694442294U, // <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 3786100058U, // <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7> + 3722065254U, // <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6> + 2694442537U, // <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654298214U, // <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS + 3854783893U, // <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u> + 3710126010U, // <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7> + 2332250618U, // <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3> + 2654301494U, // <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS + 2284474795U, // <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5> + 2718330931U, // <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7> + 2332250946U, // <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7> + 2719658197U, // <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7> + 2332921954U, // <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0> + 3768185254U, // <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0> + 3710134202U, // <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7> + 3710134561U, // <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6> + 3710135606U, // <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS + 3864884745U, // <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7> + 3854784017U, // <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6> + 2791953940U, // <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0> + 2792617501U, // <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0> + 2797925927U, // <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1> + 3365528426U, // <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1> + 3728058022U, // <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1> + 3365528509U, // <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3> + 3854784079U, // <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5> + 3722088148U, // <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7> + 3728060845U, // <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7> + 2781042284U, // <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7> + 2798515823U, // <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1> + 2654322705U, // <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u> + 2694444846U, // <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 2299711912U, // <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2323649018U, // <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3> + 2654326070U, // <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS + 2694445210U, // <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654327214U, // <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u> + 2323649346U, // <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7> + 2694445413U, // <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 1610752017U, // <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u> + 1613406310U, // <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 2685821107U, // <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u> + 2283765916U, // <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 1613406549U, // <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u> + 1725880054U, // <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1> + 2688475639U, // <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u> + 2283769160U, // <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 1613406877U, // <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 1550221414U, // <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,u,1,1>: Cost 1 vspltisw1 LHS + 1683117870U, // <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1213350044U, // <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 1550224694U, // <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1147574426U, // <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 2687149326U, // <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7> + 1213353288U, // <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 269271142U, // <1,u,1,u>: Cost 1 vspltisw1 LHS + 2222118611U, // <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2> + 1148376878U, // <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 1148371862U, // <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 1225965724U, // <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2222118975U, // <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6> + 1148377242U, // <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 2687150010U, // <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7> + 1225968968U, // <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1148377445U, // <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 471040156U, // <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544782644U, // <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544783464U, // <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544784022U, // <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471043382U, // <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592561668U, // <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592562170U, // <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592562682U, // <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 471045934U, // <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708384629U, // <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u> + 2687151101U, // <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0> + 2223408022U, // <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 2283798684U, // <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2642422785U, // <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4> + 1613409590U, // <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2283801090U, // <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 2283801928U, // <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 1613409833U, // <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2284471235U, // <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0> + 2284472046U, // <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1> + 2284472533U, // <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2> + 1210728604U, // <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 2284471239U, // <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4> + 1210728786U, // <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 1683118234U, // <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210731848U, // <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210728609U, // <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720330023U, // <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u> + 2757376190U, // <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7> + 2726302202U, // <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3> + 2283151516U, // <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 2224972114U, // <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 2224683162U, // <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726302520U, // <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6> + 2283154760U, // <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283151521U, // <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652560896U, // <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u> + 2333590225U, // <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1> + 2765412628U, // <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3> + 2291785884U, // <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2781042984U, // <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5> + 3365527953U, // <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5> + 2791954748U, // <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7> + 2291789128U, // <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1657869960U, // <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u> + 471081121U, // <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 269271142U, // <1,u,u,1>: Cost 1 vspltisw1 LHS + 1544824424U, // <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544824982U, // <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471084342U, // <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 1613412506U, // <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 1683118477U, // <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210756424U, // <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 471086894U, // <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS + 2226757632U, // <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0> + 2226757734U, // <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS + 3826622483U, // <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1> + 3843211292U, // <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1> + 3300499794U, // <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5> + 3356256724U, // <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5> + 3825664056U, // <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2> + 3762889289U, // <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0> + 2226758301U, // <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS + 2227429386U, // <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1> + 2227429478U, // <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS + 1691156582U, // <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2666358997U, // <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2> + 2227462482U, // <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5> + 3722186464U, // <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1> + 3867099278U, // <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7> + 3366881912U, // <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7> + 1691156636U, // <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2228027392U, // <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0> + 1154285670U, // <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 2228027565U, // <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2> + 3301769468U, // <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0> + 2228027730U, // <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5> + 3301769635U, // <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5> + 3780806586U, // <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7> + 3368880760U, // <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7> + 1154286237U, // <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS + 1213440000U, // <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1213441702U, // <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2228535470U, // <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3> + 2636515632U, // <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3> + 2287182962U, // <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4> + 2660405346U, // <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0> + 2228535798U, // <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660406420U, // <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3> + 1213441709U, // <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3368894464U, // <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0> + 2764898642U, // <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5> + 3826622811U, // <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5> + 3843211620U, // <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5> + 3838640493U, // <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5> + 2732944694U, // <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS + 3797396857U, // <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2> + 3867099528U, // <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5> + 2764898705U, // <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5> + 3364257792U, // <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0> + 2230124646U, // <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 3304235184U, // <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5> + 3364260144U, // <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3> + 3303817554U, // <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5> + 3364260146U, // <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5> + 3867099602U, // <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7> + 3364260472U, // <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7> + 2230125213U, // <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS + 2230796288U, // <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0> + 1157054566U, // <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 2230796465U, // <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6> + 3304538364U, // <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0> + 2230796626U, // <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5> + 3797398205U, // <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0> + 3304538614U, // <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7> + 3798725471U, // <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0> + 1157055133U, // <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 3371573248U, // <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0> + 2231189606U, // <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS + 3801380003U, // <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0> + 3802043636U, // <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0> + 3806688614U, // <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6> + 3356317308U, // <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5> + 3804034535U, // <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0> + 3806688876U, // <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7> + 2231190173U, // <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS + 1208836096U, // <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1208837798U, // <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 1691157149U, // <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2636556597U, // <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u> + 2282579625U, // <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2660446306U, // <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0> + 2228535798U, // <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660447385U, // <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u> + 1208837805U, // <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3692388523U, // <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0> + 2757526244U, // <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2> + 2330290974U, // <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2> + 3843212020U, // <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0> + 3692391734U, // <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS + 3300533362U, // <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4> + 3794084337U, // <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2> + 3374170614U, // <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7> + 2758042403U, // <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2> + 2690482924U, // <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1> + 2764899124U, // <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1> + 2695791510U, // <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0> + 3362235271U, // <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3> + 3692399926U, // <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS + 3832226649U, // <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2> + 3301205235U, // <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7> + 3768870179U, // <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1> + 2695791988U, // <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1> + 2618663085U, // <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2> + 2228028212U, // <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1> + 2618664552U, // <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2> + 2759000984U, // <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2> + 2618666294U, // <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS + 2295136594U, // <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5> + 3769534376U, // <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7> + 2793358266U, // <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0> + 2618668846U, // <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS + 2282536969U, // <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208795146U, // <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1213442198U, // <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2287181998U, // <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2618674486U, // <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS + 1208795474U, // <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2287182001U, // <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287183055U, // <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208795153U, // <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 3692421295U, // <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4> + 3838641195U, // <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5> + 2330323742U, // <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2> + 3692423318U, // <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2> + 3692424502U, // <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS + 2695793974U, // <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3799395705U, // <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2> + 3368895695U, // <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7> + 2695794217U, // <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3692429488U, // <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5> + 3364257802U, // <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1> + 3692431253U, // <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6> + 3692431874U, // <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6> + 3692432694U, // <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS + 3364258130U, // <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5> + 3303875827U, // <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7> + 3867100333U, // <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0> + 3692435246U, // <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS + 2618695857U, // <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6> + 2230797108U, // <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1> + 2618697658U, // <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7> + 3692439702U, // <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2> + 2618699062U, // <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS + 3364929874U, // <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5> + 3692442424U, // <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6> + 3798733664U, // <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1> + 2618701614U, // <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS + 3799397370U, // <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2> + 3371573258U, // <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1> + 2330351234U, // <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 3799397658U, // <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2> + 3799397734U, // <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6> + 3371573586U, // <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5> + 3799397870U, // <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7> + 3799397956U, // <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3> + 2330351234U, // <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 2282577929U, // <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208836106U, // <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1208838294U, // <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282578094U, // <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282577933U, // <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1208836434U, // <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282578097U, // <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287224015U, // <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208836113U, // <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 2226759117U, // <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0> + 1624047718U, // <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 2697789613U, // <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2> + 2226767526U, // <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1> + 2697789778U, // <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5> + 3300657000U, // <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6> + 2226988986U, // <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7> + 3734271139U, // <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0> + 1624048285U, // <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 3831268868U, // <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1> + 2293138804U, // <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1> + 2697790358U, // <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0> + 2293137510U, // <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 3771532331U, // <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5> + 3767551106U, // <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2> + 3301173178U, // <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7> + 3372853169U, // <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7> + 2293137515U, // <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS + 1556938854U, // <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2295137733U, // <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1> + 336380006U, // <2,2,2,2>: Cost 1 vspltisw2 LHS + 1221394534U, // <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS + 1556942134U, // <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2228029370U, // <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7> + 2660545701U, // <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2> + 336380006U, // <2,2,2,u>: Cost 1 vspltisw2 LHS + 2697791638U, // <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2> + 2765489840U, // <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2> + 1213441640U, // <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135053414U, // <2,2,3,3>: Cost 1 vmrglw LHS, LHS + 2697792002U, // <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6> + 2330313780U, // <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5> + 2287183549U, // <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6> + 2660553894U, // <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3> + 135053419U, // <2,2,3,u>: Cost 1 vmrglw LHS, LHS + 2630697062U, // <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS + 3771534282U, // <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3> + 2764900109U, // <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5> + 2295152742U, // <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS + 2295154282U, // <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4> + 1624050998U, // <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 2229675962U, // <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7> + 3368896433U, // <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7> + 1624051241U, // <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 3771534920U, // <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2> + 3364258540U, // <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1> + 2296489576U, // <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2> + 2290516070U, // <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 3771535284U, // <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6> + 2290517044U, // <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5> + 2697793634U, // <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0> + 3370231729U, // <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7> + 2290516075U, // <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS + 2230797801U, // <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1> + 3304539679U, // <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1> + 2764900273U, // <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7> + 2764900282U, // <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7> + 2230798129U, // <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5> + 3304540008U, // <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6> + 1157056442U, // <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725000033U, // <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2> + 1157056442U, // <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2793359338U, // <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1> + 3371574725U, // <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1> + 2297833064U, // <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2> + 2297831526U, // <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 2697794918U, // <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6> + 3371575053U, // <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5> + 3304933297U, // <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7> + 2297833393U, // <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7> + 2297831531U, // <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS + 1556938854U, // <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1624053550U, // <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 336380006U, // <2,2,u,2>: Cost 1 vspltisw2 LHS + 135094374U, // <2,2,u,3>: Cost 1 vmrglw LHS, LHS + 1556942134U, // <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1624053914U, // <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 1157056442U, // <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2660594859U, // <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u> + 135094379U, // <2,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611448320U, // <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537706598U, // <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2689835181U, // <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2689835260U, // <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611448658U, // <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2732966354U, // <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2732966390U, // <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660603052U, // <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0> + 537707165U, // <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689835748U, // <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611449140U, // <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611449238U, // <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 3763577805U, // <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1> + 2689836112U, // <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689836143U, // <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689836239U, // <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 3366881210U, // <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7> + 1616094588U, // <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2689836493U, // <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0> + 2685191711U, // <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611449960U, // <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611450022U, // <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2689836822U, // <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5> + 2689836904U, // <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6> + 1611450298U, // <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2295138234U, // <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7> + 1611450456U, // <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3> + 1213440918U, // <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282538527U, // <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1> + 1557022322U, // <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3> + 1208796786U, // <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1213440922U, // <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282538531U, // <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2287188094U, // <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1213441978U, // <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 1208796791U, // <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u> + 1551056998U, // <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS + 1551057818U, // <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4> + 2624800360U, // <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2> + 2624800918U, // <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2> + 1551060278U, // <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS + 537709878U, // <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2732969337U, // <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2660635824U, // <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4> + 537710121U, // <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689838664U, // <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2732969615U, // <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2732969707U, // <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3> + 3763580721U, // <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1> + 2689839028U, // <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659228164U, // <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659228258U, // <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 3364259770U, // <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7> + 1659228420U, // <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2230798486U, // <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2> + 2732970407U, // <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1659228666U, // <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2230798748U, // <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3> + 2230798850U, // <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6> + 2732970731U, // <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659228984U, // <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659229006U, // <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1659229087U, // <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1> + 1659229178U, // <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2726999125U, // <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3> + 2727662758U, // <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3> + 2732971235U, // <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1> + 1659229542U, // <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2732971446U, // <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2732971484U, // <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7> + 1659229804U, // <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659229826U, // <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1208837014U, // <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 537712430U, // <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1616099205U, // <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0> + 1208837746U, // <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1208837018U, // <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 537712794U, // <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1616099536U, // <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1208838074U, // <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 537712997U, // <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 3771547648U, // <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0> + 2697805926U, // <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS + 3770884269U, // <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2> + 3806716164U, // <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u> + 3771547986U, // <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5> + 2226761014U, // <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 3853462427U, // <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1> + 3867102116U, // <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1> + 2226761257U, // <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS + 3849186231U, // <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2> + 3301207010U, // <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0> + 3766240150U, // <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0> + 3766240226U, // <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4> + 3301207248U, // <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4> + 2227432758U, // <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS + 3758941400U, // <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7> + 3768894758U, // <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4> + 2227433001U, // <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS + 2228030354U, // <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1> + 3770885657U, // <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4> + 2697807466U, // <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4> + 3368880468U, // <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3> + 2228030672U, // <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4> + 1154288950U, // <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 3771549617U, // <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7> + 3368880796U, // <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7> + 1154289193U, // <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS + 2636808294U, // <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS + 2287181861U, // <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1> + 2228866102U, // <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3> + 2636810580U, // <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3> + 1256574160U, // <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1213441742U, // <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2228866430U, // <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7> + 2660701368U, // <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3> + 1213441745U, // <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3704586342U, // <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS + 3782831051U, // <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4> + 3704587900U, // <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4> + 3368896123U, // <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3> + 2793360592U, // <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4> + 2697809206U, // <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 3303198078U, // <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7> + 3867102444U, // <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5> + 2697809449U, // <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 2630852710U, // <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS + 2624881572U, // <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5> + 2630854269U, // <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5> + 2666686677U, // <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2> + 2630855990U, // <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS + 2230127926U, // <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS + 1691159862U, // <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 3867102520U, // <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0> + 1691159880U, // <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230799250U, // <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1> + 3304541130U, // <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3> + 2230799417U, // <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6> + 3304541323U, // <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7> + 2230799568U, // <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4> + 1157057846U, // <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 3304541566U, // <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7> + 3798758243U, // <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4> + 1157058089U, // <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS + 3806721018U, // <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2> + 3853831590U, // <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2> + 3801412775U, // <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4> + 3802076408U, // <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4> + 3401436368U, // <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4> + 2793360840U, // <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0> + 3804067307U, // <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4> + 3867102682U, // <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0> + 2793360867U, // <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0> + 2630877286U, // <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS + 2282580144U, // <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2630878848U, // <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u> + 2636851545U, // <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u> + 1256615120U, // <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1208837838U, // <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 1691160105U, // <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2660742333U, // <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u> + 1208837841U, // <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3766910976U, // <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0> + 2693169254U, // <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3760939181U, // <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2> + 3843214936U, // <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0> + 3760939355U, // <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5> + 3867102827U, // <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1> + 3867102836U, // <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1> + 3867102844U, // <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0> + 2693169821U, // <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3766911724U, // <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1> + 3766911796U, // <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1> + 2693170070U, // <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0> + 3384798262U, // <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3> + 2693170228U, // <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5> + 3301208068U, // <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5> + 3366879607U, // <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6> + 3867102925U, // <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0> + 2695824760U, // <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5> + 2642845798U, // <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS + 2295139218U, // <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1> + 2699142760U, // <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2> + 3766912678U, // <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1> + 2699142925U, // <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5> + 2228031492U, // <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5> + 2295138818U, // <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6> + 3368879347U, // <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7> + 2295138820U, // <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u> + 2287184866U, // <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256573842U, // <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642855630U, // <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5> + 2287182763U, // <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287184870U, // <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256574170U, // <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1213442562U, // <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287183091U, // <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1213442564U, // <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3716604006U, // <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS + 3716604822U, // <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0> + 3766914099U, // <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0> + 3368895403U, // <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3> + 3716607031U, // <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4> + 2693172534U, // <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3363588610U, // <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6> + 3368895731U, // <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7> + 2693172777U, // <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3704668262U, // <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS + 3704669078U, // <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0> + 3704669830U, // <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5> + 3364259460U, // <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3> + 3704671542U, // <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS + 2793361412U, // <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 3364258167U, // <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6> + 3867103249U, // <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0> + 2793361412U, // <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 2642878566U, // <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS + 3386166810U, // <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1> + 2723033594U, // <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3> + 3848523842U, // <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4> + 2723033713U, // <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5> + 2230800388U, // <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5> + 2230800482U, // <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0> + 2785841252U, // <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2> + 2785914989U, // <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2> + 3796775930U, // <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2> + 3800757335U, // <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5> + 3853463689U, // <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3> + 3796776218U, // <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2> + 3796776294U, // <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6> + 3803411867U, // <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5> + 3371575081U, // <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6> + 3796776516U, // <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3> + 3371575083U, // <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u> + 2287225826U, // <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256614802U, // <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642896590U, // <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5> + 2287223723U, // <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287225830U, // <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256615130U, // <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1208838658U, // <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287224051U, // <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1208838660U, // <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3772227584U, // <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0> + 2698485862U, // <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3759620282U, // <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6> + 3710675299U, // <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0> + 3767583058U, // <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5> + 3378153265U, // <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5> + 3865186637U, // <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1> + 2330291510U, // <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS + 2698486429U, // <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3734569062U, // <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS + 3764929346U, // <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6> + 3772228502U, // <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0> + 3734571158U, // <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2> + 3734572342U, // <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS + 3767583878U, // <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6> + 3768247511U, // <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6> + 2293140790U, // <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 2293140791U, // <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS + 3704717414U, // <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS + 3395424589U, // <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1> + 2228031993U, // <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2> + 2698487485U, // <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6> + 3704720694U, // <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS + 3773556575U, // <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6> + 2698487738U, // <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7> + 1221397814U, // <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 1221397815U, // <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS + 2636955750U, // <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS + 2330314217U, // <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2636957626U, // <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7> + 2287184230U, // <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636959030U, // <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS + 2648903448U, // <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3> + 1256575800U, // <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135056694U, // <2,6,3,7>: Cost 1 vmrglw LHS, RHS + 135056695U, // <2,6,3,u>: Cost 1 vmrglw LHS, RHS + 3710705766U, // <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS + 3698762677U, // <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4> + 3710707389U, // <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6> + 3710708071U, // <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4> + 3710709046U, // <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS + 2698489142U, // <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 3796782457U, // <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2> + 2295156022U, // <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 2295156023U, // <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS + 3303870753U, // <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2> + 3788820134U, // <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6> + 3779530520U, // <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3> + 3303871026U, // <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5> + 3303871117U, // <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6> + 3791474666U, // <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6> + 3792138299U, // <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6> + 2290519350U, // <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 2290519351U, // <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS + 2631008358U, // <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS + 3372893673U, // <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1> + 2791445264U, // <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2> + 2230800968U, // <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0> + 2631011638U, // <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS + 3372894001U, // <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5> + 2793362232U, // <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6> + 2295835958U, // <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS + 2295835959U, // <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS + 2793362254U, // <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1> + 2792035160U, // <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2> + 2792108897U, // <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2> + 2769474408U, // <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0> + 2793362294U, // <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5> + 3371575089U, // <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5> + 2792403845U, // <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2> + 2297834806U, // <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS + 2297834807U, // <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS + 2636996710U, // <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS + 2698491694U, // <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 2636998631U, // <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7> + 2282580326U, // <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636999990U, // <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS + 2698492058U, // <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 1256616760U, // <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135097654U, // <2,6,u,7>: Cost 1 vmrglw LHS, RHS + 135097655U, // <2,6,u,u>: Cost 1 vmrglw LHS, RHS + 2666864742U, // <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS + 1719620602U, // <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2> + 3768254637U, // <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2> + 3393417722U, // <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3> + 2666868022U, // <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS + 3867104290U, // <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6> + 3728667127U, // <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0> + 2666869817U, // <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2> + 1720136761U, // <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2> + 3728670822U, // <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS + 3774227252U, // <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1> + 3774227350U, // <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0> + 2323001850U, // <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 3728674102U, // <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS + 3774227567U, // <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1> + 2694513880U, // <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7> + 3396744002U, // <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7> + 2323001850U, // <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 2654937190U, // <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS + 3728679732U, // <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1> + 2700486248U, // <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2> + 2321682938U, // <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3> + 2654940470U, // <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS + 3859584196U, // <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6> + 2700486577U, // <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7> + 2228033132U, // <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7> + 2701813843U, // <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7> + 1581203558U, // <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 2654946100U, // <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1> + 2637031354U, // <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7> + 1256575482U, // <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581206838U, // <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS + 2654949380U, // <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5> + 1581208058U, // <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3> + 1256575810U, // <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581209390U, // <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 3728695398U, // <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS + 3869758782U, // <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2> + 3728696936U, // <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2> + 3393450490U, // <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3> + 3728698678U, // <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS + 2700487990U, // <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3728699899U, // <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4> + 3867104626U, // <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0> + 2700488233U, // <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3855160709U, // <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1> + 3728704406U, // <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0> + 3370233956U, // <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2> + 2320380410U, // <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 3728706870U, // <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS + 3867104694U, // <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5> + 3792146492U, // <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7> + 3394122562U, // <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7> + 2320380410U, // <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 2230801402U, // <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2> + 3768258984U, // <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2> + 2730349050U, // <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 3372894575U, // <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3> + 2230801766U, // <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6> + 3304543670U, // <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5> + 3728716285U, // <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6> + 2230802028U, // <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7> + 2730349050U, // <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 2793362983U, // <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1> + 3728721112U, // <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7> + 3371574933U, // <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2> + 2327695866U, // <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3> + 3728723254U, // <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS + 3371574855U, // <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5> + 2730350062U, // <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7> + 2793363052U, // <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7> + 2798671471U, // <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1> + 1581244518U, // <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1724929666U, // <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2> + 2637072314U, // <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7> + 1256616442U, // <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581247798U, // <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS + 2700490906U, // <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 1581249023U, // <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u> + 1256616770U, // <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581250350U, // <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1611489280U, // <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537747563U, // <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685231277U, // <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685231356U, // <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611489618U, // <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2226763930U, // <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 2733007350U, // <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660971737U, // <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0> + 537748125U, // <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689876708U, // <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611490100U, // <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611490198U, // <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 2293137564U, // <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 2689877072U, // <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689877103U, // <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689877199U, // <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2293140808U, // <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 1616135548U, // <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 1556938854U, // <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1154291502U, // <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 336380006U, // <2,u,2,2>: Cost 1 vspltisw2 LHS + 1611490982U, // <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 1556942134U, // <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1154291866U, // <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 1611491258U, // <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1221397832U, // <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 336380006U, // <2,u,2,u>: Cost 1 vspltisw2 LHS + 1611491478U, // <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2> + 1213440073U, // <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1> + 1213442261U, // <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135053468U, // <2,u,3,3>: Cost 1 vmrglw LHS, LHS + 1611491842U, // <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6> + 1213440401U, // <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5> + 1213442589U, // <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135056712U, // <2,u,3,7>: Cost 1 vmrglw LHS, RHS + 135053473U, // <2,u,3,u>: Cost 1 vmrglw LHS, LHS + 1551425638U, // <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS + 1551426503U, // <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4> + 2625169000U, // <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2> + 2625169558U, // <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2> + 1551428918U, // <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS + 537750838U, // <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2733010297U, // <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2295156040U, // <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 537751081U, // <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689879624U, // <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2230130478U, // <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 2631149217U, // <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5> + 2290516124U, // <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 2689879988U, // <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659269124U, // <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1691162778U, // <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2290519368U, // <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 1691162796U, // <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230802131U, // <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2> + 1157060398U, // <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659269626U, // <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2764904656U, // <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7> + 2230802495U, // <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6> + 1157060762U, // <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 1659269944U, // <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659269966U, // <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1157060965U, // <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659270138U, // <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2727040090U, // <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u> + 2727703723U, // <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u> + 2297831580U, // <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 1659270502U, // <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2733012406U, // <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2730358255U, // <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u> + 1659270764U, // <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659270786U, // <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1213481923U, // <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0> + 537753390U, // <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 336380006U, // <2,u,u,2>: Cost 1 vspltisw2 LHS + 135094428U, // <2,u,u,3>: Cost 1 vmrglw LHS, LHS + 1213481927U, // <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4> + 537753754U, // <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 1208838685U, // <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135097672U, // <2,u,u,7>: Cost 1 vmrglw LHS, RHS + 135094433U, // <2,u,u,u>: Cost 1 vmrglw LHS, LHS + 1678557184U, // <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1678557194U, // <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2631181989U, // <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0> + 2289223984U, // <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3> + 2756943909U, // <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1> + 3362965729U, // <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5> + 3362966054U, // <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6> + 2289224312U, // <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7> + 1683202121U, // <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1> + 1557446758U, // <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS + 2752741467U, // <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1> + 604815462U, // <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2631190676U, // <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0> + 1557450038U, // <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS + 2667024388U, // <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5> + 2800074894U, // <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7> + 2661053667U, // <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1> + 604815516U, // <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696521165U, // <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0> + 2752741549U, // <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2> + 2691876456U, // <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2> + 2691876518U, // <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1> + 3830685895U, // <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1> + 3765618536U, // <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6> + 2691876794U, // <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7> + 2701166596U, // <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0> + 2756944108U, // <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2> + 2691877014U, // <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2> + 1161003110U, // <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2691877168U, // <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3> + 2691877246U, // <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0> + 2691877378U, // <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6> + 3765619238U, // <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6> + 2691877496U, // <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 3368962680U, // <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7> + 1161003677U, // <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS + 2289254400U, // <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0> + 1678557522U, // <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2631214761U, // <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4> + 2235580672U, // <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 2756944237U, // <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5> + 1618136374U, // <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 3309322742U, // <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7> + 3362998904U, // <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7> + 1683202449U, // <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 3765620296U, // <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2> + 2752299427U, // <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5> + 3789508346U, // <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0> + 3403486842U, // <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3> + 3765620660U, // <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6> + 2733682692U, // <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5> + 2800075218U, // <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7> + 3873817044U, // <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0> + 2800075234U, // <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5> + 2752299501U, // <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7> + 2236547174U, // <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS + 2733683194U, // <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3> + 3844473352U, // <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7> + 3310289234U, // <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5> + 3873817114U, // <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7> + 2733683512U, // <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6> + 2725057384U, // <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0> + 2236547741U, // <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS + 2297905152U, // <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0> + 2297906854U, // <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1> + 2727711916U, // <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0> + 3371649328U, // <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3> + 2733684070U, // <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6> + 3734843490U, // <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0> + 3798799895U, // <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3> + 2733684332U, // <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7> + 2297906861U, // <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u> + 1557504102U, // <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS + 1678557842U, // <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1> + 604816029U, // <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2691880892U, // <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1> + 1557507382U, // <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS + 1618139290U, // <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 2691881168U, // <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7> + 2661111018U, // <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u> + 604816083U, // <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 2619310332U, // <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0> + 2756944612U, // <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2> + 2289221724U, // <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2> + 2619312278U, // <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2> + 2619313462U, // <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS + 2289221970U, // <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5> + 2232599768U, // <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7> + 3362964687U, // <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7> + 2619316014U, // <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS + 2756944683U, // <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1> + 1678558004U, // <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2691883927U, // <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1> + 3826631496U, // <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3> + 2756944723U, // <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5> + 2756944732U, // <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 3830686561U, // <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1> + 3734869228U, // <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1> + 1678558004U, // <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2696529358U, // <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1> + 2756944775U, // <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 2294548630U, // <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2> + 1678558102U, // <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0> + 2631273782U, // <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS + 2756944811U, // <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 3830686644U, // <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3> + 2800075706U, // <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0> + 1679000515U, // <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0> + 2619334911U, // <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3> + 2295218186U, // <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1> + 2293229718U, // <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2> + 2619337116U, // <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3> + 2619338038U, // <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS + 2295218514U, // <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5> + 3830686729U, // <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7> + 3368961231U, // <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7> + 2619340590U, // <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS + 2619343104U, // <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4> + 2289254410U, // <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1> + 2289256598U, // <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2> + 2619345410U, // <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6> + 2619346230U, // <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS + 2756944976U, // <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6> + 3362996401U, // <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6> + 3362997455U, // <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7> + 2619348782U, // <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS + 2756945007U, // <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1> + 3830686840U, // <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1> + 3358361750U, // <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2> + 3830686857U, // <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0> + 2756945047U, // <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5> + 2294571346U, // <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5> + 3806105698U, // <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0> + 3873817774U, // <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1> + 2756945079U, // <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1> + 3830686912U, // <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1> + 2756945103U, // <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2236547990U, // <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0> + 3826631905U, // <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7> + 3830686952U, // <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5> + 2756945139U, // <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 3830686972U, // <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7> + 2800076030U, // <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0> + 2756945166U, // <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7> + 3699081318U, // <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS + 2297905162U, // <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1> + 2297907350U, // <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2> + 3365675182U, // <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3> + 3699084598U, // <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS + 2297905490U, // <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5> + 2297905329U, // <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 3368330447U, // <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7> + 2297905169U, // <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u> + 2619375876U, // <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u> + 1678558004U, // <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2289289366U, // <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2> + 1679000956U, // <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0> + 2619378998U, // <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS + 2756945297U, // <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3> + 2297905329U, // <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 2800076192U, // <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0> + 1683203497U, // <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0> + 3362964203U, // <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0> + 2289222380U, // <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1> + 2289222462U, // <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2> + 1215479910U, // <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 3362964207U, // <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4> + 2289222708U, // <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2232600506U, // <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7> + 3396142296U, // <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7> + 1215479915U, // <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS + 3699105894U, // <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS + 3765633844U, // <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1> + 2691892120U, // <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2> + 2752300575U, // <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1> + 3699109174U, // <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS + 3830687280U, // <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0> + 3830687289U, // <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0> + 3874260548U, // <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2> + 2752742988U, // <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1> + 2631344230U, // <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS + 2697201184U, // <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2> + 1678558824U, // <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678558834U, // <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 2631347510U, // <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS + 3368953613U, // <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5> + 2234304442U, // <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7> + 3368953777U, // <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7> + 1679001247U, // <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3> + 1678558886U, // <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1> + 2752300719U, // <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1> + 2752300729U, // <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2> + 1221476454U, // <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS + 1678558926U, // <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5> + 2800076503U, // <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5> + 2234746810U, // <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7> + 2800076516U, // <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0> + 1678558958U, // <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1> + 3699130470U, // <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS + 3362996972U, // <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1> + 2289256040U, // <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2> + 1215512678U, // <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 3362998676U, // <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4> + 2691894582U, // <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2235582394U, // <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7> + 3734967544U, // <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4> + 1215512683U, // <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS + 3705110630U, // <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS + 3368313985U, // <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1> + 3368314472U, // <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2> + 2756945768U, // <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6> + 3705113910U, // <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS + 3310061416U, // <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6> + 3310135226U, // <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7> + 3370305457U, // <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7> + 2752743317U, // <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6> + 2631376998U, // <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS + 3705119540U, // <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1> + 2631378621U, // <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6> + 1678559162U, // <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2631380278U, // <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS + 3370976956U, // <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5> + 2237065146U, // <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7> + 3798815594U, // <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2> + 1679001575U, // <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 2800076778U, // <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1> + 3371647724U, // <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1> + 2297906792U, // <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2> + 1224163430U, // <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 3705130294U, // <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS + 3371648052U, // <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5> + 2297906877U, // <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6> + 3371648702U, // <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7> + 1224163435U, // <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1679001659U, // <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1> + 2752743492U, // <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1> + 1678558824U, // <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678559320U, // <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3> + 1679001699U, // <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5> + 2691897498U, // <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2237908922U, // <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7> + 2800519289U, // <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0> + 1679001731U, // <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1> + 1215480726U, // <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1678559382U, // <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2> + 2631403200U, // <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0> + 2289223282U, // <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3> + 2752301232U, // <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1> + 3362965027U, // <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5> + 3362965352U, // <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6> + 2289223610U, // <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7> + 1678559445U, // <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2> + 3830687964U, // <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0> + 2752301286U, // <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1> + 2752301297U, // <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3> + 2305157532U, // <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3> + 3830688000U, // <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0> + 3830688009U, // <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0> + 3830688019U, // <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1> + 3362973626U, // <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7> + 2752743719U, // <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3> + 2631417958U, // <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS + 3826043193U, // <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3> + 1624131186U, // <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3> + 2752301384U, // <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0> + 2631421238U, // <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS + 3826485602U, // <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u> + 2752301414U, // <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3> + 2771249519U, // <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3> + 1628112984U, // <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3> + 1563656294U, // <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS + 2301855911U, // <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1> + 2697873730U, // <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3> + 403488870U, // <3,3,3,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 2301856239U, // <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5> + 2697874067U, // <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7> + 2295220154U, // <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7> + 403488870U, // <3,3,3,u>: Cost 1 vspltisw3 LHS + 2289255318U, // <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0> + 2631435162U, // <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4> + 2631435972U, // <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4> + 2289256050U, // <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3> + 1215513498U, // <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679002114U, // <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6> + 3362998120U, // <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6> + 2289256378U, // <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7> + 1679002141U, // <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6> + 3831130657U, // <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1> + 3376277671U, // <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1> + 3771617012U, // <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3> + 2302536092U, // <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3> + 3831130697U, // <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5> + 2294572579U, // <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5> + 2800519773U, // <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7> + 3368314810U, // <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7> + 2800519791U, // <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7> + 2800077432U, // <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7> + 3310291185U, // <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3> + 2789165706U, // <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7> + 2764982931U, // <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7> + 2800077468U, // <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7> + 3873819301U, // <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7> + 2297235304U, // <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6> + 2725081963U, // <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3> + 2725745596U, // <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3> + 2631458918U, // <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS + 3705201460U, // <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1> + 2631460551U, // <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7> + 2297906802U, // <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3> + 2631462198U, // <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS + 3371648547U, // <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5> + 3371648548U, // <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6> + 1224165306U, // <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1224165306U, // <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1215480726U, // <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1679002398U, // <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2> + 1659967368U, // <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3> + 403488870U, // <3,3,u,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 1679002438U, // <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6> + 2756946764U, // <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3> + 1224165306U, // <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 403488870U, // <3,3,u,u>: Cost 1 vspltisw3 LHS + 2691907584U, // <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0> + 1618165862U, // <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631476937U, // <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0> + 2232601732U, // <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0> + 2691907922U, // <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5> + 1158860086U, // <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 3306343806U, // <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7> + 3366947484U, // <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7> + 1618166429U, // <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631483494U, // <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS + 2691908404U, // <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1> + 1618166682U, // <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4> + 3765650393U, // <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4> + 2631486774U, // <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS + 2756946914U, // <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0> + 3765650639U, // <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7> + 3735090439U, // <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1> + 1622148480U, // <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4> + 3765650893U, // <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0> + 3831131154U, // <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3> + 2691909224U, // <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2> + 2691909286U, // <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1> + 2699208469U, // <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4> + 2233863478U, // <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS + 2691909562U, // <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7> + 2701199368U, // <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4> + 2691909691U, // <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1> + 2691909782U, // <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2> + 3765651686U, // <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1> + 2691909972U, // <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3> + 2691910044U, // <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1> + 1161006390U, // <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691910300U, // <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 3368962716U, // <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7> + 1161006633U, // <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS + 2631508070U, // <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS + 2631508890U, // <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4> + 2631509709U, // <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4> + 2289256788U, // <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3> + 1726336208U, // <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4> + 1618169142U, // <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 3362998858U, // <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6> + 2289257116U, // <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7> + 1618169385U, // <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 1557774438U, // <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS + 2631516980U, // <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1> + 1557776078U, // <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5> + 2631518358U, // <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2> + 1557777718U, // <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS + 2296563406U, // <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5> + 604818742U, // <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2661381387U, // <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5> + 604818760U, // <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 3705266278U, // <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS + 3831131482U, // <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7> + 2733715962U, // <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3> + 3844771180U, // <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7> + 2800078197U, // <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7> + 2236550454U, // <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716280U, // <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6> + 2725090156U, // <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4> + 2236550697U, // <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716474U, // <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2> + 3371647013U, // <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1> + 2727744688U, // <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4> + 3371649364U, // <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3> + 2733716838U, // <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6> + 2297906894U, // <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5> + 3371647180U, // <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6> + 2733717100U, // <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7> + 2297906897U, // <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u> + 1557799014U, // <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS + 1618171694U, // <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 1557800657U, // <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u> + 2691913660U, // <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1> + 1557802294U, // <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS + 1618172058U, // <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 604818985U, // <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2661405966U, // <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u> + 604819003U, // <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2643492966U, // <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS + 2756947528U, // <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2> + 2331029019U, // <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2643495062U, // <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2> + 2756947554U, // <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1> + 2800078443U, // <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1> + 2289224194U, // <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6> + 3362964723U, // <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7> + 2756947590U, // <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1> + 2800078479U, // <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1> + 2333027218U, // <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1> + 2691916699U, // <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5> + 3832901294U, // <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5> + 2800078519U, // <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5> + 3830689467U, // <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0> + 3830689481U, // <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5> + 3873820365U, // <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0> + 2800078551U, // <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1> + 3770967487U, // <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4> + 2697225763U, // <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5> + 3830689523U, // <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2> + 2699216590U, // <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5> + 2699216662U, // <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5> + 2783047439U, // <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3> + 2783121176U, // <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3> + 3856936737U, // <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3> + 2701871194U, // <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5> + 2643517542U, // <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS + 2331052946U, // <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1> + 3699345010U, // <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3> + 2705189276U, // <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3> + 2705189359U, // <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5> + 2331053274U, // <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5> + 2295220738U, // <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6> + 3368961267U, // <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7> + 2295220740U, // <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u> + 2643525734U, // <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS + 2331061138U, // <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1> + 2235584280U, // <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3> + 2643528194U, // <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6> + 2735713498U, // <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5> + 2756947892U, // <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6> + 2289256962U, // <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6> + 3362997491U, // <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7> + 2756947919U, // <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6> + 2800078803U, // <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1> + 2800078812U, // <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1> + 2631591639U, // <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5> + 3832901616U, // <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3> + 2800078843U, // <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5> + 1726337028U, // <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078862U, // <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6> + 3368314099U, // <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7> + 1726337028U, // <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078884U, // <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1> + 2800078899U, // <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7> + 2631599832U, // <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6> + 2800078914U, // <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4> + 2800078924U, // <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5> + 2800078935U, // <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7> + 2297235970U, // <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6> + 1726337122U, // <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0> + 1726337131U, // <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0> + 3699376230U, // <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS + 2333739922U, // <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1> + 3699378106U, // <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7> + 3371647915U, // <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3> + 3699379510U, // <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS + 2333740250U, // <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5> + 2297907714U, // <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6> + 3370984691U, // <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7> + 2297907716U, // <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u> + 2800079046U, // <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1> + 2756948176U, // <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2> + 2331029019U, // <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2800079076U, // <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4> + 2800079085U, // <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4> + 1726337028U, // <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2289289730U, // <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6> + 1726337284U, // <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0> + 1726337293U, // <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0> + 3773628416U, // <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0> + 2699886694U, // <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789167401U, // <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1> + 3362965862U, // <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3> + 3773628754U, // <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5> + 3723284326U, // <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0> + 2800079181U, // <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1> + 1215483190U, // <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1215483191U, // <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS + 3873821032U, // <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1> + 3773629236U, // <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1> + 2691924892U, // <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6> + 3830690184U, // <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6> + 3873821072U, // <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5> + 3873821082U, // <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6> + 3403453240U, // <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6> + 2289233206U, // <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 2289233207U, // <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS + 2661498982U, // <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS + 3770975780U, // <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6> + 2631640797U, // <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2> + 3771639485U, // <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6> + 2661502262U, // <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS + 2699888488U, // <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6> + 2661503482U, // <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3> + 1715425786U, // <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3> + 1715499523U, // <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3> + 3773630614U, // <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2> + 3372942825U, // <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1> + 2234749434U, // <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3> + 3368962406U, // <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3> + 2699889154U, // <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6> + 3773631068U, // <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6> + 2331054904U, // <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6> + 1221479734U, // <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 1221479735U, // <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS + 2235584801U, // <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2> + 3717342106U, // <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4> + 2789167729U, // <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5> + 2235585074U, // <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5> + 2235585165U, // <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6> + 2699889974U, // <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 2800079509U, // <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5> + 1215515958U, // <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1215515959U, // <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS + 3873821356U, // <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1> + 3372959209U, // <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1> + 3862909629U, // <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0> + 3773632358U, // <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0> + 3873821396U, // <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5> + 3873821405U, // <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5> + 3862909672U, // <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7> + 2294574390U, // <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 2294574391U, // <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS + 2800079613U, // <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1> + 3873821446U, // <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1> + 2789167888U, // <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2> + 3844920090U, // <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3> + 2800079653U, // <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5> + 3723333484U, // <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6> + 1726337848U, // <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726337858U, // <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7> + 1726337867U, // <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7> + 1726337870U, // <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1> + 2297906665U, // <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1> + 2792117090U, // <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3> + 2297907558U, // <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3> + 1726337910U, // <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5> + 2297906993U, // <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5> + 2297906832U, // <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6> + 1224166710U, // <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224166711U, // <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS + 1726337951U, // <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1> + 2699892526U, // <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789168049U, // <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1> + 2792854460U, // <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3> + 1726337991U, // <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5> + 2699892890U, // <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 1726337848U, // <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1215548726U, // <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 1215548727U, // <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS + 2700558336U, // <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0> + 1626816614U, // <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700558513U, // <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6> + 2331030010U, // <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3> + 2700558674U, // <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5> + 2800079906U, // <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6> + 2655588936U, // <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0> + 2800079919U, // <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1> + 1626817181U, // <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 3774300899U, // <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1> + 2700559156U, // <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1> + 2700559254U, // <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0> + 3774301148U, // <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7> + 3774301227U, // <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5> + 3774301295U, // <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1> + 3768329441U, // <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7> + 3403453250U, // <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7> + 2700559740U, // <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0> + 2700559849U, // <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1> + 3770983973U, // <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7> + 2700559976U, // <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2> + 2698569415U, // <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7> + 2700560177U, // <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5> + 3773638505U, // <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7> + 1626818490U, // <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7> + 2795140307U, // <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3> + 1628145756U, // <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7> + 2700560534U, // <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2> + 3774302438U, // <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1> + 2700560742U, // <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3> + 2700560796U, // <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3> + 2700560898U, // <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6> + 3774302821U, // <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6> + 2700561079U, // <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7> + 2700561091U, // <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1> + 2700561182U, // <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2> + 2655617126U, // <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS + 3774303178U, // <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3> + 2655619002U, // <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7> + 2331062778U, // <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3> + 2655620406U, // <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS + 1626819894U, // <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 2655621708U, // <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4> + 2800080247U, // <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5> + 1626820137U, // <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 3774303816U, // <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2> + 3873822093U, // <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0> + 3774303998U, // <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4> + 3862910368U, // <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1> + 3774304180U, // <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6> + 2800080310U, // <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5> + 2800080321U, // <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7> + 3873822147U, // <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0> + 2800080339U, // <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7> + 2800080348U, // <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7> + 3873822181U, // <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7> + 2789168622U, // <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7> + 2700563016U, // <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0> + 2800080384U, // <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7> + 3862910472U, // <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6> + 2700563256U, // <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6> + 2800080404U, // <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0> + 2793149988U, // <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7> + 2637725798U, // <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS + 3371649227U, // <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1> + 2637727674U, // <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7> + 2297907567U, // <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3> + 2637729078U, // <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS + 3371649312U, // <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5> + 2655646287U, // <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7> + 1726338668U, // <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1726338668U, // <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 2700564179U, // <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2> + 1626822446U, // <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700564357U, // <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0> + 2700564412U, // <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1> + 2700564543U, // <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6> + 1626822810U, // <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 1662654672U, // <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7> + 1726338668U, // <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1626823013U, // <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 1678557184U, // <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1679005395U, // <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2> + 2289221787U, // <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2> + 1215479964U, // <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 2752747245U, // <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1> + 1158863002U, // <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 2289224221U, // <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6> + 1215483208U, // <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1679005458U, // <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2> + 1558036582U, // <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS + 1678558004U, // <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 604821294U, // <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 2752747317U, // <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1> + 1558039862U, // <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS + 2756949830U, // <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0> + 2800080726U, // <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7> + 2289233224U, // <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 604821348U, // <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696586709U, // <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u> + 2757392246U, // <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3> + 1624172151U, // <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u> + 1679005576U, // <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3> + 2631789878U, // <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS + 2699904874U, // <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u> + 1626826683U, // <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u> + 1726338988U, // <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3> + 1683208117U, // <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3> + 1679005628U, // <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1> + 1161008942U, // <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2752747471U, // <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2> + 403488870U, // <3,u,3,3>: Cost 1 vspltisw3 LHS + 1679005668U, // <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5> + 1161009306U, // <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691943104U, // <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7> + 1221479752U, // <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 403488870U, // <3,u,3,u>: Cost 1 vspltisw3 LHS + 2289255363U, // <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0> + 1161844526U, // <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS + 2289256661U, // <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2> + 1215512732U, // <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 1215513498U, // <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679005759U, // <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6> + 2289256989U, // <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6> + 1215515976U, // <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1679005786U, // <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6> + 1558069350U, // <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS + 2631811892U, // <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1> + 1558071026U, // <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5> + 2752747646U, // <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6> + 1558072630U, // <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS + 1726337028U, // <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 604821658U, // <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 2294574408U, // <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 604821676U, // <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 2631819366U, // <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS + 2757392574U, // <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7> + 2631821043U, // <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6> + 1679005904U, // <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 2631822646U, // <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS + 2236553370U, // <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 1726337848U, // <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726339309U, // <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0> + 1683208445U, // <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7> + 1726339328U, // <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1> + 2297905225U, // <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1> + 2631829236U, // <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7> + 1224163484U, // <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 1726339368U, // <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5> + 2297905553U, // <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5> + 2297905392U, // <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6> + 1224166728U, // <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224163489U, // <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1683208529U, // <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1> + 1679006043U, // <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2> + 604821861U, // <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 403488870U, // <3,u,u,3>: Cost 1 vspltisw3 LHS + 1683208569U, // <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5> + 1679006083U, // <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6> + 604821901U, // <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 1215548744U, // <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 604821915U, // <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS + 2759016448U, // <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0> + 1165115494U, // <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS + 3717531337U, // <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0> + 3369675785U, // <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3> + 2751791144U, // <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4> + 2238857630U, // <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0> + 3312591341U, // <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7> + 3369676113U, // <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7> + 1165116061U, // <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS + 2637824102U, // <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS + 2637824922U, // <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4> + 1685274726U, // <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637826512U, // <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1> + 2637827382U, // <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS + 2661716070U, // <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4> + 3729486427U, // <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1> + 2661717300U, // <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1> + 1685274780U, // <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 3711574118U, // <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS + 2240200806U, // <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 3771663992U, // <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0> + 2698585801U, // <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0> + 3373672105U, // <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4> + 3810813795U, // <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1> + 3772327866U, // <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7> + 3386280568U, // <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7> + 2701903966U, // <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0> + 3699638374U, // <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS + 2753560832U, // <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 3772328276U, // <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3> + 3827302674U, // <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4> + 3699641654U, // <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS + 3779627588U, // <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0> + 3772328604U, // <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7> + 3780954854U, // <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0> + 2753560832U, // <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 2725129106U, // <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1> + 1167720550U, // <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 3839172953U, // <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3> + 3772329051U, // <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4> + 2241462610U, // <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5> + 2698587446U, // <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 3772329297U, // <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7> + 3735483703U, // <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4> + 1167721117U, // <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS + 1168556032U, // <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 94814310U, // <4,0,5,1>: Cost 1 vmrghw RHS, LHS + 2242298029U, // <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2637859284U, // <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5> + 1168556370U, // <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2242306530U, // <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5> + 2242298358U, // <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661750072U, // <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5> + 94814877U, // <4,0,5,u>: Cost 1 vmrghw RHS, LHS + 3316580362U, // <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1> + 2242846822U, // <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS + 3798872570U, // <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3> + 3796218413U, // <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0> + 3834528273U, // <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7> + 3798872811U, // <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1> + 3316621876U, // <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6> + 2725131121U, // <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0> + 2242847389U, // <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS + 3377692672U, // <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0> + 2243493990U, // <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 3775648970U, // <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3> + 3802191110U, // <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0> + 3317236050U, // <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5> + 3803518376U, // <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0> + 3317236214U, // <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7> + 3798873708U, // <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7> + 2243494557U, // <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS + 1170546688U, // <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 96804966U, // <4,0,u,1>: Cost 1 vmrghw RHS, LHS + 1685275293U, // <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637883863U, // <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u> + 1170547026U, // <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2698590362U, // <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 2244289014U, // <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661774651U, // <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u> + 96805533U, // <4,0,u,u>: Cost 1 vmrghw RHS, LHS + 2667749478U, // <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS + 2689966182U, // <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS + 2238571418U, // <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4> + 3711633880U, // <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0> + 2689966418U, // <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5> + 3361046866U, // <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5> + 3741495802U, // <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3> + 3741496314U, // <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2> + 2689966765U, // <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1> + 3764372222U, // <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1> + 2758206263U, // <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4> + 2698593178U, // <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4> + 3361057810U, // <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3> + 3827303250U, // <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4> + 2287313234U, // <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5> + 3763709171U, // <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7> + 3361058138U, // <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7> + 2239759744U, // <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4> + 2637906022U, // <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS + 2637906842U, // <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4> + 3763709544U, // <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2> + 1685275546U, // <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4> + 2637909302U, // <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS + 3361063250U, // <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5> + 3763709882U, // <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7> + 3735541054U, // <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2> + 1685644231U, // <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4> + 2702575792U, // <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1> + 3832759257U, // <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4> + 3833349090U, // <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4> + 3763710364U, // <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3> + 2707884546U, // <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6> + 3361071442U, // <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5> + 3772336796U, // <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7> + 3775654595U, // <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1> + 2707884856U, // <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1> + 2667782246U, // <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS + 2241463092U, // <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1> + 2241553306U, // <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4> + 3827303484U, // <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4> + 2667785424U, // <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4> + 2689969462U, // <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 3763711322U, // <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7> + 3867116636U, // <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0> + 2689969705U, // <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 1546273106U, // <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5> + 1168556852U, // <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1168556950U, // <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2620016790U, // <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2> + 1546276150U, // <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS + 2620018692U, // <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5> + 2242299087U, // <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667795450U, // <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2> + 1546278702U, // <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS + 3781628193U, // <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2> + 3832759503U, // <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7> + 3316261786U, // <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4> + 3781628466U, // <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5> + 3827303658U, // <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7> + 3361096018U, // <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5> + 3788264248U, // <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6> + 3788264270U, // <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1> + 3832759566U, // <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7> + 2726466580U, // <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1> + 3377692682U, // <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1> + 3377694870U, // <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2> + 3802199303U, // <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1> + 2731775334U, // <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6> + 3377693010U, // <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5> + 3365749804U, // <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6> + 3788265068U, // <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7> + 2731775644U, // <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1> + 1546297685U, // <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u> + 1170547508U, // <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1170547606U, // <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 1689257344U, // <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4> + 1546300726U, // <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS + 2284716370U, // <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5> + 2244289743U, // <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667820026U, // <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2> + 1546303278U, // <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS + 3729621094U, // <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS + 3763716198U, // <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS + 2238858856U, // <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2> + 2295930982U, // <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 3763716434U, // <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5> + 2238859107U, // <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1> + 2238859194U, // <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7> + 3312601066U, // <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1> + 2295930987U, // <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS + 3699769446U, // <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS + 3313255971U, // <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5> + 3361056360U, // <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2> + 2287312998U, // <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 3788932148U, // <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5> + 3313256290U, // <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0> + 3838289469U, // <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4> + 3369682865U, // <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7> + 2287313003U, // <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS + 3838658133U, // <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1> + 3711722394U, // <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4> + 2759018088U, // <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2> + 2759018098U, // <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3> + 3838658168U, // <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0> + 3369027341U, // <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5> + 2240227258U, // <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7> + 3735614791U, // <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2> + 2759018143U, // <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3> + 2759018150U, // <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1> + 3831948975U, // <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1> + 3832759993U, // <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2> + 2759018180U, // <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4> + 2759018185U, // <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0> + 3839542998U, // <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4> + 3314640826U, // <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7> + 2765948648U, // <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4> + 2759018222U, // <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1> + 3838658295U, // <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1> + 3315205667U, // <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5> + 2241463912U, // <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2> + 1234829414U, // <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 2241464085U, // <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4> + 2241546087U, // <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5> + 2241464250U, // <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7> + 3741602873U, // <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2> + 1234829419U, // <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS + 2626060390U, // <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS + 2626061364U, // <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5> + 1168557672U, // <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222230118U, // <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 2626063670U, // <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS + 2242299752U, // <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1168558010U, // <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2242299882U, // <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1222230123U, // <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS + 3711754342U, // <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS + 3711755162U, // <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4> + 3838658481U, // <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7> + 2759018426U, // <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7> + 3838658499U, // <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7> + 3735646310U, // <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4> + 3316590522U, // <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7> + 3798889331U, // <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2> + 2759018471U, // <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7> + 3874564074U, // <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1> + 3800880230U, // <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2> + 3371722344U, // <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2> + 2303950950U, // <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 3371722346U, // <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4> + 3371722509U, // <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5> + 3317237690U, // <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7> + 3317237738U, // <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1> + 2303950955U, // <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 2759018555U, // <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1> + 2626085943U, // <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u> + 1170548328U, // <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222254694U, // <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 2759018595U, // <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5> + 2244290408U, // <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1170548666U, // <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2769266813U, // <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4> + 1222254699U, // <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS + 2238859414U, // <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2> + 2759018646U, // <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2> + 3312314708U, // <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3> + 2238859676U, // <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3> + 2295931802U, // <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4> + 3735670886U, // <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4> + 3312315036U, // <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7> + 3369674682U, // <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7> + 2759018709U, // <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2> + 3361055638U, // <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0> + 3831949542U, // <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1> + 2703917978U, // <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3361056370U, // <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3> + 2295939994U, // <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4> + 3361056291U, // <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5> + 3378972520U, // <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6> + 3361056698U, // <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7> + 2703917978U, // <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3832760624U, // <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3> + 3711796122U, // <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4> + 3832760641U, // <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2> + 2770962764U, // <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4> + 2759018836U, // <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3> + 3827304802U, // <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u> + 3832760678U, // <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3> + 3859597679U, // <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3> + 2771331449U, // <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4> + 2240841878U, // <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2> + 3776997635U, // <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3> + 2703919444U, // <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3> + 2759018908U, // <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3> + 2759018918U, // <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4> + 3386951446U, // <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5> + 3777661596U, // <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7> + 3375007674U, // <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7> + 2707901242U, // <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3> + 2759018960U, // <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1> + 2759018970U, // <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2> + 2632099605U, // <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4> + 2241464732U, // <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3> + 2759019000U, // <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5> + 2753563138U, // <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6> + 3777662316U, // <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7> + 2308573114U, // <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7> + 2759019032U, // <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1> + 1168558230U, // <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2242300134U, // <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1> + 2632107798U, // <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5> + 1168558492U, // <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1168558594U, // <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2295973654U, // <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5> + 2242300536U, // <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295973818U, // <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7> + 1168558878U, // <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 3832760952U, // <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7> + 3711828890U, // <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4> + 3316484436U, // <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3> + 3711830512U, // <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6> + 2759019164U, // <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3361097251U, // <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5> + 3316624045U, // <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6> + 2773912244U, // <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4> + 2759019164U, // <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3377693590U, // <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0> + 3365751680U, // <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1> + 2727810232U, // <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3> + 3377694322U, // <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3> + 2303951770U, // <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4> + 3741700198U, // <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4> + 3377695216U, // <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6> + 3375703994U, // <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7> + 2731792030U, // <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3> + 1170548886U, // <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2759019294U, // <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2> + 2632132377U, // <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u> + 1170549148U, // <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1170549250U, // <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2759019334U, // <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6> + 2244291192U, // <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295998394U, // <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7> + 1170549534U, // <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 1165118354U, // <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1637482598U, // <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 3711854285U, // <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4> + 3827305344U, // <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1> + 2711224658U, // <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5> + 1165118774U, // <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 3312602489U, // <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2> + 3369675420U, // <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7> + 1165119017U, // <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS + 3369682633U, // <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0> + 2287313581U, // <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1> + 2759019466U, // <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3> + 3369683284U, // <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3> + 2311204048U, // <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4> + 2239319350U, // <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS + 3784967411U, // <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7> + 3369683612U, // <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7> + 2763000832U, // <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3> + 3711869030U, // <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS + 3711869850U, // <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4> + 2240203830U, // <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3> + 2698618573U, // <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4> + 2711226133U, // <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4> + 2240204086U, // <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2711226298U, // <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7> + 3832761416U, // <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3> + 2701936738U, // <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4> + 2711226518U, // <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2> + 3777005828U, // <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4> + 3832761453U, // <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4> + 2301266260U, // <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 2705254903U, // <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4> + 2240843062U, // <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 3832761489U, // <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4> + 3375008412U, // <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7> + 2301266260U, // <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 1570373734U, // <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS + 2308574089U, // <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1> + 2644117096U, // <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2> + 2638146039U, // <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4> + 229035318U, // <4,4,4,4>: Cost 1 vspltisw0 RHS + 1167723830U, // <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS + 2644120058U, // <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3> + 2662036827U, // <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4> + 229035318U, // <4,4,4,u>: Cost 1 vspltisw0 RHS + 1168558994U, // <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 2638152602U, // <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4> + 2242300981U, // <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638154232U, // <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5> + 1168559322U, // <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5> + 94817590U, // <4,4,5,5>: Cost 1 vmrghw RHS, RHS + 1685278006U, // <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2242309576U, // <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 94817833U, // <4,4,5,u>: Cost 1 vmrghw RHS, RHS + 3316591506U, // <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1> + 3758428587U, // <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5> + 2711228922U, // <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3> + 3796251185U, // <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4> + 2711229085U, // <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4> + 2242850102U, // <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2242850169U, // <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2> + 2725163893U, // <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4> + 2242850345U, // <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS + 2711229434U, // <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2> + 3377694410U, // <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1> + 3868593584U, // <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3> + 3377695060U, // <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3> + 2729145691U, // <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4> + 2243497270U, // <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 3871542744U, // <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7> + 2303953564U, // <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7> + 2243497513U, // <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS + 1170549650U, // <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 1637488430U, // <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 2244291637U, // <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638178811U, // <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u> + 229035318U, // <4,4,u,4>: Cost 1 vspltisw0 RHS + 96808246U, // <4,4,u,5>: Cost 1 vmrghw RHS, RHS + 1685278249U, // <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2244292040U, // <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 96808489U, // <4,4,u,u>: Cost 1 vmrghw RHS, RHS + 2698625024U, // <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0> + 1624883302U, // <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2638186190U, // <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5> + 2638187004U, // <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0> + 2687345005U, // <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5> + 2238861316U, // <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5> + 2662077302U, // <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5> + 2662077792U, // <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0> + 1624883869U, // <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 3361057762U, // <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0> + 2691326803U, // <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5> + 2698625942U, // <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0> + 3361055659U, // <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3> + 3761087567U, // <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5> + 2693981335U, // <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5> + 2305231362U, // <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 3361055987U, // <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7> + 2695972234U, // <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5> + 2638200934U, // <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS + 3761088035U, // <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5> + 2697963133U, // <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5> + 1624884942U, // <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5> + 2698626838U, // <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5> + 3772368744U, // <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6> + 2698627002U, // <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7> + 3775023122U, // <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5> + 1628203107U, // <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5> + 2698627222U, // <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2> + 3765070057U, // <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4> + 2698627404U, // <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4> + 2698627484U, // <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3> + 2698627580U, // <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0> + 3779668553U, // <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5> + 2725169844U, // <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4> + 2707253995U, // <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5> + 2698627870U, // <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2> + 2638217318U, // <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS + 2308574098U, // <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1> + 2698628150U, // <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3> + 2638219776U, // <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4> + 2698628314U, // <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5> + 1624886582U, // <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 2698628478U, // <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7> + 2662110564U, // <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4> + 1624886825U, // <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1570455654U, // <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS + 2312564250U, // <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1> + 2644199118U, // <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5> + 2295974966U, // <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3> + 1570458842U, // <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5> + 1168568324U, // <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5> + 1168568418U, // <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2295975294U, // <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7> + 1168716036U, // <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0> + 1564491878U, // <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS + 2626290768U, // <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6> + 2632263465U, // <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6> + 1564494338U, // <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6> + 1564495158U, // <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS + 2638237464U, // <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3> + 2656154253U, // <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2725172218U, // <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2> + 3859599489U, // <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4> + 2698630320U, // <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4> + 2728490251U, // <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5> + 2725172576U, // <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0> + 3317239812U, // <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5> + 2725172760U, // <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4> + 2725172844U, // <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7> + 2725172866U, // <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2> + 1564508262U, // <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS + 1624889134U, // <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2698631045U, // <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0> + 1564510724U, // <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u> + 1564511542U, // <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS + 1624889498U, // <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1170550882U, // <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 3312595285U, // <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0> + 3763748966U, // <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS + 2238861818U, // <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3> + 3767730432U, // <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4> + 3763749202U, // <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5> + 2238862059U, // <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1> + 2238862136U, // <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6> + 2295934262U, // <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 2295934263U, // <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS + 3378973999U, // <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0> + 3378974648U, // <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1> + 3779675034U, // <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4> + 3378974002U, // <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3> + 3378974003U, // <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4> + 3767731352U, // <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6> + 3378974734U, // <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6> + 2287316278U, // <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 2287316279U, // <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS + 3735904358U, // <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS + 3763750435U, // <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5> + 3313938937U, // <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2> + 3772376782U, // <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5> + 3852890591U, // <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3> + 3735908454U, // <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4> + 3801573306U, // <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7> + 2785858042U, // <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3> + 2785858051U, // <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3> + 3863065101U, // <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4> + 3314586024U, // <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2> + 3863212575U, // <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4> + 3863286312U, // <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4> + 3767732738U, // <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6> + 3779676746U, // <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6> + 3398898488U, // <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6> + 2301267254U, // <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2301267255U, // <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS + 3852890715U, // <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1> + 3315208615U, // <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1> + 2241466874U, // <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3> + 3852890745U, // <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4> + 2241467037U, // <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4> + 2241549039U, // <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5> + 2241467192U, // <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6> + 1234832694U, // <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 1234832695U, // <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS + 2242302241U, // <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2242310567U, // <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1168568826U, // <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2242302514U, // <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2242302605U, // <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2242310891U, // <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1168569144U, // <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222233398U, // <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 1222233399U, // <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS + 3316576545U, // <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2> + 3316584871U, // <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1> + 2242851322U, // <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3> + 3316601394U, // <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5> + 3852890916U, // <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4> + 3316617963U, // <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1> + 2242884408U, // <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6> + 2785858370U, // <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7> + 2785858379U, // <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7> + 2785858382U, // <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1> + 3859600215U, // <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1> + 3317240314U, // <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3> + 2792199020U, // <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4> + 2785858422U, // <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5> + 3856651132U, // <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2> + 3317240632U, // <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6> + 2303954230U, // <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303954231U, // <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS + 2244292897U, // <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2244293031U, // <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1170551290U, // <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2244293170U, // <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2244293261U, // <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2244293355U, // <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1170551608U, // <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222257974U, // <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS + 1222257975U, // <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS + 2238862330U, // <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2> + 2706604134U, // <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3312604308U, // <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3> + 3768402176U, // <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4> + 2238862648U, // <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5> + 3859600418U, // <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6> + 3729994393U, // <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0> + 2238862956U, // <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7> + 2706604701U, // <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3385610338U, // <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0> + 3780346676U, // <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1> + 2706604954U, // <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3385610746U, // <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3> + 3385610342U, // <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4> + 3385610667U, // <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5> + 3768403178U, // <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7> + 3385611074U, // <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7> + 2706604954U, // <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3859600532U, // <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3> + 3712091034U, // <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4> + 3774375528U, // <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2> + 2794853552U, // <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4> + 2785858744U, // <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3> + 3735982182U, // <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4> + 3774375875U, // <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7> + 3735983476U, // <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2> + 2795222237U, // <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4> + 3780348054U, // <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2> + 3730015130U, // <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4> + 3780348244U, // <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3> + 3778357673U, // <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7> + 2325155942U, // <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4> + 3779684939U, // <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7> + 2706606748U, // <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7> + 3398898498U, // <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7> + 2707934014U, // <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7> + 2785858868U, // <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1> + 3780348874U, // <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3> + 3780349000U, // <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3> + 2308575738U, // <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3> + 2656283856U, // <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4> + 2706607414U, // <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2656285341U, // <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4> + 2241468012U, // <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7> + 2706607657U, // <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 1168569338U, // <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2242311242U, // <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1> + 2242303178U, // <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3> + 2242311395U, // <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1168569702U, // <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2242311606U, // <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5> + 2242311662U, // <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1168569964U, // <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1168569986U, // <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 3316593658U, // <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2> + 3316593738U, // <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1> + 3316634800U, // <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4> + 3386978810U, // <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3> + 2785859072U, // <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7> + 3736014950U, // <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4> + 3316594158U, // <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7> + 2797803032U, // <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4> + 2797876769U, // <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4> + 2243499002U, // <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2> + 3718103962U, // <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4> + 3317257418U, // <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3> + 3377695816U, // <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3> + 2243532134U, // <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6> + 3317282230U, // <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5> + 2730497536U, // <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7> + 2243556972U, // <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7> + 2243565186U, // <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2> + 1170551802U, // <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2706609966U, // <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 2244293797U, // <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2> + 2244293859U, // <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1170552166U, // <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2706610330U, // <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2244294126U, // <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1170552428U, // <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1170552450U, // <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 1165118354U, // <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1624907878U, // <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638407377U, // <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u> + 2295931036U, // <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 2687369584U, // <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u> + 1165121690U, // <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 2662298489U, // <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u> + 2295934280U, // <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 1624908445U, // <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638413926U, // <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS + 2691351382U, // <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u> + 1685280558U, // <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2287313052U, // <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 2299257799U, // <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4> + 2694005914U, // <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u> + 2305231362U, // <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 2287316296U, // <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 1685280612U, // <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2638422118U, // <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS + 2240206638U, // <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 2697987712U, // <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u> + 1624909521U, // <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u> + 2759391121U, // <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3> + 2240207002U, // <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2698651578U, // <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7> + 2785859500U, // <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3> + 1628227686U, // <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u> + 2759022524U, // <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1> + 2801342408U, // <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4> + 2703960409U, // <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u> + 2759022554U, // <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4> + 2759022564U, // <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5> + 2240845978U, // <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 2706614941U, // <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u> + 2301267272U, // <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2759022596U, // <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1> + 1570668646U, // <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS + 1167726382U, // <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 2698652753U, // <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3> + 1234829468U, // <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 229035318U, // <4,u,4,4>: Cost 1 vspltisw0 RHS + 1624911158U, // <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS + 2698653081U, // <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7> + 1234832712U, // <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 229035318U, // <4,u,4,u>: Cost 1 vspltisw0 RHS + 1168561875U, // <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2> + 94820142U, // <4,u,5,1>: Cost 1 vmrghw RHS, LHS + 1168562053U, // <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0> + 1222230172U, // <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 1168562239U, // <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6> + 94820506U, // <4,u,5,5>: Cost 1 vmrghw RHS, RHS + 1685280922U, // <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 1222233416U, // <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 94820709U, // <4,u,5,u>: Cost 1 vmrghw RHS, LHS + 1564713062U, // <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS + 2626511979U, // <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6> + 2632484676U, // <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6> + 1564715549U, // <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6> + 1564716342U, // <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS + 2242853018U, // <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2656375464U, // <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6> + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2785859840U, // <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1> + 2243499822U, // <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 2727851197U, // <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u> + 2303951004U, // <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 2785859880U, // <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5> + 2243500186U, // <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 2730505729U, // <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u> + 2303954248U, // <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303951009U, // <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 1564729446U, // <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS + 96810798U, // <4,u,u,1>: Cost 1 vmrghw RHS, LHS + 1685281125U, // <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 1222254748U, // <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 229035318U, // <4,u,u,4>: Cost 1 vspltisw0 RHS + 96811162U, // <4,u,u,5>: Cost 1 vmrghw RHS, RHS + 1685281165U, // <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2754232320U, // <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0> + 2754232330U, // <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1> + 3718194894U, // <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5> + 3376385762U, // <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3> + 2754232357U, // <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1> + 3845816370U, // <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5> + 3782353389U, // <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7> + 3376386090U, // <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7> + 2757402697U, // <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1> + 2626543718U, // <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS + 2626544751U, // <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1> + 1680490598U, // <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3766428665U, // <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0> + 2626546998U, // <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS + 2650435539U, // <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1> + 3783017715U, // <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7> + 3385019000U, // <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7> + 1680490652U, // <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3376398336U, // <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0> + 2245877862U, // <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 3773064808U, // <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2> + 2705295054U, // <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 3827974343U, // <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1> + 3845816530U, // <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3> + 3779037114U, // <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7> + 3810887658U, // <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1> + 2245878429U, // <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS + 2710603926U, // <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2> + 3827974396U, // <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0> + 3779037516U, // <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4> + 3779037596U, // <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3> + 2705295868U, // <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0> + 3379726804U, // <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5> + 3802925748U, // <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4> + 3363138168U, // <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7> + 2707950400U, // <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0> + 2626568294U, // <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS + 1680490834U, // <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 3828048219U, // <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5> + 2710604932U, // <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0> + 2754232685U, // <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5> + 2705296694U, // <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779038590U, // <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7> + 2713259464U, // <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1680490834U, // <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 2311307264U, // <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0> + 1174437990U, // <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 3779038946U, // <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3> + 3845816752U, // <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0> + 2248180050U, // <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5> + 2248180194U, // <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5> + 3779039274U, // <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7> + 3385051768U, // <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7> + 1174438557U, // <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS + 2302689280U, // <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0> + 1175208038U, // <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 3787002362U, // <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3> + 3376432160U, // <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3> + 2248950098U, // <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5> + 2248950180U, // <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 3376433702U, // <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6> + 2729186166U, // <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5> + 1175208605U, // <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS + 2713261050U, // <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2> + 3365823599U, // <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1> + 3808900317U, // <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4> + 3784348899U, // <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1> + 2729186656U, // <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0> + 3787003268U, // <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0> + 3802928664U, // <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4> + 3787003431U, // <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1> + 2731841188U, // <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0> + 2626601062U, // <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS + 1683145366U, // <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5> + 1680491165U, // <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2705295054U, // <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 2754233005U, // <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1> + 2705299610U, // <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779041488U, // <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7> + 2737150252U, // <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0> + 1680491219U, // <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2713927680U, // <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0> + 1640185958U, // <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2310607866U, // <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 3787669756U, // <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0> + 2713928018U, // <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5> + 2306621778U, // <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5> + 3787670006U, // <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7> + 3736188301U, // <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0> + 1640186525U, // <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2650505318U, // <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS + 2754233140U, // <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1> + 2311276694U, // <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2> + 2311278315U, // <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3> + 2758435667U, // <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5> + 2754233180U, // <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5> + 3385016497U, // <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6> + 2311278643U, // <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7> + 2758730615U, // <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5> + 3700367462U, // <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS + 3830629255U, // <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3> + 2713929320U, // <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2> + 2754233238U, // <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0> + 2759099300U, // <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5> + 2754233259U, // <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3> + 2713929658U, // <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7> + 3872359354U, // <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0> + 2754233283U, // <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0> + 2713929878U, // <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2> + 3363135498U, // <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1> + 3363137686U, // <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2> + 2713930140U, // <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3> + 2713930242U, // <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6> + 2289394002U, // <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5> + 3787672184U, // <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7> + 3787672259U, // <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1> + 2713930526U, // <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2> + 1634880402U, // <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1> + 2760205355U, // <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5> + 2760279092U, // <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5> + 3787672708U, // <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0> + 2713930960U, // <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4> + 1640189238U, // <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 3786345848U, // <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1> + 3787009481U, // <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1> + 1640189466U, // <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1> + 2754233455U, // <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1> + 2713931407U, // <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1> + 2713931499U, // <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3> + 3827975305U, // <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0> + 2754233495U, // <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5> + 2288746834U, // <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5> + 2713931827U, // <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7> + 3787673725U, // <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0> + 2754233527U, // <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1> + 2668462182U, // <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS + 2290746002U, // <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1> + 2302691478U, // <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2> + 3364488071U, // <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3> + 2302689536U, // <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4> + 2754233587U, // <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7> + 2713932600U, // <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6> + 2713932622U, // <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1> + 2302689297U, // <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u> + 2713932794U, // <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2> + 3365822474U, // <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1> + 3365824662U, // <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2> + 3787674851U, // <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1> + 2713933158U, // <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6> + 2292080978U, // <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5> + 3365823613U, // <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6> + 2713933420U, // <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7> + 2713933442U, // <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2> + 1658771190U, // <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1> + 1640191790U, // <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2762933624U, // <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5> + 2754233724U, // <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0> + 2763081098U, // <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5> + 1640192154U, // <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 2713934032U, // <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7> + 2713934080U, // <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1> + 1640192357U, // <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 3779051520U, // <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0> + 2705309798U, // <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 3838813637U, // <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1> + 2302640230U, // <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 3765117266U, // <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5> + 3381027892U, // <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5> + 3842794985U, // <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1> + 3408232554U, // <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7> + 2302640235U, // <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS + 3700432998U, // <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS + 3765117785U, // <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2> + 2311276136U, // <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2> + 1237532774U, // <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700436278U, // <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS + 3381036084U, // <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5> + 3385018045U, // <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6> + 3385017560U, // <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7> + 1237532779U, // <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700441190U, // <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS + 3700442242U, // <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2> + 2754233960U, // <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2> + 2754233970U, // <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3> + 2765071997U, // <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5> + 3834021508U, // <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3> + 3842795152U, // <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6> + 3376402492U, // <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7> + 2754234015U, // <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3> + 2754234022U, // <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1> + 3827975855U, // <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1> + 2644625102U, // <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393766U, // <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1691993806U, // <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5> + 2785052375U, // <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5> + 3854812897U, // <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6> + 3802942187U, // <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5> + 1692288754U, // <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5> + 3839846139U, // <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5> + 2709294052U, // <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2> + 2766251789U, // <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5> + 2765735702U, // <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5> + 3840141087U, // <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5> + 2705313078U, // <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2712612217U, // <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2> + 3787017674U, // <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2> + 2765735747U, // <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5> + 3834021704U, // <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1> + 3834021714U, // <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2> + 2311308904U, // <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2> + 1237565542U, // <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 3834021744U, // <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5> + 3369124916U, // <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5> + 2248181690U, // <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7> + 3786354825U, // <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3> + 1237565547U, // <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS + 3700473958U, // <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS + 3700475014U, // <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6> + 2296718952U, // <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2> + 1228947558U, // <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 3700477238U, // <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS + 3834021836U, // <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7> + 2248951738U, // <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7> + 3370461105U, // <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7> + 1228947563U, // <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 3786355706U, // <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2> + 3783038037U, // <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3> + 3365824104U, // <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2> + 2292080742U, // <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS + 3842131986U, // <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5> + 3371795508U, // <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5> + 3786356206U, // <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7> + 3786356332U, // <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7> + 2292080747U, // <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS + 2754234427U, // <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1> + 2705315630U, // <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 2296735336U, // <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2> + 1228963942U, // <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 1695311971U, // <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5> + 2705315994U, // <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2769201269U, // <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5> + 3370477489U, // <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7> + 1695606919U, // <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5> + 3827976331U, // <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0> + 2754234518U, // <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2> + 3706472290U, // <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0> + 3700500630U, // <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2> + 2754234544U, // <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1> + 3376383766U, // <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5> + 3769770513U, // <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7> + 3376383930U, // <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7> + 2754234581U, // <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2> + 2311275414U, // <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0> + 2305967971U, // <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1> + 2692047787U, // <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3> + 2311276146U, // <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3> + 2311275418U, // <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4> + 3765789807U, // <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1> + 3765789939U, // <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7> + 2311276474U, // <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7> + 2696029585U, // <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3> + 2311288709U, // <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 3765790243U, // <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5> + 3827976513U, // <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2> + 2765736268U, // <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4> + 2246248962U, // <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6> + 3765790563U, // <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1> + 3827976550U, // <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3> + 3842795887U, // <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3> + 2769054073U, // <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4> + 3827976575U, // <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1> + 3765790963U, // <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5> + 3839478162U, // <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2> + 2754234780U, // <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3> + 2771708327U, // <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5> + 3363137059U, // <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5> + 3375081320U, // <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6> + 3363137466U, // <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7> + 2772003275U, // <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5> + 2772077012U, // <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5> + 3765791714U, // <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0> + 2709965878U, // <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3> + 2772298223U, // <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5> + 2772371960U, // <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5> + 2754234882U, // <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6> + 3839478282U, // <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5> + 3376416698U, // <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7> + 2754234909U, // <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6> + 2311308182U, // <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0> + 3765792421U, // <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5> + 2715938575U, // <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3> + 2311308914U, // <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3> + 2311308186U, // <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4> + 2248182354U, // <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5> + 3765792837U, // <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7> + 2311309242U, // <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7> + 2311308190U, // <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u> + 2632777830U, // <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3706520372U, // <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1> + 2632779624U, // <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6> + 2632780290U, // <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6> + 2632781110U, // <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS + 2248952413U, // <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7> + 2302691176U, // <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302691258U, // <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7> + 2632783662U, // <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3365823382U, // <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0> + 3706529011U, // <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7> + 3706529641U, // <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7> + 3365824114U, // <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3> + 2774362859U, // <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5> + 3365824035U, // <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5> + 3383740183U, // <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6> + 3363833786U, // <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7> + 2774657807U, // <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5> + 2632794214U, // <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS + 2754235166U, // <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2> + 2632796010U, // <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u> + 2632796676U, // <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u> + 2632797494U, // <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS + 2754235206U, // <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6> + 2302691176U, // <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302707642U, // <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7> + 2754235229U, // <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2> + 3765133325U, // <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4> + 2705326182U, // <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 3718489806U, // <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5> + 3718490624U, // <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4> + 2709307730U, // <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5> + 2302641870U, // <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5> + 3376383695U, // <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6> + 3384351018U, // <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7> + 2705326749U, // <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 2305971057U, // <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0> + 3765134171U, // <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4> + 3766461338U, // <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4> + 3766461437U, // <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4> + 2311277776U, // <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4> + 2754235362U, // <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0> + 3783050483U, // <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7> + 3385019036U, // <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7> + 2311276241U, // <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u> + 3718504550U, // <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS + 3783050787U, // <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5> + 3773097576U, // <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2> + 2705327822U, // <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 3773097767U, // <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4> + 2765737014U, // <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3> + 3779069882U, // <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7> + 3376401052U, // <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7> + 2245881370U, // <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1> + 3779070102U, // <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2> + 3363135525U, // <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1> + 3779070284U, // <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4> + 3779070364U, // <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3> + 2705328640U, // <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4> + 2307311310U, // <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5> + 3866021012U, // <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7> + 3363138204U, // <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7> + 2707983172U, // <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4> + 2708646805U, // <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4> + 2709310438U, // <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4> + 3779071030U, // <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3> + 2710637704U, // <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4> + 2754235600U, // <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4> + 1704676570U, // <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5> + 3779071358U, // <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7> + 2713292236U, // <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4> + 1704897781U, // <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5> + 2626871398U, // <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS + 2626872471U, // <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5> + 2765737230U, // <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3> + 3700615318U, // <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2> + 2626874678U, // <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS + 1174441270U, // <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS + 1680493878U, // <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 3385051804U, // <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7> + 1680493896U, // <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2248952722U, // <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1> + 2302692152U, // <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 3382406107U, // <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2> + 3700623874U, // <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6> + 2248953040U, // <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4> + 1175211318U, // <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 3376432280U, // <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6> + 2729218934U, // <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5> + 1175211561U, // <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS + 3787035642U, // <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2> + 3365822501U, // <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1> + 3808933085U, // <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4> + 3784381707U, // <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5> + 2713294182U, // <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6> + 2309998286U, // <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5> + 3383740111U, // <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6> + 3787036239U, // <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5> + 2731873960U, // <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4> + 2626895974U, // <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS + 2626897050U, // <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u> + 2644813518U, // <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5> + 2705327822U, // <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 2626899254U, // <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS + 1707331102U, // <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5> + 1680494121U, // <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2737183024U, // <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4> + 1680494139U, // <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2302642684U, // <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0> + 1640218726U, // <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 3376384510U, // <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2> + 3376385078U, // <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3> + 2754236002U, // <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1> + 2717942242U, // <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5> + 2244907106U, // <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 3376385406U, // <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7> + 1640219293U, // <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2305969365U, // <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0> + 1237536282U, // <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2713961366U, // <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0> + 3766469630U, // <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5> + 2782326455U, // <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5> + 2311277786U, // <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5> + 2311277058U, // <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6> + 3385017587U, // <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7> + 1237536282U, // <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 3376400892U, // <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0> + 3827977963U, // <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3> + 2302659070U, // <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2> + 2765737726U, // <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4> + 3839479558U, // <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3> + 2781073167U, // <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3> + 2713962426U, // <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7> + 3376401790U, // <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7> + 2769055531U, // <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4> + 2713962646U, // <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2> + 3765143786U, // <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5> + 3839479621U, // <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3> + 2289394603U, // <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3> + 2713963010U, // <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6> + 2313285150U, // <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5> + 3363138050U, // <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6> + 3363136755U, // <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7> + 2713963294U, // <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2> + 2713963410U, // <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1> + 3827978127U, // <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5> + 3839479704U, // <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5> + 3376417846U, // <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3> + 1637567706U, // <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5> + 1640222006U, // <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS + 2310640998U, // <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6> + 3376418174U, // <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7> + 1640222238U, // <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5> + 1577091174U, // <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 2311310226U, // <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1> + 2713964303U, // <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3> + 2311311119U, // <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3> + 1577094454U, // <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,5,5>: Cost 1 vspltisw1 RHS + 2311309826U, // <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6> + 2311311447U, // <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7> + 296144182U, // <5,5,5,u>: Cost 1 vspltisw1 RHS + 2248953460U, // <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1> + 2326580114U, // <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1> + 2713965050U, // <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3> + 3700697602U, // <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6> + 2785644620U, // <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5> + 2781073495U, // <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7> + 1228950018U, // <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965390U, // <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1> + 1228950018U, // <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965562U, // <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2> + 3383741330U, // <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1> + 3718620878U, // <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5> + 3365823403U, // <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3> + 2713965926U, // <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6> + 2717947318U, // <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5> + 3365825026U, // <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6> + 2292081907U, // <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7> + 2713966210U, // <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2> + 1577091174U, // <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1640224558U, // <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2713966469U, // <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0> + 2713966524U, // <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1> + 1577094454U, // <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,u,5>: Cost 1 vspltisw1 RHS + 1228950018U, // <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713966848U, // <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1> + 296144182U, // <5,5,u,u>: Cost 1 vspltisw1 RHS + 2705342464U, // <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0> + 1631600742U, // <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3773112493U, // <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2> + 2705342720U, // <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4> + 2705342802U, // <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5> + 3779084708U, // <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6> + 3779084790U, // <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7> + 2302643510U, // <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631601309U, // <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3767141092U, // <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2> + 2705343284U, // <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1> + 2705343382U, // <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0> + 3779085282U, // <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4> + 2693399632U, // <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6> + 3767805089U, // <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6> + 2311279416U, // <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6> + 1237536054U, // <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1237536055U, // <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS + 3773113789U, // <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2> + 3779085855U, // <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1> + 2699372136U, // <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2> + 2705344166U, // <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1> + 2699372329U, // <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6> + 2705344360U, // <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6> + 2705344442U, // <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7> + 2302659894U, // <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2702026861U, // <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6> + 2705344662U, // <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2> + 3767142661U, // <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5> + 3773114689U, // <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2> + 2705344924U, // <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3> + 1631603202U, // <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6> + 3842945597U, // <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7> + 3779086962U, // <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1> + 2289397046U, // <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634257734U, // <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6> + 2644926566U, // <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS + 3779087306U, // <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3> + 2790142577U, // <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5> + 2644929026U, // <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6> + 2711317723U, // <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6> + 1631604022U, // <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 2712644989U, // <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6> + 2302676278U, // <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631604265U, // <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 3842945708U, // <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1> + 3767144133U, // <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1> + 2705346328U, // <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3> + 3779088207U, // <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4> + 2717290420U, // <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6> + 2705346574U, // <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6> + 2705346596U, // <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1> + 1237568822U, // <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 1237568823U, // <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS + 2650914918U, // <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS + 3364490949U, // <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1> + 2248954362U, // <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3> + 2302693144U, // <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3> + 2650918198U, // <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS + 2650918926U, // <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6> + 2302693390U, // <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6> + 1228950838U, // <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228950839U, // <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS + 497467494U, // <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571210036U, // <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571210856U, // <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571211414U, // <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497470774U, // <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571213316U, // <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571213818U, // <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571214956U, // <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7> + 497473326U, // <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS + 497475686U, // <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631606574U, // <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 1571219048U, // <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571219606U, // <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497478967U, // <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631606938U, // <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 1571222010U, // <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1228967222U, // <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497481518U, // <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS + 3768475648U, // <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0> + 2694733926U, // <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 3718711395U, // <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5> + 3384349178U, // <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3> + 2694734162U, // <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5> + 3384347884U, // <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5> + 3730658026U, // <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0> + 3718714362U, // <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2> + 2694734493U, // <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2311278690U, // <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0> + 2305970923U, // <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1> + 3768476566U, // <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0> + 2311279098U, // <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3> + 2311278694U, // <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4> + 3768476783U, // <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1> + 2694735091U, // <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7> + 2311279426U, // <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7> + 2696062357U, // <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7> + 3383701602U, // <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0> + 3768477219U, // <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5> + 3768477288U, // <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2> + 2309960186U, // <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3383701606U, // <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4> + 3768477545U, // <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7> + 3766486970U, // <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7> + 3383702338U, // <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7> + 2309960186U, // <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3768477846U, // <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2> + 3768477975U, // <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5> + 3786393932U, // <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4> + 3768478108U, // <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3> + 2795599115U, // <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5> + 3385037470U, // <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5> + 3780422309U, // <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7> + 3848107301U, // <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4> + 2795894063U, // <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5> + 2795967800U, // <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5> + 3768478690U, // <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0> + 3718744163U, // <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5> + 3784404107U, // <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7> + 2796262748U, // <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5> + 2694737206U, // <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2712653182U, // <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7> + 2713316815U, // <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7> + 2694737449U, // <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2311311458U, // <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0> + 3768479433U, // <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5> + 3768479521U, // <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3> + 2311311866U, // <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3> + 2311311462U, // <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4> + 2248185270U, // <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5> + 2718625879U, // <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7> + 2311312194U, // <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7> + 2311311466U, // <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u> + 2248954874U, // <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2> + 3322696778U, // <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1> + 2248955028U, // <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2656963074U, // <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6> + 2248955238U, // <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6> + 2248955329U, // <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7> + 2656965360U, // <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6> + 2248955500U, // <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7> + 2248955522U, // <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2> + 3718766694U, // <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS + 3724739827U, // <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7> + 3718768739U, // <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5> + 3365826337U, // <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3> + 2798253647U, // <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5> + 3365826258U, // <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5> + 3730715377U, // <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7> + 2310665836U, // <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7> + 2798548595U, // <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5> + 2311336034U, // <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0> + 2694739758U, // <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2248955028U, // <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2311336442U, // <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3> + 2311336038U, // <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4> + 2694740122U, // <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2656981746U, // <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u> + 2311336770U, // <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7> + 2694740325U, // <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2705358848U, // <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0> + 1631617126U, // <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2310607866U, // <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 2302640284U, // <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 2754238189U, // <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1> + 2305296114U, // <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5> + 2244907106U, // <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 2302643528U, // <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631617693U, // <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2627133542U, // <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS + 1237536282U, // <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 1680496430U, // <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1237532828U, // <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 2693416018U, // <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u> + 2756892486U, // <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0> + 2694743284U, // <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u> + 1237536072U, // <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1680496484U, // <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2311288709U, // <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 2245883694U, // <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 2699388520U, // <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2> + 2754238344U, // <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3> + 2699388715U, // <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u> + 2757408666U, // <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3> + 2705360826U, // <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7> + 2302659912U, // <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2754238389U, // <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3> + 2754238396U, // <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1> + 3827980229U, // <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1> + 2644625102U, // <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393820U, // <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1631619588U, // <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u> + 2785056749U, // <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5> + 3363138077U, // <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6> + 2289397064U, // <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634274120U, // <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u> + 1634937753U, // <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u> + 1728272410U, // <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5> + 2710006843U, // <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u> + 2765740076U, // <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5> + 1637592285U, // <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u> + 1631620406U, // <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 2712661375U, // <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u> + 2302676296U, // <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631620649U, // <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 1577091174U, // <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1174443822U, // <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 2766035058U, // <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3> + 1237565596U, // <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 1577094454U, // <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,u,5,5>: Cost 1 vspltisw1 RHS + 1680496794U, // <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1237568840U, // <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 296144182U, // <5,u,5,u>: Cost 1 vspltisw1 RHS + 2633146470U, // <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS + 1175213870U, // <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 2633148309U, // <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6> + 1228947612U, // <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 2633149750U, // <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS + 1175214234U, // <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 1228950018U, // <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 1228950856U, // <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228947617U, // <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 497614950U, // <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571357492U, // <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571358312U, // <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571358870U, // <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497618248U, // <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571360772U, // <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571361274U, // <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571361786U, // <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2> + 497620782U, // <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS + 497623142U, // <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631622958U, // <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 1680496997U, // <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1228963996U, // <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 497626441U, // <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS + 296144182U, // <5,u,u,5>: Cost 1 vspltisw1 RHS + 1680497037U, // <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1228967240U, // <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497628974U, // <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS + 2772451328U, // <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0> + 2772451338U, // <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1> + 3771146417U, // <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6> + 3383095739U, // <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3> + 3846193189U, // <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1> + 3724832803U, // <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0> + 3383095985U, // <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6> + 3383096067U, // <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7> + 2772451401U, // <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1> + 2651095142U, // <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS + 2251612262U, // <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS + 1698709606U, // <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2651097602U, // <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6> + 2651098422U, // <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS + 2651099172U, // <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1> + 2657071869U, // <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1> + 3724841978U, // <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2> + 1698709660U, // <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2252292096U, // <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0> + 1178550374U, // <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 3826655418U, // <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6> + 3777783485U, // <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6> + 2252292434U, // <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5> + 3785746280U, // <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6> + 2252292593U, // <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2> + 3736794583U, // <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2> + 1178550941U, // <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS + 3375153152U, // <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0> + 2772451584U, // <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4> + 3777784163U, // <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0> + 3846193426U, // <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4> + 2712005122U, // <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6> + 3724857382U, // <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3> + 3802335864U, // <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7> + 3801672410U, // <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6> + 2772451647U, // <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4> + 3383123968U, // <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0> + 2772451666U, // <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5> + 3773803577U, // <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6> + 3724864002U, // <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6> + 3846193517U, // <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5> + 2712005935U, // <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0> + 3327009265U, // <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2> + 3383126648U, // <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7> + 2772451729U, // <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5> + 3373178880U, // <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0> + 2254266470U, // <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS + 3785748248U, // <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3> + 3790393190U, // <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0> + 3328000338U, // <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5> + 3785748494U, // <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6> + 3785748516U, // <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1> + 3379153528U, // <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7> + 2254267037U, // <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS + 2254897152U, // <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0> + 1181155430U, // <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 3785748923U, // <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3> + 3785749042U, // <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5> + 2254897490U, // <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5> + 3785749169U, // <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6> + 2724614962U, // <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0> + 3787739982U, // <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1> + 1181155997U, // <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS + 1235664896U, // <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235666598U, // <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 3712943720U, // <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2> + 2639202936U, // <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7> + 2639203638U, // <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS + 2309409236U, // <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 3712946517U, // <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0> + 2309409400U, // <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235666605U, // <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 1235673088U, // <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235674790U, // <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 1698710173U, // <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2639211129U, // <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u> + 2639211830U, // <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS + 2712008858U, // <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS + 2657129220U, // <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u> + 2309417592U, // <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1698710227U, // <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 3775799296U, // <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0> + 2702057574U, // <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3373143763U, // <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2> + 3695045122U, // <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6> + 3775799634U, // <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5> + 3383091538U, // <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5> + 3368493233U, // <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6> + 3362522319U, // <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7> + 2702058141U, // <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3834250027U, // <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1> + 2772452148U, // <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 3832038210U, // <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6> + 3373150660U, // <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3> + 3834250067U, // <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5> + 3373146450U, // <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5> + 3826656102U, // <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6> + 3362530511U, // <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7> + 2772452148U, // <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 2669092966U, // <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS + 2252292916U, // <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1> + 2252293014U, // <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0> + 2772452246U, // <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0> + 2669096246U, // <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS + 3846194091U, // <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3> + 2702059450U, // <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7> + 3870081978U, // <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0> + 2702059633U, // <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1> + 3775801494U, // <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2> + 3777128723U, // <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1> + 3775801702U, // <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3> + 3775801756U, // <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3> + 3775801858U, // <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6> + 3375153490U, // <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5> + 3826656265U, // <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7> + 3775802051U, // <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1> + 3775802142U, // <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2> + 3846194206U, // <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1> + 3846194219U, // <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5> + 3846194228U, // <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5> + 3846194236U, // <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4> + 3846194246U, // <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5> + 2760508496U, // <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6> + 3368526001U, // <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6> + 3870082144U, // <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4> + 2760729707U, // <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6> + 2714668660U, // <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1> + 3834619005U, // <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6> + 3834692742U, // <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6> + 3846194317U, // <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4> + 3834840216U, // <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6> + 3834913953U, // <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6> + 2719977570U, // <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0> + 3367208143U, // <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7> + 2719977724U, // <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1> + 2669125734U, // <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS + 2254897972U, // <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1> + 2254898070U, // <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0> + 3775803929U, // <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7> + 2669129014U, // <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS + 2322006354U, // <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5> + 2725950264U, // <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6> + 3793720142U, // <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1> + 2254898556U, // <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0> + 2627330150U, // <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS + 1235664906U, // <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235667094U, // <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309406894U, // <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2627333430U, // <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS + 1235665234U, // <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309406897U, // <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309407222U, // <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235664913U, // <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 2627338342U, // <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS + 1235673098U, // <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235675286U, // <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2772452732U, // <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0> + 2627341622U, // <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS + 1235673426U, // <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309415089U, // <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309415414U, // <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235673105U, // <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 3324683725U, // <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0> + 2725290086U, // <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS + 3771162801U, // <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6> + 2309349478U, // <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 3730951478U, // <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS + 3840738784U, // <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1> + 3842655721U, // <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1> + 3736925671U, // <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0> + 2309349483U, // <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS + 3367840468U, // <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0> + 3325355551U, // <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1> + 3373147752U, // <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2> + 2299404390U, // <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 3701099830U, // <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS + 3767846054U, // <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2> + 3826656825U, // <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0> + 3373147838U, // <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7> + 2299404395U, // <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS + 2657222758U, // <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS + 3771164219U, // <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2> + 2766481000U, // <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2> + 2772452978U, // <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3> + 2657226038U, // <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS + 3790407528U, // <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6> + 2252294074U, // <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7> + 2252294148U, // <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0> + 2772453023U, // <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3> + 2772453030U, // <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1> + 3834250930U, // <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4> + 2765596349U, // <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6> + 2301411430U, // <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS + 2772453070U, // <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5> + 2765817560U, // <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6> + 2252933050U, // <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7> + 2796340968U, // <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4> + 2766038771U, // <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6> + 3725008998U, // <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS + 3368530217U, // <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1> + 3840222989U, // <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5> + 2309382246U, // <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 3725012278U, // <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS + 2766481193U, // <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6> + 3842656049U, // <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5> + 3327010820U, // <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0> + 2766702404U, // <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6> + 3713073254U, // <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS + 3789082310U, // <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2> + 3840665439U, // <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6> + 2766997352U, // <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6> + 3713076534U, // <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS + 3791736842U, // <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2> + 3373180605U, // <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6> + 3793064108U, // <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2> + 2767366037U, // <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6> + 3701137510U, // <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS + 3701138647U, // <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6> + 2254898792U, // <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2> + 1248264294U, // <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 3701140790U, // <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS + 3725029435U, // <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6> + 2254899130U, // <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7> + 2725294981U, // <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2> + 1248264299U, // <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS + 2633375846U, // <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS + 2309407468U, // <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235666536U, // <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161923174U, // <6,2,7,3>: Cost 1 vmrglw RHS, LHS + 2633379126U, // <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS + 2309407796U, // <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309408445U, // <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309407960U, // <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161923179U, // <6,2,7,u>: Cost 1 vmrglw RHS, LHS + 2633384038U, // <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS + 2309415660U, // <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235674728U, // <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161931366U, // <6,2,u,3>: Cost 1 vmrglw RHS, LHS + 2633387318U, // <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS + 2769135725U, // <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6> + 2309416637U, // <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309416152U, // <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161931371U, // <6,2,u,u>: Cost 1 vmrglw RHS, LHS + 3777806336U, // <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0> + 2704064614U, // <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 3765862577U, // <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6> + 3843393708U, // <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6> + 2250516994U, // <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6> + 3725054014U, // <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0> + 3383093096U, // <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6> + 3368495034U, // <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7> + 2704065181U, // <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 2251622550U, // <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 3777807156U, // <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1> + 3765863348U, // <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3> + 3373147762U, // <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3> + 3834251525U, // <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5> + 3373147683U, // <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5> + 3391727545U, // <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6> + 2299406266U, // <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7> + 2251622550U, // <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 2252294294U, // <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2> + 3326036198U, // <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1> + 3771836045U, // <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3> + 2252294556U, // <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3> + 2252294658U, // <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6> + 3840739677U, // <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3> + 2704066490U, // <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7> + 3368511418U, // <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7> + 2252294942U, // <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2> + 3707158630U, // <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS + 3765864692U, // <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6> + 2704066918U, // <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3> + 2772453788U, // <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3> + 2772453799U, // <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5> + 3789752888U, // <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6> + 3840739770U, // <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6> + 2301413306U, // <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7> + 2775108043U, // <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5> + 2651340902U, // <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS + 3846195674U, // <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2> + 3845974503U, // <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6> + 2651343362U, // <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6> + 2651344182U, // <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS + 1698712066U, // <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6> + 3383125864U, // <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6> + 3368527802U, // <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7> + 1698933277U, // <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6> + 3373179798U, // <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0> + 3707176179U, // <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7> + 2716012312U, // <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3> + 3373180530U, // <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3> + 2254309890U, // <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6> + 3785773070U, // <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6> + 3840739932U, // <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6> + 2299439034U, // <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7> + 2719994110U, // <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3> + 2254899350U, // <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2> + 3328641254U, // <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1> + 2633443257U, // <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6> + 2254899612U, // <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3> + 2254899714U, // <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6> + 3785773772U, // <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6> + 2725966648U, // <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6> + 2322007994U, // <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7> + 2254899998U, // <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2> + 1559707750U, // <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 2633450292U, // <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1> + 1559709626U, // <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7> + 1235666546U, // <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559711030U, // <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS + 2309408291U, // <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2633454152U, // <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0> + 1235666874U, // <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559713582U, // <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 1559715942U, // <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 2633458484U, // <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1> + 1559717819U, // <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u> + 1235674738U, // <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559719222U, // <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS + 1701366598U, // <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6> + 2633462353U, // <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0> + 1235675066U, // <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559721774U, // <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 3785777152U, // <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0> + 2712035430U, // <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3771179185U, // <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6> + 3846196096U, // <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1> + 3785777490U, // <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5> + 2250517814U, // <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS + 3324259703U, // <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0> + 3383092458U, // <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7> + 2712035997U, // <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3325356946U, // <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1> + 3785777972U, // <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1> + 3846196170U, // <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3> + 3325365380U, // <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0> + 3852168155U, // <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2> + 2251615542U, // <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS + 3325357432U, // <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1> + 3870084088U, // <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4> + 2251615785U, // <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS + 2252295058U, // <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1> + 3771180605U, // <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4> + 3785778792U, // <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2> + 3777816253U, // <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6> + 2252295376U, // <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4> + 1178553654U, // <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 2252295545U, // <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2> + 3326037448U, // <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0> + 1178553897U, // <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS + 3785779350U, // <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2> + 3383118648U, // <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1> + 3777816935U, // <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4> + 3785779612U, // <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3> + 2712037890U, // <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6> + 2252754230U, // <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS + 3784452764U, // <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7> + 3801705178U, // <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6> + 2252754473U, // <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS + 3787770770U, // <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1> + 3383126840U, // <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1> + 3327380534U, // <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3> + 3784453265U, // <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4> + 2253630672U, // <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4> + 2778426587U, // <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6> + 3383128789U, // <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6> + 3381799580U, // <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7> + 2778647798U, // <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6> + 2651422822U, // <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS + 3701277928U, // <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5> + 3701278650U, // <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7> + 2651425282U, // <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6> + 2651426102U, // <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS + 2651426892U, // <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5> + 1698712886U, // <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3725169658U, // <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2> + 1698712904U, // <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2254900114U, // <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1> + 3389115192U, // <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1> + 3785781727U, // <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3> + 3785781810U, // <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5> + 2254900432U, // <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4> + 1181158710U, // <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 2254900605U, // <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6> + 3787772750U, // <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1> + 1181158953U, // <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS + 2639495270U, // <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS + 2639496090U, // <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4> + 3707267011U, // <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7> + 2639497884U, // <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7> + 1237658832U, // <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235666638U, // <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 3713241753U, // <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0> + 2309409436U, // <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235666641U, // <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 2639503462U, // <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS + 2639504282U, // <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4> + 3701303226U, // <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7> + 2639506077U, // <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u> + 1235676368U, // <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235674830U, // <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 1698713129U, // <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2309417628U, // <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1698713147U, // <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3775832064U, // <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0> + 2702090342U, // <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3775832241U, // <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6> + 3719227906U, // <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6> + 3775832402U, // <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5> + 3385085146U, // <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5> + 2309351938U, // <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6> + 3376459134U, // <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7> + 2702090909U, // <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3719233546U, // <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1> + 3775832884U, // <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1> + 3775832982U, // <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0> + 3846196909U, // <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4> + 3719236984U, // <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1> + 3856150209U, // <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6> + 3834252997U, // <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1> + 3870084817U, // <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4> + 3769861532U, // <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5> + 2645500006U, // <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS + 3719242548U, // <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1> + 3775833704U, // <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2> + 3775833766U, // <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1> + 2645503353U, // <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2> + 2252296196U, // <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5> + 2702092218U, // <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7> + 3719246842U, // <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2> + 2702092405U, // <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5> + 3775834262U, // <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2> + 3777161495U, // <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5> + 3775834470U, // <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3> + 3775834524U, // <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3> + 3775834626U, // <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6> + 3385109722U, // <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5> + 2309376514U, // <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3775834819U, // <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1> + 2309376514U, // <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3719258214U, // <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS + 3385117586U, // <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1> + 3327242008U, // <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3> + 3719260674U, // <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6> + 3719261563U, // <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4> + 2702093622U, // <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 2309384706U, // <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6> + 3870085060U, // <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4> + 2702093865U, // <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 3719266406U, // <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS + 3789106889U, // <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5> + 3785789208U, // <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3> + 3373183950U, // <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3> + 2717355964U, // <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5> + 2791772164U, // <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5> + 2772455438U, // <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6> + 3373183549U, // <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7> + 2720010496U, // <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5> + 2772455460U, // <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1> + 2322008978U, // <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1> + 3840225335U, // <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2> + 2772455490U, // <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4> + 2772455500U, // <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5> + 2254901252U, // <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5> + 2772455520U, // <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7> + 2785874024U, // <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6> + 2772455532U, // <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1> + 2627625062U, // <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS + 1235667858U, // <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309409278U, // <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309407659U, // <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627628342U, // <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS + 1235668186U, // <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235667458U, // <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309407987U, // <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235667460U, // <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2627633254U, // <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS + 1235676050U, // <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309417470U, // <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309415851U, // <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627636534U, // <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS + 1235676378U, // <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235675650U, // <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309416179U, // <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235675652U, // <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2309352751U, // <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0> + 1650917478U, // <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 2250584570U, // <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3> + 3846197554U, // <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1> + 2724659538U, // <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5> + 3725275225U, // <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0> + 2791772493U, // <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1> + 2309352758U, // <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 1650918045U, // <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 3325358368U, // <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1> + 2299406449U, // <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1> + 2724660118U, // <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0> + 3373148518U, // <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3> + 3834253712U, // <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5> + 3373147953U, // <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5> + 2323297080U, // <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6> + 2299407670U, // <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 2299407671U, // <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS + 2252296489U, // <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1> + 3326038394U, // <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1> + 1178554874U, // <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724660902U, // <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1> + 2252296817U, // <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5> + 3840741864U, // <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3> + 2252296976U, // <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2> + 2785874426U, // <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3> + 1178554874U, // <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724661398U, // <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2> + 3375154665U, // <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1> + 3375154909U, // <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2> + 2301413734U, // <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3> + 2772455986U, // <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5> + 3375154993U, // <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5> + 2323313464U, // <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6> + 2301414710U, // <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 2301414711U, // <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS + 2724662162U, // <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1> + 3326939559U, // <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1> + 2253271546U, // <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3> + 3383127346U, // <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3> + 2309385523U, // <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4> + 1650920758U, // <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 2724662653U, // <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6> + 2309385526U, // <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 1650921001U, // <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 3725312102U, // <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS + 3373180393U, // <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1> + 3791769368U, // <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3> + 3373181286U, // <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3> + 3725315382U, // <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS + 2299439221U, // <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5> + 2724663394U, // <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0> + 2299440438U, // <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 2299440439U, // <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS + 1583808614U, // <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2254574074U, // <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3> + 2322010609U, // <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3> + 1583811894U, // <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2322010773U, // <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5> + 363253046U, // <6,6,6,6>: Cost 1 vspltisw2 RHS + 1248267574U, // <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS + 363253046U, // <6,6,6,u>: Cost 1 vspltisw2 RHS + 2309410095U, // <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0> + 2309408233U, // <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1> + 2311402373U, // <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2> + 2309409126U, // <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 2309410099U, // <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4> + 2309408561U, // <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5> + 1237660472U, // <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 161926454U, // <6,6,7,7>: Cost 1 vmrglw RHS, RHS + 161926455U, // <6,6,7,u>: Cost 1 vmrglw RHS, RHS + 1583808614U, // <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1650923310U, // <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 1178554874U, // <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2309417318U, // <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 1583811894U, // <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1650923674U, // <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 363253046U, // <6,6,u,6>: Cost 1 vspltisw2 RHS + 161934646U, // <6,6,u,7>: Cost 1 vmrglw RHS, RHS + 161934647U, // <6,6,u,u>: Cost 1 vmrglw RHS, RHS + 1638318080U, // <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564576358U, // <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712060077U, // <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712060156U, // <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638318418U, // <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577865314U, // <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0> + 2712060406U, // <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2651608058U, // <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2> + 564576925U, // <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712060643U, // <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638318900U, // <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638318998U, // <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 3766559753U, // <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7> + 2712060971U, // <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712061039U, // <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712061135U, // <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 3373148612U, // <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7> + 1638319484U, // <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712061373U, // <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712061471U, // <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638319720U, // <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638319782U, // <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712061709U, // <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712061800U, // <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1638320058U, // <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252297836U, // <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7> + 1638320187U, // <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638320278U, // <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712062182U, // <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2712062256U, // <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3> + 1638320540U, // <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638320642U, // <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712062546U, // <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712062584U, // <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2712062659U, // <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1> + 1638320926U, // <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638321042U, // <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712062922U, // <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712063029U, // <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712063108U, // <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638321360U, // <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564579638U, // <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712063357U, // <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6> + 2712063439U, // <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7> + 564579881U, // <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712063560U, // <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714054287U, // <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712063742U, // <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 3373181295U, // <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3> + 2712063924U, // <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638322180U, // <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638322274U, // <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 3373181380U, // <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7> + 1640313092U, // <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712064289U, // <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712064423U, // <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638322682U, // <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712064562U, // <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712064653U, // <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712064747U, // <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638323000U, // <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638323022U, // <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638323168U, // <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3> + 1237659746U, // <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309411158U, // <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1> + 2639718330U, // <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7> + 1235669498U, // <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1237659750U, // <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309411243U, // <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5> + 1583895362U, // <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7> + 1235669826U, // <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 1235669503U, // <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u> + 1638323923U, // <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564582190U, // <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638324101U, // <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638324156U, // <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638324287U, // <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564582554U, // <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638324432U, // <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 1235678018U, // <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 564582757U, // <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 1638326272U, // <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564584550U, // <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712068269U, // <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2309349532U, // <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 1638326610U, // <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577939051U, // <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0> + 2712068598U, // <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2309352776U, // <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 564585117U, // <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712068835U, // <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638327092U, // <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1698715438U, // <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2299404444U, // <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 2712069163U, // <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712069231U, // <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712069327U, // <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 2299407688U, // <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 1698715492U, // <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2712069565U, // <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 1178556206U, // <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 1638327912U, // <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638327974U, // <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712069901U, // <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 1178556570U, // <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 1638328250U, // <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252298496U, // <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1> + 1638328379U, // <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638328470U, // <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712070374U, // <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2704107883U, // <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u> + 1638328732U, // <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638328834U, // <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712070738U, // <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712070776U, // <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2301414728U, // <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 1638329118U, // <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638329234U, // <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712071114U, // <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712071221U, // <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2309382300U, // <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 1638329552U, // <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564587831U, // <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712071545U, // <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2309385544U, // <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 564588073U, // <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712071752U, // <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714062479U, // <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712071934U, // <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2299437212U, // <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS + 2712072116U, // <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638330372U, // <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1698715802U, // <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2299440456U, // <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 1698715820U, // <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 1583808614U, // <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1181161262U, // <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 1638330874U, // <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1248264348U, // <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 1583811894U, // <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1181161626U, // <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 363253046U, // <6,u,6,6>: Cost 1 vspltisw2 RHS + 1638331214U, // <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 363253046U, // <6,u,6,u>: Cost 1 vspltisw2 RHS + 1560076390U, // <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS + 1235664969U, // <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1560078311U, // <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7> + 161923228U, // <6,u,7,3>: Cost 1 vmrglw RHS, LHS + 1560079670U, // <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS + 1235665297U, // <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235667485U, // <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 161926472U, // <6,u,7,7>: Cost 1 vmrglw RHS, RHS + 161923233U, // <6,u,7,u>: Cost 1 vmrglw RHS, LHS + 1560084582U, // <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS + 564590382U, // <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS + 1560086504U, // <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u> + 161931420U, // <6,u,u,3>: Cost 1 vmrglw RHS, LHS + 1560087862U, // <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS + 564590746U, // <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS + 363253046U, // <6,u,u,6>: Cost 1 vspltisw2 RHS + 161934664U, // <6,u,u,7>: Cost 1 vmrglw RHS, RHS + 161931425U, // <6,u,u,u>: Cost 1 vmrglw RHS, LHS + 1705426944U, // <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705426954U, // <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1> + 3713550266U, // <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7> + 2316063892U, // <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3> + 2779168805U, // <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1> + 2663698530U, // <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2657727309U, // <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0> + 2316064220U, // <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7> + 1705427017U, // <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1> + 1583988838U, // <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS + 2779168859U, // <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1> + 631685222U, // <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS + 2639817411U, // <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1> + 1583992118U, // <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS + 2657734660U, // <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5> + 1583993678U, // <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1> + 2657735672U, // <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0> + 631685276U, // <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779168933U, // <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3> + 2767667377U, // <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6> + 2718713448U, // <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2> + 2718713510U, // <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1> + 3841409228U, // <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6> + 3852910802U, // <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3> + 2718713786U, // <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7> + 3847160036U, // <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3> + 2767667440U, // <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6> + 2718714006U, // <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2> + 2779169020U, // <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0> + 3852910853U, // <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0> + 2718714268U, // <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3> + 2718714370U, // <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6> + 2718714461U, // <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7> + 2706770608U, // <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0> + 3847160114U, // <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0> + 2779169083U, // <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0> + 2718714770U, // <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1> + 1705427282U, // <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5> + 3713583034U, // <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7> + 3713583814U, // <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4> + 2779169133U, // <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5> + 1644973366U, // <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 2657760081U, // <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4> + 2259468868U, // <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4> + 1705427345U, // <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5> + 2718715508U, // <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1> + 2260123750U, // <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS + 3792457451U, // <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3> + 3852911024U, // <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0> + 2718715836U, // <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5> + 2718715908U, // <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5> + 1644974178U, // <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0> + 3792457853U, // <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0> + 1646301444U, // <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0> + 2720706901U, // <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0> + 2779169270U, // <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7> + 2718716410U, // <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3> + 2722697800U, // <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0> + 3852911121U, // <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7> + 3852911130U, // <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7> + 2718716728U, // <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6> + 2718716750U, // <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1> + 2779169333U, // <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7> + 2718716922U, // <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2> + 1187872870U, // <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2718717076U, // <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3> + 3847160408U, // <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6> + 2718717286U, // <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6> + 2718717377U, // <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7> + 2718717404U, // <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7> + 2718717478U, // <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0> + 1187873437U, // <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS + 1584046182U, // <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS + 1705427602U, // <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1> + 631685789U, // <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS + 2639874762U, // <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u> + 1584049462U, // <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS + 1644976282U, // <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 1584051029U, // <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u> + 2718718208U, // <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1> + 631685843U, // <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS + 2721374218U, // <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1> + 2779169507U, // <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1> + 2779169516U, // <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1> + 3852911348U, // <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0> + 2669743414U, // <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS + 2316058962U, // <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5> + 2316059044U, // <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6> + 2669745146U, // <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2> + 2779169570U, // <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1> + 2779169579U, // <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1> + 1705427764U, // <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169598U, // <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2> + 3713632972U, // <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1> + 2779169619U, // <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5> + 2779169628U, // <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5> + 2657809239U, // <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1> + 3835290474U, // <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1> + 1705427764U, // <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169660U, // <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1> + 2779169671U, // <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3> + 2779169680U, // <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3> + 1705427862U, // <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0> + 2779169700U, // <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5> + 2779169707U, // <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3> + 2657817432U, // <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2> + 2803057594U, // <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0> + 1705427907U, // <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0> + 3776538827U, // <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1> + 2319400970U, // <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1> + 2316085398U, // <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2> + 3852911591U, // <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0> + 3852911600U, // <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0> + 2319401298U, // <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5> + 3833668617U, // <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7> + 3367265487U, // <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7> + 2319400977U, // <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u> + 2724031378U, // <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1> + 2779169835U, // <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5> + 2779169844U, // <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5> + 3852911672U, // <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0> + 2669776182U, // <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS + 2779169872U, // <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6> + 3835290712U, // <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5> + 2669778278U, // <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6> + 2779169898U, // <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5> + 2779169903U, // <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1> + 3835585661U, // <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6> + 3841410182U, // <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6> + 3852911753U, // <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0> + 2779169943U, // <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5> + 2318754130U, // <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5> + 2718724195U, // <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1> + 3859178670U, // <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1> + 2779169975U, // <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1> + 2720715094U, // <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1> + 2761549007U, // <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7> + 2779170008U, // <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7> + 3835438305U, // <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7> + 3835512042U, // <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7> + 2761843955U, // <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7> + 3835659516U, // <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7> + 2803057918U, // <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0> + 2762065166U, // <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7> + 2669797478U, // <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS + 2322087946U, // <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1> + 2317448186U, // <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2> + 3395829934U, // <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3> + 2669800758U, // <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS + 2322088274U, // <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5> + 3375923377U, // <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6> + 2731996780U, // <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7> + 2322087953U, // <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u> + 2779170146U, // <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1> + 1705427764U, // <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779170164U, // <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1> + 1705428348U, // <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0> + 2779170186U, // <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5> + 2763171221U, // <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7> + 2657866590U, // <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u> + 2803058080U, // <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0> + 1705428393U, // <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0> + 3713695846U, // <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS + 2779170237U, // <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2> + 2779170245U, // <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1> + 1242316902U, // <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 3713699126U, // <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS + 3852912096U, // <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1> + 2767668713U, // <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1> + 2256488426U, // <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1> + 1242316907U, // <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS + 3852912132U, // <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1> + 3852912141U, // <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1> + 3852912149U, // <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0> + 2779170335U, // <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1> + 3852912172U, // <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5> + 3840747062U, // <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6> + 3841410617U, // <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0> + 3795125538U, // <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0> + 2779170380U, // <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1> + 2779170389U, // <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1> + 3852912222U, // <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1> + 1705428584U, // <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705428594U, // <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3> + 2779170429U, // <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5> + 3852912259U, // <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2> + 2767668880U, // <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6> + 3841336981U, // <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2> + 1705428639U, // <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3> + 1705428646U, // <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1> + 2779170479U, // <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1> + 2767668925U, // <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6> + 1245659238U, // <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705428686U, // <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5> + 2779170519U, // <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5> + 2657899362U, // <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3> + 2319406574U, // <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7> + 1705428718U, // <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1> + 3713728614U, // <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS + 3852912388U, // <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5> + 2779170573U, // <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5> + 1242349670U, // <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 3713731894U, // <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS + 2779170601U, // <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6> + 2767669041U, // <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5> + 3389834456U, // <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7> + 1242349675U, // <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS + 3852912456U, // <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1> + 3852912466U, // <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2> + 3852912475U, // <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2> + 2779170664U, // <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6> + 3852912496U, // <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5> + 3792474116U, // <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5> + 2718732388U, // <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2> + 3841337228U, // <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6> + 2779170709U, // <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6> + 2640003174U, // <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS + 2721386920U, // <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2> + 2767595441U, // <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7> + 1693927354U, // <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7> + 2640006454U, // <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS + 3841558476U, // <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7> + 2657923941U, // <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6> + 3841337310U, // <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7> + 1694296039U, // <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7> + 2803058666U, // <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1> + 3852912632U, // <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6> + 2322089576U, // <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2> + 1248346214U, // <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 3841337362U, // <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5> + 3395830836U, // <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5> + 2261616570U, // <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7> + 3371943857U, // <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7> + 1248346219U, // <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705429051U, // <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1> + 2779170884U, // <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1> + 1705428584U, // <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1695254620U, // <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7> + 1705429091U, // <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5> + 2779170924U, // <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5> + 2767669361U, // <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1> + 2803058809U, // <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0> + 1695623305U, // <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7> + 2779170955U, // <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0> + 1705429142U, // <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2> + 2634057732U, // <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0> + 2779170983U, // <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1> + 2779170992U, // <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1> + 3852912829U, // <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5> + 2657948520U, // <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0> + 2316060602U, // <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7> + 1705429205U, // <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2> + 3852912860U, // <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0> + 2779171046U, // <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1> + 2779171057U, // <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3> + 3852912887U, // <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0> + 3852912896U, // <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0> + 3852912905U, // <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0> + 3835291923U, // <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1> + 3841411356U, // <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1> + 2779171111U, // <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3> + 2779171120U, // <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3> + 3852912952U, // <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2> + 2779171137U, // <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2> + 2779171144U, // <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0> + 2779171156U, // <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3> + 3852912989U, // <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3> + 2767669606U, // <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3> + 2767669615U, // <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3> + 2779171189U, // <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0> + 2779171198U, // <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0> + 3852913032U, // <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1> + 2704140655U, // <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3> + 1705429404U, // <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171238U, // <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4> + 3852913070U, // <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3> + 2657973099U, // <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3> + 2767669700U, // <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7> + 1705429404U, // <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171280U, // <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1> + 2779171290U, // <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2> + 2634090504U, // <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4> + 2779171311U, // <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5> + 2779171319U, // <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4> + 1705429506U, // <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6> + 2722057593U, // <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2> + 2316093370U, // <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7> + 1705429533U, // <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6> + 3852913185U, // <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1> + 3795799695U, // <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1> + 3852913203U, // <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1> + 3852913214U, // <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3> + 3852913225U, // <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5> + 2779171410U, // <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5> + 2718740581U, // <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3> + 3841411685U, // <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6> + 2720067847U, // <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3> + 2773420664U, // <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7> + 3847236225U, // <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7> + 1648316922U, // <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3> + 2773641875U, // <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7> + 2773715612U, // <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7> + 3847531173U, // <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7> + 2722059024U, // <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2> + 2767669943U, // <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7> + 1652298720U, // <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3> + 2767669955U, // <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1> + 3841411788U, // <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1> + 2767669978U, // <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6> + 2722059546U, // <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2> + 2767669995U, // <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5> + 3852913396U, // <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5> + 2722059758U, // <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7> + 2302183354U, // <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7> + 2767670027U, // <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1> + 2774747930U, // <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7> + 1705429790U, // <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2> + 1660262316U, // <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3> + 1705429404U, // <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2775042878U, // <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7> + 1705429830U, // <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6> + 2779171660U, // <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3> + 2767670101U, // <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3> + 1705429853U, // <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2> + 2718744576U, // <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0> + 1645002854U, // <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 3852913527U, // <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1> + 3852913536U, // <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1> + 2316061904U, // <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4> + 1705429906U, // <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658022257U, // <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0> + 2256489928U, // <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1707420589U, // <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1> + 3852913590U, // <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1> + 2718745396U, // <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1> + 2779171786U, // <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3> + 3852913616U, // <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0> + 3852913627U, // <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2> + 2779171810U, // <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0> + 3792487631U, // <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7> + 3394456220U, // <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7> + 2779171837U, // <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0> + 3852913673U, // <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3> + 3852913682U, // <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3> + 2718746216U, // <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2> + 2718746278U, // <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1> + 2779171885U, // <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3> + 2779171893U, // <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2> + 2718746554U, // <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7> + 3847457864U, // <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3> + 2779171921U, // <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3> + 2718746774U, // <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2> + 3852913762U, // <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2> + 3852913772U, // <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3> + 2718747036U, // <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3> + 2718747138U, // <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6> + 2779171972U, // <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0> + 2706803380U, // <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4> + 3847457946U, // <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4> + 2781162655U, // <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0> + 2718747538U, // <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1> + 3852913842U, // <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1> + 3852913852U, // <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2> + 2316096696U, // <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3> + 1705430224U, // <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705430234U, // <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5> + 2658055029U, // <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4> + 2316097024U, // <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7> + 1707420917U, // <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5> + 1584316518U, // <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS + 2658059060U, // <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1> + 2640144314U, // <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7> + 2640145131U, // <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5> + 1584319798U, // <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS + 2779172134U, // <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0> + 631688502U, // <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS + 2658063354U, // <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2> + 631688520U, // <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS + 3852914001U, // <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7> + 3852914010U, // <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7> + 2718749178U, // <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3> + 2722730572U, // <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4> + 2723394205U, // <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4> + 2779172221U, // <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6> + 2718749496U, // <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6> + 2718749518U, // <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1> + 2779172249U, // <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7> + 2718749690U, // <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2> + 3847458214U, // <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2> + 2718749880U, // <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3> + 3847458236U, // <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6> + 2718750004U, // <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1> + 1187876150U, // <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2718750208U, // <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7> + 2718750286U, // <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4> + 1187876393U, // <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS + 1584341094U, // <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS + 1645008686U, // <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 2640168890U, // <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7> + 2640169710U, // <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u> + 1584344374U, // <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS + 1705430554U, // <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1> + 631688745U, // <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS + 2718750976U, // <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1> + 631688763U, // <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS + 2646147174U, // <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS + 2779172424U, // <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2> + 3852914258U, // <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3> + 3852914268U, // <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4> + 2779172450U, // <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1> + 2316061914U, // <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5> + 2316061186U, // <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6> + 2646152186U, // <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2> + 2779172486U, // <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1> + 2781163151U, // <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1> + 2321378194U, // <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1> + 3852914339U, // <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3> + 3852914350U, // <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5> + 2781163191U, // <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5> + 3852914363U, // <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0> + 3835588297U, // <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5> + 3835588306U, // <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5> + 2781163223U, // <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1> + 3852914400U, // <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1> + 2781163243U, // <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3> + 3852914419U, // <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2> + 2779172606U, // <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4> + 3780552497U, // <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5> + 2781163279U, // <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2779172632U, // <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3> + 3835588385U, // <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3> + 2779172650U, // <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3> + 3852914481U, // <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1> + 2319403922U, // <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1> + 2319404409U, // <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 3852914510U, // <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3> + 3779226131U, // <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5> + 2319404250U, // <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5> + 2319403522U, // <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6> + 3852914547U, // <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4> + 2319403524U, // <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u> + 2646179942U, // <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS + 2316094354U, // <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1> + 3852914582U, // <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3> + 3852914592U, // <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4> + 2646183372U, // <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4> + 2779172788U, // <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6> + 2316093954U, // <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6> + 2646185318U, // <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6> + 2779172815U, // <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6> + 2781163475U, // <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1> + 2781163484U, // <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1> + 3852914662U, // <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2> + 3852914672U, // <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3> + 2781163515U, // <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5> + 1705431044U, // <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172878U, // <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6> + 3835588632U, // <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7> + 1705431044U, // <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172900U, // <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1> + 2781163571U, // <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7> + 3852914743U, // <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2> + 2779172930U, // <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4> + 2779172940U, // <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5> + 2781163607U, // <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 2779172960U, // <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7> + 1705431138U, // <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0> + 1705578603U, // <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0> + 2646204518U, // <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2322090898U, // <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1> + 3719947880U, // <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2> + 3719948438U, // <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2> + 2646207951U, // <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7> + 2322091226U, // <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5> + 2322090498U, // <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6> + 2646210156U, // <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7> + 2646210350U, // <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2779173062U, // <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1> + 2779173072U, // <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2> + 2319404409U, // <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 2779173092U, // <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4> + 2779173101U, // <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4> + 1705431044U, // <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779173118U, // <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3> + 1705578756U, // <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0> + 1707421965U, // <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0> + 3852914966U, // <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0> + 2779173153U, // <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2> + 2256491002U, // <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3> + 3852914994U, // <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1> + 3852915003U, // <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1> + 2316062652U, // <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316063544U, // <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6> + 1242320182U, // <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1242320183U, // <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS + 3852915048U, // <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1> + 3377866217U, // <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1> + 3852915068U, // <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3> + 3833672072U, // <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6> + 3852915088U, // <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5> + 3395122056U, // <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5> + 3389813560U, // <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6> + 2779173287U, // <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1> + 2779320752U, // <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1> + 2658181222U, // <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS + 3852915140U, // <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3> + 2257973754U, // <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3> + 3841413589U, // <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2> + 2658184502U, // <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS + 3852915176U, // <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3> + 2658186117U, // <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2> + 1705431546U, // <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3> + 1705579011U, // <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3> + 3714015334U, // <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS + 3777243425U, // <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6> + 2319405957U, // <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2> + 3375229286U, // <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3> + 2779173426U, // <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5> + 3375228721U, // <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5> + 2319405880U, // <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6> + 1245662518U, // <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1245662519U, // <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS + 3852915291U, // <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1> + 3389834729U, // <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1> + 2259472890U, // <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3> + 3852915321U, // <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4> + 3852915330U, // <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4> + 2779173517U, // <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6> + 2316096312U, // <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6> + 1242352950U, // <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1242352951U, // <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS + 3852915372U, // <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1> + 3835294392U, // <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4> + 3852915395U, // <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6> + 3852915404U, // <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6> + 3852915412U, // <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5> + 3377899313U, // <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5> + 2718765160U, // <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6> + 2779173611U, // <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1> + 2779321076U, // <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1> + 2658213990U, // <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS + 3852915462U, // <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1> + 2718765562U, // <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3> + 3714042622U, // <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6> + 2658217270U, // <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS + 2724074224U, // <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6> + 1705431864U, // <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705431874U, // <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7> + 1705579339U, // <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7> + 1705431886U, // <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1> + 2779173719U, // <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1> + 2779173729U, // <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2> + 2779173736U, // <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0> + 1705431926U, // <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5> + 2779173759U, // <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5> + 2779173765U, // <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2> + 1248349494U, // <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS + 1705431958U, // <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1> + 1705579423U, // <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1> + 2779173801U, // <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2> + 2779321266U, // <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2> + 2779321273U, // <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0> + 1705579463U, // <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5> + 2779173841U, // <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6> + 1705431864U, // <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705432032U, // <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3> + 1705579495U, // <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1> + 1242320994U, // <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705432058U, // <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2> + 3841414146U, // <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1> + 2316063226U, // <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3> + 2779173908U, // <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1> + 2658242658U, // <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0> + 2658243468U, // <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0> + 2316063554U, // <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7> + 1705432121U, // <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2> + 3852915777U, // <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1> + 2779173962U, // <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1> + 2779173973U, // <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3> + 3389813242U, // <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3> + 3852915813U, // <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1> + 3852915821U, // <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0> + 3835294839U, // <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1> + 2329343596U, // <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7> + 2779174027U, // <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3> + 2803061908U, // <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3> + 3852915869U, // <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3> + 2779174053U, // <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2> + 2779174060U, // <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0> + 2803061944U, // <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3> + 3852915905U, // <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3> + 2767672522U, // <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3> + 2791855315U, // <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3> + 2768999644U, // <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3> + 2779174115U, // <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1> + 3852915948U, // <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1> + 3841414394U, // <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6> + 1245663738U, // <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174155U, // <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5> + 3852915988U, // <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5> + 2706827959U, // <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7> + 2319405890U, // <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7> + 1245663738U, // <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174200U, // <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5> + 3852916030U, // <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2> + 3714099130U, // <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7> + 2316095994U, // <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3> + 1242353766U, // <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705432422U, // <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6> + 2658276240U, // <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4> + 2316096322U, // <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7> + 1705432449U, // <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6> + 3852916101U, // <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1> + 3854906765U, // <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0> + 3852916121U, // <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3> + 3389846010U, // <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3> + 3852916141U, // <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5> + 2779174326U, // <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5> + 2779174337U, // <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7> + 2329376364U, // <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7> + 2779321811U, // <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7> + 2658287718U, // <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS + 3852916197U, // <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7> + 2779174382U, // <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7> + 2316112378U, // <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3> + 2658290998U, // <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS + 3852916233U, // <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7> + 1651004226U, // <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7> + 2779174420U, // <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0> + 1652331492U, // <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7> + 1590526054U, // <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS + 2328728623U, // <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1> + 2724746451U, // <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3> + 2322092538U, // <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3> + 1590529334U, // <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS + 2328728951U, // <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5> + 2724746770U, // <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7> + 430361910U, // <7,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,7,u>: Cost 1 vspltisw3 RHS + 1242320994U, // <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705580162U, // <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2> + 2779321996U, // <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3> + 1245663738U, // <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 1242353766U, // <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705580202U, // <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6> + 1662949620U, // <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7> + 430361910U, // <7,7,u,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,u,u>: Cost 1 vspltisw3 RHS + 1705426944U, // <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705432787U, // <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2> + 2316060885U, // <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2> + 1242316956U, // <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 2779174637U, // <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1> + 1182750874U, // <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS + 2316061213U, // <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6> + 1242320200U, // <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1705432850U, // <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2> + 1584578662U, // <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS + 1705427764U, // <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 631691054U, // <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS + 2640407307U, // <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1> + 1584581942U, // <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS + 2779174726U, // <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0> + 1584583574U, // <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1> + 2779322201U, // <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1> + 631691108U, // <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779174763U, // <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1> + 2779174774U, // <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3> + 1705428584U, // <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705432965U, // <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0> + 2779174801U, // <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3> + 2779174810U, // <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3> + 2767673251U, // <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3> + 1705580460U, // <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3> + 1705433010U, // <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0> + 1705433020U, // <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1> + 2779174853U, // <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1> + 2767673299U, // <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6> + 1245659292U, // <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705433060U, // <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5> + 2779174893U, // <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5> + 2706836152U, // <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u> + 1245662536U, // <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1705433092U, // <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1> + 2779174925U, // <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1> + 1185732398U, // <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS + 2316093653U, // <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2> + 1242349724U, // <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 1705430224U, // <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705433151U, // <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6> + 2316093981U, // <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6> + 1242352968U, // <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1705433178U, // <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6> + 1584611430U, // <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS + 2781165670U, // <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0> + 2640439226U, // <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7> + 2640440079U, // <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5> + 1584614710U, // <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS + 1705431044U, // <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 631691418U, // <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS + 2779322525U, // <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1> + 631691436U, // <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS + 2779175087U, // <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1> + 2779175102U, // <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7> + 1648357887U, // <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u> + 1705433296U, // <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7> + 2779175127U, // <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5> + 2779175138U, // <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7> + 1651012419U, // <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u> + 1705580788U, // <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7> + 1705433341U, // <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7> + 1705580800U, // <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1> + 1187878702U, // <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2768042263U, // <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6> + 1248346268U, // <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705580840U, // <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5> + 1187879066U, // <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2779322679U, // <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2> + 430361910U, // <7,u,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,u,7,u>: Cost 1 vspltisw3 RHS + 1705433425U, // <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1> + 1705433435U, // <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2> + 631691621U, // <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS + 1705433451U, // <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0> + 1705433465U, // <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5> + 1705433475U, // <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6> + 631691661U, // <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS + 430361910U, // <7,u,u,7>: Cost 1 vspltisw3 RHS + 631691675U, // <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS + 202162278U, // <u,0,0,0>: Cost 1 vspltisw0 LHS + 1678598154U, // <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2634500154U, // <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0> + 2289596269U, // <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3> + 1548815670U, // <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS + 2663698530U, // <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2658390942U, // <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0> + 2289596597U, // <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7> + 202162278U, // <u,0,0,u>: Cost 1 vspltisw0 LHS + 1560764518U, // <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS + 115720294U, // <u,0,1,1>: Cost 1 vmrghw LHS, LHS + 604856427U, // <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2634508438U, // <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2> + 1560767798U, // <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS + 2652426438U, // <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1> + 1584657311U, // <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1> + 2658399226U, // <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2> + 604856476U, // <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696889850U, // <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0> + 1190174822U, // <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 2692245096U, // <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2> + 2692245158U, // <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1> + 2263916882U, // <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5> + 2299709908U, // <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 2692245434U, // <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7> + 2701535281U, // <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0> + 1190175389U, // <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS + 1209237504U, // <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1209239206U, // <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2704189813U, // <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0> + 2692245916U, // <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3> + 2282981033U, // <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2664386658U, // <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0> + 2691877496U, // <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 2664388218U, // <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3> + 1209239213U, // <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 2289623040U, // <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0> + 1678598482U, // <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2634532926U, // <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4> + 2235580672U, // <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 1143619922U, // <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1618505014U, // <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 2658423714U, // <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4> + 2713259464U, // <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1683243409U, // <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 1192443904U, // <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 118702182U, // <u,0,5,1>: Cost 1 vmrghw RHS, LHS + 2266185901U, // <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2640513816U, // <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5> + 1192444242U, // <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2718789636U, // <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5> + 1645047915U, // <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0> + 2664404604U, // <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5> + 118702749U, // <u,0,5,u>: Cost 1 vmrghw RHS, LHS + 2302910464U, // <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0> + 1192886374U, // <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 2718790138U, // <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3> + 2722771537U, // <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0> + 2266628434U, // <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5> + 2248950180U, // <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 2718790456U, // <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6> + 2718790478U, // <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1> + 1192886941U, // <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS + 1235812352U, // <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235814054U, // <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 2728080601U, // <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0> + 2640530202U, // <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7> + 2640530742U, // <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS + 2309556692U, // <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 2730735133U, // <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0> + 2309556856U, // <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235814061U, // <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 202162278U, // <u,0,u,0>: Cost 1 vspltisw0 LHS + 120365158U, // <u,0,u,1>: Cost 1 vmrghw LHS, LHS + 604856989U, // <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2692249532U, // <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1> + 1560825142U, // <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS + 1618507930U, // <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 1584714662U, // <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u> + 2309565048U, // <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 604857043U, // <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 1611210825U, // <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1> + 1616519270U, // <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS + 2287605459U, // <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2> + 2640546588U, // <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0> + 2622631222U, // <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS + 2289590610U, // <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5> + 2664436630U, // <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1> + 2664437376U, // <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0> + 1616519889U, // <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1> + 1548894866U, // <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1> + 269271142U, // <u,1,1,1>: Cost 1 vspltisw1 LHS + 1189462934U, // <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2622638230U, // <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2> + 1548897590U, // <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS + 2756985692U, // <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 2658472872U, // <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1> + 2287614142U, // <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7> + 269271142U, // <u,1,1,u>: Cost 1 vspltisw1 LHS + 1566818406U, // <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS + 2756985735U, // <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 1148371862U, // <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1566821686U, // <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS + 2756985771U, // <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 2690262970U, // <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7> + 1590711938U, // <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 2282979337U, // <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1209237514U, // <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1209239702U, // <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282979502U, // <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282979341U, // <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1209237842U, // <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282979505U, // <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287625423U, // <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1209237521U, // <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 1635101613U, // <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1> + 2289623050U, // <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1> + 2289625238U, // <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2> + 2640579360U, // <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4> + 2622663990U, // <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS + 1616522550U, // <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 2664469398U, // <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1> + 2664470148U, // <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4> + 1616522793U, // <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 1548927638U, // <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5> + 1192444724U, // <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1192444822U, // <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2622670998U, // <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2> + 1548930358U, // <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS + 1210728786U, // <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2714153058U, // <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0> + 2670449658U, // <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2> + 1548932910U, // <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS + 2622677655U, // <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6> + 2756986063U, // <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2302912662U, // <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2> + 3696421014U, // <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2> + 2622680374U, // <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS + 2756986099U, // <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 2714153784U, // <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6> + 1651692438U, // <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1> + 1652356071U, // <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1> + 2628657254U, // <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS + 1235812362U, // <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235814550U, // <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309554350U, // <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2628660534U, // <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS + 1235812690U, // <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309554353U, // <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309554678U, // <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235812369U, // <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 1548952217U, // <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u> + 269271142U, // <u,1,u,1>: Cost 1 vspltisw1 LHS + 1209280662U, // <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1548954934U, // <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS + 1209278802U, // <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2283020465U, // <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 1590761096U, // <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 2702876672U, // <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0> + 1629134950U, // <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS + 2289591912U, // <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2> + 1215848550U, // <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 2702877010U, // <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5> + 2289222708U, // <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2779178473U, // <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1> + 2726249024U, // <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1215848555U, // <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS + 2690933539U, // <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2> + 2628683124U, // <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1> + 1189463656U, // <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1213866086U, // <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 2628685110U, // <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS + 2263205736U, // <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6> + 1189463994U, // <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2263205866U, // <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1213866091U, // <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS + 1556938854U, // <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2697569869U, // <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2> + 336380006U, // <u,2,2,2>: Cost 1 vspltisw2 LHS + 1678599794U, // <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 1556942134U, // <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2702878650U, // <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7> + 2300229831U, // <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7> + 336380006U, // <u,2,2,u>: Cost 1 vspltisw2 LHS + 475243165U, // <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1548985140U, // <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1209239144U, // <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135495782U, // <u,2,3,3>: Cost 1 vmrglw LHS, LHS + 475245878U, // <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1596764164U, // <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1596764666U, // <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1596765178U, // <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135495787U, // <u,2,3,u>: Cost 1 vmrglw LHS, LHS + 2708851630U, // <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2> + 2217362979U, // <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2289624680U, // <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2> + 1215881318U, // <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 2726767824U, // <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4> + 1629138230U, // <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 2779178801U, // <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5> + 2726251976U, // <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1215881323U, // <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS + 2628714598U, // <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS + 2628715896U, // <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5> + 1192445544U, // <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1213898854U, // <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 2628717878U, // <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS + 2726768644U, // <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5> + 1192445882U, // <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2266187754U, // <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1213898859U, // <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS + 2634694758U, // <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS + 2721460657U, // <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2> + 2296940136U, // <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2> + 1678600122U, // <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2634698038U, // <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS + 3370682125U, // <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5> + 1157056442U, // <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725442455U, // <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2> + 1678600167U, // <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 1653027897U, // <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2> + 2309554924U, // <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235813992U, // <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 162070630U, // <u,2,7,3>: Cost 1 vmrglw RHS, LHS + 2634706230U, // <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS + 2309555252U, // <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309555901U, // <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309555416U, // <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 162070635U, // <u,2,7,u>: Cost 1 vmrglw RHS, LHS + 475284130U, // <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1549026100U, // <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 336380006U, // <u,2,u,2>: Cost 1 vspltisw2 LHS + 135536742U, // <u,2,u,3>: Cost 1 vmrglw LHS, LHS + 475286838U, // <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1629141146U, // <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 1194108858U, // <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 1596806138U, // <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135536747U, // <u,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611890688U, // <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 538149020U, // <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685632685U, // <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685632764U, // <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611891026U, // <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2733408722U, // <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2658612153U, // <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0> + 2289592250U, // <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7> + 538149533U, // <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 1189464214U, // <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 1611891508U, // <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611891606U, // <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 1189464476U, // <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1189464578U, // <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2690278511U, // <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2690278607U, // <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2287609786U, // <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7> + 1611892092U, // <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2685634042U, // <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0> + 2685634079U, // <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611892328U, // <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611892390U, // <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2685634371U, // <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5> + 2685634453U, // <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6> + 1611892666U, // <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2300225466U, // <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7> + 1611892795U, // <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1> + 1209238422U, // <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282980247U, // <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1> + 1561004120U, // <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3> + 403488870U, // <u,3,3,3>: Cost 1 vspltisw3 LHS + 1209238426U, // <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282980899U, // <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2282985598U, // <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1209239482U, // <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 403488870U, // <u,3,3,u>: Cost 1 vspltisw3 LHS + 1555038310U, // <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS + 1555039616U, // <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4> + 2628781672U, // <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2> + 2289624690U, // <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3> + 1555041590U, // <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS + 538152246U, // <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2658644925U, // <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4> + 2289625018U, // <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7> + 538152489U, // <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 1192446102U, // <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2733411983U, // <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2634762330U, // <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5> + 1192446364U, // <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1192446466U, // <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 1659670532U, // <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659670626U, // <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 2287642554U, // <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7> + 1659670788U, // <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2634768486U, // <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS + 2733412775U, // <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1648390659U, // <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3> + 2634770973U, // <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6> + 2634771766U, // <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS + 2733413099U, // <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659671352U, // <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659671374U, // <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1652372457U, // <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3> + 1561034854U, // <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 2634777396U, // <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1> + 1561036892U, // <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7> + 1235814002U, // <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1561038134U, // <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS + 2309555747U, // <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2309556072U, // <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6> + 1235814330U, // <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1561040686U, // <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 1611896531U, // <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2> + 538154798U, // <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1611896712U, // <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3> + 403488870U, // <u,3,u,3>: Cost 1 vspltisw3 LHS + 1611896895U, // <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6> + 538155162U, // <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1611897040U, // <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1209280442U, // <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 538155365U, // <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 1165118354U, // <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1618534502U, // <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 2634795102U, // <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0> + 2686451968U, // <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2692276562U, // <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5> + 1705438098U, // <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658685890U, // <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0> + 2256489928U, // <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1618535069U, // <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1189464978U, // <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2692277044U, // <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1> + 1618535367U, // <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4> + 2640775992U, // <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1> + 1189465296U, // <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 115723574U, // <u,4,1,5>: Cost 1 vmrghw LHS, RHS + 2263207289U, // <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2664666780U, // <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1> + 115723817U, // <u,4,1,u>: Cost 1 vmrghw LHS, RHS + 2263919506U, // <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1> + 2222115812U, // <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 2692277864U, // <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2> + 2692277926U, // <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1> + 2324114640U, // <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4> + 1190178102U, // <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278202U, // <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7> + 2701568053U, // <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4> + 1190178345U, // <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278422U, // <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2> + 2282981552U, // <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2704222585U, // <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4> + 2692278684U, // <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3> + 1257016528U, // <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1209239246U, // <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2691910300U, // <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 2664683166U, // <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3> + 1209239249U, // <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 1573027942U, // <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS + 2634826695U, // <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4> + 2634827874U, // <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4> + 2289629073U, // <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3> + 229035318U, // <u,4,4,4>: Cost 1 vspltisw0 RHS + 1618537782U, // <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS + 2658718662U, // <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4> + 2289629401U, // <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7> + 229035318U, // <u,4,4,u>: Cost 1 vspltisw0 RHS + 1561092198U, // <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS + 2628863370U, // <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5> + 1561094243U, // <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5> + 2634836118U, // <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2> + 1561095478U, // <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS + 118705462U, // <u,4,5,5>: Cost 1 vmrghw RHS, RHS + 604859702U, // <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2658726906U, // <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2> + 604859720U, // <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 2266631058U, // <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1> + 2302692152U, // <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 2718822906U, // <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3> + 2722804309U, // <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4> + 2723467942U, // <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4> + 1192889654U, // <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 2718823224U, // <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6> + 2718823246U, // <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1> + 1192889897U, // <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS + 2640822374U, // <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS + 2640823194U, // <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4> + 2728113373U, // <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4> + 2640825150U, // <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7> + 1235815632U, // <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235814094U, // <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 2730767905U, // <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4> + 2309556892U, // <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235814097U, // <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 1561116774U, // <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS + 1618540334U, // <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1561118822U, // <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u> + 2692282300U, // <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1> + 229035318U, // <u,4,u,4>: Cost 1 vspltisw0 RHS + 120368438U, // <u,4,u,5>: Cost 1 vmrghw LHS, RHS + 604859945U, // <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2309565084U, // <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 604859963U, // <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2690293760U, // <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0> + 1616552038U, // <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2640840434U, // <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5> + 2640841536U, // <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0> + 1613381970U, // <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2316135642U, // <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5> + 2289592834U, // <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6> + 2664732324U, // <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0> + 1616552661U, // <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5> + 1573077094U, // <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 1237536282U, // <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2690294678U, // <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0> + 2646821014U, // <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2> + 1573080602U, // <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1> + 1189466116U, // <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1189466210U, // <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2646823930U, // <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2> + 1573082926U, // <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 2640855142U, // <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS + 2697594448U, // <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5> + 2690295400U, // <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2> + 1625179890U, // <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5> + 2699585347U, // <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5> + 2781171471U, // <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2690295738U, // <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7> + 3775318070U, // <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5> + 1628498055U, // <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5> + 2287627234U, // <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1257016210U, // <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2646836942U, // <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5> + 2287625131U, // <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287627238U, // <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1257016538U, // <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1209240066U, // <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287625459U, // <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1209240068U, // <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 2640871526U, // <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS + 2316168082U, // <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1> + 2640873202U, // <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5> + 2640874308U, // <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4> + 1637788917U, // <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5> + 1616555318U, // <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 2287638591U, // <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6> + 2664765096U, // <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4> + 1616555561U, // <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 1573109862U, // <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS + 2646852404U, // <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1> + 2646853224U, // <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2> + 2287646618U, // <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3> + 1573113374U, // <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5> + 296144182U, // <u,5,5,5>: Cost 1 vspltisw1 RHS + 1192448098U, // <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2287646946U, // <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7> + 296144182U, // <u,5,5,u>: Cost 1 vspltisw1 RHS + 1567146086U, // <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS + 2628945300U, // <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6> + 2634917997U, // <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6> + 1567148870U, // <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6> + 1567149366U, // <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS + 2781171799U, // <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 1228950018U, // <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 2628952166U, // <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS + 1235815314U, // <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309556734U, // <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309555115U, // <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2628955446U, // <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS + 1235815642U, // <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235814914U, // <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309555443U, // <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235814916U, // <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 1567162470U, // <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS + 1616557870U, // <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2690299781U, // <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0> + 1567165256U, // <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u> + 1567165750U, // <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS + 296144182U, // <u,5,u,5>: Cost 1 vspltisw1 RHS + 1209281026U, // <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2705563648U, // <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0> + 1631821926U, // <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 2262462970U, // <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3> + 2646886941U, // <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6> + 2705563986U, // <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5> + 2316062652U, // <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316137272U, // <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6> + 1215851830U, // <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 1215851831U, // <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS + 2634948710U, // <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS + 2705564468U, // <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1> + 1189466618U, // <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2263208498U, // <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2693620843U, // <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6> + 2652868860U, // <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1> + 1189466936U, // <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1213869366U, // <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 1213869367U, // <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS + 2658844774U, // <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS + 3771344465U, // <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6> + 1178554874U, // <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2698929907U, // <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6> + 2699593540U, // <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6> + 2700257173U, // <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6> + 2705565626U, // <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7> + 1226485046U, // <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 1226485047U, // <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS + 2705565846U, // <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2> + 2330756585U, // <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2330756829U, // <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2> + 2282981734U, // <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 1631824413U, // <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6> + 2652885246U, // <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3> + 1257018168U, // <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135499062U, // <u,6,3,7>: Cost 1 vmrglw LHS, RHS + 135499063U, // <u,6,3,u>: Cost 1 vmrglw LHS, RHS + 2646917222U, // <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS + 2217365931U, // <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2790167156U, // <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u> + 2646919709U, // <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6> + 2711538934U, // <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6> + 1631825206U, // <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 2316170040U, // <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6> + 1215884598U, // <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 1215884599U, // <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS + 2634981478U, // <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS + 2266190247U, // <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1192448506U, // <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2266190386U, // <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2634984758U, // <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS + 2652901632U, // <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5> + 1192448824U, // <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1213902134U, // <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 1213902135U, // <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS + 1583808614U, // <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2718839290U, // <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3> + 2670823965U, // <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6> + 1583811894U, // <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2724147961U, // <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6> + 363253046U, // <u,6,6,6>: Cost 1 vspltisw2 RHS + 1229172022U, // <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS + 363253046U, // <u,6,6,u>: Cost 1 vspltisw2 RHS + 499458150U, // <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1573200692U, // <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1573201512U, // <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573202070U, // <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499461673U, // <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1573203972U, // <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1235817272U, // <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 162073910U, // <u,6,7,7>: Cost 1 vmrglw RHS, RHS + 162073911U, // <u,6,7,u>: Cost 1 vmrglw RHS, RHS + 499466342U, // <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631827758U, // <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 1573209704U, // <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573210262U, // <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499469866U, // <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631828122U, // <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 363253046U, // <u,6,u,6>: Cost 1 vspltisw2 RHS + 135540022U, // <u,6,u,7>: Cost 1 vmrglw LHS, RHS + 135540023U, // <u,6,u,u>: Cost 1 vmrglw LHS, RHS + 1638465536U, // <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564723814U, // <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712207533U, // <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712207612U, // <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638465874U, // <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1579192580U, // <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0> + 2712207862U, // <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2316137282U, // <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7> + 564724381U, // <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 1189467130U, // <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 1638466356U, // <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638466454U, // <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 2311500282U, // <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3> + 1189467494U, // <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2712208495U, // <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2694956302U, // <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7> + 1189467756U, // <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1638466940U, // <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712208829U, // <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712208927U, // <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638467176U, // <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638467238U, // <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712209165U, // <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712209256U, // <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1627187175U, // <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7> + 2324116290U, // <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7> + 1628514441U, // <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7> + 1638467734U, // <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712209638U, // <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2700929387U, // <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u> + 1638467996U, // <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638468098U, // <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712210002U, // <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 1585189856U, // <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3> + 1257018178U, // <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1638468382U, // <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638468498U, // <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712210378U, // <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712210485U, // <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712210564U, // <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638468816U, // <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564727112U, // <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712210809U, // <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2712210888U, // <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0> + 564727337U, // <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 1192449018U, // <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2714201743U, // <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712211198U, // <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2311533050U, // <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3> + 1192449382U, // <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 1638469636U, // <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638469730U, // <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 1192449644U, // <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1638469892U, // <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712211745U, // <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712211879U, // <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638470138U, // <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712212018U, // <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712212109U, // <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712212203U, // <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638470456U, // <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638470478U, // <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638470559U, // <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1> + 1235816546U, // <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309558371U, // <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1> + 2641045434U, // <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7> + 1235816954U, // <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1235816550U, // <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309558375U, // <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5> + 1585222628U, // <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7> + 430361910U, // <u,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <u,7,7,u>: Cost 1 vspltisw3 RHS + 1638471379U, // <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564729646U, // <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638471557U, // <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638471612U, // <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638471743U, // <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564730010U, // <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638471888U, // <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 430361910U, // <u,7,u,7>: Cost 1 vspltisw3 RHS + 564730213U, // <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 202162278U, // <u,u,0,0>: Cost 1 vspltisw0 LHS + 538189985U, // <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685673645U, // <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 1215848604U, // <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 1611931986U, // <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 1579266317U, // <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0> + 2289592861U, // <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6> + 1215851848U, // <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 538190493U, // <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 1549411025U, // <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1> + 115726126U, // <u,u,1,1>: Cost 1 vmrghw LHS, LHS + 604862254U, // <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 1213866140U, // <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 1549413686U, // <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS + 115726490U, // <u,u,1,5>: Cost 1 vmrghw LHS, RHS + 1585247207U, // <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1> + 1213869384U, // <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 604862308U, // <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 1567334502U, // <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS + 1190180654U, // <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 336380006U, // <u,u,2,2>: Cost 1 vspltisw2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1567337782U, // <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS + 1190181018U, // <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 1611933626U, // <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1226485064U, // <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 475685587U, // <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1209239278U, // <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1> + 1209239765U, // <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135495836U, // <u,u,3,3>: Cost 1 vmrglw LHS, LHS + 475688246U, // <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1209239282U, // <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5> + 1209240093U, // <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135499080U, // <u,u,3,7>: Cost 1 vmrglw LHS, RHS + 135495841U, // <u,u,3,u>: Cost 1 vmrglw LHS, LHS + 1555406950U, // <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS + 1555408301U, // <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4> + 2289625301U, // <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2> + 1215881372U, // <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 229035318U, // <u,u,4,4>: Cost 1 vspltisw0 RHS + 538193206U, // <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2289625629U, // <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6> + 1215884616U, // <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 538193449U, // <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 1549443797U, // <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5> + 118708014U, // <u,u,5,1>: Cost 1 vmrghw RHS, LHS + 1561389191U, // <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5> + 1213898908U, // <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 1549446454U, // <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS + 118708378U, // <u,u,5,5>: Cost 1 vmrghw RHS, RHS + 604862618U, // <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 1213902152U, // <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 604862636U, // <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 1567367270U, // <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS + 1192892206U, // <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 1638478330U, // <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1679046864U, // <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 1567370550U, // <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS + 1192892570U, // <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 363253046U, // <u,u,6,6>: Cost 1 vspltisw2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 499605606U, // <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1235812425U, // <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1561405577U, // <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7> + 162070684U, // <u,u,7,3>: Cost 1 vmrglw RHS, LHS + 499609147U, // <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1235812753U, // <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235814941U, // <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 162073928U, // <u,u,7,7>: Cost 1 vmrglw RHS, RHS + 162070689U, // <u,u,7,u>: Cost 1 vmrglw RHS, LHS + 475726552U, // <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 538195758U, // <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 604862821U, // <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 475729206U, // <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 538196122U, // <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 604862861U, // <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp new file mode 100644 index 0000000..ccda5c0 --- /dev/null +++ b/lib/Target/PowerPC/PPCPredicates.cpp @@ -0,0 +1,30 @@ +//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#include "PPCPredicates.h" +#include <cassert> +using namespace llvm; + +PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) { + switch (Opcode) { + default: assert(0 && "Unknown PPC branch opcode!"); + case PPC::PRED_EQ: return PPC::PRED_NE; + case PPC::PRED_NE: return PPC::PRED_EQ; + case PPC::PRED_LT: return PPC::PRED_GE; + case PPC::PRED_GE: return PPC::PRED_LT; + case PPC::PRED_GT: return PPC::PRED_LE; + case PPC::PRED_LE: return PPC::PRED_GT; + case PPC::PRED_NU: return PPC::PRED_UN; + case PPC::PRED_UN: return PPC::PRED_NU; + } +} diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/PPCPredicates.h new file mode 100644 index 0000000..ba1bb74 --- /dev/null +++ b/lib/Target/PowerPC/PPCPredicates.h @@ -0,0 +1,39 @@ +//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H +#define LLVM_TARGET_POWERPC_PPCPREDICATES_H + +#include "PPC.h" + +namespace llvm { +namespace PPC { + /// Predicate - These are "(BI << 5) | BO" for various predicates. + enum Predicate { + PRED_ALWAYS = (0 << 5) | 20, + PRED_LT = (0 << 5) | 12, + PRED_LE = (1 << 5) | 4, + PRED_EQ = (2 << 5) | 12, + PRED_GE = (0 << 5) | 4, + PRED_GT = (1 << 5) | 12, + PRED_NE = (2 << 5) | 4, + PRED_UN = (3 << 5) | 12, + PRED_NU = (3 << 5) | 4 + }; + + /// Invert the specified predicate. != -> ==, < -> >=. + Predicate InvertPredicate(Predicate Opcode); +} +} + +#endif diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp new file mode 100644 index 0000000..19780a8 --- /dev/null +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -0,0 +1,1153 @@ +//===- PPCRegisterInfo.cpp - PowerPC Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reginfo" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCRegisterInfo.h" +#include "PPCFrameInfo.h" +#include "PPCSubtarget.h" +#include "llvm/Constants.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include <cstdlib> +using namespace llvm; + +/// getRegisterNumbering - Given the enum value for some register, e.g. +/// PPC::F14, return the number that it corresponds to (e.g. 14). +unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) { + using namespace PPC; + switch (RegEnum) { + case R0 : case X0 : case F0 : case V0 : case CR0: return 0; + case R1 : case X1 : case F1 : case V1 : case CR1: return 1; + case R2 : case X2 : case F2 : case V2 : case CR2: return 2; + case R3 : case X3 : case F3 : case V3 : case CR3: return 3; + case R4 : case X4 : case F4 : case V4 : case CR4: return 4; + case R5 : case X5 : case F5 : case V5 : case CR5: return 5; + case R6 : case X6 : case F6 : case V6 : case CR6: return 6; + case R7 : case X7 : case F7 : case V7 : case CR7: return 7; + case R8 : case X8 : case F8 : case V8 : return 8; + case R9 : case X9 : case F9 : case V9 : return 9; + case R10: case X10: case F10: case V10: return 10; + case R11: case X11: case F11: case V11: return 11; + case R12: case X12: case F12: case V12: return 12; + case R13: case X13: case F13: case V13: return 13; + case R14: case X14: case F14: case V14: return 14; + case R15: case X15: case F15: case V15: return 15; + case R16: case X16: case F16: case V16: return 16; + case R17: case X17: case F17: case V17: return 17; + case R18: case X18: case F18: case V18: return 18; + case R19: case X19: case F19: case V19: return 19; + case R20: case X20: case F20: case V20: return 20; + case R21: case X21: case F21: case V21: return 21; + case R22: case X22: case F22: case V22: return 22; + case R23: case X23: case F23: case V23: return 23; + case R24: case X24: case F24: case V24: return 24; + case R25: case X25: case F25: case V25: return 25; + case R26: case X26: case F26: case V26: return 26; + case R27: case X27: case F27: case V27: return 27; + case R28: case X28: case F28: case V28: return 28; + case R29: case X29: case F29: case V29: return 29; + case R30: case X30: case F30: case V30: return 30; + case R31: case X31: case F31: case V31: return 31; + default: + cerr << "Unhandled reg in PPCRegisterInfo::getRegisterNumbering!\n"; + abort(); + } +} + +PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST, + const TargetInstrInfo &tii) + : PPCGenRegisterInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + Subtarget(ST), TII(tii) { + ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX; + ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX; + ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX; + ImmToIdxMap[PPC::LWZ] = PPC::LWZX; ImmToIdxMap[PPC::LWA] = PPC::LWAX; + ImmToIdxMap[PPC::LFS] = PPC::LFSX; ImmToIdxMap[PPC::LFD] = PPC::LFDX; + ImmToIdxMap[PPC::STH] = PPC::STHX; ImmToIdxMap[PPC::STW] = PPC::STWX; + ImmToIdxMap[PPC::STFS] = PPC::STFSX; ImmToIdxMap[PPC::STFD] = PPC::STFDX; + ImmToIdxMap[PPC::ADDI] = PPC::ADD4; + ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; +} + +void +PPCRegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIdx, + const TargetRegisterClass *RC) const { + if (RC == PPC::GPRCRegisterClass) { + if (SrcReg != PPC::LR) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW)) + .addReg(SrcReg, false, false, true), FrameIdx); + } else { + // FIXME: this spills LR immediately to memory in one step. To do this, + // we use R11, which we know cannot be used in the prolog/epilog. This is + // a hack. + BuildMI(MBB, MI, TII.get(PPC::MFLR), PPC::R11); + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW)) + .addReg(PPC::R11, false, false, true), FrameIdx); + } + } else if (RC == PPC::G8RCRegisterClass) { + if (SrcReg != PPC::LR8) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STD)) + .addReg(SrcReg, false, false, true), FrameIdx); + } else { + // FIXME: this spills LR immediately to memory in one step. To do this, + // we use R11, which we know cannot be used in the prolog/epilog. This is + // a hack. + BuildMI(MBB, MI, TII.get(PPC::MFLR8), PPC::X11); + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STD)) + .addReg(PPC::X11, false, false, true), FrameIdx); + } + } else if (RC == PPC::F8RCRegisterClass) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STFD)) + .addReg(SrcReg, false, false, true), FrameIdx); + } else if (RC == PPC::F4RCRegisterClass) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STFS)) + .addReg(SrcReg, false, false, true), FrameIdx); + } else if (RC == PPC::CRRCRegisterClass) { + // FIXME: We use R0 here, because it isn't available for RA. + // We need to store the CR in the low 4-bits of the saved value. First, + // issue a MFCR to save all of the CRBits. + BuildMI(MBB, MI, TII.get(PPC::MFCR), PPC::R0); + + // If the saved register wasn't CR0, shift the bits left so that they are in + // CR0's slot. + if (SrcReg != PPC::CR0) { + unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4; + // rlwinm r0, r0, ShiftBits, 0, 31. + BuildMI(MBB, MI, TII.get(PPC::RLWINM), PPC::R0) + .addReg(PPC::R0).addImm(ShiftBits).addImm(0).addImm(31); + } + + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::STW)) + .addReg(PPC::R0, false, false, true), FrameIdx); + } else if (RC == PPC::VRRCRegisterClass) { + // We don't have indexed addressing for vector loads. Emit: + // R11 = ADDI FI# + // Dest = LVX R0, R11 + // + // FIXME: We use R0 here, because it isn't available for RA. + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::ADDI), PPC::R0), + FrameIdx, 0, 0); + BuildMI(MBB, MI, TII.get(PPC::STVX)) + .addReg(SrcReg, false, false, true).addReg(PPC::R0).addReg(PPC::R0); + } else { + assert(0 && "Unknown regclass!"); + abort(); + } +} + +void +PPCRegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC) const { + if (RC == PPC::GPRCRegisterClass) { + if (DestReg != PPC::LR) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), DestReg), FrameIdx); + } else { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), PPC::R11),FrameIdx); + BuildMI(MBB, MI, TII.get(PPC::MTLR)).addReg(PPC::R11); + } + } else if (RC == PPC::G8RCRegisterClass) { + if (DestReg != PPC::LR8) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LD), DestReg), FrameIdx); + } else { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LD), PPC::R11), FrameIdx); + BuildMI(MBB, MI, TII.get(PPC::MTLR8)).addReg(PPC::R11); + } + } else if (RC == PPC::F8RCRegisterClass) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LFD), DestReg), FrameIdx); + } else if (RC == PPC::F4RCRegisterClass) { + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LFS), DestReg), FrameIdx); + } else if (RC == PPC::CRRCRegisterClass) { + // FIXME: We use R0 here, because it isn't available for RA. + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::LWZ), PPC::R0), FrameIdx); + + // If the reloaded register isn't CR0, shift the bits right so that they are + // in the right CR's slot. + if (DestReg != PPC::CR0) { + unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4; + // rlwinm r11, r11, 32-ShiftBits, 0, 31. + BuildMI(MBB, MI, TII.get(PPC::RLWINM), PPC::R0) + .addReg(PPC::R0).addImm(32-ShiftBits).addImm(0).addImm(31); + } + + BuildMI(MBB, MI, TII.get(PPC::MTCRF), DestReg).addReg(PPC::R0); + } else if (RC == PPC::VRRCRegisterClass) { + // We don't have indexed addressing for vector loads. Emit: + // R11 = ADDI FI# + // Dest = LVX R0, R11 + // + // FIXME: We use R0 here, because it isn't available for RA. + addFrameReference(BuildMI(MBB, MI, TII.get(PPC::ADDI), PPC::R0), + FrameIdx, 0, 0); + BuildMI(MBB, MI, TII.get(PPC::LVX),DestReg).addReg(PPC::R0).addReg(PPC::R0); + } else { + assert(0 && "Unknown regclass!"); + abort(); + } +} + +void PPCRegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + if (RC == PPC::GPRCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg); + } else if (RC == PPC::G8RCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg); + } else if (RC == PPC::F4RCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::FMRS), DestReg).addReg(SrcReg); + } else if (RC == PPC::F8RCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::FMRD), DestReg).addReg(SrcReg); + } else if (RC == PPC::CRRCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::MCRF), DestReg).addReg(SrcReg); + } else if (RC == PPC::VRRCRegisterClass) { + BuildMI(MBB, MI, TII.get(PPC::VOR), DestReg).addReg(SrcReg).addReg(SrcReg); + } else { + cerr << "Attempt to copy register that is not GPR or FPR"; + abort(); + } +} + +void PPCRegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +const unsigned* PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + // 32-bit Darwin calling convention. + static const unsigned Macho32_CalleeSavedRegs[] = { + PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31, + + PPC::F14, PPC::F15, PPC::F16, PPC::F17, + PPC::F18, PPC::F19, PPC::F20, PPC::F21, + PPC::F22, PPC::F23, PPC::F24, PPC::F25, + PPC::F26, PPC::F27, PPC::F28, PPC::F29, + PPC::F30, PPC::F31, + + PPC::CR2, PPC::CR3, PPC::CR4, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31, + + PPC::LR, 0 + }; + + static const unsigned ELF32_CalleeSavedRegs[] = { + PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31, + + PPC::F9, + PPC::F10, PPC::F11, PPC::F12, PPC::F13, + PPC::F14, PPC::F15, PPC::F16, PPC::F17, + PPC::F18, PPC::F19, PPC::F20, PPC::F21, + PPC::F22, PPC::F23, PPC::F24, PPC::F25, + PPC::F26, PPC::F27, PPC::F28, PPC::F29, + PPC::F30, PPC::F31, + + PPC::CR2, PPC::CR3, PPC::CR4, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31, + + PPC::LR, 0 + }; + // 64-bit Darwin calling convention. + static const unsigned Macho64_CalleeSavedRegs[] = { + PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31, + + PPC::F14, PPC::F15, PPC::F16, PPC::F17, + PPC::F18, PPC::F19, PPC::F20, PPC::F21, + PPC::F22, PPC::F23, PPC::F24, PPC::F25, + PPC::F26, PPC::F27, PPC::F28, PPC::F29, + PPC::F30, PPC::F31, + + PPC::CR2, PPC::CR3, PPC::CR4, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31, + + PPC::LR8, 0 + }; + + if (Subtarget.isMachoABI()) + return Subtarget.isPPC64() ? Macho64_CalleeSavedRegs : + Macho32_CalleeSavedRegs; + + // ELF 32. + return ELF32_CalleeSavedRegs; +} + +const TargetRegisterClass* const* +PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + // 32-bit Macho calling convention. + static const TargetRegisterClass * const Macho32_CalleeSavedRegClasses[] = { + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass, + + &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass, + + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + + &PPC::GPRCRegClass, 0 + }; + + static const TargetRegisterClass * const ELF32_CalleeSavedRegClasses[] = { + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass, + + &PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass, + + &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass, + + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + + &PPC::GPRCRegClass, 0 + }; + + // 64-bit Macho calling convention. + static const TargetRegisterClass * const Macho64_CalleeSavedRegClasses[] = { + &PPC::G8RCRegClass,&PPC::G8RCRegClass, + &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass, + &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass, + &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass, + &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass, + + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass, + &PPC::F8RCRegClass,&PPC::F8RCRegClass, + + &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass, + + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass, + + &PPC::G8RCRegClass, 0 + }; + + if (Subtarget.isMachoABI()) + return Subtarget.isPPC64() ? Macho64_CalleeSavedRegClasses : + Macho32_CalleeSavedRegClasses; + + // ELF 32. + return ELF32_CalleeSavedRegClasses; +} + +// needsFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +// +static bool needsFP(const MachineFunction &MF) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return NoFramePointerElim || MFI->hasVarSizedObjects(); +} + +BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(PPC::R0); + Reserved.set(PPC::R1); + Reserved.set(PPC::LR); + // In Linux, r2 is reserved for the OS. + if (!Subtarget.isDarwin()) + Reserved.set(PPC::R2); + // On PPC64, r13 is the thread pointer. Never allocate this register. + // Note that this is overconservative, as it also prevents allocation of + // R31 when the FP is not needed. + if (Subtarget.isPPC64()) { + Reserved.set(PPC::R13); + Reserved.set(PPC::R31); + } + if (needsFP(MF)) + Reserved.set(PPC::R31); + return Reserved; +} + +/// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into +/// copy instructions, turning them into load/store instructions. +MachineInstr *PPCRegisterInfo::foldMemoryOperand(MachineInstr *MI, + unsigned OpNum, + int FrameIndex) const { + // Make sure this is a reg-reg copy. Note that we can't handle MCRF, because + // it takes more than one instruction to store it. + unsigned Opc = MI->getOpcode(); + + MachineInstr *NewMI = NULL; + if ((Opc == PPC::OR && + MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) { + if (OpNum == 0) { // move -> store + unsigned InReg = MI->getOperand(1).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::STW)).addReg(InReg), + FrameIndex); + } else { // move -> load + unsigned OutReg = MI->getOperand(0).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::LWZ), OutReg), + FrameIndex); + } + } else if ((Opc == PPC::OR8 && + MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) { + if (OpNum == 0) { // move -> store + unsigned InReg = MI->getOperand(1).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::STD)).addReg(InReg), + FrameIndex); + } else { // move -> load + unsigned OutReg = MI->getOperand(0).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::LD), OutReg), FrameIndex); + } + } else if (Opc == PPC::FMRD) { + if (OpNum == 0) { // move -> store + unsigned InReg = MI->getOperand(1).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::STFD)).addReg(InReg), + FrameIndex); + } else { // move -> load + unsigned OutReg = MI->getOperand(0).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::LFD), OutReg), FrameIndex); + } + } else if (Opc == PPC::FMRS) { + if (OpNum == 0) { // move -> store + unsigned InReg = MI->getOperand(1).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::STFS)).addReg(InReg), + FrameIndex); + } else { // move -> load + unsigned OutReg = MI->getOperand(0).getReg(); + NewMI = addFrameReference(BuildMI(TII.get(PPC::LFS), OutReg), FrameIndex); + } + } + + if (NewMI) + NewMI->copyKillDeadInfo(MI); + return NewMI; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +// hasFP - Return true if the specified function actually has a dedicated frame +// pointer register. This is true if the function needs a frame pointer and has +// a non-zero stack size. +bool PPCRegisterInfo::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return MFI->getStackSize() && needsFP(MF); +} + +/// usesLR - Returns if the link registers (LR) has been used in the function. +/// +bool PPCRegisterInfo::usesLR(MachineFunction &MF) const { + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + return FI->usesLR(); +} + +void PPCRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions. + MBB.erase(I); +} + +/// LowerDynamicAlloc - Generate the code for allocating an object in the +/// current frame. The sequence of code with be in the general form +/// +/// addi R0, SP, #frameSize ; get the address of the previous frame +/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size +/// addi Rnew, SP, #maxCalFrameSize ; get the top of the allocation +/// +void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + // Determine whether 64-bit pointers are used. + bool LP64 = Subtarget.isPPC64(); + + // Get the maximum call stack size. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + // Get the total frame size. + unsigned FrameSize = MFI->getStackSize(); + + // Get stack alignments. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert(MaxAlign <= TargetAlign && + "Dynamic alloca with large aligns not supported"); + + // Determine the previous frame's address. If FrameSize can't be + // represented as 16 bits or we need special alignment, then we load the + // previous frame's address from 0(SP). Why not do an addis of the hi? + // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. + // Constructing the constant and adding would take 3 instructions. + // Fortunately, a frame greater than 32K is rare. + if (MaxAlign < TargetAlign && isInt16(FrameSize)) { + BuildMI(MBB, II, TII.get(PPC::ADDI), PPC::R0) + .addReg(PPC::R31) + .addImm(FrameSize); + } else if (LP64) { + BuildMI(MBB, II, TII.get(PPC::LD), PPC::X0) + .addImm(0) + .addReg(PPC::X1); + } else { + BuildMI(MBB, II, TII.get(PPC::LWZ), PPC::R0) + .addImm(0) + .addReg(PPC::R1); + } + + // Grow the stack and update the stack pointer link, then + // determine the address of new allocated space. + if (LP64) { + BuildMI(MBB, II, TII.get(PPC::STDUX)) + .addReg(PPC::X0) + .addReg(PPC::X1) + .addReg(MI.getOperand(1).getReg()); + BuildMI(MBB, II, TII.get(PPC::ADDI8), MI.getOperand(0).getReg()) + .addReg(PPC::X1) + .addImm(maxCallFrameSize); + } else { + BuildMI(MBB, II, TII.get(PPC::STWUX)) + .addReg(PPC::R0) + .addReg(PPC::R1) + .addReg(MI.getOperand(1).getReg()); + BuildMI(MBB, II, TII.get(PPC::ADDI), MI.getOperand(0).getReg()) + .addReg(PPC::R1) + .addImm(maxCallFrameSize); + } + + // Discard the DYNALLOC instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Find out which operand is the frame index. + unsigned i = 0; + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + // Take into account whether it's an add or mem instruction + unsigned OffIdx = (i == 2) ? 1 : 2; + if (MI.getOpcode() == TargetInstrInfo::INLINEASM) + OffIdx = i-1; + + // Get the frame index. + int FrameIndex = MI.getOperand(i).getFrameIndex(); + + // Get the frame pointer save index. Users of this index are primarily + // DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + // Get the instruction opcode. + unsigned OpC = MI.getOpcode(); + + // Special case for dynamic alloca. + if (FPSI && FrameIndex == FPSI && + (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { + lowerDynamicAlloc(II); + return; + } + + // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP). + MI.getOperand(i).ChangeToRegister(hasFP(MF) ? PPC::R31 : PPC::R1, false); + + // Figure out if the offset in the instruction is shifted right two bits. This + // is true for instructions like "STD", which the machine implicitly adds two + // low zeros to. + bool isIXAddr = false; + switch (OpC) { + case PPC::LWA: + case PPC::LD: + case PPC::STD: + case PPC::STD_32: + isIXAddr = true; + break; + } + + // Now add the frame object offset to the offset from r1. + int Offset = MFI->getObjectOffset(FrameIndex); + + if (!isIXAddr) + Offset += MI.getOperand(OffIdx).getImmedValue(); + else + Offset += MI.getOperand(OffIdx).getImmedValue() << 2; + + // If we're not using a Frame Pointer that has been set to the value of the + // SP before having the stack size subtracted from it, then add the stack size + // to Offset to get the correct offset. + Offset += MFI->getStackSize(); + + if (!isInt16(Offset)) { + // Insert a set of r0 with the full offset value before the ld, st, or add + BuildMI(MBB, II, TII.get(PPC::LIS), PPC::R0).addImm(Offset >> 16); + BuildMI(MBB, II, TII.get(PPC::ORI), PPC::R0).addReg(PPC::R0).addImm(Offset); + + // convert into indexed form of the instruction + // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0 + // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0 + assert(ImmToIdxMap.count(OpC) && + "No indexed form of load or store available!"); + unsigned NewOpcode = ImmToIdxMap.find(OpC)->second; + MI.setInstrDescriptor(TII.get(NewOpcode)); + MI.getOperand(1).ChangeToRegister(MI.getOperand(i).getReg(), false); + MI.getOperand(2).ChangeToRegister(PPC::R0, false); + } else { + if (isIXAddr) { + assert((Offset & 3) == 0 && "Invalid frame offset!"); + Offset >>= 2; // The actual encoded value has the low two bits zero. + } + MI.getOperand(OffIdx).ChangeToImmediate(Offset); + } +} + +/// VRRegNo - Map from a numbered VR register to its enum value. +/// +static const unsigned short VRRegNo[] = { + PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , + PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; + +/// RemoveVRSaveCode - We have found that this function does not need any code +/// to manipulate the VRSAVE register, even though it uses vector registers. +/// This can happen when the only registers used are known to be live in or out +/// of the function. Remove all of the VRSAVE related code from the function. +static void RemoveVRSaveCode(MachineInstr *MI) { + MachineBasicBlock *Entry = MI->getParent(); + MachineFunction *MF = Entry->getParent(); + + // We know that the MTVRSAVE instruction immediately follows MI. Remove it. + MachineBasicBlock::iterator MBBI = MI; + ++MBBI; + assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE); + MBBI->eraseFromParent(); + + bool RemovedAllMTVRSAVEs = true; + // See if we can find and remove the MTVRSAVE instruction from all of the + // epilog blocks. + const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo(); + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { + // If last instruction is a return instruction, add an epilogue + if (!I->empty() && TII.isReturn(I->back().getOpcode())) { + bool FoundIt = false; + for (MBBI = I->end(); MBBI != I->begin(); ) { + --MBBI; + if (MBBI->getOpcode() == PPC::MTVRSAVE) { + MBBI->eraseFromParent(); // remove it. + FoundIt = true; + break; + } + } + RemovedAllMTVRSAVEs &= FoundIt; + } + } + + // If we found and removed all MTVRSAVE instructions, remove the read of + // VRSAVE as well. + if (RemovedAllMTVRSAVEs) { + MBBI = MI; + assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?"); + --MBBI; + assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?"); + MBBI->eraseFromParent(); + } + + // Finally, nuke the UPDATE_VRSAVE. + MI->eraseFromParent(); +} + +// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the +// instruction selector. Based on the vector registers that have been used, +// transform this into the appropriate ORI instruction. +static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { + MachineFunction *MF = MI->getParent()->getParent(); + + unsigned UsedRegMask = 0; + for (unsigned i = 0; i != 32; ++i) + if (MF->isPhysRegUsed(VRRegNo[i])) + UsedRegMask |= 1 << (31-i); + + // Live in and live out values already must be in the mask, so don't bother + // marking them. + for (MachineFunction::livein_iterator I = + MF->livein_begin(), E = MF->livein_end(); I != E; ++I) { + unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first); + if (VRRegNo[RegNo] == I->first) // If this really is a vector reg. + UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. + } + for (MachineFunction::liveout_iterator I = + MF->liveout_begin(), E = MF->liveout_end(); I != E; ++I) { + unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I); + if (VRRegNo[RegNo] == *I) // If this really is a vector reg. + UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. + } + + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + // If no registers are used, turn this into a copy. + if (UsedRegMask == 0) { + // Remove all VRSAVE code. + RemoveVRSaveCode(MI); + return; + } else if ((UsedRegMask & 0xFFFF) == UsedRegMask) { + BuildMI(*MI->getParent(), MI, TII.get(PPC::ORI), DstReg) + .addReg(SrcReg).addImm(UsedRegMask); + } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) { + BuildMI(*MI->getParent(), MI, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg).addImm(UsedRegMask >> 16); + } else { + BuildMI(*MI->getParent(), MI, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg).addImm(UsedRegMask >> 16); + BuildMI(*MI->getParent(), MI, TII.get(PPC::ORI), DstReg) + .addReg(DstReg).addImm(UsedRegMask & 0xFFFF); + } + + // Remove the old UPDATE_VRSAVE instruction. + MI->eraseFromParent(); +} + +/// determineFrameLayout - Determine the size of the frame and maximum call +/// frame size. +void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + unsigned FrameSize = MFI->getStackSize(); + + // Get the alignments provided by the target, and the maximum alignment + // (if any) of the fixed frame objects. + unsigned MaxAlign = MFI->getMaxAlignment(); + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; // + + // If we are a leaf function, and use up to 224 bytes of stack space, + // don't have a frame pointer, calls, or dynamic alloca then we do not need + // to adjust the stack pointer (we fit in the Red Zone). + if (FrameSize <= 224 && // Fits in red zone. + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->hasCalls() && // No calls. + MaxAlign <= TargetAlign) { // No special alignment. + // No need for frame + MFI->setStackSize(0); + return; + } + + // Get the maximum call frame size of all the calls. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + + // Maximum call frame needs to be at least big enough for linkage and 8 args. + unsigned minCallFrameSize = + PPCFrameInfo::getMinCallFrameSize(Subtarget.isPPC64(), + Subtarget.isMachoABI()); + maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize); + + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so + // that allocations will be aligned. + if (MFI->hasVarSizedObjects()) + maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; + + // Update maximum call frame size. + MFI->setMaxCallFrameSize(maxCallFrameSize); + + // Include call frame size in total. + FrameSize += maxCallFrameSize; + + // Make sure the frame is aligned. + FrameSize = (FrameSize + AlignMask) & ~AlignMask; + + // Update frame info. + MFI->setStackSize(FrameSize); +} + +void PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) + const { + // Save and clear the LR state. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned LR = getRARegister(); + FI->setUsesLR(MF.isPhysRegUsed(LR)); + MF.setPhysRegUnused(LR); + + // Save R31 if necessary + int FPSI = FI->getFramePointerSaveIndex(); + bool IsPPC64 = Subtarget.isPPC64(); + bool IsELF32_ABI = Subtarget.isELF32_ABI(); + bool IsMachoABI = Subtarget.isMachoABI(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI && (NoFramePointerElim || MFI->hasVarSizedObjects()) + && IsELF32_ABI) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, + IsMachoABI); + // Allocate the frame index for frame pointer save area. + FPSI = MF.getFrameInfo()->CreateFixedObject(IsPPC64? 8 : 4, FPOffset); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + +} + +void PPCRegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); + + // Prepare for frame info. + unsigned FrameLabelId = 0; + + // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it, + // process it. + for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) { + if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) { + HandleVRSaveUpdate(MBBI, TII); + break; + } + } + + // Move MBBI back to the beginning of the function. + MBBI = MBB.begin(); + + // Work out frame sizes. + determineFrameLayout(MF); + unsigned FrameSize = MFI->getStackSize(); + + int NegFrameSize = -FrameSize; + + // Get processor type. + bool IsPPC64 = Subtarget.isPPC64(); + // Get operating system + bool IsMachoABI = Subtarget.isMachoABI(); + // Check if the link register (LR) has been used. + bool UsesLR = MFI->hasCalls() || usesLR(MF); + // Do we have a frame pointer for this function? + bool HasFP = hasFP(MF) && FrameSize; + + int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI); + int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI); + + if (IsPPC64) { + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::MFLR8), PPC::X0); + + if (HasFP) + BuildMI(MBB, MBBI, TII.get(PPC::STD)) + .addReg(PPC::X31).addImm(FPOffset/4).addReg(PPC::X1); + + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::STD)) + .addReg(PPC::X0).addImm(LROffset/4).addReg(PPC::X1); + } else { + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::MFLR), PPC::R0); + + if (HasFP) + BuildMI(MBB, MBBI, TII.get(PPC::STW)) + .addReg(PPC::R31).addImm(FPOffset).addReg(PPC::R1); + + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::STW)) + .addReg(PPC::R0).addImm(LROffset).addReg(PPC::R1); + } + + // Skip if a leaf routine. + if (!FrameSize) return; + + // Get stack alignments. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned MaxAlign = MFI->getMaxAlignment(); + + if (MMI && MMI->needsFrameInfo()) { + // Mark effective beginning of when frame pointer becomes valid. + FrameLabelId = MMI->NextLabelID(); + BuildMI(MBB, MBBI, TII.get(PPC::LABEL)).addImm(FrameLabelId); + } + + // Adjust stack pointer: r1 += NegFrameSize. + // If there is a preferred stack alignment, align R1 now + if (!IsPPC64) { + // PPC32. + if (MaxAlign > TargetAlign) { + assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!"); + assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!"); + BuildMI(MBB, MBBI, TII.get(PPC::RLWINM), PPC::R0) + .addReg(PPC::R1).addImm(0).addImm(32-Log2_32(MaxAlign)).addImm(31); + BuildMI(MBB, MBBI, TII.get(PPC::SUBFIC) ,PPC::R0).addReg(PPC::R0) + .addImm(NegFrameSize); + BuildMI(MBB, MBBI, TII.get(PPC::STWUX)) + .addReg(PPC::R1).addReg(PPC::R1).addReg(PPC::R0); + } else if (isInt16(NegFrameSize)) { + BuildMI(MBB, MBBI, TII.get(PPC::STWU), + PPC::R1).addReg(PPC::R1).addImm(NegFrameSize).addReg(PPC::R1); + } else { + BuildMI(MBB, MBBI, TII.get(PPC::LIS), PPC::R0).addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, TII.get(PPC::ORI), PPC::R0).addReg(PPC::R0) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, TII.get(PPC::STWUX)).addReg(PPC::R1).addReg(PPC::R1) + .addReg(PPC::R0); + } + } else { // PPC64. + if (MaxAlign > TargetAlign) { + assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!"); + assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!"); + BuildMI(MBB, MBBI, TII.get(PPC::RLDICL), PPC::X0) + .addReg(PPC::X1).addImm(0).addImm(64-Log2_32(MaxAlign)); + BuildMI(MBB, MBBI, TII.get(PPC::SUBFIC8), PPC::X0).addReg(PPC::X0) + .addImm(NegFrameSize); + BuildMI(MBB, MBBI, TII.get(PPC::STDUX)) + .addReg(PPC::X1).addReg(PPC::X1).addReg(PPC::X0); + } else if (isInt16(NegFrameSize)) { + BuildMI(MBB, MBBI, TII.get(PPC::STDU), PPC::X1) + .addReg(PPC::X1).addImm(NegFrameSize/4).addReg(PPC::X1); + } else { + BuildMI(MBB, MBBI, TII.get(PPC::LIS8), PPC::X0).addImm(NegFrameSize >>16); + BuildMI(MBB, MBBI, TII.get(PPC::ORI8), PPC::X0).addReg(PPC::X0) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, TII.get(PPC::STDUX)).addReg(PPC::X1).addReg(PPC::X1) + .addReg(PPC::X0); + } + } + + if (MMI && MMI->needsFrameInfo()) { + std::vector<MachineMove> &Moves = MMI->getFrameMoves(); + + if (NegFrameSize) { + // Show update of SP. + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize); + Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc)); + } else { + MachineLocation SP(IsPPC64 ? PPC::X31 : PPC::R31); + Moves.push_back(MachineMove(FrameLabelId, SP, SP)); + } + + if (HasFP) { + MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset); + MachineLocation FPSrc(IsPPC64 ? PPC::X31 : PPC::R31); + Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc)); + } + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned Reg = CSI[I].getReg(); + if (Reg == PPC::LR || Reg == PPC::LR8) continue; + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc)); + } + + MachineLocation LRDst(MachineLocation::VirtualFP, LROffset); + MachineLocation LRSrc(IsPPC64 ? PPC::LR8 : PPC::LR); + Moves.push_back(MachineMove(FrameLabelId, LRDst, LRSrc)); + + // Mark effective beginning of when frame pointer is ready. + unsigned ReadyLabelId = MMI->NextLabelID(); + BuildMI(MBB, MBBI, TII.get(PPC::LABEL)).addImm(ReadyLabelId); + + MachineLocation FPDst(HasFP ? (IsPPC64 ? PPC::X31 : PPC::R31) : + (IsPPC64 ? PPC::X1 : PPC::R1)); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc)); + } + + // If there is a frame pointer, copy R1 into R31 + if (HasFP) { + if (!IsPPC64) { + BuildMI(MBB, MBBI, TII.get(PPC::OR), PPC::R31).addReg(PPC::R1) + .addReg(PPC::R1); + } else { + BuildMI(MBB, MBBI, TII.get(PPC::OR8), PPC::X31).addReg(PPC::X1) + .addReg(PPC::X1); + } + } +} + +void PPCRegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert(MBBI->getOpcode() == PPC::BLR && + "Can only insert epilog into returning blocks"); + + // Get alignment info so we know how to restore r1 + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned MaxAlign = MFI->getMaxAlignment(); + + // Get the number of bytes allocated from the FrameInfo. + unsigned FrameSize = MFI->getStackSize(); + + // Get processor type. + bool IsPPC64 = Subtarget.isPPC64(); + // Get operating system + bool IsMachoABI = Subtarget.isMachoABI(); + // Check if the link register (LR) has been used. + bool UsesLR = MFI->hasCalls() || usesLR(MF); + // Do we have a frame pointer for this function? + bool HasFP = hasFP(MF) && FrameSize; + + int LROffset = PPCFrameInfo::getReturnSaveOffset(IsPPC64, IsMachoABI); + int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(IsPPC64, IsMachoABI); + + if (FrameSize) { + // The loaded (or persistent) stack pointer value is offset by the 'stwu' + // on entry to the function. Add this offset back now. + if (!Subtarget.isPPC64()) { + if (isInt16(FrameSize) && TargetAlign >= MaxAlign && + !MFI->hasVarSizedObjects()) { + BuildMI(MBB, MBBI, TII.get(PPC::ADDI), PPC::R1) + .addReg(PPC::R1).addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, TII.get(PPC::LWZ),PPC::R1).addImm(0).addReg(PPC::R1); + } + } else { + if (isInt16(FrameSize) && TargetAlign >= MaxAlign && + !MFI->hasVarSizedObjects()) { + BuildMI(MBB, MBBI, TII.get(PPC::ADDI8), PPC::X1) + .addReg(PPC::X1).addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X1).addImm(0).addReg(PPC::X1); + } + } + } + + if (IsPPC64) { + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X0) + .addImm(LROffset/4).addReg(PPC::X1); + + if (HasFP) + BuildMI(MBB, MBBI, TII.get(PPC::LD), PPC::X31) + .addImm(FPOffset/4).addReg(PPC::X1); + + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::MTLR8)).addReg(PPC::X0); + } else { + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::LWZ), PPC::R0) + .addImm(LROffset).addReg(PPC::R1); + + if (HasFP) + BuildMI(MBB, MBBI, TII.get(PPC::LWZ), PPC::R31) + .addImm(FPOffset).addReg(PPC::R1); + + if (UsesLR) + BuildMI(MBB, MBBI, TII.get(PPC::MTLR)).addReg(PPC::R0); + } +} + +unsigned PPCRegisterInfo::getRARegister() const { + return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8; +} + +unsigned PPCRegisterInfo::getFrameRegister(MachineFunction &MF) const { + if (!Subtarget.isPPC64()) + return hasFP(MF) ? PPC::R31 : PPC::R1; + else + return hasFP(MF) ? PPC::X31 : PPC::X1; +} + +void PPCRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) + const { + // Initial state of the frame pointer is R1. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(PPC::R1, 0); + Moves.push_back(MachineMove(0, Dst, Src)); +} + +unsigned PPCRegisterInfo::getEHExceptionRegister() const { + return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3; +} + +unsigned PPCRegisterInfo::getEHHandlerRegister() const { + return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4; +} + +#include "PPCGenRegisterInfo.inc" + diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h new file mode 100644 index 0000000..4112034 --- /dev/null +++ b/lib/Target/PowerPC/PPCRegisterInfo.h @@ -0,0 +1,107 @@ +//===- PPCRegisterInfo.h - PowerPC Register Information Impl -----*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC32_REGISTERINFO_H +#define POWERPC32_REGISTERINFO_H + +#include "PPC.h" +#include "PPCGenRegisterInfo.h.inc" +#include <map> + +namespace llvm { +class PPCSubtarget; +class TargetInstrInfo; +class Type; + +class PPCRegisterInfo : public PPCGenRegisterInfo { + std::map<unsigned, unsigned> ImmToIdxMap; + const PPCSubtarget &Subtarget; + const TargetInstrInfo &TII; +public: + PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii); + + /// getRegisterNumbering - Given the enum value for some register, e.g. + /// PPC::F14, return the number that it corresponds to (e.g. 14). + static unsigned getRegisterNumbering(unsigned RegEnum); + + /// Code Generation virtual methods... + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into + /// copy instructions, turning them into load/store instructions. + virtual MachineInstr* foldMemoryOperand(MachineInstr* MI, unsigned OpNum, + int FrameIndex) const; + + const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + bool targetHandlesStackFrameRounding() const { return true; } + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + /// usesLR - Returns if the link registers (LR) has been used in the function. + /// + bool usesLR(MachineFunction &MF) const; + + void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + /// determineFrameLayout - Determine the size of the frame and maximum call + /// frame size. + void determineFrameLayout(MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + void getInitialFrameState(std::vector<MachineMove> &Moves) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td new file mode 100644 index 0000000..0b3b4ca --- /dev/null +++ b/lib/Target/PowerPC/PPCRegisterInfo.td @@ -0,0 +1,333 @@ +//===- PowerPCRegisterInfo.td - The PowerPC Register File --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +class PPCReg<string n> : Register<n> { + let Namespace = "PPC"; +} + +// We identify all our registers with a 5-bit ID, for consistency's sake. + +// GPR - One of the 32 32-bit general-purpose registers +class GPR<bits<5> num, string n> : PPCReg<n> { + field bits<5> Num = num; +} + +// GP8 - One of the 32 64-bit general-purpose registers +class GP8<GPR SubReg> : PPCReg<SubReg.Name> { + field bits<5> Num = SubReg.Num; + let SubRegs = [SubReg]; +} + +// SPR - One of the 32-bit special-purpose registers +class SPR<bits<10> num, string n> : PPCReg<n> { + field bits<10> Num = num; +} + +// FPR - One of the 32 64-bit floating-point registers +class FPR<bits<5> num, string n> : PPCReg<n> { + field bits<5> Num = num; +} + +// VR - One of the 32 128-bit vector registers +class VR<bits<5> num, string n> : PPCReg<n> { + field bits<5> Num = num; +} + +// CR - One of the 8 4-bit condition registers +class CR<bits<3> num, string n> : PPCReg<n> { + field bits<3> Num = num; +} + +// CRBIT - One of the 32 1-bit condition register fields +class CRBIT<bits<5> num, string n> : PPCReg<n> { + field bits<5> Num = num; +} + + +// General-purpose registers +def R0 : GPR< 0, "r0">, DwarfRegNum<0>; +def R1 : GPR< 1, "r1">, DwarfRegNum<1>; +def R2 : GPR< 2, "r2">, DwarfRegNum<2>; +def R3 : GPR< 3, "r3">, DwarfRegNum<3>; +def R4 : GPR< 4, "r4">, DwarfRegNum<4>; +def R5 : GPR< 5, "r5">, DwarfRegNum<5>; +def R6 : GPR< 6, "r6">, DwarfRegNum<6>; +def R7 : GPR< 7, "r7">, DwarfRegNum<7>; +def R8 : GPR< 8, "r8">, DwarfRegNum<8>; +def R9 : GPR< 9, "r9">, DwarfRegNum<9>; +def R10 : GPR<10, "r10">, DwarfRegNum<10>; +def R11 : GPR<11, "r11">, DwarfRegNum<11>; +def R12 : GPR<12, "r12">, DwarfRegNum<12>; +def R13 : GPR<13, "r13">, DwarfRegNum<13>; +def R14 : GPR<14, "r14">, DwarfRegNum<14>; +def R15 : GPR<15, "r15">, DwarfRegNum<15>; +def R16 : GPR<16, "r16">, DwarfRegNum<16>; +def R17 : GPR<17, "r17">, DwarfRegNum<17>; +def R18 : GPR<18, "r18">, DwarfRegNum<18>; +def R19 : GPR<19, "r19">, DwarfRegNum<19>; +def R20 : GPR<20, "r20">, DwarfRegNum<20>; +def R21 : GPR<21, "r21">, DwarfRegNum<21>; +def R22 : GPR<22, "r22">, DwarfRegNum<22>; +def R23 : GPR<23, "r23">, DwarfRegNum<23>; +def R24 : GPR<24, "r24">, DwarfRegNum<24>; +def R25 : GPR<25, "r25">, DwarfRegNum<25>; +def R26 : GPR<26, "r26">, DwarfRegNum<26>; +def R27 : GPR<27, "r27">, DwarfRegNum<27>; +def R28 : GPR<28, "r28">, DwarfRegNum<28>; +def R29 : GPR<29, "r29">, DwarfRegNum<29>; +def R30 : GPR<30, "r30">, DwarfRegNum<30>; +def R31 : GPR<31, "r31">, DwarfRegNum<31>; + +// 64-bit General-purpose registers +def X0 : GP8< R0>, DwarfRegNum<0>; +def X1 : GP8< R1>, DwarfRegNum<1>; +def X2 : GP8< R2>, DwarfRegNum<2>; +def X3 : GP8< R3>, DwarfRegNum<3>; +def X4 : GP8< R4>, DwarfRegNum<4>; +def X5 : GP8< R5>, DwarfRegNum<5>; +def X6 : GP8< R6>, DwarfRegNum<6>; +def X7 : GP8< R7>, DwarfRegNum<7>; +def X8 : GP8< R8>, DwarfRegNum<8>; +def X9 : GP8< R9>, DwarfRegNum<9>; +def X10 : GP8<R10>, DwarfRegNum<10>; +def X11 : GP8<R11>, DwarfRegNum<11>; +def X12 : GP8<R12>, DwarfRegNum<12>; +def X13 : GP8<R13>, DwarfRegNum<13>; +def X14 : GP8<R14>, DwarfRegNum<14>; +def X15 : GP8<R15>, DwarfRegNum<15>; +def X16 : GP8<R16>, DwarfRegNum<16>; +def X17 : GP8<R17>, DwarfRegNum<17>; +def X18 : GP8<R18>, DwarfRegNum<18>; +def X19 : GP8<R19>, DwarfRegNum<19>; +def X20 : GP8<R20>, DwarfRegNum<20>; +def X21 : GP8<R21>, DwarfRegNum<21>; +def X22 : GP8<R22>, DwarfRegNum<22>; +def X23 : GP8<R23>, DwarfRegNum<23>; +def X24 : GP8<R24>, DwarfRegNum<24>; +def X25 : GP8<R25>, DwarfRegNum<25>; +def X26 : GP8<R26>, DwarfRegNum<26>; +def X27 : GP8<R27>, DwarfRegNum<27>; +def X28 : GP8<R28>, DwarfRegNum<28>; +def X29 : GP8<R29>, DwarfRegNum<29>; +def X30 : GP8<R30>, DwarfRegNum<30>; +def X31 : GP8<R31>, DwarfRegNum<31>; + +// Floating-point registers +def F0 : FPR< 0, "f0">, DwarfRegNum<32>; +def F1 : FPR< 1, "f1">, DwarfRegNum<33>; +def F2 : FPR< 2, "f2">, DwarfRegNum<34>; +def F3 : FPR< 3, "f3">, DwarfRegNum<35>; +def F4 : FPR< 4, "f4">, DwarfRegNum<36>; +def F5 : FPR< 5, "f5">, DwarfRegNum<37>; +def F6 : FPR< 6, "f6">, DwarfRegNum<38>; +def F7 : FPR< 7, "f7">, DwarfRegNum<39>; +def F8 : FPR< 8, "f8">, DwarfRegNum<40>; +def F9 : FPR< 9, "f9">, DwarfRegNum<41>; +def F10 : FPR<10, "f10">, DwarfRegNum<42>; +def F11 : FPR<11, "f11">, DwarfRegNum<43>; +def F12 : FPR<12, "f12">, DwarfRegNum<44>; +def F13 : FPR<13, "f13">, DwarfRegNum<45>; +def F14 : FPR<14, "f14">, DwarfRegNum<46>; +def F15 : FPR<15, "f15">, DwarfRegNum<47>; +def F16 : FPR<16, "f16">, DwarfRegNum<48>; +def F17 : FPR<17, "f17">, DwarfRegNum<49>; +def F18 : FPR<18, "f18">, DwarfRegNum<50>; +def F19 : FPR<19, "f19">, DwarfRegNum<51>; +def F20 : FPR<20, "f20">, DwarfRegNum<52>; +def F21 : FPR<21, "f21">, DwarfRegNum<53>; +def F22 : FPR<22, "f22">, DwarfRegNum<54>; +def F23 : FPR<23, "f23">, DwarfRegNum<55>; +def F24 : FPR<24, "f24">, DwarfRegNum<56>; +def F25 : FPR<25, "f25">, DwarfRegNum<57>; +def F26 : FPR<26, "f26">, DwarfRegNum<58>; +def F27 : FPR<27, "f27">, DwarfRegNum<59>; +def F28 : FPR<28, "f28">, DwarfRegNum<60>; +def F29 : FPR<29, "f29">, DwarfRegNum<61>; +def F30 : FPR<30, "f30">, DwarfRegNum<62>; +def F31 : FPR<31, "f31">, DwarfRegNum<63>; + +// Vector registers +def V0 : VR< 0, "v0">, DwarfRegNum<77>; +def V1 : VR< 1, "v1">, DwarfRegNum<78>; +def V2 : VR< 2, "v2">, DwarfRegNum<79>; +def V3 : VR< 3, "v3">, DwarfRegNum<80>; +def V4 : VR< 4, "v4">, DwarfRegNum<81>; +def V5 : VR< 5, "v5">, DwarfRegNum<82>; +def V6 : VR< 6, "v6">, DwarfRegNum<83>; +def V7 : VR< 7, "v7">, DwarfRegNum<84>; +def V8 : VR< 8, "v8">, DwarfRegNum<85>; +def V9 : VR< 9, "v9">, DwarfRegNum<86>; +def V10 : VR<10, "v10">, DwarfRegNum<87>; +def V11 : VR<11, "v11">, DwarfRegNum<88>; +def V12 : VR<12, "v12">, DwarfRegNum<89>; +def V13 : VR<13, "v13">, DwarfRegNum<90>; +def V14 : VR<14, "v14">, DwarfRegNum<91>; +def V15 : VR<15, "v15">, DwarfRegNum<92>; +def V16 : VR<16, "v16">, DwarfRegNum<93>; +def V17 : VR<17, "v17">, DwarfRegNum<94>; +def V18 : VR<18, "v18">, DwarfRegNum<95>; +def V19 : VR<19, "v19">, DwarfRegNum<96>; +def V20 : VR<20, "v20">, DwarfRegNum<97>; +def V21 : VR<21, "v21">, DwarfRegNum<98>; +def V22 : VR<22, "v22">, DwarfRegNum<99>; +def V23 : VR<23, "v23">, DwarfRegNum<100>; +def V24 : VR<24, "v24">, DwarfRegNum<101>; +def V25 : VR<25, "v25">, DwarfRegNum<102>; +def V26 : VR<26, "v26">, DwarfRegNum<103>; +def V27 : VR<27, "v27">, DwarfRegNum<104>; +def V28 : VR<28, "v28">, DwarfRegNum<105>; +def V29 : VR<29, "v29">, DwarfRegNum<106>; +def V30 : VR<30, "v30">, DwarfRegNum<107>; +def V31 : VR<31, "v31">, DwarfRegNum<108>; + +// Condition registers +def CR0 : CR<0, "cr0">, DwarfRegNum<68>; +def CR1 : CR<1, "cr1">, DwarfRegNum<69>; +def CR2 : CR<2, "cr2">, DwarfRegNum<70>; +def CR3 : CR<3, "cr3">, DwarfRegNum<71>; +def CR4 : CR<4, "cr4">, DwarfRegNum<72>; +def CR5 : CR<5, "cr5">, DwarfRegNum<73>; +def CR6 : CR<6, "cr6">, DwarfRegNum<74>; +def CR7 : CR<7, "cr7">, DwarfRegNum<75>; + +// Condition register bits +def CR0LT : CRBIT< 0, "0">, DwarfRegNum<0>; +def CR0GT : CRBIT< 1, "1">, DwarfRegNum<0>; +def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<0>; +def CR0UN : CRBIT< 3, "3">, DwarfRegNum<0>; +def CR1LT : CRBIT< 4, "4">, DwarfRegNum<0>; +def CR1GT : CRBIT< 5, "5">, DwarfRegNum<0>; +def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<0>; +def CR1UN : CRBIT< 7, "7">, DwarfRegNum<0>; +def CR2LT : CRBIT< 8, "8">, DwarfRegNum<0>; +def CR2GT : CRBIT< 9, "9">, DwarfRegNum<0>; +def CR2EQ : CRBIT<10, "10">, DwarfRegNum<0>; +def CR2UN : CRBIT<11, "11">, DwarfRegNum<0>; +def CR3LT : CRBIT<12, "12">, DwarfRegNum<0>; +def CR3GT : CRBIT<13, "13">, DwarfRegNum<0>; +def CR3EQ : CRBIT<14, "14">, DwarfRegNum<0>; +def CR3UN : CRBIT<15, "15">, DwarfRegNum<0>; +def CR4LT : CRBIT<16, "16">, DwarfRegNum<0>; +def CR4GT : CRBIT<17, "17">, DwarfRegNum<0>; +def CR4EQ : CRBIT<18, "18">, DwarfRegNum<0>; +def CR4UN : CRBIT<19, "19">, DwarfRegNum<0>; +def CR5LT : CRBIT<20, "20">, DwarfRegNum<0>; +def CR5GT : CRBIT<21, "21">, DwarfRegNum<0>; +def CR5EQ : CRBIT<22, "22">, DwarfRegNum<0>; +def CR5UN : CRBIT<23, "23">, DwarfRegNum<0>; +def CR6LT : CRBIT<24, "24">, DwarfRegNum<0>; +def CR6GT : CRBIT<25, "25">, DwarfRegNum<0>; +def CR6EQ : CRBIT<26, "26">, DwarfRegNum<0>; +def CR6UN : CRBIT<27, "27">, DwarfRegNum<0>; +def CR7LT : CRBIT<28, "28">, DwarfRegNum<0>; +def CR7GT : CRBIT<29, "29">, DwarfRegNum<0>; +def CR7EQ : CRBIT<30, "30">, DwarfRegNum<0>; +def CR7UN : CRBIT<31, "31">, DwarfRegNum<0>; + +def : SubRegSet<1, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7], + [CR0LT, CR1LT, CR2LT, CR3LT, CR4LT, CR5LT, CR6LT, CR7LT]>; +def : SubRegSet<2, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7], + [CR0GT, CR1GT, CR2GT, CR3GT, CR4GT, CR5GT, CR6GT, CR7GT]>; +def : SubRegSet<3, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7], + [CR0EQ, CR1EQ, CR2EQ, CR3EQ, CR4EQ, CR5EQ, CR6EQ, CR7EQ]>; +def : SubRegSet<4, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7], + [CR0UN, CR1UN, CR2UN, CR3UN, CR4UN, CR5UN, CR6UN, CR7UN]>; + +// Link register +def LR : SPR<8, "lr">, DwarfRegNum<65>; +//let Aliases = [LR] in +def LR8 : SPR<8, "lr">, DwarfRegNum<65>; + +// Count register +def CTR : SPR<9, "ctr">, DwarfRegNum<66>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<66>; + +// VRsave register +def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<107>; + +/// Register classes +// Allocate volatiles first +// then nonvolatiles in reverse order since stmw/lmw save from rN to r31 +def GPRC : RegisterClass<"PPC", [i32], 32, + [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, + R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17, + R16, R15, R14, R13, R31, R0, R1, LR]> +{ + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + GPRCClass::iterator + GPRCClass::allocation_order_begin(const MachineFunction &MF) const { + // In Linux, r2 is reserved for the OS. + if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin()) + return begin()+1; + + return begin(); + } + GPRCClass::iterator + GPRCClass::allocation_order_end(const MachineFunction &MF) const { + // On PPC64, r13 is the thread pointer. Never allocate this register. + // Note that this is overconservative, as it also prevents allocation of + // R31 when the FP is not needed. + if (MF.getTarget().getSubtarget<PPCSubtarget>().isPPC64()) + return end()-5; // don't allocate R13, R31, R0, R1, LR + + if (needsFP(MF)) + return end()-4; // don't allocate R31, R0, R1, LR + else + return end()-3; // don't allocate R0, R1, LR + } + }]; +} +def G8RC : RegisterClass<"PPC", [i64], 64, + [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, + X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17, + X16, X15, X14, X31, X13, X0, X1, LR8]> +{ + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + G8RCClass::iterator + G8RCClass::allocation_order_begin(const MachineFunction &MF) const { + return begin(); + } + G8RCClass::iterator + G8RCClass::allocation_order_end(const MachineFunction &MF) const { + if (needsFP(MF)) + return end()-5; + else + return end()-4; + } + }]; +} + + + +def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7, + F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, + F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>; +def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, + F8, F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, + F22, F23, F24, F25, F26, F27, F28, F29, F30, F31]>; + +def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128, + [V2, V3, V4, V5, V0, V1, + V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, + V22, V23, V24, V25, V26, V27, V28, V29, V30, V31]>; + +def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, + CR3, CR4]>; + diff --git a/lib/Target/PowerPC/PPCRelocations.h b/lib/Target/PowerPC/PPCRelocations.h new file mode 100644 index 0000000..261622f --- /dev/null +++ b/lib/Target/PowerPC/PPCRelocations.h @@ -0,0 +1,56 @@ +//===- PPCRelocations.h - PPC32 Code Relocations ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC 32-bit target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC32RELOCATIONS_H +#define PPC32RELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +// Hack to rid us of a PPC pre-processor symbol which is erroneously +// defined in a PowerPC header file (bug in Linux/PPC) +#ifdef PPC +#undef PPC +#endif + +namespace llvm { + namespace PPC { + enum RelocationType { + // reloc_vanilla - A standard relocation, where the address of the + // relocated object completely overwrites the address of the relocation. + reloc_vanilla, + + // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions. + reloc_pcrel_bx, + + // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE, + // and other bcx instructions. + reloc_pcrel_bcx, + + // reloc_absolute_high - Absolute relocation, for the loadhi instruction + // (which is really addis). Add the high 16-bits of the specified global + // address into the low 16-bits of the instruction. + reloc_absolute_high, + + // reloc_absolute_low - Absolute relocation, for the la instruction (which + // is really an addi). Add the low 16-bits of the specified global + // address into the low 16-bits of the instruction. + reloc_absolute_low, + + // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store + // instruction which have two implicit zero bits. + reloc_absolute_low_ix + }; + } +} + +#endif diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td new file mode 100644 index 0000000..0e0fd82 --- /dev/null +++ b/lib/Target/PowerPC/PPCSchedule.td @@ -0,0 +1,508 @@ +//===- PPCSchedule.td - PowerPC Scheduling Definitions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Functional units across PowerPC chips sets +// +def BPU : FuncUnit; // Branch unit +def SLU : FuncUnit; // Store/load unit +def SRU : FuncUnit; // special register unit +def IU1 : FuncUnit; // integer unit 1 (simple) +def IU2 : FuncUnit; // integer unit 2 (complex) +def IU3 : FuncUnit; // integer unit 3 (7450 simple) +def IU4 : FuncUnit; // integer unit 4 (7450 simple) +def FPU1 : FuncUnit; // floating point unit 1 +def FPU2 : FuncUnit; // floating point unit 2 +def VPU : FuncUnit; // vector permutation unit +def VIU1 : FuncUnit; // vector integer unit 1 (simple) +def VIU2 : FuncUnit; // vector integer unit 2 (complex) +def VFPU : FuncUnit; // vector floating point unit + + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for PowerPC +// +def IntGeneral : InstrItinClass; +def IntCompare : InstrItinClass; +def IntDivD : InstrItinClass; +def IntDivW : InstrItinClass; +def IntMFFS : InstrItinClass; +def IntMFVSCR : InstrItinClass; +def IntMTFSB0 : InstrItinClass; +def IntMTSRD : InstrItinClass; +def IntMulHD : InstrItinClass; +def IntMulHW : InstrItinClass; +def IntMulHWU : InstrItinClass; +def IntMulLI : InstrItinClass; +def IntRFID : InstrItinClass; +def IntRotateD : InstrItinClass; +def IntRotate : InstrItinClass; +def IntShift : InstrItinClass; +def IntTrapD : InstrItinClass; +def IntTrapW : InstrItinClass; +def BrB : InstrItinClass; +def BrCR : InstrItinClass; +def BrMCR : InstrItinClass; +def BrMCRX : InstrItinClass; +def LdStDCBA : InstrItinClass; +def LdStDCBF : InstrItinClass; +def LdStDCBI : InstrItinClass; +def LdStGeneral : InstrItinClass; +def LdStDSS : InstrItinClass; +def LdStICBI : InstrItinClass; +def LdStUX : InstrItinClass; +def LdStLD : InstrItinClass; +def LdStLDARX : InstrItinClass; +def LdStLFD : InstrItinClass; +def LdStLFDU : InstrItinClass; +def LdStLHA : InstrItinClass; +def LdStLMW : InstrItinClass; +def LdStLVecX : InstrItinClass; +def LdStLWA : InstrItinClass; +def LdStLWARX : InstrItinClass; +def LdStSLBIA : InstrItinClass; +def LdStSLBIE : InstrItinClass; +def LdStSTD : InstrItinClass; +def LdStSTDCX : InstrItinClass; +def LdStSTVEBX : InstrItinClass; +def LdStSTWCX : InstrItinClass; +def LdStSync : InstrItinClass; +def SprISYNC : InstrItinClass; +def SprMFSR : InstrItinClass; +def SprMTMSR : InstrItinClass; +def SprMTSR : InstrItinClass; +def SprTLBSYNC : InstrItinClass; +def SprMFCR : InstrItinClass; +def SprMFMSR : InstrItinClass; +def SprMFSPR : InstrItinClass; +def SprMFTB : InstrItinClass; +def SprMTSPR : InstrItinClass; +def SprMTSRIN : InstrItinClass; +def SprRFI : InstrItinClass; +def SprSC : InstrItinClass; +def FPGeneral : InstrItinClass; +def FPCompare : InstrItinClass; +def FPDivD : InstrItinClass; +def FPDivS : InstrItinClass; +def FPFused : InstrItinClass; +def FPRes : InstrItinClass; +def FPSqrt : InstrItinClass; +def VecGeneral : InstrItinClass; +def VecFP : InstrItinClass; +def VecFPCompare : InstrItinClass; +def VecComplex : InstrItinClass; +def VecPerm : InstrItinClass; +def VecFPRound : InstrItinClass; +def VecVSL : InstrItinClass; +def VecVSR : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +include "PPCScheduleG3.td" +include "PPCScheduleG4.td" +include "PPCScheduleG4Plus.td" +include "PPCScheduleG5.td" + +//===----------------------------------------------------------------------===// +// Instruction to itinerary class map - When add new opcodes to the supported +// set, refer to the following table to determine which itinerary class the +// opcode belongs. +// +// opcode itinerary class +// ====== =============== +// add IntGeneral +// addc IntGeneral +// adde IntGeneral +// addi IntGeneral +// addic IntGeneral +// addic. IntGeneral +// addis IntGeneral +// addme IntGeneral +// addze IntGeneral +// and IntGeneral +// andc IntGeneral +// andi. IntGeneral +// andis. IntGeneral +// b BrB +// bc BrB +// bcctr BrB +// bclr BrB +// cmp IntCompare +// cmpi IntCompare +// cmpl IntCompare +// cmpli IntCompare +// cntlzd IntRotateD +// cntlzw IntGeneral +// crand BrCR +// crandc BrCR +// creqv BrCR +// crnand BrCR +// crnor BrCR +// cror BrCR +// crorc BrCR +// crxor BrCR +// dcba LdStDCBA +// dcbf LdStDCBF +// dcbi LdStDCBI +// dcbst LdStDCBF +// dcbt LdStGeneral +// dcbtst LdStGeneral +// dcbz LdStDCBF +// divd IntDivD +// divdu IntDivD +// divw IntDivW +// divwu IntDivW +// dss LdStDSS +// dst LdStDSS +// dstst LdStDSS +// eciwx LdStGeneral +// ecowx LdStGeneral +// eieio LdStGeneral +// eqv IntGeneral +// extsb IntGeneral +// extsh IntGeneral +// extsw IntRotateD +// fabs FPGeneral +// fadd FPGeneral +// fadds FPGeneral +// fcfid FPGeneral +// fcmpo FPCompare +// fcmpu FPCompare +// fctid FPGeneral +// fctidz FPGeneral +// fctiw FPGeneral +// fctiwz FPGeneral +// fdiv FPDivD +// fdivs FPDivS +// fmadd FPFused +// fmadds FPGeneral +// fmr FPGeneral +// fmsub FPFused +// fmsubs FPGeneral +// fmul FPFused +// fmuls FPGeneral +// fnabs FPGeneral +// fneg FPGeneral +// fnmadd FPFused +// fnmadds FPGeneral +// fnmsub FPFused +// fnmsubs FPGeneral +// fres FPRes +// frsp FPGeneral +// frsqrte FPGeneral +// fsel FPGeneral +// fsqrt FPSqrt +// fsqrts FPSqrt +// fsub FPGeneral +// fsubs FPGeneral +// icbi LdStICBI +// isync SprISYNC +// lbz LdStGeneral +// lbzu LdStGeneral +// lbzux LdStUX +// lbzx LdStGeneral +// ld LdStLD +// ldarx LdStLDARX +// ldu LdStLD +// ldux LdStLD +// ldx LdStLD +// lfd LdStLFD +// lfdu LdStLFDU +// lfdux LdStLFDU +// lfdx LdStLFDU +// lfs LdStLFDU +// lfsu LdStLFDU +// lfsux LdStLFDU +// lfsx LdStLFDU +// lha LdStLHA +// lhau LdStLHA +// lhaux LdStLHA +// lhax LdStLHA +// lhbrx LdStGeneral +// lhz LdStGeneral +// lhzu LdStGeneral +// lhzux LdStUX +// lhzx LdStGeneral +// lmw LdStLMW +// lswi LdStLMW +// lswx LdStLMW +// lvebx LdStLVecX +// lvehx LdStLVecX +// lvewx LdStLVecX +// lvsl LdStLVecX +// lvsr LdStLVecX +// lvx LdStLVecX +// lvxl LdStLVecX +// lwa LdStLWA +// lwarx LdStLWARX +// lwaux LdStLHA +// lwax LdStLHA +// lwbrx LdStGeneral +// lwz LdStGeneral +// lwzu LdStGeneral +// lwzux LdStUX +// lwzx LdStGeneral +// mcrf BrMCR +// mcrfs FPGeneral +// mcrxr BrMCRX +// mfcr SprMFCR +// mffs IntMFFS +// mfmsr SprMFMSR +// mfspr SprMFSPR +// mfsr SprMFSR +// mfsrin SprMFSR +// mftb SprMFTB +// mfvscr IntMFVSCR +// mtcrf BrMCRX +// mtfsb0 IntMTFSB0 +// mtfsb1 IntMTFSB0 +// mtfsf IntMTFSB0 +// mtfsfi IntMTFSB0 +// mtmsr SprMTMSR +// mtmsrd LdStLD +// mtspr SprMTSPR +// mtsr SprMTSR +// mtsrd IntMTSRD +// mtsrdin IntMTSRD +// mtsrin SprMTSRIN +// mtvscr IntMFVSCR +// mulhd IntMulHD +// mulhdu IntMulHD +// mulhw IntMulHW +// mulhwu IntMulHWU +// mulld IntMulHD +// mulli IntMulLI +// mullw IntMulHW +// nand IntGeneral +// neg IntGeneral +// nor IntGeneral +// or IntGeneral +// orc IntGeneral +// ori IntGeneral +// oris IntGeneral +// rfi SprRFI +// rfid IntRFID +// rldcl IntRotateD +// rldcr IntRotateD +// rldic IntRotateD +// rldicl IntRotateD +// rldicr IntRotateD +// rldimi IntRotateD +// rlwimi IntRotate +// rlwinm IntGeneral +// rlwnm IntGeneral +// sc SprSC +// slbia LdStSLBIA +// slbie LdStSLBIE +// sld IntRotateD +// slw IntGeneral +// srad IntRotateD +// sradi IntRotateD +// sraw IntShift +// srawi IntShift +// srd IntRotateD +// srw IntGeneral +// stb LdStGeneral +// stbu LdStGeneral +// stbux LdStGeneral +// stbx LdStGeneral +// std LdStSTD +// stdcx. LdStSTDCX +// stdu LdStSTD +// stdux LdStSTD +// stdx LdStSTD +// stfd LdStUX +// stfdu LdStUX +// stfdux LdStUX +// stfdx LdStUX +// stfiwx LdStUX +// stfs LdStUX +// stfsu LdStUX +// stfsux LdStUX +// stfsx LdStUX +// sth LdStGeneral +// sthbrx LdStGeneral +// sthu LdStGeneral +// sthux LdStGeneral +// sthx LdStGeneral +// stmw LdStLMW +// stswi LdStLMW +// stswx LdStLMW +// stvebx LdStSTVEBX +// stvehx LdStSTVEBX +// stvewx LdStSTVEBX +// stvx LdStSTVEBX +// stvxl LdStSTVEBX +// stw LdStGeneral +// stwbrx LdStGeneral +// stwcx. LdStSTWCX +// stwu LdStGeneral +// stwux LdStGeneral +// stwx LdStGeneral +// subf IntGeneral +// subfc IntGeneral +// subfe IntGeneral +// subfic IntGeneral +// subfme IntGeneral +// subfze IntGeneral +// sync LdStSync +// td IntTrapD +// tdi IntTrapD +// tlbia LdStSLBIA +// tlbie LdStDCBF +// tlbsync SprTLBSYNC +// tw IntTrapW +// twi IntTrapW +// vaddcuw VecGeneral +// vaddfp VecFP +// vaddsbs VecGeneral +// vaddshs VecGeneral +// vaddsws VecGeneral +// vaddubm VecGeneral +// vaddubs VecGeneral +// vadduhm VecGeneral +// vadduhs VecGeneral +// vadduwm VecGeneral +// vadduws VecGeneral +// vand VecGeneral +// vandc VecGeneral +// vavgsb VecGeneral +// vavgsh VecGeneral +// vavgsw VecGeneral +// vavgub VecGeneral +// vavguh VecGeneral +// vavguw VecGeneral +// vcfsx VecFP +// vcfux VecFP +// vcmpbfp VecFPCompare +// vcmpeqfp VecFPCompare +// vcmpequb VecGeneral +// vcmpequh VecGeneral +// vcmpequw VecGeneral +// vcmpgefp VecFPCompare +// vcmpgtfp VecFPCompare +// vcmpgtsb VecGeneral +// vcmpgtsh VecGeneral +// vcmpgtsw VecGeneral +// vcmpgtub VecGeneral +// vcmpgtuh VecGeneral +// vcmpgtuw VecGeneral +// vctsxs VecFP +// vctuxs VecFP +// vexptefp VecFP +// vlogefp VecFP +// vmaddfp VecFP +// vmaxfp VecFPCompare +// vmaxsb VecGeneral +// vmaxsh VecGeneral +// vmaxsw VecGeneral +// vmaxub VecGeneral +// vmaxuh VecGeneral +// vmaxuw VecGeneral +// vmhaddshs VecComplex +// vmhraddshs VecComplex +// vminfp VecFPCompare +// vminsb VecGeneral +// vminsh VecGeneral +// vminsw VecGeneral +// vminub VecGeneral +// vminuh VecGeneral +// vminuw VecGeneral +// vmladduhm VecComplex +// vmrghb VecPerm +// vmrghh VecPerm +// vmrghw VecPerm +// vmrglb VecPerm +// vmrglh VecPerm +// vmrglw VecPerm +// vmsubfp VecFP +// vmsummbm VecComplex +// vmsumshm VecComplex +// vmsumshs VecComplex +// vmsumubm VecComplex +// vmsumuhm VecComplex +// vmsumuhs VecComplex +// vmulesb VecComplex +// vmulesh VecComplex +// vmuleub VecComplex +// vmuleuh VecComplex +// vmulosb VecComplex +// vmulosh VecComplex +// vmuloub VecComplex +// vmulouh VecComplex +// vnor VecGeneral +// vor VecGeneral +// vperm VecPerm +// vpkpx VecPerm +// vpkshss VecPerm +// vpkshus VecPerm +// vpkswss VecPerm +// vpkswus VecPerm +// vpkuhum VecPerm +// vpkuhus VecPerm +// vpkuwum VecPerm +// vpkuwus VecPerm +// vrefp VecFPRound +// vrfim VecFPRound +// vrfin VecFPRound +// vrfip VecFPRound +// vrfiz VecFPRound +// vrlb VecGeneral +// vrlh VecGeneral +// vrlw VecGeneral +// vrsqrtefp VecFP +// vsel VecGeneral +// vsl VecVSL +// vslb VecGeneral +// vsldoi VecPerm +// vslh VecGeneral +// vslo VecPerm +// vslw VecGeneral +// vspltb VecPerm +// vsplth VecPerm +// vspltisb VecPerm +// vspltish VecPerm +// vspltisw VecPerm +// vspltw VecPerm +// vsr VecVSR +// vsrab VecGeneral +// vsrah VecGeneral +// vsraw VecGeneral +// vsrb VecGeneral +// vsrh VecGeneral +// vsro VecPerm +// vsrw VecGeneral +// vsubcuw VecGeneral +// vsubfp VecFP +// vsubsbs VecGeneral +// vsubshs VecGeneral +// vsubsws VecGeneral +// vsububm VecGeneral +// vsububs VecGeneral +// vsubuhm VecGeneral +// vsubuhs VecGeneral +// vsubuwm VecGeneral +// vsubuws VecGeneral +// vsum2sws VecComplex +// vsum4sbs VecComplex +// vsum4shs VecComplex +// vsum4ubs VecComplex +// vsumsws VecComplex +// vupkhpx VecPerm +// vupkhsb VecPerm +// vupkhsh VecPerm +// vupklpx VecPerm +// vupklsb VecPerm +// vupklsh VecPerm +// vxor VecGeneral +// xor IntGeneral +// xori IntGeneral +// xoris IntGeneral +// diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td new file mode 100644 index 0000000..fbb9f6f --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleG3.td @@ -0,0 +1,63 @@ +//===- PPCScheduleG3.td - PPC G3 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G3 (750) processor. +// +//===----------------------------------------------------------------------===// + + +def G3Itineraries : ProcessorItineraries<[ + InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntDivW , [InstrStage<19, [IU1]>]>, + InstrItinData<IntMFFS , [InstrStage<1, [FPU1]>]>, + InstrItinData<IntMTFSB0 , [InstrStage<3, [FPU1]>]>, + InstrItinData<IntMulHW , [InstrStage<5, [IU1]>]>, + InstrItinData<IntMulHWU , [InstrStage<6, [IU1]>]>, + InstrItinData<IntMulLI , [InstrStage<3, [IU1]>]>, + InstrItinData<IntRotate , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntShift , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntTrapW , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<BrB , [InstrStage<1, [BPU]>]>, + InstrItinData<BrCR , [InstrStage<1, [SRU]>]>, + InstrItinData<BrMCR , [InstrStage<1, [SRU]>]>, + InstrItinData<BrMCRX , [InstrStage<1, [SRU]>]>, + InstrItinData<LdStDCBA , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStDCBI , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStICBI , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStUX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLFD , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLFDU , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLHA , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLMW , [InstrStage<34, [SLU]>]>, + InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTWCX , [InstrStage<8, [SLU]>]>, + InstrItinData<LdStSync , [InstrStage<3, [SLU]>]>, + InstrItinData<SprISYNC , [InstrStage<2, [SRU]>]>, + InstrItinData<SprMFSR , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMTMSR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMTSR , [InstrStage<2, [SRU]>]>, + InstrItinData<SprTLBSYNC , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMFCR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMFMSR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMFSPR , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMFTB , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMTSPR , [InstrStage<2, [SRU]>]>, + InstrItinData<SprMTSRIN , [InstrStage<2, [SRU]>]>, + InstrItinData<SprRFI , [InstrStage<2, [SRU]>]>, + InstrItinData<SprSC , [InstrStage<2, [SRU]>]>, + InstrItinData<FPGeneral , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPCompare , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPDivD , [InstrStage<31, [FPU1]>]>, + InstrItinData<FPDivS , [InstrStage<17, [FPU1]>]>, + InstrItinData<FPFused , [InstrStage<2, [FPU1]>]>, + InstrItinData<FPRes , [InstrStage<10, [FPU1]>]> +]>; diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td new file mode 100644 index 0000000..d0e4456 --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleG4.td @@ -0,0 +1,73 @@ +//===- PPCScheduleG4.td - PPC G4 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4 (7400) processor. +// +//===----------------------------------------------------------------------===// + +def G4Itineraries : ProcessorItineraries<[ + InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntDivW , [InstrStage<19, [IU1]>]>, + InstrItinData<IntMFFS , [InstrStage<3, [FPU1]>]>, + InstrItinData<IntMFVSCR , [InstrStage<1, [VIU1]>]>, + InstrItinData<IntMTFSB0 , [InstrStage<3, [FPU1]>]>, + InstrItinData<IntMulHW , [InstrStage<5, [IU1]>]>, + InstrItinData<IntMulHWU , [InstrStage<6, [IU1]>]>, + InstrItinData<IntMulLI , [InstrStage<3, [IU1]>]>, + InstrItinData<IntRotate , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntShift , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntTrapW , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<BrB , [InstrStage<1, [BPU]>]>, + InstrItinData<BrCR , [InstrStage<1, [SRU]>]>, + InstrItinData<BrMCR , [InstrStage<1, [SRU]>]>, + InstrItinData<BrMCRX , [InstrStage<1, [SRU]>]>, + InstrItinData<LdStDCBF , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStDCBI , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStDSS , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStICBI , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStUX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLFD , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLFDU , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLHA , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLMW , [InstrStage<34, [SLU]>]>, + InstrItinData<LdStLVecX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTVEBX , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTWCX , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStSync , [InstrStage<8, [SLU]>]>, + InstrItinData<SprISYNC , [InstrStage<2, [SRU]>]>, + InstrItinData<SprMFSR , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMTMSR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMTSR , [InstrStage<2, [SRU]>]>, + InstrItinData<SprTLBSYNC , [InstrStage<8, [SRU]>]>, + InstrItinData<SprMFCR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMFMSR , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMFSPR , [InstrStage<3, [SRU]>]>, + InstrItinData<SprMFTB , [InstrStage<1, [SRU]>]>, + InstrItinData<SprMTSPR , [InstrStage<2, [SRU]>]>, + InstrItinData<SprMTSRIN , [InstrStage<2, [SRU]>]>, + InstrItinData<SprRFI , [InstrStage<2, [SRU]>]>, + InstrItinData<SprSC , [InstrStage<2, [SRU]>]>, + InstrItinData<FPGeneral , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPCompare , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPDivD , [InstrStage<31, [FPU1]>]>, + InstrItinData<FPDivS , [InstrStage<17, [FPU1]>]>, + InstrItinData<FPFused , [InstrStage<1, [FPU1]>]>, + InstrItinData<FPRes , [InstrStage<10, [FPU1]>]>, + InstrItinData<VecGeneral , [InstrStage<1, [VIU1]>]>, + InstrItinData<VecFP , [InstrStage<4, [VFPU]>]>, + InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>, + InstrItinData<VecComplex , [InstrStage<3, [VIU2]>]>, + InstrItinData<VecPerm , [InstrStage<1, [VPU]>]>, + InstrItinData<VecFPRound , [InstrStage<4, [VFPU]>]>, + InstrItinData<VecVSL , [InstrStage<1, [VIU1]>]>, + InstrItinData<VecVSR , [InstrStage<1, [VIU1]>]> +]>; diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td new file mode 100644 index 0000000..b40a8a5 --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -0,0 +1,76 @@ +//===- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4+ (7450) processor. +// +//===----------------------------------------------------------------------===// + +def G4PlusItineraries : ProcessorItineraries<[ + InstrItinData<IntGeneral , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<IntCompare , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<IntDivW , [InstrStage<23, [IU2]>]>, + InstrItinData<IntMFFS , [InstrStage<5, [FPU1]>]>, + InstrItinData<IntMFVSCR , [InstrStage<2, [VFPU]>]>, + InstrItinData<IntMTFSB0 , [InstrStage<5, [FPU1]>]>, + InstrItinData<IntMulHW , [InstrStage<4, [IU2]>]>, + InstrItinData<IntMulHWU , [InstrStage<4, [IU2]>]>, + InstrItinData<IntMulLI , [InstrStage<3, [IU2]>]>, + InstrItinData<IntRotate , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<IntShift , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<IntTrapW , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<BrB , [InstrStage<1, [BPU]>]>, + InstrItinData<BrCR , [InstrStage<2, [IU2]>]>, + InstrItinData<BrMCR , [InstrStage<2, [IU2]>]>, + InstrItinData<BrMCRX , [InstrStage<2, [IU2]>]>, + InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStDCBI , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStDSS , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStICBI , [InstrStage<3, [IU2]>]>, + InstrItinData<LdStUX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLFD , [InstrStage<4, [SLU]>]>, + InstrItinData<LdStLFDU , [InstrStage<4, [SLU]>]>, + InstrItinData<LdStLHA , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLMW , [InstrStage<37, [SLU]>]>, + InstrItinData<LdStLVecX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLWA , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLWARX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTDCX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTVEBX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTWCX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSync , [InstrStage<35, [SLU]>]>, + InstrItinData<SprISYNC , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<SprMFSR , [InstrStage<4, [IU2]>]>, + InstrItinData<SprMTMSR , [InstrStage<2, [IU2]>]>, + InstrItinData<SprMTSR , [InstrStage<2, [IU2]>]>, + InstrItinData<SprTLBSYNC , [InstrStage<3, [SLU]>]>, + InstrItinData<SprMFCR , [InstrStage<2, [IU2]>]>, + InstrItinData<SprMFMSR , [InstrStage<3, [IU2]>]>, + InstrItinData<SprMFSPR , [InstrStage<4, [IU2]>]>, + InstrItinData<SprMFTB , [InstrStage<5, [IU2]>]>, + InstrItinData<SprMTSPR , [InstrStage<2, [IU2]>]>, + InstrItinData<SprMTSRIN , [InstrStage<2, [IU2]>]>, + InstrItinData<SprRFI , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<SprSC , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>, + InstrItinData<FPGeneral , [InstrStage<5, [FPU1]>]>, + InstrItinData<FPCompare , [InstrStage<5, [FPU1]>]>, + InstrItinData<FPDivD , [InstrStage<35, [FPU1]>]>, + InstrItinData<FPDivS , [InstrStage<21, [FPU1]>]>, + InstrItinData<FPFused , [InstrStage<5, [FPU1]>]>, + InstrItinData<FPRes , [InstrStage<14, [FPU1]>]>, + InstrItinData<VecGeneral , [InstrStage<1, [VIU1]>]>, + InstrItinData<VecFP , [InstrStage<4, [VFPU]>]>, + InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>, + InstrItinData<VecComplex , [InstrStage<4, [VIU2]>]>, + InstrItinData<VecPerm , [InstrStage<2, [VPU]>]>, + InstrItinData<VecFPRound , [InstrStage<4, [VIU1]>]>, + InstrItinData<VecVSL , [InstrStage<2, [VPU]>]>, + InstrItinData<VecVSR , [InstrStage<2, [VPU]>]> +]>; diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td new file mode 100644 index 0000000..ff4be2c --- /dev/null +++ b/lib/Target/PowerPC/PPCScheduleG5.td @@ -0,0 +1,83 @@ +//===- PPCScheduleG5.td - PPC G5 Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G5 (970) processor. +// +//===----------------------------------------------------------------------===// + +def G5Itineraries : ProcessorItineraries<[ + InstrItinData<IntGeneral , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<IntCompare , [InstrStage<3, [IU1, IU2]>]>, + InstrItinData<IntDivD , [InstrStage<68, [IU1]>]>, + InstrItinData<IntDivW , [InstrStage<36, [IU1]>]>, + InstrItinData<IntMFFS , [InstrStage<6, [IU2]>]>, + InstrItinData<IntMFVSCR , [InstrStage<1, [VFPU]>]>, + InstrItinData<IntMTFSB0 , [InstrStage<6, [FPU1, FPU2]>]>, + InstrItinData<IntMulHD , [InstrStage<7, [IU1, IU2]>]>, + InstrItinData<IntMulHW , [InstrStage<5, [IU1, IU2]>]>, + InstrItinData<IntMulHWU , [InstrStage<5, [IU1, IU2]>]>, + InstrItinData<IntMulLI , [InstrStage<4, [IU1, IU2]>]>, + InstrItinData<IntRFID , [InstrStage<1, [IU2]>]>, + InstrItinData<IntRotateD , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<IntRotate , [InstrStage<4, [IU1, IU2]>]>, + InstrItinData<IntShift , [InstrStage<2, [IU1, IU2]>]>, + InstrItinData<IntTrapD , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<IntTrapW , [InstrStage<1, [IU1, IU2]>]>, + InstrItinData<BrB , [InstrStage<1, [BPU]>]>, + InstrItinData<BrCR , [InstrStage<4, [BPU]>]>, + InstrItinData<BrMCR , [InstrStage<2, [BPU]>]>, + InstrItinData<BrMCRX , [InstrStage<3, [BPU]>]>, + InstrItinData<LdStDCBF , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStDSS , [InstrStage<10, [SLU]>]>, + InstrItinData<LdStICBI , [InstrStage<40, [SLU]>]>, + InstrItinData<LdStUX , [InstrStage<4, [SLU]>]>, + InstrItinData<LdStLD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLDARX , [InstrStage<11, [SLU]>]>, + InstrItinData<LdStLFD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLFDU , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStLHA , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStLMW , [InstrStage<64, [SLU]>]>, + InstrItinData<LdStLVecX , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStLWA , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStLWARX , [InstrStage<11, [SLU]>]>, + InstrItinData<LdStSLBIA , [InstrStage<40, [SLU]>]>, // needs work + InstrItinData<LdStSLBIE , [InstrStage<2, [SLU]>]>, + InstrItinData<LdStSTD , [InstrStage<3, [SLU]>]>, + InstrItinData<LdStSTDCX , [InstrStage<11, [SLU]>]>, + InstrItinData<LdStSTVEBX , [InstrStage<5, [SLU]>]>, + InstrItinData<LdStSTWCX , [InstrStage<11, [SLU]>]>, + InstrItinData<LdStSync , [InstrStage<35, [SLU]>]>, + InstrItinData<SprISYNC , [InstrStage<40, [SLU]>]>, // needs work + InstrItinData<SprMFSR , [InstrStage<3, [SLU]>]>, + InstrItinData<SprMTMSR , [InstrStage<3, [SLU]>]>, + InstrItinData<SprMTSR , [InstrStage<3, [SLU]>]>, + InstrItinData<SprTLBSYNC , [InstrStage<3, [SLU]>]>, + InstrItinData<SprMFCR , [InstrStage<2, [IU2]>]>, + InstrItinData<SprMFMSR , [InstrStage<3, [IU2]>]>, + InstrItinData<SprMFSPR , [InstrStage<3, [IU2]>]>, + InstrItinData<SprMFTB , [InstrStage<10, [IU2]>]>, + InstrItinData<SprMTSPR , [InstrStage<8, [IU2]>]>, + InstrItinData<SprSC , [InstrStage<1, [IU2]>]>, + InstrItinData<FPGeneral , [InstrStage<6, [FPU1, FPU2]>]>, + InstrItinData<FPCompare , [InstrStage<8, [FPU1, FPU2]>]>, + InstrItinData<FPDivD , [InstrStage<33, [FPU1, FPU2]>]>, + InstrItinData<FPDivS , [InstrStage<33, [FPU1, FPU2]>]>, + InstrItinData<FPFused , [InstrStage<6, [FPU1, FPU2]>]>, + InstrItinData<FPRes , [InstrStage<6, [FPU1, FPU2]>]>, + InstrItinData<FPSqrt , [InstrStage<40, [FPU1, FPU2]>]>, + InstrItinData<VecGeneral , [InstrStage<2, [VIU1]>]>, + InstrItinData<VecFP , [InstrStage<8, [VFPU]>]>, + InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>, + InstrItinData<VecComplex , [InstrStage<5, [VIU2]>]>, + InstrItinData<VecPerm , [InstrStage<3, [VPU]>]>, + InstrItinData<VecFPRound , [InstrStage<8, [VFPU]>]>, + InstrItinData<VecVSL , [InstrStage<2, [VIU1]>]>, + InstrItinData<VecVSR , [InstrStage<3, [VPU]>]> +]>; diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp new file mode 100644 index 0000000..4419d20 --- /dev/null +++ b/lib/Target/PowerPC/PPCSubtarget.cpp @@ -0,0 +1,141 @@ +//===- PowerPCSubtarget.cpp - PPC Subtarget Information -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Begeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPC specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "PPCSubtarget.h" +#include "PPC.h" +#include "llvm/Module.h" +#include "llvm/Target/TargetMachine.h" +#include "PPCGenSubtarget.inc" +using namespace llvm; + +#if defined(__APPLE__) +#include <mach/mach.h> +#include <mach/mach_host.h> +#include <mach/host_info.h> +#include <mach/machine.h> + +/// GetCurrentPowerPCFeatures - Returns the current CPUs features. +static const char *GetCurrentPowerPCCPU() { + host_basic_info_data_t hostInfo; + mach_msg_type_number_t infoCount; + + infoCount = HOST_BASIC_INFO_COUNT; + host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, + &infoCount); + + if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic"; + + switch(hostInfo.cpu_subtype) { + case CPU_SUBTYPE_POWERPC_601: return "601"; + case CPU_SUBTYPE_POWERPC_602: return "602"; + case CPU_SUBTYPE_POWERPC_603: return "603"; + case CPU_SUBTYPE_POWERPC_603e: return "603e"; + case CPU_SUBTYPE_POWERPC_603ev: return "603ev"; + case CPU_SUBTYPE_POWERPC_604: return "604"; + case CPU_SUBTYPE_POWERPC_604e: return "604e"; + case CPU_SUBTYPE_POWERPC_620: return "620"; + case CPU_SUBTYPE_POWERPC_750: return "750"; + case CPU_SUBTYPE_POWERPC_7400: return "7400"; + case CPU_SUBTYPE_POWERPC_7450: return "7450"; + case CPU_SUBTYPE_POWERPC_970: return "970"; + default: ; + } + + return "generic"; +} +#endif + + +PPCSubtarget::PPCSubtarget(const TargetMachine &tm, const Module &M, + const std::string &FS, bool is64Bit) + : TM(tm) + , StackAlignment(16) + , IsGigaProcessor(false) + , Has64BitSupport(false) + , Use64BitRegs(false) + , IsPPC64(is64Bit) + , HasAltivec(false) + , HasFSQRT(false) + , HasSTFIWX(false) + , IsDarwin(false) + , HasLazyResolverStubs(false) { + + // Determine default and user specified characteristics + std::string CPU = "generic"; +#if defined(__APPLE__) + CPU = GetCurrentPowerPCCPU(); +#endif + + // Parse features string. + ParseSubtargetFeatures(FS, CPU); + + // If we are generating code for ppc64, verify that options make sense. + if (is64Bit) { + if (!has64BitSupport()) { + cerr << "PPC: Generation of 64-bit code for a 32-bit processor " + << "requested. Ignoring 32-bit processor feature.\n"; + Has64BitSupport = true; + } + // Silently force 64-bit register use on ppc64. + Use64BitRegs = true; + } + + // If the user requested use of 64-bit regs, but the cpu selected doesn't + // support it, warn and ignore. + if (use64BitRegs() && !has64BitSupport()) { + cerr << "PPC: 64-bit registers requested on CPU without support. " + << "Disabling 64-bit register use.\n"; + Use64BitRegs = false; + } + + // Set the boolean corresponding to the current target triple, or the default + // if one cannot be determined, to true. + const std::string& TT = M.getTargetTriple(); + if (TT.length() > 5) { + IsDarwin = TT.find("-darwin") != std::string::npos; + } else if (TT.empty()) { +#if defined(__APPLE__) + IsDarwin = true; +#endif + } + + // Set up darwin-specific properties. + if (IsDarwin) { + HasLazyResolverStubs = true; + AsmFlavor = NewMnemonic; + } else { + AsmFlavor = OldMnemonic; + } +} + +/// SetJITMode - This is called to inform the subtarget info that we are +/// producing code for the JIT. +void PPCSubtarget::SetJITMode() { + // JIT mode doesn't want lazy resolver stubs, it knows exactly where + // everything is. This matters for PPC64, which codegens in PIC mode without + // stubs. + HasLazyResolverStubs = false; +} + + +/// hasLazyResolverStub - Return true if accesses to the specified global have +/// to go through a dyld lazy resolution stub. This means that an extra load +/// is required to get the address of the global. +bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const { + // We never hae stubs if HasLazyResolverStubs=false or if in static mode. + if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static) + return false; + + return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() || + (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode()); +} diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h new file mode 100644 index 0000000..d1e135c --- /dev/null +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -0,0 +1,146 @@ +//=====-- PPCSubtarget.h - Define Subtarget for the PPC -------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Begeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPCSUBTARGET_H +#define POWERPCSUBTARGET_H + +#include "llvm/Target/TargetInstrItineraries.h" +#include "llvm/Target/TargetSubtarget.h" + +#include <string> + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { + +namespace PPC { + // -m directive values. + enum { + DIR_32, + DIR_601, + DIR_602, + DIR_603, + DIR_7400, + DIR_750, + DIR_970, + DIR_64 + }; +} + +class Module; +class GlobalValue; +class TargetMachine; + +class PPCSubtarget : public TargetSubtarget { +public: + enum AsmWriterFlavorTy { + OldMnemonic, NewMnemonic, Unset + }; +protected: + const TargetMachine &TM; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned StackAlignment; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + /// Which cpu directive was used. + unsigned DarwinDirective; + + /// AsmFlavor - Which PPC asm dialect to use. + AsmWriterFlavorTy AsmFlavor; + + /// Used by the ISel to turn in optimizations for POWER4-derived architectures + bool IsGigaProcessor; + bool Has64BitSupport; + bool Use64BitRegs; + bool IsPPC64; + bool HasAltivec; + bool HasFSQRT; + bool HasSTFIWX; + bool IsDarwin; + bool HasLazyResolverStubs; +public: + /// This constructor initializes the data members to match that + /// of the specified module. + /// + PPCSubtarget(const TargetMachine &TM, const Module &M, + const std::string &FS, bool is64Bit); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + + /// SetJITMode - This is called to inform the subtarget info that we are + /// producing code for the JIT. + void SetJITMode(); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return StackAlignment; } + + /// getDarwinDirective - Returns the -m directive specified for the cpu. + /// + unsigned getDarwinDirective() const { return DarwinDirective; } + + /// getInstrItins - Return the instruction itineraies based on subtarget + /// selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + /// getTargetDataString - Return the pointer size and type alignment + /// properties of this subtarget. + const char *getTargetDataString() const { + return isPPC64() ? "E-p:64:64-f64:32:64-i64:32:64" + : "E-p:32:32-f64:32:64-i64:32:64"; + } + + /// isPPC64 - Return true if we are generating code for 64-bit pointer mode. + /// + bool isPPC64() const { return IsPPC64; } + + /// has64BitSupport - Return true if the selected CPU supports 64-bit + /// instructions, regardless of whether we are in 32-bit or 64-bit mode. + bool has64BitSupport() const { return Has64BitSupport; } + + /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit + /// registers in 32-bit mode when possible. This can only true if + /// has64BitSupport() returns true. + bool use64BitRegs() const { return Use64BitRegs; } + + /// hasLazyResolverStub - Return true if accesses to the specified global have + /// to go through a dyld lazy resolution stub. This means that an extra load + /// is required to get the address of the global. + bool hasLazyResolverStub(const GlobalValue *GV) const; + + // Specific obvious features. + bool hasFSQRT() const { return HasFSQRT; } + bool hasSTFIWX() const { return HasSTFIWX; } + bool hasAltivec() const { return HasAltivec; } + bool isGigaProcessor() const { return IsGigaProcessor; } + + bool isDarwin() const { return IsDarwin; } + + bool isMachoABI() const { return IsDarwin || IsPPC64; } + bool isELF32_ABI() const { return !IsDarwin && !IsPPC64; } + + unsigned getAsmFlavor() const { + return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0; + } +}; +} // End llvm namespace + +#endif diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.cpp b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp new file mode 100644 index 0000000..01c78b7 --- /dev/null +++ b/lib/Target/PowerPC/PPCTargetAsmInfo.cpp @@ -0,0 +1,96 @@ +//===-- PPCTargetAsmInfo.cpp - PPC asm properties ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the DarwinTargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetAsmInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/Function.h" +using namespace llvm; + +PPCTargetAsmInfo::PPCTargetAsmInfo(const PPCTargetMachine &TM) { + bool isPPC64 = TM.getSubtargetImpl()->isPPC64(); + + ZeroDirective = "\t.space\t"; + SetDirective = "\t.set"; + Data64bitsDirective = isPPC64 ? "\t.quad\t" : 0; + AlignmentIsInBytes = false; + LCOMMDirective = "\t.lcomm\t"; + InlineAsmStart = "# InlineAsm Start"; + InlineAsmEnd = "# InlineAsm End"; + AssemblerDialect = TM.getSubtargetImpl()->getAsmFlavor(); + + NeedsSet = true; + AddressSize = isPPC64 ? 8 : 4; + DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug"; + DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug"; + DwarfLineSection = ".section __DWARF,__debug_line,regular,debug"; + DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug"; + DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug"; + DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug"; + DwarfStrSection = ".section __DWARF,__debug_str,regular,debug"; + DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug"; + DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug"; + DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug"; + DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug"; + DwarfEHFrameSection = + ".section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support"; + DwarfExceptionSection = ".section __DATA,__gcc_except_tab"; +} + +DarwinTargetAsmInfo::DarwinTargetAsmInfo(const PPCTargetMachine &TM) +: PPCTargetAsmInfo(TM) +{ + PCSymbol = "."; + CommentString = ";"; + GlobalPrefix = "_"; + PrivateGlobalPrefix = "L"; + ConstantPoolSection = "\t.const\t"; + JumpTableDataSection = ".const"; + GlobalDirective = "\t.globl\t"; + CStringSection = "\t.cstring"; + FourByteConstantSection = "\t.literal4\n"; + EightByteConstantSection = "\t.literal8\n"; + ReadOnlySection = "\t.const\n"; + if (TM.getRelocationModel() == Reloc::Static) { + StaticCtorsSection = ".constructor"; + StaticDtorsSection = ".destructor"; + } else { + StaticCtorsSection = ".mod_init_func"; + StaticDtorsSection = ".mod_term_func"; + } + UsedDirective = "\t.no_dead_strip\t"; + WeakRefDirective = "\t.weak_reference\t"; + HiddenDirective = "\t.private_extern\t"; + SupportsExceptionHandling = true; + + // In non-PIC modes, emit a special label before jump tables so that the + // linker can perform more accurate dead code stripping. + if (TM.getRelocationModel() != Reloc::PIC_) { + // Emit a local label that is preserved until the linker runs. + JumpTableSpecialLabelPrefix = "l"; + } +} + +LinuxTargetAsmInfo::LinuxTargetAsmInfo(const PPCTargetMachine &TM) +: PPCTargetAsmInfo(TM) +{ + CommentString = "#"; + GlobalPrefix = ""; + PrivateGlobalPrefix = ""; + ConstantPoolSection = "\t.section .rodata.cst4\t"; + JumpTableDataSection = ".section .rodata.cst4"; + CStringSection = "\t.section\t.rodata"; + StaticCtorsSection = ".section\t.ctors,\"aw\",@progbits"; + StaticDtorsSection = ".section\t.dtors,\"aw\",@progbits"; + UsedDirective = "\t# .no_dead_strip\t"; + WeakRefDirective = "\t.weak\t"; +} diff --git a/lib/Target/PowerPC/PPCTargetAsmInfo.h b/lib/Target/PowerPC/PPCTargetAsmInfo.h new file mode 100644 index 0000000..6a680e2 --- /dev/null +++ b/lib/Target/PowerPC/PPCTargetAsmInfo.h @@ -0,0 +1,38 @@ +//=====-- PPCTargetAsmInfo.h - PPC asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the DarwinTargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCTARGETASMINFO_H +#define PPCTARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class PPCTargetMachine; + + struct PPCTargetAsmInfo : public TargetAsmInfo { + PPCTargetAsmInfo(const PPCTargetMachine &TM); + }; + + struct DarwinTargetAsmInfo : public PPCTargetAsmInfo { + DarwinTargetAsmInfo(const PPCTargetMachine &TM); + }; + + struct LinuxTargetAsmInfo : public PPCTargetAsmInfo { + LinuxTargetAsmInfo(const PPCTargetMachine &TM); + }; + +} // namespace llvm + +#endif diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp new file mode 100644 index 0000000..57c8437 --- /dev/null +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -0,0 +1,166 @@ +//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Top-level implementation for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCTargetAsmInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Target/TargetMachineRegistry.h" +using namespace llvm; + +namespace { + // Register the targets + RegisterTarget<PPC32TargetMachine> + X("ppc32", " PowerPC 32"); + RegisterTarget<PPC64TargetMachine> + Y("ppc64", " PowerPC 64"); +} + +const TargetAsmInfo *PPCTargetMachine::createTargetAsmInfo() const { + if (Subtarget.isDarwin()) + return new DarwinTargetAsmInfo(*this); + else + return new LinuxTargetAsmInfo(*this); +} + +unsigned PPC32TargetMachine::getJITMatchQuality() { +#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__) + if (sizeof(void*) == 4) + return 10; +#endif + return 0; +} +unsigned PPC64TargetMachine::getJITMatchQuality() { +#if defined(__POWERPC__) || defined (__ppc__) || defined(_POWER) || defined(__PPC__) + if (sizeof(void*) == 8) + return 10; +#endif + return 0; +} + +unsigned PPC32TargetMachine::getModuleMatchQuality(const Module &M) { + // We strongly match "powerpc-*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 8 && std::string(TT.begin(), TT.begin()+8) == "powerpc-") + return 20; + + // If the target triple is something non-powerpc, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::BigEndian && + M.getPointerSize() == Module::Pointer32) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + +unsigned PPC64TargetMachine::getModuleMatchQuality(const Module &M) { + // We strongly match "powerpc64-*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 10 && std::string(TT.begin(), TT.begin()+10) == "powerpc64-") + return 20; + + if (M.getEndianness() == Module::BigEndian && + M.getPointerSize() == Module::Pointer64) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + + +PPCTargetMachine::PPCTargetMachine(const Module &M, const std::string &FS, + bool is64Bit) + : Subtarget(*this, M, FS, is64Bit), + DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this), + FrameInfo(*this, is64Bit), JITInfo(*this, is64Bit), TLInfo(*this), + InstrItins(Subtarget.getInstrItineraryData()), MachOWriterInfo(*this) { + + if (getRelocationModel() == Reloc::Default) + if (Subtarget.isDarwin()) + setRelocationModel(Reloc::DynamicNoPIC); + else + setRelocationModel(Reloc::Static); +} + +/// Override this for PowerPC. Tail merging happily breaks up instruction issue +/// groups, which typically degrades performance. +const bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; } + +PPC32TargetMachine::PPC32TargetMachine(const Module &M, const std::string &FS) + : PPCTargetMachine(M, FS, false) { +} + + +PPC64TargetMachine::PPC64TargetMachine(const Module &M, const std::string &FS) + : PPCTargetMachine(M, FS, true) { +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool PPCTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + // Install an instruction selector. + PM.add(createPPCISelDag(*this)); + return false; +} + +bool PPCTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) { + + // Must run branch selection immediately preceding the asm printer. + PM.add(createPPCBranchSelectionPass()); + return false; +} + +bool PPCTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + PM.add(createPPCAsmPrinterPass(Out, *this)); + return false; +} + +bool PPCTargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64. + // FIXME: This should be moved to TargetJITInfo!! + if (Subtarget.isPPC64()) { + // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many + // instructions to materialize arbitrary global variable + function + + // constant pool addresses. + setRelocationModel(Reloc::PIC_); + } else { + setRelocationModel(Reloc::Static); + } + + // Inform the subtarget that we are in JIT mode. FIXME: does this break macho + // writing? + Subtarget.SetJITMode(); + + // Machine code emitter pass for PowerPC. + PM.add(createPPCCodeEmitterPass(*this, MCE)); + return false; +} + +bool PPCTargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + // Machine code emitter pass for PowerPC. + PM.add(createPPCCodeEmitterPass(*this, MCE)); + return false; +} diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h new file mode 100644 index 0000000..10c5b7b --- /dev/null +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -0,0 +1,101 @@ +//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC -----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC_TARGETMACHINE_H +#define PPC_TARGETMACHINE_H + +#include "PPCFrameInfo.h" +#include "PPCSubtarget.h" +#include "PPCJITInfo.h" +#include "PPCInstrInfo.h" +#include "PPCISelLowering.h" +#include "PPCMachOWriterInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" + +namespace llvm { +class PassManager; +class GlobalValue; + +/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets. +/// +class PPCTargetMachine : public LLVMTargetMachine { + PPCSubtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + PPCInstrInfo InstrInfo; + PPCFrameInfo FrameInfo; + PPCJITInfo JITInfo; + PPCTargetLowering TLInfo; + InstrItineraryData InstrItins; + PPCMachOWriterInfo MachOWriterInfo; + +protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + +public: + PPCTargetMachine(const Module &M, const std::string &FS, bool is64Bit); + + virtual const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual TargetJITInfo *getJITInfo() { return &JITInfo; } + virtual PPCTargetLowering *getTargetLowering() const { + return const_cast<PPCTargetLowering*>(&TLInfo); + } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const PPCSubtarget *getSubtargetImpl() const { return &Subtarget; } + virtual const InstrItineraryData getInstrItineraryData() const { + return InstrItins; + } + virtual const PPCMachOWriterInfo *getMachOWriterInfo() const { + return &MachOWriterInfo; + } + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); + virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); + virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); + virtual const bool getEnableTailMergeDefault() const; +}; + +/// PPC32TargetMachine - PowerPC 32-bit target machine. +/// +class PPC32TargetMachine : public PPCTargetMachine { +public: + PPC32TargetMachine(const Module &M, const std::string &FS); + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); +}; + +/// PPC64TargetMachine - PowerPC 64-bit target machine. +/// +class PPC64TargetMachine : public PPCTargetMachine { +public: + PPC64TargetMachine(const Module &M, const std::string &FS); + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt new file mode 100644 index 0000000..69e60fc --- /dev/null +++ b/lib/Target/PowerPC/README.txt @@ -0,0 +1,664 @@ +//===- README.txt - Notes for improving PowerPC-specific code gen ---------===// + +TODO: +* gpr0 allocation +* implement do-loop -> bdnz transform +* __builtin_return_address not supported on PPC + +===-------------------------------------------------------------------------=== + +Support 'update' load/store instructions. These are cracked on the G5, but are +still a codesize win. + +With preinc enabled, this: + +long *%test4(long *%X, long *%dest) { + %Y = getelementptr long* %X, int 4 + %A = load long* %Y + store long %A, long* %dest + ret long* %Y +} + +compiles to: + +_test4: + mr r2, r3 + lwzu r5, 32(r2) + lwz r3, 36(r3) + stw r5, 0(r4) + stw r3, 4(r4) + mr r3, r2 + blr + +with -sched=list-burr, I get: + +_test4: + lwz r2, 36(r3) + lwzu r5, 32(r3) + stw r2, 4(r4) + stw r5, 0(r4) + blr + +===-------------------------------------------------------------------------=== + +We compile the hottest inner loop of viterbi to: + + li r6, 0 + b LBB1_84 ;bb432.i +LBB1_83: ;bb420.i + lbzx r8, r5, r7 + addi r6, r7, 1 + stbx r8, r4, r7 +LBB1_84: ;bb432.i + mr r7, r6 + cmplwi cr0, r7, 143 + bne cr0, LBB1_83 ;bb420.i + +The CBE manages to produce: + + li r0, 143 + mtctr r0 +loop: + lbzx r2, r2, r11 + stbx r0, r2, r9 + addi r2, r2, 1 + bdz later + b loop + +This could be much better (bdnz instead of bdz) but it still beats us. If we +produced this with bdnz, the loop would be a single dispatch group. + +===-------------------------------------------------------------------------=== + +Compile: + +void foo(int *P) { + if (P) *P = 0; +} + +into: + +_foo: + cmpwi cr0,r3,0 + beqlr cr0 + li r0,0 + stw r0,0(r3) + blr + +This is effectively a simple form of predication. + +===-------------------------------------------------------------------------=== + +Lump the constant pool for each function into ONE pic object, and reference +pieces of it as offsets from the start. For functions like this (contrived +to have lots of constants obviously): + +double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; } + +We generate: + +_X: + lis r2, ha16(.CPI_X_0) + lfd f0, lo16(.CPI_X_0)(r2) + lis r2, ha16(.CPI_X_1) + lfd f2, lo16(.CPI_X_1)(r2) + fmadd f0, f1, f0, f2 + lis r2, ha16(.CPI_X_2) + lfd f1, lo16(.CPI_X_2)(r2) + lis r2, ha16(.CPI_X_3) + lfd f2, lo16(.CPI_X_3)(r2) + fmadd f1, f0, f1, f2 + blr + +It would be better to materialize .CPI_X into a register, then use immediates +off of the register to avoid the lis's. This is even more important in PIC +mode. + +Note that this (and the static variable version) is discussed here for GCC: +http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html + +===-------------------------------------------------------------------------=== + +PIC Code Gen IPO optimization: + +Squish small scalar globals together into a single global struct, allowing the +address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size +of the GOT on targets with one). + +Note that this is discussed here for GCC: +http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html + +===-------------------------------------------------------------------------=== + +Implement Newton-Rhapson method for improving estimate instructions to the +correct accuracy, and implementing divide as multiply by reciprocal when it has +more than one use. Itanium will want this too. + +===-------------------------------------------------------------------------=== + +Compile this: + +int %f1(int %a, int %b) { + %tmp.1 = and int %a, 15 ; <int> [#uses=1] + %tmp.3 = and int %b, 240 ; <int> [#uses=1] + %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1] + ret int %tmp.4 +} + +without a copy. We make this currently: + +_f1: + rlwinm r2, r4, 0, 24, 27 + rlwimi r2, r3, 0, 28, 31 + or r3, r2, r2 + blr + +The two-addr pass or RA needs to learn when it is profitable to commute an +instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass +currently only commutes to avoid inserting a copy BEFORE the two addr instr. + +===-------------------------------------------------------------------------=== + +Compile offsets from allocas: + +int *%test() { + %X = alloca { int, int } + %Y = getelementptr {int,int}* %X, int 0, uint 1 + ret int* %Y +} + +into a single add, not two: + +_test: + addi r2, r1, -8 + addi r3, r2, 4 + blr + +--> important for C++. + +===-------------------------------------------------------------------------=== + +No loads or stores of the constants should be needed: + +struct foo { double X, Y; }; +void xxx(struct foo F); +void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); } + +===-------------------------------------------------------------------------=== + +Darwin Stub LICM optimization: + +Loops like this: + + for (...) bar(); + +Have to go through an indirect stub if bar is external or linkonce. It would +be better to compile it as: + + fp = &bar; + for (...) fp(); + +which only computes the address of bar once (instead of each time through the +stub). This is Darwin specific and would have to be done in the code generator. +Probably not a win on x86. + +===-------------------------------------------------------------------------=== + +Simple IPO for argument passing, change: + void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y) + +the Darwin ABI specifies that any integer arguments in the first 32 bytes worth +of arguments get assigned to r3 through r10. That is, if you have a function +foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the +argument bytes for r4 and r5. The trick then would be to shuffle the argument +order for functions we can internalize so that the maximum number of +integers/pointers get passed in regs before you see any of the fp arguments. + +Instead of implementing this, it would actually probably be easier to just +implement a PPC fastcc, where we could do whatever we wanted to the CC, +including having this work sanely. + +===-------------------------------------------------------------------------=== + +Fix Darwin FP-In-Integer Registers ABI + +Darwin passes doubles in structures in integer registers, which is very very +bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation +that percolates these things out of functions. + +Check out how horrible this is: +http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html + +This is an extension of "interprocedural CC unmunging" that can't be done with +just fastcc. + +===-------------------------------------------------------------------------=== + +Compile this: + +int foo(int a) { + int b = (a < 8); + if (b) { + return b * 3; // ignore the fact that this is always 3. + } else { + return 2; + } +} + +into something not this: + +_foo: +1) cmpwi cr7, r3, 8 + mfcr r2, 1 + rlwinm r2, r2, 29, 31, 31 +1) cmpwi cr0, r3, 7 + bgt cr0, LBB1_2 ; UnifiedReturnBlock +LBB1_1: ; then + rlwinm r2, r2, 0, 31, 31 + mulli r3, r2, 3 + blr +LBB1_2: ; UnifiedReturnBlock + li r3, 2 + blr + +In particular, the two compares (marked 1) could be shared by reversing one. +This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the +same operands (but backwards) exists. In this case, this wouldn't save us +anything though, because the compares still wouldn't be shared. + +===-------------------------------------------------------------------------=== + +We should custom expand setcc instead of pretending that we have it. That +would allow us to expose the access of the crbit after the mfcr, allowing +that access to be trivially folded into other ops. A simple example: + +int foo(int a, int b) { return (a < b) << 4; } + +compiles into: + +_foo: + cmpw cr7, r3, r4 + mfcr r2, 1 + rlwinm r2, r2, 29, 31, 31 + slwi r3, r2, 4 + blr + +===-------------------------------------------------------------------------=== + +Fold add and sub with constant into non-extern, non-weak addresses so this: + +static int a; +void bar(int b) { a = b; } +void foo(unsigned char *c) { + *c = a; +} + +So that + +_foo: + lis r2, ha16(_a) + la r2, lo16(_a)(r2) + lbz r2, 3(r2) + stb r2, 0(r3) + blr + +Becomes + +_foo: + lis r2, ha16(_a+3) + lbz r2, lo16(_a+3)(r2) + stb r2, 0(r3) + blr + +===-------------------------------------------------------------------------=== + +We generate really bad code for this: + +int f(signed char *a, _Bool b, _Bool c) { + signed char t = 0; + if (b) t = *a; + if (c) *a = t; +} + +===-------------------------------------------------------------------------=== + +This: +int test(unsigned *P) { return *P >> 24; } + +Should compile to: + +_test: + lbz r3,0(r3) + blr + +not: + +_test: + lwz r2, 0(r3) + srwi r3, r2, 24 + blr + +===-------------------------------------------------------------------------=== + +On the G5, logical CR operations are more expensive in their three +address form: ops that read/write the same register are half as expensive as +those that read from two registers that are different from their destination. + +We should model this with two separate instructions. The isel should generate +the "two address" form of the instructions. When the register allocator +detects that it needs to insert a copy due to the two-addresness of the CR +logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point +we can convert to the "three address" instruction, to save code space. + +This only matters when we start generating cr logical ops. + +===-------------------------------------------------------------------------=== + +We should compile these two functions to the same thing: + +#include <stdlib.h> +void f(int a, int b, int *P) { + *P = (a-b)>=0?(a-b):(b-a); +} +void g(int a, int b, int *P) { + *P = abs(a-b); +} + +Further, they should compile to something better than: + +_g: + subf r2, r4, r3 + subfic r3, r2, 0 + cmpwi cr0, r2, -1 + bgt cr0, LBB2_2 ; entry +LBB2_1: ; entry + mr r2, r3 +LBB2_2: ; entry + stw r2, 0(r5) + blr + +GCC produces: + +_g: + subf r4,r4,r3 + srawi r2,r4,31 + xor r0,r2,r4 + subf r0,r2,r0 + stw r0,0(r5) + blr + +... which is much nicer. + +This theoretically may help improve twolf slightly (used in dimbox.c:142?). + +===-------------------------------------------------------------------------=== + +int foo(int N, int ***W, int **TK, int X) { + int t, i; + + for (t = 0; t < N; ++t) + for (i = 0; i < 4; ++i) + W[t / X][i][t % X] = TK[i][t]; + + return 5; +} + +We generate relatively atrocious code for this loop compared to gcc. + +We could also strength reduce the rem and the div: +http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf + +===-------------------------------------------------------------------------=== + +float foo(float X) { return (int)(X); } + +Currently produces: + +_foo: + fctiwz f0, f1 + stfd f0, -8(r1) + lwz r2, -4(r1) + extsw r2, r2 + std r2, -16(r1) + lfd f0, -16(r1) + fcfid f0, f0 + frsp f1, f0 + blr + +We could use a target dag combine to turn the lwz/extsw into an lwa when the +lwz has a single use. Since LWA is cracked anyway, this would be a codesize +win only. + +===-------------------------------------------------------------------------=== + +We generate ugly code for this: + +void func(unsigned int *ret, float dx, float dy, float dz, float dw) { + unsigned code = 0; + if(dx < -dw) code |= 1; + if(dx > dw) code |= 2; + if(dy < -dw) code |= 4; + if(dy > dw) code |= 8; + if(dz < -dw) code |= 16; + if(dz > dw) code |= 32; + *ret = code; +} + +===-------------------------------------------------------------------------=== + +Complete the signed i32 to FP conversion code using 64-bit registers +transformation, good for PI. See PPCISelLowering.cpp, this comment: + + // FIXME: disable this lowered code. This generates 64-bit register values, + // and we don't model the fact that the top part is clobbered by calls. We + // need to flag these together so that the value isn't live across a call. + //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + +Also, if the registers are spilled to the stack, we have to ensure that all +64-bits of them are save/restored, otherwise we will miscompile the code. It +sounds like we need to get the 64-bit register classes going. + +===-------------------------------------------------------------------------=== + +%struct.B = type { i8, [3 x i8] } + +define void @bar(%struct.B* %b) { +entry: + %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] + %tmp = load i32* %tmp ; <uint> [#uses=1] + %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1] + %tmp4 = load i32* %tmp3 ; <uint> [#uses=1] + %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2] + %tmp9 = load i32* %tmp8 ; <uint> [#uses=1] + %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1] + %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1] + %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1] + %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1] + %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1] + %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1] + store i32 %tmp13, i32* %tmp8 + ret void +} + +We emit: + +_foo: + lwz r2, 0(r3) + slwi r4, r2, 1 + or r4, r4, r2 + rlwimi r2, r4, 0, 0, 0 + stw r2, 0(r3) + blr + +We could collapse a bunch of those ORs and ANDs and generate the following +equivalent code: + +_foo: + lwz r2, 0(r3) + rlwinm r4, r2, 1, 0, 0 + or r2, r2, r4 + stw r2, 0(r3) + blr + +===-------------------------------------------------------------------------=== + +We compile: + +unsigned test6(unsigned x) { + return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16); +} + +into: + +_test6: + lis r2, 255 + rlwinm r3, r3, 16, 0, 31 + ori r2, r2, 255 + and r3, r3, r2 + blr + +GCC gets it down to: + +_test6: + rlwinm r0,r3,16,8,15 + rlwinm r3,r3,16,24,31 + or r3,r3,r0 + blr + + +===-------------------------------------------------------------------------=== + +Consider a function like this: + +float foo(float X) { return X + 1234.4123f; } + +The FP constant ends up in the constant pool, so we need to get the LR register. + This ends up producing code like this: + +_foo: +.LBB_foo_0: ; entry + mflr r11 +*** stw r11, 8(r1) + bl "L00000$pb" +"L00000$pb": + mflr r2 + addis r2, r2, ha16(.CPI_foo_0-"L00000$pb") + lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2) + fadds f1, f1, f0 +*** lwz r11, 8(r1) + mtlr r11 + blr + +This is functional, but there is no reason to spill the LR register all the way +to the stack (the two marked instrs): spilling it to a GPR is quite enough. + +Implementing this will require some codegen improvements. Nate writes: + +"So basically what we need to support the "no stack frame save and restore" is a +generalization of the LR optimization to "callee-save regs". + +Currently, we have LR marked as a callee-save reg. The register allocator sees +that it's callee save, and spills it directly to the stack. + +Ideally, something like this would happen: + +LR would be in a separate register class from the GPRs. The class of LR would be +marked "unspillable". When the register allocator came across an unspillable +reg, it would ask "what is the best class to copy this into that I *can* spill" +If it gets a class back, which it will in this case (the gprs), it grabs a free +register of that class. If it is then later necessary to spill that reg, so be +it. + +===-------------------------------------------------------------------------=== + +We compile this: +int test(_Bool X) { + return X ? 524288 : 0; +} + +to: +_test: + cmplwi cr0, r3, 0 + lis r2, 8 + li r3, 0 + beq cr0, LBB1_2 ;entry +LBB1_1: ;entry + mr r3, r2 +LBB1_2: ;entry + blr + +instead of: +_test: + addic r2,r3,-1 + subfe r0,r2,r3 + slwi r3,r0,19 + blr + +This sort of thing occurs a lot due to globalopt. + +===-------------------------------------------------------------------------=== + +We currently compile 32-bit bswap: + +declare i32 @llvm.bswap.i32(i32 %A) +define i32 @test(i32 %A) { + %B = call i32 @llvm.bswap.i32(i32 %A) + ret i32 %B +} + +to: + +_test: + rlwinm r2, r3, 24, 16, 23 + slwi r4, r3, 24 + rlwimi r2, r3, 8, 24, 31 + rlwimi r4, r3, 8, 8, 15 + rlwimi r4, r2, 0, 16, 31 + mr r3, r4 + blr + +it would be more efficient to produce: + +_foo: mr r0,r3 + rlwinm r3,r3,8,0xffffffff + rlwimi r3,r0,24,0,7 + rlwimi r3,r0,24,16,23 + blr + +===-------------------------------------------------------------------------=== + +test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to: + +__ZNK4llvm5APInt17countLeadingZerosEv: + ld r2, 0(r3) + cntlzd r2, r2 + or r2, r2, r2 <<-- silly. + addi r3, r2, -64 + blr + +The dead or is a 'truncate' from 64- to 32-bits. + +===-------------------------------------------------------------------------=== + +We generate horrible ppc code for this: + +#define N 2000000 +double a[N],c[N]; +void simpleloop() { + int j; + for (j=0; j<N; j++) + c[j] = a[j]; +} + +LBB1_1: ;bb + lfdx f0, r3, r4 + addi r5, r5, 1 ;; Extra IV for the exit value compare. + stfdx f0, r2, r4 + addi r4, r4, 8 + + xoris r6, r5, 30 ;; This is due to a large immediate. + cmplwi cr0, r6, 33920 + bne cr0, LBB1_1 + +===-------------------------------------------------------------------------=== + diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt new file mode 100644 index 0000000..143804d --- /dev/null +++ b/lib/Target/PowerPC/README_ALTIVEC.txt @@ -0,0 +1,179 @@ +//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===// + +Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector +registers, to generate better spill code. + +//===----------------------------------------------------------------------===// + +The first should be a single lvx from the constant pool, the second should be +a xor/stvx: + +void foo(void) { + int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 }; + bar (x); +} + +#include <string.h> +void foo(void) { + int x[8] __attribute__((aligned(128))); + memset (x, 0, sizeof (x)); + bar (x); +} + +//===----------------------------------------------------------------------===// + +Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763 + +When -ffast-math is on, we can use 0.0. + +//===----------------------------------------------------------------------===// + + Consider this: + v4f32 Vector; + v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X }; + +Since we know that "Vector" is 16-byte aligned and we know the element offset +of ".X", we should change the load into a lve*x instruction, instead of doing +a load/store/lve*x sequence. + +//===----------------------------------------------------------------------===// + +For functions that use altivec AND have calls, we are VRSAVE'ing all call +clobbered regs. + +//===----------------------------------------------------------------------===// + +Implement passing vectors by value into calls and receiving them as arguments. + +//===----------------------------------------------------------------------===// + +GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load +of C1/C2/C3, then a load and vperm of Variable. + +//===----------------------------------------------------------------------===// + +We need a way to teach tblgen that some operands of an intrinsic are required to +be constants. The verifier should enforce this constraint. + +//===----------------------------------------------------------------------===// + +We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte +aligned stack slot, followed by a load/vperm. We should probably just store it +to a scalar stack slot, then use lvsl/vperm to load it. If the value is already +in memory this is a big win. + +//===----------------------------------------------------------------------===// + +extract_vector_elt of an arbitrary constant vector can be done with the +following instructions: + +vTemp = vec_splat(v0,2); // 2 is the element the src is in. +vec_ste(&destloc,0,vTemp); + +We can do an arbitrary non-constant value by using lvsr/perm/ste. + +//===----------------------------------------------------------------------===// + +If we want to tie instruction selection into the scheduler, we can do some +constant formation with different instructions. For example, we can generate +"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with +"vsplti 0" or "vxor", each of which use different execution units, thus could +help scheduling. + +This is probably only reasonable for a post-pass scheduler. + +//===----------------------------------------------------------------------===// + +For this function: + +void test(vector float *A, vector float *B) { + vector float C = (vector float)vec_cmpeq(*A, *B); + if (!vec_any_eq(*A, *B)) + *B = (vector float){0,0,0,0}; + *A = C; +} + +we get the following basic block: + + ... + lvx v2, 0, r4 + lvx v3, 0, r3 + vcmpeqfp v4, v3, v2 + vcmpeqfp. v2, v3, v2 + bne cr6, LBB1_2 ; cond_next + +The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the +vcmpeqfp. result is used by a branch. This can be improved. + +//===----------------------------------------------------------------------===// + +The code generated for this is truly aweful: + +vector float test(float a, float b) { + return (vector float){ 0.0, a, 0.0, 0.0}; +} + +LCPI1_0: ; float + .space 4 + .text + .globl _test + .align 4 +_test: + mfspr r2, 256 + oris r3, r2, 4096 + mtspr 256, r3 + lis r3, ha16(LCPI1_0) + addi r4, r1, -32 + stfs f1, -16(r1) + addi r5, r1, -16 + lfs f0, lo16(LCPI1_0)(r3) + stfs f0, -32(r1) + lvx v2, 0, r4 + lvx v3, 0, r5 + vmrghw v3, v3, v2 + vspltw v2, v2, 0 + vmrghw v2, v2, v3 + mtspr 256, r2 + blr + +//===----------------------------------------------------------------------===// + +int foo(vector float *x, vector float *y) { + if (vec_all_eq(*x,*y)) return 3245; + else return 12; +} + +A predicate compare being used in a select_cc should have the same peephole +applied to it as a predicate compare used by a br_cc. There should be no +mfcr here: + +_foo: + mfspr r2, 256 + oris r5, r2, 12288 + mtspr 256, r5 + li r5, 12 + li r6, 3245 + lvx v2, 0, r4 + lvx v3, 0, r3 + vcmpeqfp. v2, v3, v2 + mfcr r3, 2 + rlwinm r3, r3, 25, 31, 31 + cmpwi cr0, r3, 0 + bne cr0, LBB1_2 ; entry +LBB1_1: ; entry + mr r6, r5 +LBB1_2: ; entry + mr r3, r6 + mtspr 256, r2 + blr + +//===----------------------------------------------------------------------===// + +CodeGen/PowerPC/vec_constants.ll has an and operation that should be +codegen'd to andc. The issue is that the 'all ones' build vector is +SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected +which prevents the vnot pattern from matching. + + +//===----------------------------------------------------------------------===// diff --git a/lib/Target/README.txt b/lib/Target/README.txt new file mode 100644 index 0000000..37b671f --- /dev/null +++ b/lib/Target/README.txt @@ -0,0 +1,451 @@ +Target Independent Opportunities: + +//===---------------------------------------------------------------------===// + +With the recent changes to make the implicit def/use set explicit in +machineinstrs, we should change the target descriptions for 'call' instructions +so that the .td files don't list all the call-clobbered registers as implicit +defs. Instead, these should be added by the code generator (e.g. on the dag). + +This has a number of uses: + +1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions + for their different impdef sets. +2. Targets with multiple calling convs (e.g. x86) which have different clobber + sets don't need copies of call instructions. +3. 'Interprocedural register allocation' can be done to reduce the clobber sets + of calls. + +//===---------------------------------------------------------------------===// + +Make the PPC branch selector target independant + +//===---------------------------------------------------------------------===// + +Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and +precision don't matter (ffastmath). Misc/mandel will like this. :) + +//===---------------------------------------------------------------------===// + +Solve this DAG isel folding deficiency: + +int X, Y; + +void fn1(void) +{ + X = X | (Y << 3); +} + +compiles to + +fn1: + movl Y, %eax + shll $3, %eax + orl X, %eax + movl %eax, X + ret + +The problem is the store's chain operand is not the load X but rather +a TokenFactor of the load X and load Y, which prevents the folding. + +There are two ways to fix this: + +1. The dag combiner can start using alias analysis to realize that y/x + don't alias, making the store to X not dependent on the load from Y. +2. The generated isel could be made smarter in the case it can't + disambiguate the pointers. + +Number 1 is the preferred solution. + +This has been "fixed" by a TableGen hack. But that is a short term workaround +which will be removed once the proper fix is made. + +//===---------------------------------------------------------------------===// + +On targets with expensive 64-bit multiply, we could LSR this: + +for (i = ...; ++i) { + x = 1ULL << i; + +into: + long long tmp = 1; + for (i = ...; ++i, tmp+=tmp) + x = tmp; + +This would be a win on ppc32, but not x86 or ppc64. + +//===---------------------------------------------------------------------===// + +Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0) + +//===---------------------------------------------------------------------===// + +Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply. + +//===---------------------------------------------------------------------===// + +Interesting? testcase for add/shift/mul reassoc: + +int bar(int x, int y) { + return x*x*x+y+x*x*x*x*x*y*y*y*y; +} +int foo(int z, int n) { + return bar(z, n) + bar(2*z, 2*n); +} + +Reassociate should handle the example in GCC PR16157. + +//===---------------------------------------------------------------------===// + +These two functions should generate the same code on big-endian systems: + +int g(int *j,int *l) { return memcmp(j,l,4); } +int h(int *j, int *l) { return *j - *l; } + +this could be done in SelectionDAGISel.cpp, along with other special cases, +for 1,2,4,8 bytes. + +//===---------------------------------------------------------------------===// + +It would be nice to revert this patch: +http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html + +And teach the dag combiner enough to simplify the code expanded before +legalize. It seems plausible that this knowledge would let it simplify other +stuff too. + +//===---------------------------------------------------------------------===// + +For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal +to the type size. It works but can be overly conservative as the alignment of +specific vector types are target dependent. + +//===---------------------------------------------------------------------===// + +We should add 'unaligned load/store' nodes, and produce them from code like +this: + +v4sf example(float *P) { + return (v4sf){P[0], P[1], P[2], P[3] }; +} + +//===---------------------------------------------------------------------===// + +We should constant fold vector type casts at the LLVM level, regardless of the +cast. Currently we cannot fold some casts because we don't have TargetData +information in the constant folder, so we don't know the endianness of the +target! + +//===---------------------------------------------------------------------===// + +Add support for conditional increments, and other related patterns. Instead +of: + + movl 136(%esp), %eax + cmpl $0, %eax + je LBB16_2 #cond_next +LBB16_1: #cond_true + incl _foo +LBB16_2: #cond_next + +emit: + movl _foo, %eax + cmpl $1, %edi + sbbl $-1, %eax + movl %eax, _foo + +//===---------------------------------------------------------------------===// + +Combine: a = sin(x), b = cos(x) into a,b = sincos(x). + +Expand these to calls of sin/cos and stores: + double sincos(double x, double *sin, double *cos); + float sincosf(float x, float *sin, float *cos); + long double sincosl(long double x, long double *sin, long double *cos); + +Doing so could allow SROA of the destination pointers. See also: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687 + +//===---------------------------------------------------------------------===// + +Scalar Repl cannot currently promote this testcase to 'ret long cst': + + %struct.X = type { i32, i32 } + %struct.Y = type { %struct.X } + +define i64 @bar() { + %retval = alloca %struct.Y, align 8 + %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0 + store i32 0, i32* %tmp12 + %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1 + store i32 1, i32* %tmp15 + %retval.upgrd.1 = bitcast %struct.Y* %retval to i64* + %retval.upgrd.2 = load i64* %retval.upgrd.1 + ret i64 %retval.upgrd.2 +} + +it should be extended to do so. + +//===---------------------------------------------------------------------===// + +-scalarrepl should promote this to be a vector scalar. + + %struct..0anon = type { <4 x float> } + +define void @test1(<4 x float> %V, float* %P) { + %u = alloca %struct..0anon, align 16 + %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0 + store <4 x float> %V, <4 x float>* %tmp + %tmp1 = bitcast %struct..0anon* %u to [4 x float]* + %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1 + %tmp.upgrd.2 = load float* %tmp.upgrd.1 + %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00 + store float %tmp3, float* %P + ret void +} + +//===---------------------------------------------------------------------===// + +Turn this into a single byte store with no load (the other 3 bytes are +unmodified): + +void %test(uint* %P) { + %tmp = load uint* %P + %tmp14 = or uint %tmp, 3305111552 + %tmp15 = and uint %tmp14, 3321888767 + store uint %tmp15, uint* %P + ret void +} + +//===---------------------------------------------------------------------===// + +dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x. + +Compile: + +int bar(int x) +{ + int t = __builtin_clz(x); + return -(t>>5); +} + +to: + +_bar: addic r3,r3,-1 + subfe r3,r3,r3 + blr + +//===---------------------------------------------------------------------===// + +Legalize should lower ctlz like this: + ctlz(x) = popcnt((x-1) & ~x) + +on targets that have popcnt but not ctlz. itanium, what else? + +//===---------------------------------------------------------------------===// + +quantum_sigma_x in 462.libquantum contains the following loop: + + for(i=0; i<reg->size; i++) + { + /* Flip the target bit of each basis state */ + reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target); + } + +Where MAX_UNSIGNED/state is a 64-bit int. On a 32-bit platform it would be just +so cool to turn it into something like: + + long long Res = ((MAX_UNSIGNED) 1 << target); + if (target < 32) { + for(i=0; i<reg->size; i++) + reg->node[i].state ^= Res & 0xFFFFFFFFULL; + } else { + for(i=0; i<reg->size; i++) + reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL + } + +... which would only do one 32-bit XOR per loop iteration instead of two. + +It would also be nice to recognize the reg->size doesn't alias reg->node[i], but +alas... + +//===---------------------------------------------------------------------===// + +This isn't recognized as bswap by instcombine: + +unsigned int swap_32(unsigned int v) { + v = ((v & 0x00ff00ffU) << 8) | ((v & 0xff00ff00U) >> 8); + v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16); + return v; +} + +Nor is this (yes, it really is bswap): + +unsigned long reverse(unsigned v) { + unsigned t; + t = v ^ ((v << 16) | (v >> 16)); + t &= ~0xff0000; + v = (v << 24) | (v >> 8); + return v ^ (t >> 8); +} + +//===---------------------------------------------------------------------===// + +These should turn into single 16-bit (unaligned?) loads on little/big endian +processors. + +unsigned short read_16_le(const unsigned char *adr) { + return adr[0] | (adr[1] << 8); +} +unsigned short read_16_be(const unsigned char *adr) { + return (adr[0] << 8) | adr[1]; +} + +//===---------------------------------------------------------------------===// + +-instcombine should handle this transform: + icmp pred (sdiv X / C1 ), C2 +when X, C1, and C2 are unsigned. Similarly for udiv and signed operands. + +Currently InstCombine avoids this transform but will do it when the signs of +the operands and the sign of the divide match. See the FIXME in +InstructionCombining.cpp in the visitSetCondInst method after the switch case +for Instruction::UDiv (around line 4447) for more details. + +The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of +this construct. + +//===---------------------------------------------------------------------===// + +Instcombine misses several of these cases (see the testcase in the patch): +http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html + +//===---------------------------------------------------------------------===// + +viterbi speeds up *significantly* if the various "history" related copy loops +are turned into memcpy calls at the source level. We need a "loops to memcpy" +pass. + +//===---------------------------------------------------------------------===// + +Consider: + +typedef unsigned U32; +typedef unsigned long long U64; +int test (U32 *inst, U64 *regs) { + U64 effective_addr2; + U32 temp = *inst; + int r1 = (temp >> 20) & 0xf; + int b2 = (temp >> 16) & 0xf; + effective_addr2 = temp & 0xfff; + if (b2) effective_addr2 += regs[b2]; + b2 = (temp >> 12) & 0xf; + if (b2) effective_addr2 += regs[b2]; + effective_addr2 &= regs[4]; + if ((effective_addr2 & 3) == 0) + return 1; + return 0; +} + +Note that only the low 2 bits of effective_addr2 are used. On 32-bit systems, +we don't eliminate the computation of the top half of effective_addr2 because +we don't have whole-function selection dags. On x86, this means we use one +extra register for the function when effective_addr2 is declared as U64 than +when it is declared U32. + +//===---------------------------------------------------------------------===// + +Promote for i32 bswap can use i64 bswap + shr. Useful on targets with 64-bit +regs and bswap, like itanium. + +//===---------------------------------------------------------------------===// + +LSR should know what GPR types a target has. This code: + +volatile short X, Y; // globals + +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} + +produces two identical IV's (after promotion) on PPC/ARM: + +LBB1_1: @bb.preheader + mov r3, #0 + mov r2, r3 + mov r1, r3 +LBB1_2: @bb + ldr r12, LCPI1_0 + ldr r12, [r12] + strh r2, [r12] + ldr r12, LCPI1_1 + ldr r12, [r12] + strh r3, [r12] + add r1, r1, #1 <- [0,+,1] + add r3, r3, #4 + add r2, r2, #1 <- [0,+,1] + cmp r1, r0 + bne LBB1_2 @bb + + +//===---------------------------------------------------------------------===// + +Tail call elim should be more aggressive, checking to see if the call is +followed by an uncond branch to an exit block. + +; This testcase is due to tail-duplication not wanting to copy the return +; instruction into the terminating blocks because there was other code +; optimized out of the function after the taildup happened. +;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call + +int %t4(int %a) { +entry: + %tmp.1 = and int %a, 1 + %tmp.2 = cast int %tmp.1 to bool + br bool %tmp.2, label %then.0, label %else.0 + +then.0: + %tmp.5 = add int %a, -1 + %tmp.3 = call int %t4( int %tmp.5 ) + br label %return + +else.0: + %tmp.7 = setne int %a, 0 + br bool %tmp.7, label %then.1, label %return + +then.1: + %tmp.11 = add int %a, -2 + %tmp.9 = call int %t4( int %tmp.11 ) + br label %return + +return: + %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ], + [ %tmp.9, %then.1 ] + ret int %result.0 +} + +//===---------------------------------------------------------------------===// + +Argument promotion should promote arguments for recursive functions, like +this: + +; RUN: llvm-upgrade < %s | llvm-as | opt -argpromotion | llvm-dis | grep x.val + +implementation ; Functions: + +internal int %foo(int* %x) { +entry: + %tmp = load int* %x + %tmp.foo = call int %foo(int *%x) + ret int %tmp.foo +} + +int %bar(int* %x) { +entry: + %tmp3 = call int %foo( int* %x) ; <int>[#uses=1] + ret int %tmp3 +} + + + diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp new file mode 100644 index 0000000..784f1bd --- /dev/null +++ b/lib/Target/Sparc/DelaySlotFiller.cpp @@ -0,0 +1,76 @@ +//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a simple local pass that fills delay slots with NOPs. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "delayslotfiller" +#include "Sparc.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(FilledSlots, "Number of delay slots filled"); + +namespace { + struct Filler : public MachineFunctionPass { + /// Target machine description which we query for reg. names, data + /// layout, etc. + /// + TargetMachine &TM; + const TargetInstrInfo *TII; + + static char ID; + Filler(TargetMachine &tm) + : MachineFunctionPass((intptr_t)&ID), TM(tm), TII(tm.getInstrInfo()) { } + + virtual const char *getPassName() const { + return "SPARC Delay Slot Filler"; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + bool runOnMachineFunction(MachineFunction &F) { + bool Changed = false; + for (MachineFunction::iterator FI = F.begin(), FE = F.end(); + FI != FE; ++FI) + Changed |= runOnMachineBasicBlock(*FI); + return Changed; + } + + }; + char Filler::ID = 0; +} // end of anonymous namespace + +/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay +/// slots in Sparc MachineFunctions +/// +FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) { + return new Filler(tm); +} + +/// runOnMachineBasicBlock - Fill in delay slots for the given basic block. +/// Currently, we fill delay slots with NOPs. We assume there is only one +/// delay slot per delayed instruction. +/// +bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) + if (TII->hasDelaySlot(I->getOpcode())) { + MachineBasicBlock::iterator J = I; + ++J; + BuildMI(MBB, J, TII->get(SP::NOP)); + ++FilledSlots; + Changed = true; + } + return Changed; +} diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp new file mode 100644 index 0000000..e1c9966 --- /dev/null +++ b/lib/Target/Sparc/FPMover.cpp @@ -0,0 +1,138 @@ +//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "fpmover" +#include "Sparc.h" +#include "SparcSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" +using namespace llvm; + +STATISTIC(NumFpDs , "Number of instructions translated"); +STATISTIC(NoopFpDs, "Number of noop instructions removed"); + +namespace { + struct FPMover : public MachineFunctionPass { + /// Target machine description which we query for reg. names, data + /// layout, etc. + /// + TargetMachine &TM; + + static char ID; + FPMover(TargetMachine &tm) + : MachineFunctionPass((intptr_t)&ID), TM(tm) { } + + virtual const char *getPassName() const { + return "Sparc Double-FP Move Fixer"; + } + + bool runOnMachineBasicBlock(MachineBasicBlock &MBB); + bool runOnMachineFunction(MachineFunction &F); + }; + char FPMover::ID = 0; +} // end of anonymous namespace + +/// createSparcFPMoverPass - Returns a pass that turns FpMOVD +/// instructions into FMOVS instructions +/// +FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) { + return new FPMover(tm); +} + +/// getDoubleRegPair - Given a DFP register, return the even and odd FP +/// registers that correspond to it. +static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg, + unsigned &OddReg) { + static const unsigned EvenHalvesOfPairs[] = { + SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14, + SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30 + }; + static const unsigned OddHalvesOfPairs[] = { + SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15, + SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31 + }; + static const unsigned DoubleRegsInOrder[] = { + SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8, + SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15 + }; + for (unsigned i = 0; i < sizeof(DoubleRegsInOrder)/sizeof(unsigned); ++i) + if (DoubleRegsInOrder[i] == DoubleReg) { + EvenReg = EvenHalvesOfPairs[i]; + OddReg = OddHalvesOfPairs[i]; + return; + } + assert(0 && "Can't find reg"); +} + +/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB. +/// +bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) { + MachineInstr *MI = I++; + if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD || + MI->getOpcode() == SP::FpNEGD) { + Changed = true; + unsigned DestDReg = MI->getOperand(0).getReg(); + unsigned SrcDReg = MI->getOperand(1).getReg(); + if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) { + MBB.erase(MI); // Eliminate the noop copy. + ++NoopFpDs; + continue; + } + + unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0; + getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg); + getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg); + + const TargetInstrInfo *TII = TM.getInstrInfo(); + if (MI->getOpcode() == SP::FpMOVD) + MI->setInstrDescriptor(TII->get(SP::FMOVS)); + else if (MI->getOpcode() == SP::FpNEGD) + MI->setInstrDescriptor(TII->get(SP::FNEGS)); + else if (MI->getOpcode() == SP::FpABSD) + MI->setInstrDescriptor(TII->get(SP::FABSS)); + else + assert(0 && "Unknown opcode!"); + + MI->getOperand(0).setReg(EvenDestReg); + MI->getOperand(1).setReg(EvenSrcReg); + DOUT << "FPMover: the modified instr is: " << *MI; + // Insert copy for the other half of the double. + if (DestDReg != SrcDReg) { + MI = BuildMI(MBB, I, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg) + .addReg(OddSrcReg); + DOUT << "FPMover: the inserted instr is: " << *MI; + } + ++NumFpDs; + } + } + return Changed; +} + +bool FPMover::runOnMachineFunction(MachineFunction &F) { + // If the target has V9 instructions, the fp-mover pseudos will never be + // emitted. Avoid a scan of the instructions to improve compile time. + if (TM.getSubtarget<SparcSubtarget>().isV9()) + return false; + + bool Changed = false; + for (MachineFunction::iterator FI = F.begin(), FE = F.end(); + FI != FE; ++FI) + Changed |= runOnMachineBasicBlock(*FI); + return Changed; +} diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile new file mode 100644 index 0000000..8cc4add --- /dev/null +++ b/lib/Target/Sparc/Makefile @@ -0,0 +1,20 @@ +##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMSparc +TARGET = Sparc + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = SparcGenRegisterInfo.h.inc SparcGenRegisterNames.inc \ + SparcGenRegisterInfo.inc SparcGenInstrNames.inc \ + SparcGenInstrInfo.inc SparcGenAsmWriter.inc \ + SparcGenDAGISel.inc SparcGenSubtarget.inc + +include $(LEVEL)/Makefile.common + diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt new file mode 100644 index 0000000..f7cb9b8 --- /dev/null +++ b/lib/Target/Sparc/README.txt @@ -0,0 +1,57 @@ + +To-do +----- + +* Keep the address of the constant pool in a register instead of forming its + address all of the time. +* We can fold small constant offsets into the %hi/%lo references to constant + pool addresses as well. +* When in V9 mode, register allocate %icc[0-3]. +* Emit the 'Branch on Integer Register with Prediction' instructions. It's + not clear how to write a pattern for this though: + +float %t1(int %a, int* %p) { + %C = seteq int %a, 0 + br bool %C, label %T, label %F +T: + store int 123, int* %p + br label %F +F: + ret float undef +} + +codegens to this: + +t1: + save -96, %o6, %o6 +1) subcc %i0, 0, %l0 +1) bne .LBBt1_2 ! F + nop +.LBBt1_1: ! T + or %g0, 123, %l0 + st %l0, [%i1] +.LBBt1_2: ! F + restore %g0, %g0, %g0 + retl + nop + +1) should be replaced with a brz in V9 mode. + +* Same as above, but emit conditional move on register zero (p192) in V9 + mode. Testcase: + +int %t1(int %a, int %b) { + %C = seteq int %a, 0 + %D = select bool %C, int %a, int %b + ret int %D +} + +* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling + with the Y register, if they are faster. + +* Codegen bswap(load)/store(bswap) -> load/store ASI + +* Implement frame pointer elimination, e.g. eliminate save/restore for + leaf fns. +* Fill delay slots + diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h new file mode 100644 index 0000000..8936afa --- /dev/null +++ b/lib/Target/Sparc/Sparc.h @@ -0,0 +1,116 @@ +//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// Sparc back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_SPARC_H +#define TARGET_SPARC_H + +#include <iosfwd> +#include <cassert> + +namespace llvm { + class FunctionPass; + class TargetMachine; + + FunctionPass *createSparcISelDag(TargetMachine &TM); + FunctionPass *createSparcCodePrinterPass(std::ostream &OS, TargetMachine &TM); + FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM); + FunctionPass *createSparcFPMoverPass(TargetMachine &TM); +} // end namespace llvm; + +// Defines symbolic names for Sparc registers. This defines a mapping from +// register name to register number. +// +#include "SparcGenRegisterNames.inc" + +// Defines symbolic names for the Sparc instructions. +// +#include "SparcGenInstrNames.inc" + + +namespace llvm { + // Enums corresponding to Sparc condition codes, both icc's and fcc's. These + // values must be kept in sync with the ones in the .td file. + namespace SPCC { + enum CondCodes { + //ICC_A = 8 , // Always + //ICC_N = 0 , // Never + ICC_NE = 9 , // Not Equal + ICC_E = 1 , // Equal + ICC_G = 10 , // Greater + ICC_LE = 2 , // Less or Equal + ICC_GE = 11 , // Greater or Equal + ICC_L = 3 , // Less + ICC_GU = 12 , // Greater Unsigned + ICC_LEU = 4 , // Less or Equal Unsigned + ICC_CC = 13 , // Carry Clear/Great or Equal Unsigned + ICC_CS = 5 , // Carry Set/Less Unsigned + ICC_POS = 14 , // Positive + ICC_NEG = 6 , // Negative + ICC_VC = 15 , // Overflow Clear + ICC_VS = 7 , // Overflow Set + + //FCC_A = 8+16, // Always + //FCC_N = 0+16, // Never + FCC_U = 7+16, // Unordered + FCC_G = 6+16, // Greater + FCC_UG = 5+16, // Unordered or Greater + FCC_L = 4+16, // Less + FCC_UL = 3+16, // Unordered or Less + FCC_LG = 2+16, // Less or Greater + FCC_NE = 1+16, // Not Equal + FCC_E = 9+16, // Equal + FCC_UE = 10+16, // Unordered or Equal + FCC_GE = 11+16, // Greater or Equal + FCC_UGE = 12+16, // Unordered or Greater or Equal + FCC_LE = 13+16, // Less or Equal + FCC_ULE = 14+16, // Unordered or Less or Equal + FCC_O = 15+16 // Ordered + }; + } + + inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) { + switch (CC) { + default: assert(0 && "Unknown condition code"); + case SPCC::ICC_NE: return "ne"; + case SPCC::ICC_E: return "e"; + case SPCC::ICC_G: return "g"; + case SPCC::ICC_LE: return "le"; + case SPCC::ICC_GE: return "ge"; + case SPCC::ICC_L: return "l"; + case SPCC::ICC_GU: return "gu"; + case SPCC::ICC_LEU: return "leu"; + case SPCC::ICC_CC: return "cc"; + case SPCC::ICC_CS: return "cs"; + case SPCC::ICC_POS: return "pos"; + case SPCC::ICC_NEG: return "neg"; + case SPCC::ICC_VC: return "vc"; + case SPCC::ICC_VS: return "vs"; + case SPCC::FCC_U: return "u"; + case SPCC::FCC_G: return "g"; + case SPCC::FCC_UG: return "ug"; + case SPCC::FCC_L: return "l"; + case SPCC::FCC_UL: return "ul"; + case SPCC::FCC_LG: return "lg"; + case SPCC::FCC_NE: return "ne"; + case SPCC::FCC_E: return "e"; + case SPCC::FCC_UE: return "ue"; + case SPCC::FCC_GE: return "ge"; + case SPCC::FCC_UGE: return "uge"; + case SPCC::FCC_LE: return "le"; + case SPCC::FCC_ULE: return "ule"; + case SPCC::FCC_O: return "o"; + } + } +} // end namespace llvm +#endif diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td new file mode 100644 index 0000000..1646e0e --- /dev/null +++ b/lib/Target/Sparc/Sparc.td @@ -0,0 +1,80 @@ +//===- Sparc.td - Describe the Sparc Target Machine -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "../Target.td" + +//===----------------------------------------------------------------------===// +// SPARC Subtarget features. +// + +def FeatureV9 + : SubtargetFeature<"v9", "IsV9", "true", + "Enable SPARC-V9 instructions">; +def FeatureV8Deprecated + : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true", + "Enable deprecated V8 instructions in V9 mode">; +def FeatureVIS + : SubtargetFeature<"vis", "IsVIS", "true", + "Enable UltraSPARC Visual Instruction Set extensions">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "SparcRegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "SparcInstrInfo.td" + +def SparcInstrInfo : InstrInfo { + // Define how we want to layout our target-specific information field. + let TSFlagsFields = []; + let TSFlagsShifts = []; +} + +//===----------------------------------------------------------------------===// +// SPARC processors supported. +//===----------------------------------------------------------------------===// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"generic", []>; +def : Proc<"v8", []>; +def : Proc<"supersparc", []>; +def : Proc<"sparclite", []>; +def : Proc<"f934", []>; +def : Proc<"hypersparc", []>; +def : Proc<"sparclite86x", []>; +def : Proc<"sparclet", []>; +def : Proc<"tsc701", []>; +def : Proc<"v9", [FeatureV9]>; +def : Proc<"ultrasparc", [FeatureV9, FeatureV8Deprecated]>; +def : Proc<"ultrasparc3", [FeatureV9, FeatureV8Deprecated]>; +def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>; + + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// + +def Sparc : Target { + // Pull in Instruction Info: + let InstructionSet = SparcInstrInfo; +} diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp new file mode 100644 index 0000000..1f82326 --- /dev/null +++ b/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -0,0 +1,291 @@ +//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to GAS-format SPARC assembly language. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "Sparc.h" +#include "SparcInstrInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Mangler.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/MathExtras.h" +#include <cctype> +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +namespace { + struct VISIBILITY_HIDDEN SparcAsmPrinter : public AsmPrinter { + SparcAsmPrinter(std::ostream &O, TargetMachine &TM, const TargetAsmInfo *T) + : AsmPrinter(O, TM, T) { + } + + /// We name each basic block in a Function with a unique number, so + /// that we can consistently refer to them later. This is cleared + /// at the beginning of each call to runOnMachineFunction(). + /// + typedef std::map<const Value *, unsigned> ValueMapTy; + ValueMapTy NumberForBB; + + virtual const char *getPassName() const { + return "Sparc Assembly Printer"; + } + + void printOperand(const MachineInstr *MI, int opNum); + void printMemOperand(const MachineInstr *MI, int opNum, + const char *Modifier = 0); + void printCCOperand(const MachineInstr *MI, int opNum); + + bool printInstruction(const MachineInstr *MI); // autogenerated. + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + }; +} // end of anonymous namespace + +#include "SparcGenAsmWriter.inc" + +/// createSparcCodePrinterPass - Returns a pass that prints the SPARC +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. This should work +/// regardless of whether the function is in SSA form. +/// +FunctionPass *llvm::createSparcCodePrinterPass(std::ostream &o, + TargetMachine &tm) { + return new SparcAsmPrinter(o, tm, tm.getTargetAsmInfo()); +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool SparcAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // BBNumber is used here so that a given Printer will never give two + // BBs the same name. (If you have a better way, please let me know!) + static unsigned BBNumber = 0; + + O << "\n\n"; + // What's my mangled name? + CurrentFnName = Mang->getValueName(MF.getFunction()); + + // Print out the label for the function. + const Function *F = MF.getFunction(); + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + EmitAlignment(4, F); + O << "\t.globl\t" << CurrentFnName << "\n"; + O << "\t.type\t" << CurrentFnName << ", #function\n"; + O << CurrentFnName << ":\n"; + + // Number each basic block so that we can consistently refer to them + // in PC-relative references. + // FIXME: Why not use the MBB numbers? + NumberForBB.clear(); + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + NumberForBB[I->getBasicBlock()] = BBNumber++; + } + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block. + if (I != MF.begin()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printInstruction(II); + ++EmittedInsts; + } + } + + // We didn't modify anything. + return false; +} + +void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) { + const MachineOperand &MO = MI->getOperand (opNum); + const MRegisterInfo &RI = *TM.getRegisterInfo(); + bool CloseParen = false; + if (MI->getOpcode() == SP::SETHIi && !MO.isRegister() && !MO.isImmediate()) { + O << "%hi("; + CloseParen = true; + } else if ((MI->getOpcode() == SP::ORri || MI->getOpcode() == SP::ADDri) + && !MO.isRegister() && !MO.isImmediate()) { + O << "%lo("; + CloseParen = true; + } + switch (MO.getType()) { + case MachineOperand::MO_Register: + if (MRegisterInfo::isPhysicalRegister(MO.getReg())) + O << "%" << LowercaseString (RI.get(MO.getReg()).Name); + else + O << "%reg" << MO.getReg(); + break; + + case MachineOperand::MO_Immediate: + O << (int)MO.getImmedValue(); + break; + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_GlobalAddress: + O << Mang->getValueName(MO.getGlobal()); + break; + case MachineOperand::MO_ExternalSymbol: + O << MO.getSymbolName(); + break; + case MachineOperand::MO_ConstantPoolIndex: + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" + << MO.getConstantPoolIndex(); + break; + default: + O << "<unknown operand type>"; abort (); break; + } + if (CloseParen) O << ")"; +} + +void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum, + const char *Modifier) { + printOperand(MI, opNum); + + // If this is an ADD operand, emit it like normal operands. + if (Modifier && !strcmp(Modifier, "arith")) { + O << ", "; + printOperand(MI, opNum+1); + return; + } + + if (MI->getOperand(opNum+1).isRegister() && + MI->getOperand(opNum+1).getReg() == SP::G0) + return; // don't print "+%g0" + if (MI->getOperand(opNum+1).isImmediate() && + MI->getOperand(opNum+1).getImmedValue() == 0) + return; // don't print "+0" + + O << "+"; + if (MI->getOperand(opNum+1).isGlobalAddress() || + MI->getOperand(opNum+1).isConstantPoolIndex()) { + O << "%lo("; + printOperand(MI, opNum+1); + O << ")"; + } else { + printOperand(MI, opNum+1); + } +} + +void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) { + int CC = (int)MI->getOperand(opNum).getImmedValue(); + O << SPARCCondCodeToString((SPCC::CondCodes)CC); +} + + + +bool SparcAsmPrinter::doInitialization(Module &M) { + Mang = new Mangler(M); + return false; // success +} + +bool SparcAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (I->hasInitializer()) { // External global require no code + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + O << "\n\n"; + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + unsigned Size = TD->getTypeSize(C->getType()); + unsigned Align = TD->getPrefTypeAlignment(C->getType()); + + if (C->isNullValue() && + (I->hasLinkOnceLinkage() || I->hasInternalLinkage() || + I->hasWeakLinkage() /* FIXME: Verify correct */)) { + SwitchToDataSection(".data", I); + if (I->hasInternalLinkage()) + O << "\t.local " << name << "\n"; + + O << "\t.comm " << name << "," << TD->getTypeSize(C->getType()) + << "," << Align; + O << "\n"; + } else { + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: // FIXME: Verify correct for weak. + // Nonnull linkonce -> weak + O << "\t.weak " << name << "\n"; + SwitchToDataSection("", I); + O << "\t.section\t\".llvm.linkonce.d." << name + << "\",\"aw\",@progbits\n"; + break; + + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.globl " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: + if (C->isNullValue()) + SwitchToDataSection(".bss", I); + else + SwitchToDataSection(".data", I); + break; + case GlobalValue::GhostLinkage: + cerr << "Should not have any unmaterialized functions!\n"; + abort(); + case GlobalValue::DLLImportLinkage: + cerr << "DLLImport linkage is not supported by this target!\n"; + abort(); + case GlobalValue::DLLExportLinkage: + cerr << "DLLExport linkage is not supported by this target!\n"; + abort(); + default: + assert(0 && "Unknown linkage type!"); + } + + O << "\t.align " << Align << "\n"; + O << "\t.type " << name << ",#object\n"; + O << "\t.size " << name << "," << Size << "\n"; + O << name << ":\n"; + EmitGlobalConstant(C); + } + } + + AsmPrinter::doFinalization(M); + return false; // success +} diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp new file mode 100644 index 0000000..8c8b3f8 --- /dev/null +++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -0,0 +1,1140 @@ +//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the SPARC target. +// +//===----------------------------------------------------------------------===// + +#include "Sparc.h" +#include "SparcTargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <set> +using namespace llvm; + +//===----------------------------------------------------------------------===// +// TargetLowering Implementation +//===----------------------------------------------------------------------===// + +namespace SPISD { + enum { + FIRST_NUMBER = ISD::BUILTIN_OP_END+SP::INSTRUCTION_LIST_END, + CMPICC, // Compare two GPR operands, set icc. + CMPFCC, // Compare two FP operands, set fcc. + BRICC, // Branch to dest on icc condition + BRFCC, // Branch to dest on fcc condition + SELECT_ICC, // Select between two values using the current ICC flags. + SELECT_FCC, // Select between two values using the current FCC flags. + + Hi, Lo, // Hi/Lo operations, typically on a global address. + + FTOI, // FP to Int within a FP register. + ITOF, // Int to FP within a FP register. + + CALL, // A call instruction. + RET_FLAG // Return with a flag operand. + }; +} + +/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC +/// condition. +static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) { + switch (CC) { + default: assert(0 && "Unknown integer condition code!"); + case ISD::SETEQ: return SPCC::ICC_E; + case ISD::SETNE: return SPCC::ICC_NE; + case ISD::SETLT: return SPCC::ICC_L; + case ISD::SETGT: return SPCC::ICC_G; + case ISD::SETLE: return SPCC::ICC_LE; + case ISD::SETGE: return SPCC::ICC_GE; + case ISD::SETULT: return SPCC::ICC_CS; + case ISD::SETULE: return SPCC::ICC_LEU; + case ISD::SETUGT: return SPCC::ICC_GU; + case ISD::SETUGE: return SPCC::ICC_CC; + } +} + +/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC +/// FCC condition. +static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) { + switch (CC) { + default: assert(0 && "Unknown fp condition code!"); + case ISD::SETEQ: + case ISD::SETOEQ: return SPCC::FCC_E; + case ISD::SETNE: + case ISD::SETUNE: return SPCC::FCC_NE; + case ISD::SETLT: + case ISD::SETOLT: return SPCC::FCC_L; + case ISD::SETGT: + case ISD::SETOGT: return SPCC::FCC_G; + case ISD::SETLE: + case ISD::SETOLE: return SPCC::FCC_LE; + case ISD::SETGE: + case ISD::SETOGE: return SPCC::FCC_GE; + case ISD::SETULT: return SPCC::FCC_UL; + case ISD::SETULE: return SPCC::FCC_ULE; + case ISD::SETUGT: return SPCC::FCC_UG; + case ISD::SETUGE: return SPCC::FCC_UGE; + case ISD::SETUO: return SPCC::FCC_U; + case ISD::SETO: return SPCC::FCC_O; + case ISD::SETONE: return SPCC::FCC_LG; + case ISD::SETUEQ: return SPCC::FCC_UE; + } +} + +namespace { + class SparcTargetLowering : public TargetLowering { + int VarArgsFrameOffset; // Frame offset to start of varargs area. + public: + SparcTargetLowering(TargetMachine &TM); + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + + /// computeMaskedBitsForTargetNode - Determine which of the bits specified + /// in Mask are known to be either zero or one and return them in the + /// KnownZero/KnownOne bitsets. + virtual void computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + virtual std::vector<SDOperand> + LowerArguments(Function &F, SelectionDAG &DAG); + virtual std::pair<SDOperand, SDOperand> + LowerCallTo(SDOperand Chain, const Type *RetTy, bool RetTyIsSigned, + bool isVarArg, unsigned CC, bool isTailCall, SDOperand Callee, + ArgListTy &Args, SelectionDAG &DAG); + virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *MBB); + + virtual const char *getTargetNodeName(unsigned Opcode) const; + }; +} + +SparcTargetLowering::SparcTargetLowering(TargetMachine &TM) + : TargetLowering(TM) { + + // Set up the register classes. + addRegisterClass(MVT::i32, SP::IntRegsRegisterClass); + addRegisterClass(MVT::f32, SP::FPRegsRegisterClass); + addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass); + + // Turn FP extload into load/fextend + setLoadXAction(ISD::EXTLOAD, MVT::f32, Expand); + + // Custom legalize GlobalAddress nodes into LO/HI parts. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool , MVT::i32, Custom); + + // Sparc doesn't have sext_inreg, replace them with shl/sra + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + + // Sparc has no REM operation. + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + + // Custom expand fp<->sint + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + + // Expand fp<->uint + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + + setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand); + setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand); + + // Sparc has no select or setcc: expand to SELECT_CC. + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::SETCC, MVT::f64, Expand); + + // Sparc doesn't have BRCOND either, it has BR_CC. + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + // SPARC has no intrinsics for these particular operations. + setOperationAction(ISD::MEMMOVE, MVT::Other, Expand); + setOperationAction(ISD::MEMSET, MVT::Other, Expand); + setOperationAction(ISD::MEMCPY, MVT::Other, Expand); + + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::CTTZ , MVT::i32, Expand); + setOperationAction(ISD::CTLZ , MVT::i32, Expand); + setOperationAction(ISD::ROTL , MVT::i32, Expand); + setOperationAction(ISD::ROTR , MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + + // We don't have line number support yet. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + setOperationAction(ISD::LABEL, MVT::Other, Expand); + + // RET must be custom lowered, to meet ABI requirements + setOperationAction(ISD::RET , MVT::Other, Custom); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex. + setOperationAction(ISD::VASTART , MVT::Other, Custom); + // VAARG needs to be lowered to not do unaligned accesses for doubles. + setOperationAction(ISD::VAARG , MVT::Other, Custom); + + // Use the default implementation. + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); + + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + + setStackPointerRegisterToSaveRestore(SP::O6); + + if (TM.getSubtarget<SparcSubtarget>().isV9()) { + setOperationAction(ISD::CTPOP, MVT::i32, Legal); + } + + computeRegisterProperties(); +} + +const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return 0; + case SPISD::CMPICC: return "SPISD::CMPICC"; + case SPISD::CMPFCC: return "SPISD::CMPFCC"; + case SPISD::BRICC: return "SPISD::BRICC"; + case SPISD::BRFCC: return "SPISD::BRFCC"; + case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC"; + case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC"; + case SPISD::Hi: return "SPISD::Hi"; + case SPISD::Lo: return "SPISD::Lo"; + case SPISD::FTOI: return "SPISD::FTOI"; + case SPISD::ITOF: return "SPISD::ITOF"; + case SPISD::CALL: return "SPISD::CALL"; + case SPISD::RET_FLAG: return "SPISD::RET_FLAG"; + } +} + +/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to +/// be zero. Op is expected to be a target specific node. Used by DAG +/// combiner. +void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + uint64_t KnownZero2, KnownOne2; + KnownZero = KnownOne = 0; // Don't know anything. + + switch (Op.getOpcode()) { + default: break; + case SPISD::SELECT_ICC: + case SPISD::SELECT_FCC: + DAG.ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, + Depth+1); + DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2, + Depth+1); + assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); + assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); + + // Only known if known in both the LHS and RHS. + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + break; + } +} + +/// LowerArguments - V8 uses a very simple ABI, where all values are passed in +/// either one or two GPRs, including FP values. TODO: we should pass FP values +/// in FP registers for fastcc functions. +std::vector<SDOperand> +SparcTargetLowering::LowerArguments(Function &F, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + SSARegMap *RegMap = MF.getSSARegMap(); + std::vector<SDOperand> ArgValues; + + static const unsigned ArgRegs[] = { + SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5 + }; + + const unsigned *CurArgReg = ArgRegs, *ArgRegEnd = ArgRegs+6; + unsigned ArgOffset = 68; + + SDOperand Root = DAG.getRoot(); + std::vector<SDOperand> OutChains; + + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) { + MVT::ValueType ObjectVT = getValueType(I->getType()); + + switch (ObjectVT) { + default: assert(0 && "Unhandled argument type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + if (I->use_empty()) { // Argument is dead. + if (CurArgReg < ArgRegEnd) ++CurArgReg; + ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT)); + } else if (CurArgReg < ArgRegEnd) { // Lives in an incoming GPR + unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass); + MF.addLiveIn(*CurArgReg++, VReg); + SDOperand Arg = DAG.getCopyFromReg(Root, VReg, MVT::i32); + if (ObjectVT != MVT::i32) { + unsigned AssertOp = ISD::AssertSext; + Arg = DAG.getNode(AssertOp, MVT::i32, Arg, + DAG.getValueType(ObjectVT)); + Arg = DAG.getNode(ISD::TRUNCATE, ObjectVT, Arg); + } + ArgValues.push_back(Arg); + } else { + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + SDOperand Load; + if (ObjectVT == MVT::i32) { + Load = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0); + } else { + ISD::LoadExtType LoadOp = ISD::SEXTLOAD; + + // Sparc is big endian, so add an offset based on the ObjectVT. + unsigned Offset = 4-std::max(1U, MVT::getSizeInBits(ObjectVT)/8); + FIPtr = DAG.getNode(ISD::ADD, MVT::i32, FIPtr, + DAG.getConstant(Offset, MVT::i32)); + Load = DAG.getExtLoad(LoadOp, MVT::i32, Root, FIPtr, + NULL, 0, ObjectVT); + Load = DAG.getNode(ISD::TRUNCATE, ObjectVT, Load); + } + ArgValues.push_back(Load); + } + + ArgOffset += 4; + break; + case MVT::f32: + if (I->use_empty()) { // Argument is dead. + if (CurArgReg < ArgRegEnd) ++CurArgReg; + ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT)); + } else if (CurArgReg < ArgRegEnd) { // Lives in an incoming GPR + // FP value is passed in an integer register. + unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass); + MF.addLiveIn(*CurArgReg++, VReg); + SDOperand Arg = DAG.getCopyFromReg(Root, VReg, MVT::i32); + + Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Arg); + ArgValues.push_back(Arg); + } else { + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + SDOperand Load = DAG.getLoad(MVT::f32, Root, FIPtr, NULL, 0); + ArgValues.push_back(Load); + } + ArgOffset += 4; + break; + + case MVT::i64: + case MVT::f64: + if (I->use_empty()) { // Argument is dead. + if (CurArgReg < ArgRegEnd) ++CurArgReg; + if (CurArgReg < ArgRegEnd) ++CurArgReg; + ArgValues.push_back(DAG.getNode(ISD::UNDEF, ObjectVT)); + } else if (/* FIXME: Apparently this isn't safe?? */ + 0 && CurArgReg == ArgRegEnd && ObjectVT == MVT::f64 && + ((CurArgReg-ArgRegs) & 1) == 0) { + // If this is a double argument and the whole thing lives on the stack, + // and the argument is aligned, load the double straight from the stack. + // We can't do a load in cases like void foo([6ints], int,double), + // because the double wouldn't be aligned! + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + ArgValues.push_back(DAG.getLoad(MVT::f64, Root, FIPtr, NULL, 0)); + } else { + SDOperand HiVal; + if (CurArgReg < ArgRegEnd) { // Lives in an incoming GPR + unsigned VRegHi = RegMap->createVirtualRegister(&SP::IntRegsRegClass); + MF.addLiveIn(*CurArgReg++, VRegHi); + HiVal = DAG.getCopyFromReg(Root, VRegHi, MVT::i32); + } else { + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + HiVal = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0); + } + + SDOperand LoVal; + if (CurArgReg < ArgRegEnd) { // Lives in an incoming GPR + unsigned VRegLo = RegMap->createVirtualRegister(&SP::IntRegsRegClass); + MF.addLiveIn(*CurArgReg++, VRegLo); + LoVal = DAG.getCopyFromReg(Root, VRegLo, MVT::i32); + } else { + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + LoVal = DAG.getLoad(MVT::i32, Root, FIPtr, NULL, 0); + } + + // Compose the two halves together into an i64 unit. + SDOperand WholeValue = + DAG.getNode(ISD::BUILD_PAIR, MVT::i64, LoVal, HiVal); + + // If we want a double, do a bit convert. + if (ObjectVT == MVT::f64) + WholeValue = DAG.getNode(ISD::BIT_CONVERT, MVT::f64, WholeValue); + + ArgValues.push_back(WholeValue); + } + ArgOffset += 8; + break; + } + } + + // Store remaining ArgRegs to the stack if this is a varargs function. + if (F.getFunctionType()->isVarArg()) { + // Remember the vararg offset for the va_start implementation. + VarArgsFrameOffset = ArgOffset; + + for (; CurArgReg != ArgRegEnd; ++CurArgReg) { + unsigned VReg = RegMap->createVirtualRegister(&SP::IntRegsRegClass); + MF.addLiveIn(*CurArgReg, VReg); + SDOperand Arg = DAG.getCopyFromReg(DAG.getRoot(), VReg, MVT::i32); + + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset); + SDOperand FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32); + + OutChains.push_back(DAG.getStore(DAG.getRoot(), Arg, FIPtr, NULL, 0)); + ArgOffset += 4; + } + } + + if (!OutChains.empty()) + DAG.setRoot(DAG.getNode(ISD::TokenFactor, MVT::Other, + &OutChains[0], OutChains.size())); + + // Finally, inform the code generator which regs we return values in. + switch (getValueType(F.getReturnType())) { + default: assert(0 && "Unknown type!"); + case MVT::isVoid: break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + MF.addLiveOut(SP::I0); + break; + case MVT::i64: + MF.addLiveOut(SP::I0); + MF.addLiveOut(SP::I1); + break; + case MVT::f32: + MF.addLiveOut(SP::F0); + break; + case MVT::f64: + MF.addLiveOut(SP::D0); + break; + } + + return ArgValues; +} + +std::pair<SDOperand, SDOperand> +SparcTargetLowering::LowerCallTo(SDOperand Chain, const Type *RetTy, + bool RetTyIsSigned, bool isVarArg, unsigned CC, + bool isTailCall, SDOperand Callee, + ArgListTy &Args, SelectionDAG &DAG) { + // Count the size of the outgoing arguments. + unsigned ArgsSize = 0; + for (unsigned i = 0, e = Args.size(); i != e; ++i) { + switch (getValueType(Args[i].Ty)) { + default: assert(0 && "Unknown value type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::f32: + ArgsSize += 4; + break; + case MVT::i64: + case MVT::f64: + ArgsSize += 8; + break; + } + } + if (ArgsSize > 4*6) + ArgsSize -= 4*6; // Space for first 6 arguments is prereserved. + else + ArgsSize = 0; + + // Keep stack frames 8-byte aligned. + ArgsSize = (ArgsSize+7) & ~7; + + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(ArgsSize, getPointerTy())); + + SDOperand StackPtr; + std::vector<SDOperand> Stores; + std::vector<SDOperand> RegValuesToPass; + unsigned ArgOffset = 68; + for (unsigned i = 0, e = Args.size(); i != e; ++i) { + SDOperand Val = Args[i].Node; + MVT::ValueType ObjectVT = Val.getValueType(); + SDOperand ValToStore(0, 0); + unsigned ObjSize; + switch (ObjectVT) { + default: assert(0 && "Unhandled argument type!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: { + // Promote the integer to 32-bits. If the input type is signed, use a + // sign extend, otherwise use a zero extend. + ISD::NodeType ExtendKind = ISD::ANY_EXTEND; + if (Args[i].isSExt) + ExtendKind = ISD::SIGN_EXTEND; + else if (Args[i].isZExt) + ExtendKind = ISD::ZERO_EXTEND; + Val = DAG.getNode(ExtendKind, MVT::i32, Val); + // FALL THROUGH + } + case MVT::i32: + ObjSize = 4; + + if (RegValuesToPass.size() >= 6) { + ValToStore = Val; + } else { + RegValuesToPass.push_back(Val); + } + break; + case MVT::f32: + ObjSize = 4; + if (RegValuesToPass.size() >= 6) { + ValToStore = Val; + } else { + // Convert this to a FP value in an int reg. + Val = DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Val); + RegValuesToPass.push_back(Val); + } + break; + case MVT::f64: + ObjSize = 8; + // If we can store this directly into the outgoing slot, do so. We can + // do this when all ArgRegs are used and if the outgoing slot is aligned. + // FIXME: McGill/misr fails with this. + if (0 && RegValuesToPass.size() >= 6 && ((ArgOffset-68) & 7) == 0) { + ValToStore = Val; + break; + } + + // Otherwise, convert this to a FP value in int regs. + Val = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Val); + // FALL THROUGH + case MVT::i64: + ObjSize = 8; + if (RegValuesToPass.size() >= 6) { + ValToStore = Val; // Whole thing is passed in memory. + break; + } + + // Split the value into top and bottom part. Top part goes in a reg. + SDOperand Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, getPointerTy(), Val, + DAG.getConstant(1, MVT::i32)); + SDOperand Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, getPointerTy(), Val, + DAG.getConstant(0, MVT::i32)); + RegValuesToPass.push_back(Hi); + + if (RegValuesToPass.size() >= 6) { + ValToStore = Lo; + ArgOffset += 4; + ObjSize = 4; + } else { + RegValuesToPass.push_back(Lo); + } + break; + } + + if (ValToStore.Val) { + if (!StackPtr.Val) { + StackPtr = DAG.getRegister(SP::O6, MVT::i32); + } + SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff); + Stores.push_back(DAG.getStore(Chain, ValToStore, PtrOff, NULL, 0)); + } + ArgOffset += ObjSize; + } + + // Emit all stores, make sure the occur before any copies into physregs. + if (!Stores.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Stores[0],Stores.size()); + + static const unsigned ArgRegs[] = { + SP::O0, SP::O1, SP::O2, SP::O3, SP::O4, SP::O5 + }; + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into O[0-5]. + SDOperand InFlag; + for (unsigned i = 0, e = RegValuesToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, ArgRegs[i], RegValuesToPass[i], InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. + // Likewise ExternalSymbol -> TargetExternalSymbol. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32); + else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32); + + std::vector<MVT::ValueType> NodeTys; + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Flag); // Returns a flag for retval copy to use. + SDOperand Ops[] = { Chain, Callee, InFlag }; + Chain = DAG.getNode(SPISD::CALL, NodeTys, Ops, InFlag.Val ? 3 : 2); + InFlag = Chain.getValue(1); + + MVT::ValueType RetTyVT = getValueType(RetTy); + SDOperand RetVal; + if (RetTyVT != MVT::isVoid) { + switch (RetTyVT) { + default: assert(0 && "Unknown value type to return!"); + case MVT::i1: + case MVT::i8: + case MVT::i16: { + RetVal = DAG.getCopyFromReg(Chain, SP::O0, MVT::i32, InFlag); + Chain = RetVal.getValue(1); + + // Add a note to keep track of whether it is sign or zero extended. + ISD::NodeType AssertKind = ISD::AssertZext; + if (RetTyIsSigned) + AssertKind = ISD::AssertSext; + RetVal = DAG.getNode(AssertKind, MVT::i32, RetVal, + DAG.getValueType(RetTyVT)); + RetVal = DAG.getNode(ISD::TRUNCATE, RetTyVT, RetVal); + break; + } + case MVT::i32: + RetVal = DAG.getCopyFromReg(Chain, SP::O0, MVT::i32, InFlag); + Chain = RetVal.getValue(1); + break; + case MVT::f32: + RetVal = DAG.getCopyFromReg(Chain, SP::F0, MVT::f32, InFlag); + Chain = RetVal.getValue(1); + break; + case MVT::f64: + RetVal = DAG.getCopyFromReg(Chain, SP::D0, MVT::f64, InFlag); + Chain = RetVal.getValue(1); + break; + case MVT::i64: + SDOperand Lo = DAG.getCopyFromReg(Chain, SP::O1, MVT::i32, InFlag); + SDOperand Hi = DAG.getCopyFromReg(Lo.getValue(1), SP::O0, MVT::i32, + Lo.getValue(2)); + RetVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi); + Chain = Hi.getValue(1); + break; + } + } + + Chain = DAG.getNode(ISD::CALLSEQ_END, MVT::Other, Chain, + DAG.getConstant(ArgsSize, getPointerTy())); + + return std::make_pair(RetVal, Chain); +} + +// Look at LHS/RHS/CC and see if they are a lowered setcc instruction. If so +// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition. +static void LookThroughSetCC(SDOperand &LHS, SDOperand &RHS, + ISD::CondCode CC, unsigned &SPCC) { + if (isa<ConstantSDNode>(RHS) && cast<ConstantSDNode>(RHS)->getValue() == 0 && + CC == ISD::SETNE && + ((LHS.getOpcode() == SPISD::SELECT_ICC && + LHS.getOperand(3).getOpcode() == SPISD::CMPICC) || + (LHS.getOpcode() == SPISD::SELECT_FCC && + LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) && + isa<ConstantSDNode>(LHS.getOperand(0)) && + isa<ConstantSDNode>(LHS.getOperand(1)) && + cast<ConstantSDNode>(LHS.getOperand(0))->getValue() == 1 && + cast<ConstantSDNode>(LHS.getOperand(1))->getValue() == 0) { + SDOperand CMPCC = LHS.getOperand(3); + SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getValue(); + LHS = CMPCC.getOperand(0); + RHS = CMPCC.getOperand(1); + } +} + + +SDOperand SparcTargetLowering:: +LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Should not custom lower this!"); + case ISD::GlobalTLSAddress: + assert(0 && "TLS not implemented for Sparc."); + case ISD::GlobalAddress: { + GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + SDOperand GA = DAG.getTargetGlobalAddress(GV, MVT::i32); + SDOperand Hi = DAG.getNode(SPISD::Hi, MVT::i32, GA); + SDOperand Lo = DAG.getNode(SPISD::Lo, MVT::i32, GA); + return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi); + } + case ISD::ConstantPool: { + Constant *C = cast<ConstantPoolSDNode>(Op)->getConstVal(); + SDOperand CP = DAG.getTargetConstantPool(C, MVT::i32, + cast<ConstantPoolSDNode>(Op)->getAlignment()); + SDOperand Hi = DAG.getNode(SPISD::Hi, MVT::i32, CP); + SDOperand Lo = DAG.getNode(SPISD::Lo, MVT::i32, CP); + return DAG.getNode(ISD::ADD, MVT::i32, Lo, Hi); + } + case ISD::FP_TO_SINT: + // Convert the fp value to integer in an FP register. + assert(Op.getValueType() == MVT::i32); + Op = DAG.getNode(SPISD::FTOI, MVT::f32, Op.getOperand(0)); + return DAG.getNode(ISD::BIT_CONVERT, MVT::i32, Op); + case ISD::SINT_TO_FP: { + assert(Op.getOperand(0).getValueType() == MVT::i32); + SDOperand Tmp = DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Op.getOperand(0)); + // Convert the int value to FP in an FP register. + return DAG.getNode(SPISD::ITOF, Op.getValueType(), Tmp); + } + case ISD::BR_CC: { + SDOperand Chain = Op.getOperand(0); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); + SDOperand LHS = Op.getOperand(2); + SDOperand RHS = Op.getOperand(3); + SDOperand Dest = Op.getOperand(4); + unsigned Opc, SPCC = ~0U; + + // If this is a br_cc of a "setcc", and if the setcc got lowered into + // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values. + LookThroughSetCC(LHS, RHS, CC, SPCC); + + // Get the condition flag. + SDOperand CompareFlag; + if (LHS.getValueType() == MVT::i32) { + std::vector<MVT::ValueType> VTs; + VTs.push_back(MVT::i32); + VTs.push_back(MVT::Flag); + SDOperand Ops[2] = { LHS, RHS }; + CompareFlag = DAG.getNode(SPISD::CMPICC, VTs, Ops, 2).getValue(1); + if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC); + Opc = SPISD::BRICC; + } else { + CompareFlag = DAG.getNode(SPISD::CMPFCC, MVT::Flag, LHS, RHS); + if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC); + Opc = SPISD::BRFCC; + } + return DAG.getNode(Opc, MVT::Other, Chain, Dest, + DAG.getConstant(SPCC, MVT::i32), CompareFlag); + } + case ISD::SELECT_CC: { + SDOperand LHS = Op.getOperand(0); + SDOperand RHS = Op.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + SDOperand TrueVal = Op.getOperand(2); + SDOperand FalseVal = Op.getOperand(3); + unsigned Opc, SPCC = ~0U; + + // If this is a select_cc of a "setcc", and if the setcc got lowered into + // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values. + LookThroughSetCC(LHS, RHS, CC, SPCC); + + SDOperand CompareFlag; + if (LHS.getValueType() == MVT::i32) { + std::vector<MVT::ValueType> VTs; + VTs.push_back(LHS.getValueType()); // subcc returns a value + VTs.push_back(MVT::Flag); + SDOperand Ops[2] = { LHS, RHS }; + CompareFlag = DAG.getNode(SPISD::CMPICC, VTs, Ops, 2).getValue(1); + Opc = SPISD::SELECT_ICC; + if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC); + } else { + CompareFlag = DAG.getNode(SPISD::CMPFCC, MVT::Flag, LHS, RHS); + Opc = SPISD::SELECT_FCC; + if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC); + } + return DAG.getNode(Opc, TrueVal.getValueType(), TrueVal, FalseVal, + DAG.getConstant(SPCC, MVT::i32), CompareFlag); + } + case ISD::VASTART: { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDOperand Offset = DAG.getNode(ISD::ADD, MVT::i32, + DAG.getRegister(SP::I6, MVT::i32), + DAG.getConstant(VarArgsFrameOffset, MVT::i32)); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + return DAG.getStore(Op.getOperand(0), Offset, + Op.getOperand(1), SV->getValue(), SV->getOffset()); + } + case ISD::VAARG: { + SDNode *Node = Op.Val; + MVT::ValueType VT = Node->getValueType(0); + SDOperand InChain = Node->getOperand(0); + SDOperand VAListPtr = Node->getOperand(1); + SrcValueSDNode *SV = cast<SrcValueSDNode>(Node->getOperand(2)); + SDOperand VAList = DAG.getLoad(getPointerTy(), InChain, VAListPtr, + SV->getValue(), SV->getOffset()); + // Increment the pointer, VAList, to the next vaarg + SDOperand NextPtr = DAG.getNode(ISD::ADD, getPointerTy(), VAList, + DAG.getConstant(MVT::getSizeInBits(VT)/8, + getPointerTy())); + // Store the incremented VAList to the legalized pointer + InChain = DAG.getStore(VAList.getValue(1), NextPtr, + VAListPtr, SV->getValue(), SV->getOffset()); + // Load the actual argument out of the pointer VAList, unless this is an + // f64 load. + if (VT != MVT::f64) { + return DAG.getLoad(VT, InChain, VAList, NULL, 0); + } else { + // Otherwise, load it as i64, then do a bitconvert. + SDOperand V = DAG.getLoad(MVT::i64, InChain, VAList, NULL, 0); + std::vector<MVT::ValueType> Tys; + Tys.push_back(MVT::f64); + Tys.push_back(MVT::Other); + // Bit-Convert the value to f64. + SDOperand Ops[2] = { DAG.getNode(ISD::BIT_CONVERT, MVT::f64, V), + V.getValue(1) }; + return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2); + } + } + case ISD::DYNAMIC_STACKALLOC: { + SDOperand Chain = Op.getOperand(0); // Legalize the chain. + SDOperand Size = Op.getOperand(1); // Legalize the size. + + unsigned SPReg = SP::O6; + SDOperand SP = DAG.getCopyFromReg(Chain, SPReg, MVT::i32); + SDOperand NewSP = DAG.getNode(ISD::SUB, MVT::i32, SP, Size); // Value + Chain = DAG.getCopyToReg(SP.getValue(1), SPReg, NewSP); // Output chain + + // The resultant pointer is actually 16 words from the bottom of the stack, + // to provide a register spill area. + SDOperand NewVal = DAG.getNode(ISD::ADD, MVT::i32, NewSP, + DAG.getConstant(96, MVT::i32)); + std::vector<MVT::ValueType> Tys; + Tys.push_back(MVT::i32); + Tys.push_back(MVT::Other); + SDOperand Ops[2] = { NewVal, Chain }; + return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2); + } + case ISD::RET: { + SDOperand Copy; + + switch(Op.getNumOperands()) { + default: + assert(0 && "Do not know how to return this many arguments!"); + abort(); + case 1: + return SDOperand(); // ret void is legal + case 3: { + unsigned ArgReg; + switch(Op.getOperand(1).getValueType()) { + default: assert(0 && "Unknown type to return!"); + case MVT::i32: ArgReg = SP::I0; break; + case MVT::f32: ArgReg = SP::F0; break; + case MVT::f64: ArgReg = SP::D0; break; + } + Copy = DAG.getCopyToReg(Op.getOperand(0), ArgReg, Op.getOperand(1), + SDOperand()); + break; + } + case 5: + Copy = DAG.getCopyToReg(Op.getOperand(0), SP::I0, Op.getOperand(3), + SDOperand()); + Copy = DAG.getCopyToReg(Copy, SP::I1, Op.getOperand(1), Copy.getValue(1)); + break; + } + return DAG.getNode(SPISD::RET_FLAG, MVT::Other, Copy, Copy.getValue(1)); + } + // Frame & Return address. Currently unimplemented + case ISD::RETURNADDR: break; + case ISD::FRAMEADDR: break; + } + return SDOperand(); +} + +MachineBasicBlock * +SparcTargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *BB) { + const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo(); + unsigned BROpcode; + unsigned CC; + // Figure out the conditional branch opcode to use for this select_cc. + switch (MI->getOpcode()) { + default: assert(0 && "Unknown SELECT_CC!"); + case SP::SELECT_CC_Int_ICC: + case SP::SELECT_CC_FP_ICC: + case SP::SELECT_CC_DFP_ICC: + BROpcode = SP::BCOND; + break; + case SP::SELECT_CC_Int_FCC: + case SP::SELECT_CC_FP_FCC: + case SP::SELECT_CC_DFP_FCC: + BROpcode = SP::FBCOND; + break; + } + + CC = (SPCC::CondCodes)MI->getOperand(3).getImmedValue(); + + // To "insert" a SELECT_CC instruction, we actually have to insert the diamond + // control-flow pattern. The incoming instruction knows the destination vreg + // to set, the condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + ilist<MachineBasicBlock>::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // [f]bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB); + BuildMI(BB, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC); + MachineFunction *F = BB->getParent(); + F->getBasicBlockList().insert(It, copy0MBB); + F->getBasicBlockList().insert(It, sinkMBB); + // Update machine-CFG edges by first adding all successors of the current + // block to the new block which will contain the Phi node for the select. + for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), + e = BB->succ_end(); i != e; ++i) + sinkMBB->addSuccessor(*i); + // Next, remove all successors of the current block, and add the true + // and fallthrough blocks as its successors. + while(!BB->succ_empty()) + BB->removeSuccessor(BB->succ_begin()); + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, TII.get(SP::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB); + + delete MI; // The pseudo instruction is gone now. + return BB; +} + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------===// +/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine +/// instructions for SelectionDAG operations. +/// +namespace { +class SparcDAGToDAGISel : public SelectionDAGISel { + SparcTargetLowering Lowering; + + /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can + /// make the right decision when generating code for different targets. + const SparcSubtarget &Subtarget; +public: + SparcDAGToDAGISel(TargetMachine &TM) + : SelectionDAGISel(Lowering), Lowering(TM), + Subtarget(TM.getSubtarget<SparcSubtarget>()) { + } + + SDNode *Select(SDOperand Op); + + // Complex Pattern Selectors. + bool SelectADDRrr(SDOperand Op, SDOperand N, SDOperand &R1, SDOperand &R2); + bool SelectADDRri(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Offset); + + /// InstructionSelectBasicBlock - This callback is invoked by + /// SelectionDAGISel when it has created a SelectionDAG for us to codegen. + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + + virtual const char *getPassName() const { + return "SPARC DAG->DAG Pattern Instruction Selection"; + } + + // Include the pieces autogenerated from the target description. +#include "SparcGenDAGISel.inc" +}; +} // end anonymous namespace + +/// InstructionSelectBasicBlock - This callback is invoked by +/// SelectionDAGISel when it has created a SelectionDAG for us to codegen. +void SparcDAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + + // Select target instructions for the DAG. + DAG.setRoot(SelectRoot(DAG.getRoot())); + DAG.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(DAG); +} + +bool SparcDAGToDAGISel::SelectADDRri(SDOperand Op, SDOperand Addr, + SDOperand &Base, SDOperand &Offset) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; + } + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) + return false; // direct calls. + + if (Addr.getOpcode() == ISD::ADD) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { + if (Predicate_simm13(CN)) { + if (FrameIndexSDNode *FIN = + dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) { + // Constant offset from frame ref. + Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + } else { + Base = Addr.getOperand(0); + } + Offset = CurDAG->getTargetConstant(CN->getValue(), MVT::i32); + return true; + } + } + if (Addr.getOperand(0).getOpcode() == SPISD::Lo) { + Base = Addr.getOperand(1); + Offset = Addr.getOperand(0).getOperand(0); + return true; + } + if (Addr.getOperand(1).getOpcode() == SPISD::Lo) { + Base = Addr.getOperand(0); + Offset = Addr.getOperand(1).getOperand(0); + return true; + } + } + Base = Addr; + Offset = CurDAG->getTargetConstant(0, MVT::i32); + return true; +} + +bool SparcDAGToDAGISel::SelectADDRrr(SDOperand Op, SDOperand Addr, + SDOperand &R1, SDOperand &R2) { + if (Addr.getOpcode() == ISD::FrameIndex) return false; + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) + return false; // direct calls. + + if (Addr.getOpcode() == ISD::ADD) { + if (isa<ConstantSDNode>(Addr.getOperand(1)) && + Predicate_simm13(Addr.getOperand(1).Val)) + return false; // Let the reg+imm pattern catch this! + if (Addr.getOperand(0).getOpcode() == SPISD::Lo || + Addr.getOperand(1).getOpcode() == SPISD::Lo) + return false; // Let the reg+imm pattern catch this! + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + return true; + } + + R1 = Addr; + R2 = CurDAG->getRegister(SP::G0, MVT::i32); + return true; +} + +SDNode *SparcDAGToDAGISel::Select(SDOperand Op) { + SDNode *N = Op.Val; + if (N->getOpcode() >= ISD::BUILTIN_OP_END && + N->getOpcode() < SPISD::FIRST_NUMBER) + return NULL; // Already selected. + + switch (N->getOpcode()) { + default: break; + case ISD::SDIV: + case ISD::UDIV: { + // FIXME: should use a custom expander to expose the SRA to the dag. + SDOperand DivLHS = N->getOperand(0); + SDOperand DivRHS = N->getOperand(1); + AddToISelQueue(DivLHS); + AddToISelQueue(DivRHS); + + // Set the Y register to the high-part. + SDOperand TopPart; + if (N->getOpcode() == ISD::SDIV) { + TopPart = SDOperand(CurDAG->getTargetNode(SP::SRAri, MVT::i32, DivLHS, + CurDAG->getTargetConstant(31, MVT::i32)), 0); + } else { + TopPart = CurDAG->getRegister(SP::G0, MVT::i32); + } + TopPart = SDOperand(CurDAG->getTargetNode(SP::WRYrr, MVT::Flag, TopPart, + CurDAG->getRegister(SP::G0, MVT::i32)), 0); + + // FIXME: Handle div by immediate. + unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr; + return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS, + TopPart); + } + case ISD::MULHU: + case ISD::MULHS: { + // FIXME: Handle mul by immediate. + SDOperand MulLHS = N->getOperand(0); + SDOperand MulRHS = N->getOperand(1); + AddToISelQueue(MulLHS); + AddToISelQueue(MulRHS); + unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr; + SDNode *Mul = CurDAG->getTargetNode(Opcode, MVT::i32, MVT::Flag, + MulLHS, MulRHS); + // The high part is in the Y register. + return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDOperand(Mul, 1)); + return NULL; + } + } + + return SelectCode(Op); +} + + +/// createSparcISelDag - This pass converts a legalized DAG into a +/// SPARC-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createSparcISelDag(TargetMachine &TM) { + return new SparcDAGToDAGISel(TM); +} diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td new file mode 100644 index 0000000..f463ab8 --- /dev/null +++ b/lib/Target/Sparc/SparcInstrFormats.td @@ -0,0 +1,113 @@ +//===- SparcInstrFormats.td - Sparc Instruction Formats ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class InstSP<dag ops, string asmstr, list<dag> pattern> : Instruction { + field bits<32> Inst; + + let Namespace = "SP"; + + bits<2> op; + let Inst{31-30} = op; // Top two bits are the 'op' field + + dag OperandList = ops; + let AsmString = asmstr; + let Pattern = pattern; +} + +//===----------------------------------------------------------------------===// +// Format #2 instruction classes in the Sparc +//===----------------------------------------------------------------------===// + +// Format 2 instructions +class F2<dag ops, string asmstr, list<dag> pattern> + : InstSP<ops, asmstr, pattern> { + bits<3> op2; + bits<22> imm22; + let op = 0; // op = 0 + let Inst{24-22} = op2; + let Inst{21-0} = imm22; +} + +// Specific F2 classes: SparcV8 manual, page 44 +// +class F2_1<bits<3> op2Val, dag ops, string asmstr, list<dag> pattern> + : F2<ops, asmstr, pattern> { + bits<5> rd; + + let op2 = op2Val; + + let Inst{29-25} = rd; +} + +class F2_2<bits<4> condVal, bits<3> op2Val, dag ops, string asmstr, + list<dag> pattern> : F2<ops, asmstr, pattern> { + bits<4> cond; + bit annul = 0; // currently unused + + let cond = condVal; + let op2 = op2Val; + + let Inst{29} = annul; + let Inst{28-25} = cond; +} + +//===----------------------------------------------------------------------===// +// Format #3 instruction classes in the Sparc +//===----------------------------------------------------------------------===// + +class F3<dag ops, string asmstr, list<dag> pattern> + : InstSP<ops, asmstr, pattern> { + bits<5> rd; + bits<6> op3; + bits<5> rs1; + let op{1} = 1; // Op = 2 or 3 + let Inst{29-25} = rd; + let Inst{24-19} = op3; + let Inst{18-14} = rs1; +} + +// Specific F3 classes: SparcV8 manual, page 44 +// +class F3_1<bits<2> opVal, bits<6> op3val, dag ops, + string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> { + bits<8> asi = 0; // asi not currently used + bits<5> rs2; + + let op = opVal; + let op3 = op3val; + + let Inst{13} = 0; // i field = 0 + let Inst{12-5} = asi; // address space identifier + let Inst{4-0} = rs2; +} + +class F3_2<bits<2> opVal, bits<6> op3val, dag ops, + string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> { + bits<13> simm13; + + let op = opVal; + let op3 = op3val; + + let Inst{13} = 1; // i field = 1 + let Inst{12-0} = simm13; +} + +// floating-point +class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag ops, + string asmstr, list<dag> pattern> : F3<ops, asmstr, pattern> { + bits<5> rs2; + + let op = opVal; + let op3 = op3val; + + let Inst{13-5} = opfval; // fp opcode + let Inst{4-0} = rs2; +} + + diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp new file mode 100644 index 0000000..a8c822a --- /dev/null +++ b/lib/Target/Sparc/SparcInstrInfo.cpp @@ -0,0 +1,108 @@ +//===- SparcInstrInfo.cpp - Sparc Instruction Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Sparc implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "SparcInstrInfo.h" +#include "Sparc.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "SparcGenInstrInfo.inc" +using namespace llvm; + +SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST) + : TargetInstrInfo(SparcInsts, sizeof(SparcInsts)/sizeof(SparcInsts[0])), + RI(ST, *this) { +} + +static bool isZeroImm(const MachineOperand &op) { + return op.isImmediate() && op.getImmedValue() == 0; +} + +/// Return true if the instruction is a register to register move and +/// leave the source and dest operands in the passed parameters. +/// +bool SparcInstrInfo::isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const { + // We look for 3 kinds of patterns here: + // or with G0 or 0 + // add with G0 or 0 + // fmovs or FpMOVD (pseudo double move). + if (MI.getOpcode() == SP::ORrr || MI.getOpcode() == SP::ADDrr) { + if (MI.getOperand(1).getReg() == SP::G0) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(2).getReg(); + return true; + } else if (MI.getOperand(2).getReg() == SP::G0) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); + return true; + } + } else if ((MI.getOpcode() == SP::ORri || MI.getOpcode() == SP::ADDri) && + isZeroImm(MI.getOperand(2)) && MI.getOperand(1).isRegister()) { + DstReg = MI.getOperand(0).getReg(); + SrcReg = MI.getOperand(1).getReg(); + return true; + } else if (MI.getOpcode() == SP::FMOVS || MI.getOpcode() == SP::FpMOVD || + MI.getOpcode() == SP::FMOVD) { + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + return true; + } + return false; +} + +/// isLoadFromStackSlot - If the specified machine instruction is a direct +/// load from a stack slot, return the virtual or physical register number of +/// the destination along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than loading from the stack slot. +unsigned SparcInstrInfo::isLoadFromStackSlot(MachineInstr *MI, + int &FrameIndex) const { + if (MI->getOpcode() == SP::LDri || + MI->getOpcode() == SP::LDFri || + MI->getOpcode() == SP::LDDFri) { + if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() && + MI->getOperand(2).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + } + return 0; +} + +/// isStoreToStackSlot - If the specified machine instruction is a direct +/// store to a stack slot, return the virtual or physical register number of +/// the source reg along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than storing to the stack slot. +unsigned SparcInstrInfo::isStoreToStackSlot(MachineInstr *MI, + int &FrameIndex) const { + if (MI->getOpcode() == SP::STri || + MI->getOpcode() == SP::STFri || + MI->getOpcode() == SP::STDFri) { + if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() && + MI->getOperand(1).getImmedValue() == 0) { + FrameIndex = MI->getOperand(0).getFrameIndex(); + return MI->getOperand(2).getReg(); + } + } + return 0; +} + +unsigned +SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond)const{ + // Can only insert uncond branches so far. + assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!"); + BuildMI(&MBB, get(SP::BA)).addMBB(TBB); + return 1; +} diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h new file mode 100644 index 0000000..3fb50ff --- /dev/null +++ b/lib/Target/Sparc/SparcInstrInfo.h @@ -0,0 +1,73 @@ +//===- SparcInstrInfo.h - Sparc Instruction Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Sparc implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef SPARCINSTRUCTIONINFO_H +#define SPARCINSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "SparcRegisterInfo.h" + +namespace llvm { + +/// SPII - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace SPII { + enum { + Pseudo = (1<<0), + Load = (1<<1), + Store = (1<<2), + DelaySlot = (1<<3) + }; +} + +class SparcInstrInfo : public TargetInstrInfo { + const SparcRegisterInfo RI; +public: + SparcInstrInfo(SparcSubtarget &ST); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + /// Return true if the instruction is a register to register move and + /// leave the source and dest operands in the passed parameters. + /// + virtual bool isMoveInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg) const; + + /// isLoadFromStackSlot - If the specified machine instruction is a direct + /// load from a stack slot, return the virtual or physical register number of + /// the destination along with the FrameIndex of the loaded stack slot. If + /// not, return 0. This predicate must return 0 if the instruction has + /// any side effects other than loading from the stack slot. + virtual unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + + /// isStoreToStackSlot - If the specified machine instruction is a direct + /// store to a stack slot, return the virtual or physical register number of + /// the source reg along with the FrameIndex of the loaded stack slot. If + /// not, return 0. This predicate must return 0 if the instruction has + /// any side effects other than storing to the stack slot. + virtual unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + + + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; +}; + +} + +#endif diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td new file mode 100644 index 0000000..434e8d7 --- /dev/null +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -0,0 +1,776 @@ +//===- SparcInstrInfo.td - Target Description for Sparc Target ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Sparc instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction format superclass +//===----------------------------------------------------------------------===// + +include "SparcInstrFormats.td" + +//===----------------------------------------------------------------------===// +// Feature predicates. +//===----------------------------------------------------------------------===// + +// HasV9 - This predicate is true when the target processor supports V9 +// instructions. Note that the machine may be running in 32-bit mode. +def HasV9 : Predicate<"Subtarget.isV9()">; + +// HasNoV9 - This predicate is true when the target doesn't have V9 +// instructions. Use of this is just a hack for the isel not having proper +// costs for V8 instructions that are more expensive than their V9 ones. +def HasNoV9 : Predicate<"!Subtarget.isV9()">; + +// HasVIS - This is true when the target processor has VIS extensions. +def HasVIS : Predicate<"Subtarget.isVIS()">; + +// UseDeprecatedInsts - This predicate is true when the target processor is a +// V8, or when it is V9 but the V8 deprecated instructions are efficient enough +// to use when appropriate. In either of these cases, the instruction selector +// will pick deprecated instructions. +def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">; + +//===----------------------------------------------------------------------===// +// Instruction Pattern Stuff +//===----------------------------------------------------------------------===// + +def simm11 : PatLeaf<(imm), [{ + // simm11 predicate - True if the imm fits in a 11-bit sign extended field. + return (((int)N->getValue() << (32-11)) >> (32-11)) == (int)N->getValue(); +}]>; + +def simm13 : PatLeaf<(imm), [{ + // simm13 predicate - True if the imm fits in a 13-bit sign extended field. + return (((int)N->getValue() << (32-13)) >> (32-13)) == (int)N->getValue(); +}]>; + +def LO10 : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((unsigned)N->getValue() & 1023, MVT::i32); +}]>; + +def HI22 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return CurDAG->getTargetConstant((unsigned)N->getValue() >> 10, MVT::i32); +}]>; + +def SETHIimm : PatLeaf<(imm), [{ + return (((unsigned)N->getValue() >> 10) << 10) == (unsigned)N->getValue(); +}], HI22>; + +// Addressing modes. +def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>; +def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>; + +// Address operands +def MEMrr : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops IntRegs, IntRegs); +} +def MEMri : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops IntRegs, i32imm); +} + +// Branch targets have OtherVT type. +def brtarget : Operand<OtherVT>; +def calltarget : Operand<i32>; + +// Operand for printing out a condition code. +let PrintMethod = "printCCOperand" in + def CCOp : Operand<i32>; + +def SDTSPcmpfcc : +SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>; +def SDTSPbrcc : +SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>; +def SDTSPselectcc : +SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>; +def SDTSPFTOI : +SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>; +def SDTSPITOF : +SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>; + +def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutFlag]>; +def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutFlag]>; +def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>; +def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>; + +def SPhi : SDNode<"SPISD::Hi", SDTIntUnaryOp>; +def SPlo : SDNode<"SPISD::Lo", SDTIntUnaryOp>; + +def SPftoi : SDNode<"SPISD::FTOI", SDTSPFTOI>; +def SPitof : SDNode<"SPISD::ITOF", SDTSPITOF>; + +def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInFlag]>; +def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInFlag]>; + +// These are target-independent nodes, but have target-specific formats. +def SDT_SPCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeq, + [SDNPHasChain, SDNPOutFlag]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_SPCallSeq, + [SDNPHasChain, SDNPOutFlag]>; + +def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; +def call : SDNode<"SPISD::CALL", SDT_SPCall, + [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>; + +def SDT_SPRetFlag : SDTypeProfile<0, 0, []>; +def retflag : SDNode<"SPISD::RET_FLAG", SDT_SPRetFlag, + [SDNPHasChain, SDNPOptInFlag]>; + +//===----------------------------------------------------------------------===// +// SPARC Flag Conditions +//===----------------------------------------------------------------------===// + +// Note that these values must be kept in sync with the CCOp::CondCode enum +// values. +class ICC_VAL<int N> : PatLeaf<(i32 N)>; +def ICC_NE : ICC_VAL< 9>; // Not Equal +def ICC_E : ICC_VAL< 1>; // Equal +def ICC_G : ICC_VAL<10>; // Greater +def ICC_LE : ICC_VAL< 2>; // Less or Equal +def ICC_GE : ICC_VAL<11>; // Greater or Equal +def ICC_L : ICC_VAL< 3>; // Less +def ICC_GU : ICC_VAL<12>; // Greater Unsigned +def ICC_LEU : ICC_VAL< 4>; // Less or Equal Unsigned +def ICC_CC : ICC_VAL<13>; // Carry Clear/Great or Equal Unsigned +def ICC_CS : ICC_VAL< 5>; // Carry Set/Less Unsigned +def ICC_POS : ICC_VAL<14>; // Positive +def ICC_NEG : ICC_VAL< 6>; // Negative +def ICC_VC : ICC_VAL<15>; // Overflow Clear +def ICC_VS : ICC_VAL< 7>; // Overflow Set + +class FCC_VAL<int N> : PatLeaf<(i32 N)>; +def FCC_U : FCC_VAL<23>; // Unordered +def FCC_G : FCC_VAL<22>; // Greater +def FCC_UG : FCC_VAL<21>; // Unordered or Greater +def FCC_L : FCC_VAL<20>; // Less +def FCC_UL : FCC_VAL<19>; // Unordered or Less +def FCC_LG : FCC_VAL<18>; // Less or Greater +def FCC_NE : FCC_VAL<17>; // Not Equal +def FCC_E : FCC_VAL<25>; // Equal +def FCC_UE : FCC_VAL<24>; // Unordered or Equal +def FCC_GE : FCC_VAL<25>; // Greater or Equal +def FCC_UGE : FCC_VAL<26>; // Unordered or Greater or Equal +def FCC_LE : FCC_VAL<27>; // Less or Equal +def FCC_ULE : FCC_VAL<28>; // Unordered or Less or Equal +def FCC_O : FCC_VAL<29>; // Ordered + +//===----------------------------------------------------------------------===// +// Instruction Class Templates +//===----------------------------------------------------------------------===// + +/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot. +multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> { + def rr : F3_1<2, Op3Val, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + !strconcat(OpcStr, " $b, $c, $dst"), + [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>; + def ri : F3_2<2, Op3Val, + (ops IntRegs:$dst, IntRegs:$b, i32imm:$c), + !strconcat(OpcStr, " $b, $c, $dst"), + [(set IntRegs:$dst, (OpNode IntRegs:$b, simm13:$c))]>; +} + +/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no +/// pattern. +multiclass F3_12np<string OpcStr, bits<6> Op3Val> { + def rr : F3_1<2, Op3Val, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + !strconcat(OpcStr, " $b, $c, $dst"), []>; + def ri : F3_2<2, Op3Val, + (ops IntRegs:$dst, IntRegs:$b, i32imm:$c), + !strconcat(OpcStr, " $b, $c, $dst"), []>; +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +// Pseudo instructions. +class Pseudo<dag ops, string asmstr, list<dag> pattern> + : InstSP<ops, asmstr, pattern>; + +def ADJCALLSTACKDOWN : Pseudo<(ops i32imm:$amt), + "!ADJCALLSTACKDOWN $amt", + [(callseq_start imm:$amt)]>, Imp<[O6],[O6]>; +def ADJCALLSTACKUP : Pseudo<(ops i32imm:$amt), + "!ADJCALLSTACKUP $amt", + [(callseq_end imm:$amt)]>, Imp<[O6],[O6]>; +def IMPLICIT_DEF_Int : Pseudo<(ops IntRegs:$dst), + "!IMPLICIT_DEF $dst", + [(set IntRegs:$dst, (undef))]>; +def IMPLICIT_DEF_FP : Pseudo<(ops FPRegs:$dst), "!IMPLICIT_DEF $dst", + [(set FPRegs:$dst, (undef))]>; +def IMPLICIT_DEF_DFP : Pseudo<(ops DFPRegs:$dst), "!IMPLICIT_DEF $dst", + [(set DFPRegs:$dst, (undef))]>; + +// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the +// fpmover pass. +let Predicates = [HasNoV9] in { // Only emit these in V8 mode. + def FpMOVD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src), + "!FpMOVD $src, $dst", []>; + def FpNEGD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src), + "!FpNEGD $src, $dst", + [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>; + def FpABSD : Pseudo<(ops DFPRegs:$dst, DFPRegs:$src), + "!FpABSD $src, $dst", + [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>; +} + +// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded by the +// scheduler into a branch sequence. This has to handle all permutations of +// selection between i32/f32/f64 on ICC and FCC. +let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. + def SELECT_CC_Int_ICC + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, i32imm:$Cond), + "; SELECT_CC_Int_ICC PSEUDO!", + [(set IntRegs:$dst, (SPselecticc IntRegs:$T, IntRegs:$F, + imm:$Cond))]>; + def SELECT_CC_Int_FCC + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, i32imm:$Cond), + "; SELECT_CC_Int_FCC PSEUDO!", + [(set IntRegs:$dst, (SPselectfcc IntRegs:$T, IntRegs:$F, + imm:$Cond))]>; + def SELECT_CC_FP_ICC + : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, i32imm:$Cond), + "; SELECT_CC_FP_ICC PSEUDO!", + [(set FPRegs:$dst, (SPselecticc FPRegs:$T, FPRegs:$F, + imm:$Cond))]>; + def SELECT_CC_FP_FCC + : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, i32imm:$Cond), + "; SELECT_CC_FP_FCC PSEUDO!", + [(set FPRegs:$dst, (SPselectfcc FPRegs:$T, FPRegs:$F, + imm:$Cond))]>; + def SELECT_CC_DFP_ICC + : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, i32imm:$Cond), + "; SELECT_CC_DFP_ICC PSEUDO!", + [(set DFPRegs:$dst, (SPselecticc DFPRegs:$T, DFPRegs:$F, + imm:$Cond))]>; + def SELECT_CC_DFP_FCC + : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, i32imm:$Cond), + "; SELECT_CC_DFP_FCC PSEUDO!", + [(set DFPRegs:$dst, (SPselectfcc DFPRegs:$T, DFPRegs:$F, + imm:$Cond))]>; +} + + +// Section A.3 - Synthetic Instructions, p. 85 +// special cases of JMPL: +let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, noResults = 1 in { + let rd = O7.Num, rs1 = G0.Num, simm13 = 8 in + def RETL: F3_2<2, 0b111000, (ops), "retl", [(retflag)]>; +} + +// Section B.1 - Load Integer Instructions, p. 90 +def LDSBrr : F3_1<3, 0b001001, + (ops IntRegs:$dst, MEMrr:$addr), + "ldsb [$addr], $dst", + [(set IntRegs:$dst, (sextloadi8 ADDRrr:$addr))]>; +def LDSBri : F3_2<3, 0b001001, + (ops IntRegs:$dst, MEMri:$addr), + "ldsb [$addr], $dst", + [(set IntRegs:$dst, (sextloadi8 ADDRri:$addr))]>; +def LDSHrr : F3_1<3, 0b001010, + (ops IntRegs:$dst, MEMrr:$addr), + "ldsh [$addr], $dst", + [(set IntRegs:$dst, (sextloadi16 ADDRrr:$addr))]>; +def LDSHri : F3_2<3, 0b001010, + (ops IntRegs:$dst, MEMri:$addr), + "ldsh [$addr], $dst", + [(set IntRegs:$dst, (sextloadi16 ADDRri:$addr))]>; +def LDUBrr : F3_1<3, 0b000001, + (ops IntRegs:$dst, MEMrr:$addr), + "ldub [$addr], $dst", + [(set IntRegs:$dst, (zextloadi8 ADDRrr:$addr))]>; +def LDUBri : F3_2<3, 0b000001, + (ops IntRegs:$dst, MEMri:$addr), + "ldub [$addr], $dst", + [(set IntRegs:$dst, (zextloadi8 ADDRri:$addr))]>; +def LDUHrr : F3_1<3, 0b000010, + (ops IntRegs:$dst, MEMrr:$addr), + "lduh [$addr], $dst", + [(set IntRegs:$dst, (zextloadi16 ADDRrr:$addr))]>; +def LDUHri : F3_2<3, 0b000010, + (ops IntRegs:$dst, MEMri:$addr), + "lduh [$addr], $dst", + [(set IntRegs:$dst, (zextloadi16 ADDRri:$addr))]>; +def LDrr : F3_1<3, 0b000000, + (ops IntRegs:$dst, MEMrr:$addr), + "ld [$addr], $dst", + [(set IntRegs:$dst, (load ADDRrr:$addr))]>; +def LDri : F3_2<3, 0b000000, + (ops IntRegs:$dst, MEMri:$addr), + "ld [$addr], $dst", + [(set IntRegs:$dst, (load ADDRri:$addr))]>; + +// Section B.2 - Load Floating-point Instructions, p. 92 +def LDFrr : F3_1<3, 0b100000, + (ops FPRegs:$dst, MEMrr:$addr), + "ld [$addr], $dst", + [(set FPRegs:$dst, (load ADDRrr:$addr))]>; +def LDFri : F3_2<3, 0b100000, + (ops FPRegs:$dst, MEMri:$addr), + "ld [$addr], $dst", + [(set FPRegs:$dst, (load ADDRri:$addr))]>; +def LDDFrr : F3_1<3, 0b100011, + (ops DFPRegs:$dst, MEMrr:$addr), + "ldd [$addr], $dst", + [(set DFPRegs:$dst, (load ADDRrr:$addr))]>; +def LDDFri : F3_2<3, 0b100011, + (ops DFPRegs:$dst, MEMri:$addr), + "ldd [$addr], $dst", + [(set DFPRegs:$dst, (load ADDRri:$addr))]>; + +// Section B.4 - Store Integer Instructions, p. 95 +def STBrr : F3_1<3, 0b000101, + (ops MEMrr:$addr, IntRegs:$src), + "stb $src, [$addr]", + [(truncstorei8 IntRegs:$src, ADDRrr:$addr)]>; +def STBri : F3_2<3, 0b000101, + (ops MEMri:$addr, IntRegs:$src), + "stb $src, [$addr]", + [(truncstorei8 IntRegs:$src, ADDRri:$addr)]>; +def STHrr : F3_1<3, 0b000110, + (ops MEMrr:$addr, IntRegs:$src), + "sth $src, [$addr]", + [(truncstorei16 IntRegs:$src, ADDRrr:$addr)]>; +def STHri : F3_2<3, 0b000110, + (ops MEMri:$addr, IntRegs:$src), + "sth $src, [$addr]", + [(truncstorei16 IntRegs:$src, ADDRri:$addr)]>; +def STrr : F3_1<3, 0b000100, + (ops MEMrr:$addr, IntRegs:$src), + "st $src, [$addr]", + [(store IntRegs:$src, ADDRrr:$addr)]>; +def STri : F3_2<3, 0b000100, + (ops MEMri:$addr, IntRegs:$src), + "st $src, [$addr]", + [(store IntRegs:$src, ADDRri:$addr)]>; + +// Section B.5 - Store Floating-point Instructions, p. 97 +def STFrr : F3_1<3, 0b100100, + (ops MEMrr:$addr, FPRegs:$src), + "st $src, [$addr]", + [(store FPRegs:$src, ADDRrr:$addr)]>; +def STFri : F3_2<3, 0b100100, + (ops MEMri:$addr, FPRegs:$src), + "st $src, [$addr]", + [(store FPRegs:$src, ADDRri:$addr)]>; +def STDFrr : F3_1<3, 0b100111, + (ops MEMrr:$addr, DFPRegs:$src), + "std $src, [$addr]", + [(store DFPRegs:$src, ADDRrr:$addr)]>; +def STDFri : F3_2<3, 0b100111, + (ops MEMri:$addr, DFPRegs:$src), + "std $src, [$addr]", + [(store DFPRegs:$src, ADDRri:$addr)]>; + +// Section B.9 - SETHI Instruction, p. 104 +def SETHIi: F2_1<0b100, + (ops IntRegs:$dst, i32imm:$src), + "sethi $src, $dst", + [(set IntRegs:$dst, SETHIimm:$src)]>; + +// Section B.10 - NOP Instruction, p. 105 +// (It's a special case of SETHI) +let rd = 0, imm22 = 0 in + def NOP : F2_1<0b100, (ops), "nop", []>; + +// Section B.11 - Logical Instructions, p. 106 +defm AND : F3_12<"and", 0b000001, and>; + +def ANDNrr : F3_1<2, 0b000101, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + "andn $b, $c, $dst", + [(set IntRegs:$dst, (and IntRegs:$b, (not IntRegs:$c)))]>; +def ANDNri : F3_2<2, 0b000101, + (ops IntRegs:$dst, IntRegs:$b, i32imm:$c), + "andn $b, $c, $dst", []>; + +defm OR : F3_12<"or", 0b000010, or>; + +def ORNrr : F3_1<2, 0b000110, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + "orn $b, $c, $dst", + [(set IntRegs:$dst, (or IntRegs:$b, (not IntRegs:$c)))]>; +def ORNri : F3_2<2, 0b000110, + (ops IntRegs:$dst, IntRegs:$b, i32imm:$c), + "orn $b, $c, $dst", []>; +defm XOR : F3_12<"xor", 0b000011, xor>; + +def XNORrr : F3_1<2, 0b000111, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + "xnor $b, $c, $dst", + [(set IntRegs:$dst, (not (xor IntRegs:$b, IntRegs:$c)))]>; +def XNORri : F3_2<2, 0b000111, + (ops IntRegs:$dst, IntRegs:$b, i32imm:$c), + "xnor $b, $c, $dst", []>; + +// Section B.12 - Shift Instructions, p. 107 +defm SLL : F3_12<"sll", 0b100101, shl>; +defm SRL : F3_12<"srl", 0b100110, srl>; +defm SRA : F3_12<"sra", 0b100111, sra>; + +// Section B.13 - Add Instructions, p. 108 +defm ADD : F3_12<"add", 0b000000, add>; + +// "LEA" forms of add (patterns to make tblgen happy) +def LEA_ADDri : F3_2<2, 0b000000, + (ops IntRegs:$dst, MEMri:$addr), + "add ${addr:arith}, $dst", + [(set IntRegs:$dst, ADDRri:$addr)]>; + +defm ADDCC : F3_12<"addcc", 0b010000, addc>; +defm ADDX : F3_12<"addx", 0b001000, adde>; + +// Section B.15 - Subtract Instructions, p. 110 +defm SUB : F3_12 <"sub" , 0b000100, sub>; +defm SUBX : F3_12 <"subx" , 0b001100, sube>; +defm SUBCC : F3_12 <"subcc", 0b010100, SPcmpicc>; + +def SUBXCCrr: F3_1<2, 0b011100, + (ops IntRegs:$dst, IntRegs:$b, IntRegs:$c), + "subxcc $b, $c, $dst", []>; + +// Section B.18 - Multiply Instructions, p. 113 +defm UMUL : F3_12np<"umul", 0b001010>; +defm SMUL : F3_12 <"smul", 0b001011, mul>; + + +// Section B.19 - Divide Instructions, p. 115 +defm UDIV : F3_12np<"udiv", 0b001110>; +defm SDIV : F3_12np<"sdiv", 0b001111>; + +// Section B.20 - SAVE and RESTORE, p. 117 +defm SAVE : F3_12np<"save" , 0b111100>; +defm RESTORE : F3_12np<"restore", 0b111101>; + +// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119 + +// conditional branch class: +class BranchSP<bits<4> cc, dag ops, string asmstr, list<dag> pattern> + : F2_2<cc, 0b010, ops, asmstr, pattern> { + let isBranch = 1; + let isTerminator = 1; + let hasDelaySlot = 1; + let noResults = 1; +} + +let isBarrier = 1 in + def BA : BranchSP<0b1000, (ops brtarget:$dst), + "ba $dst", + [(br bb:$dst)]>; + +// FIXME: the encoding for the JIT should look at the condition field. +def BCOND : BranchSP<0, (ops brtarget:$dst, CCOp:$cc), + "b$cc $dst", + [(SPbricc bb:$dst, imm:$cc)]>; + + +// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121 + +// floating-point conditional branch class: +class FPBranchSP<bits<4> cc, dag ops, string asmstr, list<dag> pattern> + : F2_2<cc, 0b110, ops, asmstr, pattern> { + let isBranch = 1; + let isTerminator = 1; + let hasDelaySlot = 1; + let noResults = 1; +} + +// FIXME: the encoding for the JIT should look at the condition field. +def FBCOND : FPBranchSP<0, (ops brtarget:$dst, CCOp:$cc), + "fb$cc $dst", + [(SPbrfcc bb:$dst, imm:$cc)]>; + + +// Section B.24 - Call and Link Instruction, p. 125 +// This is the only Format 1 instruction +let Uses = [O0, O1, O2, O3, O4, O5], + hasDelaySlot = 1, isCall = 1, noResults = 1, + Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7, + D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15] in { + def CALL : InstSP<(ops calltarget:$dst), + "call $dst", []> { + bits<30> disp; + let op = 1; + let Inst{29-0} = disp; + } + + // indirect calls + def JMPLrr : F3_1<2, 0b111000, + (ops MEMrr:$ptr), + "call $ptr", + [(call ADDRrr:$ptr)]>; + def JMPLri : F3_2<2, 0b111000, + (ops MEMri:$ptr), + "call $ptr", + [(call ADDRri:$ptr)]>; +} + +// Section B.28 - Read State Register Instructions +def RDY : F3_1<2, 0b101000, + (ops IntRegs:$dst), + "rd %y, $dst", []>; + +// Section B.29 - Write State Register Instructions +def WRYrr : F3_1<2, 0b110000, + (ops IntRegs:$b, IntRegs:$c), + "wr $b, $c, %y", []>; +def WRYri : F3_2<2, 0b110000, + (ops IntRegs:$b, i32imm:$c), + "wr $b, $c, %y", []>; + +// Convert Integer to Floating-point Instructions, p. 141 +def FITOS : F3_3<2, 0b110100, 0b011000100, + (ops FPRegs:$dst, FPRegs:$src), + "fitos $src, $dst", + [(set FPRegs:$dst, (SPitof FPRegs:$src))]>; +def FITOD : F3_3<2, 0b110100, 0b011001000, + (ops DFPRegs:$dst, FPRegs:$src), + "fitod $src, $dst", + [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>; + +// Convert Floating-point to Integer Instructions, p. 142 +def FSTOI : F3_3<2, 0b110100, 0b011010001, + (ops FPRegs:$dst, FPRegs:$src), + "fstoi $src, $dst", + [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>; +def FDTOI : F3_3<2, 0b110100, 0b011010010, + (ops FPRegs:$dst, DFPRegs:$src), + "fdtoi $src, $dst", + [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>; + +// Convert between Floating-point Formats Instructions, p. 143 +def FSTOD : F3_3<2, 0b110100, 0b011001001, + (ops DFPRegs:$dst, FPRegs:$src), + "fstod $src, $dst", + [(set DFPRegs:$dst, (fextend FPRegs:$src))]>; +def FDTOS : F3_3<2, 0b110100, 0b011000110, + (ops FPRegs:$dst, DFPRegs:$src), + "fdtos $src, $dst", + [(set FPRegs:$dst, (fround DFPRegs:$src))]>; + +// Floating-point Move Instructions, p. 144 +def FMOVS : F3_3<2, 0b110100, 0b000000001, + (ops FPRegs:$dst, FPRegs:$src), + "fmovs $src, $dst", []>; +def FNEGS : F3_3<2, 0b110100, 0b000000101, + (ops FPRegs:$dst, FPRegs:$src), + "fnegs $src, $dst", + [(set FPRegs:$dst, (fneg FPRegs:$src))]>; +def FABSS : F3_3<2, 0b110100, 0b000001001, + (ops FPRegs:$dst, FPRegs:$src), + "fabss $src, $dst", + [(set FPRegs:$dst, (fabs FPRegs:$src))]>; + + +// Floating-point Square Root Instructions, p.145 +def FSQRTS : F3_3<2, 0b110100, 0b000101001, + (ops FPRegs:$dst, FPRegs:$src), + "fsqrts $src, $dst", + [(set FPRegs:$dst, (fsqrt FPRegs:$src))]>; +def FSQRTD : F3_3<2, 0b110100, 0b000101010, + (ops DFPRegs:$dst, DFPRegs:$src), + "fsqrtd $src, $dst", + [(set DFPRegs:$dst, (fsqrt DFPRegs:$src))]>; + + + +// Floating-point Add and Subtract Instructions, p. 146 +def FADDS : F3_3<2, 0b110100, 0b001000001, + (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2), + "fadds $src1, $src2, $dst", + [(set FPRegs:$dst, (fadd FPRegs:$src1, FPRegs:$src2))]>; +def FADDD : F3_3<2, 0b110100, 0b001000010, + (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2), + "faddd $src1, $src2, $dst", + [(set DFPRegs:$dst, (fadd DFPRegs:$src1, DFPRegs:$src2))]>; +def FSUBS : F3_3<2, 0b110100, 0b001000101, + (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2), + "fsubs $src1, $src2, $dst", + [(set FPRegs:$dst, (fsub FPRegs:$src1, FPRegs:$src2))]>; +def FSUBD : F3_3<2, 0b110100, 0b001000110, + (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2), + "fsubd $src1, $src2, $dst", + [(set DFPRegs:$dst, (fsub DFPRegs:$src1, DFPRegs:$src2))]>; + +// Floating-point Multiply and Divide Instructions, p. 147 +def FMULS : F3_3<2, 0b110100, 0b001001001, + (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2), + "fmuls $src1, $src2, $dst", + [(set FPRegs:$dst, (fmul FPRegs:$src1, FPRegs:$src2))]>; +def FMULD : F3_3<2, 0b110100, 0b001001010, + (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2), + "fmuld $src1, $src2, $dst", + [(set DFPRegs:$dst, (fmul DFPRegs:$src1, DFPRegs:$src2))]>; +def FSMULD : F3_3<2, 0b110100, 0b001101001, + (ops DFPRegs:$dst, FPRegs:$src1, FPRegs:$src2), + "fsmuld $src1, $src2, $dst", + [(set DFPRegs:$dst, (fmul (fextend FPRegs:$src1), + (fextend FPRegs:$src2)))]>; +def FDIVS : F3_3<2, 0b110100, 0b001001101, + (ops FPRegs:$dst, FPRegs:$src1, FPRegs:$src2), + "fdivs $src1, $src2, $dst", + [(set FPRegs:$dst, (fdiv FPRegs:$src1, FPRegs:$src2))]>; +def FDIVD : F3_3<2, 0b110100, 0b001001110, + (ops DFPRegs:$dst, DFPRegs:$src1, DFPRegs:$src2), + "fdivd $src1, $src2, $dst", + [(set DFPRegs:$dst, (fdiv DFPRegs:$src1, DFPRegs:$src2))]>; + +// Floating-point Compare Instructions, p. 148 +// Note: the 2nd template arg is different for these guys. +// Note 2: the result of a FCMP is not available until the 2nd cycle +// after the instr is retired, but there is no interlock. This behavior +// is modelled with a forced noop after the instruction. +def FCMPS : F3_3<2, 0b110101, 0b001010001, + (ops FPRegs:$src1, FPRegs:$src2), + "fcmps $src1, $src2\n\tnop", + [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>; +def FCMPD : F3_3<2, 0b110101, 0b001010010, + (ops DFPRegs:$src1, DFPRegs:$src2), + "fcmpd $src1, $src2\n\tnop", + [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>; + + +//===----------------------------------------------------------------------===// +// V9 Instructions +//===----------------------------------------------------------------------===// + +// V9 Conditional Moves. +let Predicates = [HasV9], isTwoAddress = 1 in { + // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual. + // FIXME: Add instruction encodings for the JIT some day. + def MOVICCrr + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, CCOp:$cc), + "mov$cc %icc, $F, $dst", + [(set IntRegs:$dst, + (SPselecticc IntRegs:$F, IntRegs:$T, imm:$cc))]>; + def MOVICCri + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, i32imm:$F, CCOp:$cc), + "mov$cc %icc, $F, $dst", + [(set IntRegs:$dst, + (SPselecticc simm11:$F, IntRegs:$T, imm:$cc))]>; + + def MOVFCCrr + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, IntRegs:$F, CCOp:$cc), + "mov$cc %fcc0, $F, $dst", + [(set IntRegs:$dst, + (SPselectfcc IntRegs:$F, IntRegs:$T, imm:$cc))]>; + def MOVFCCri + : Pseudo<(ops IntRegs:$dst, IntRegs:$T, i32imm:$F, CCOp:$cc), + "mov$cc %fcc0, $F, $dst", + [(set IntRegs:$dst, + (SPselectfcc simm11:$F, IntRegs:$T, imm:$cc))]>; + + def FMOVS_ICC + : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, CCOp:$cc), + "fmovs$cc %icc, $F, $dst", + [(set FPRegs:$dst, + (SPselecticc FPRegs:$F, FPRegs:$T, imm:$cc))]>; + def FMOVD_ICC + : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, CCOp:$cc), + "fmovd$cc %icc, $F, $dst", + [(set DFPRegs:$dst, + (SPselecticc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>; + def FMOVS_FCC + : Pseudo<(ops FPRegs:$dst, FPRegs:$T, FPRegs:$F, CCOp:$cc), + "fmovs$cc %fcc0, $F, $dst", + [(set FPRegs:$dst, + (SPselectfcc FPRegs:$F, FPRegs:$T, imm:$cc))]>; + def FMOVD_FCC + : Pseudo<(ops DFPRegs:$dst, DFPRegs:$T, DFPRegs:$F, CCOp:$cc), + "fmovd$cc %fcc0, $F, $dst", + [(set DFPRegs:$dst, + (SPselectfcc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>; + +} + +// Floating-Point Move Instructions, p. 164 of the V9 manual. +let Predicates = [HasV9] in { + def FMOVD : F3_3<2, 0b110100, 0b000000010, + (ops DFPRegs:$dst, DFPRegs:$src), + "fmovd $src, $dst", []>; + def FNEGD : F3_3<2, 0b110100, 0b000000110, + (ops DFPRegs:$dst, DFPRegs:$src), + "fnegd $src, $dst", + [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>; + def FABSD : F3_3<2, 0b110100, 0b000001010, + (ops DFPRegs:$dst, DFPRegs:$src), + "fabsd $src, $dst", + [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>; +} + +// POPCrr - This does a ctpop of a 64-bit register. As such, we have to clear +// the top 32-bits before using it. To do this clearing, we use a SLLri X,0. +def POPCrr : F3_1<2, 0b101110, + (ops IntRegs:$dst, IntRegs:$src), + "popc $src, $dst", []>, Requires<[HasV9]>; +def : Pat<(ctpop IntRegs:$src), + (POPCrr (SLLri IntRegs:$src, 0))>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Small immediates. +def : Pat<(i32 simm13:$val), + (ORri G0, imm:$val)>; +// Arbitrary immediates. +def : Pat<(i32 imm:$val), + (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>; + +// subc +def : Pat<(subc IntRegs:$b, IntRegs:$c), + (SUBCCrr IntRegs:$b, IntRegs:$c)>; +def : Pat<(subc IntRegs:$b, simm13:$val), + (SUBCCri IntRegs:$b, imm:$val)>; + +// Global addresses, constant pool entries +def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>; +def : Pat<(SPlo tglobaladdr:$in), (ORri G0, tglobaladdr:$in)>; +def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>; +def : Pat<(SPlo tconstpool:$in), (ORri G0, tconstpool:$in)>; + +// Add reg, lo. This is used when taking the addr of a global/constpool entry. +def : Pat<(add IntRegs:$r, (SPlo tglobaladdr:$in)), + (ADDri IntRegs:$r, tglobaladdr:$in)>; +def : Pat<(add IntRegs:$r, (SPlo tconstpool:$in)), + (ADDri IntRegs:$r, tconstpool:$in)>; + +// Calls: +def : Pat<(call tglobaladdr:$dst), + (CALL tglobaladdr:$dst)>; +def : Pat<(call texternalsym:$dst), + (CALL texternalsym:$dst)>; + +def : Pat<(ret), (RETL)>; + +// Map integer extload's to zextloads. +def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>; +def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>; +def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>; +def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>; +def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>; +def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>; + +// zextload bool -> zextload byte +def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>; +def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>; + +// truncstore bool -> truncstore byte. +def : Pat<(truncstorei1 IntRegs:$src, ADDRrr:$addr), + (STBrr ADDRrr:$addr, IntRegs:$src)>; +def : Pat<(truncstorei1 IntRegs:$src, ADDRri:$addr), + (STBri ADDRri:$addr, IntRegs:$src)>; diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp new file mode 100644 index 0000000..1981b4f --- /dev/null +++ b/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -0,0 +1,276 @@ +//===- SparcRegisterInfo.cpp - SPARC Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the SPARC implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "Sparc.h" +#include "SparcRegisterInfo.h" +#include "SparcSubtarget.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Type.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st, + const TargetInstrInfo &tii) + : SparcGenRegisterInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), + Subtarget(st), TII(tii) { +} + +void SparcRegisterInfo:: +storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned SrcReg, int FI, + const TargetRegisterClass *RC) const { + // On the order of operands here: think "[FrameIdx + 0] = SrcReg". + if (RC == SP::IntRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::STri)).addFrameIndex(FI).addImm(0) + .addReg(SrcReg, false, false, true); + else if (RC == SP::FPRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::STFri)).addFrameIndex(FI).addImm(0) + .addReg(SrcReg, false, false, true); + else if (RC == SP::DFPRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::STDFri)).addFrameIndex(FI).addImm(0) + .addReg(SrcReg, false, false, true); + else + assert(0 && "Can't store this register to stack slot"); +} + +void SparcRegisterInfo:: +loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + unsigned DestReg, int FI, + const TargetRegisterClass *RC) const { + if (RC == SP::IntRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0); + else if (RC == SP::FPRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0); + else if (RC == SP::DFPRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0); + else + assert(0 && "Can't load this register from stack slot"); +} + +void SparcRegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + if (RC == SP::IntRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::ORrr), DestReg).addReg(SP::G0).addReg(SrcReg); + else if (RC == SP::FPRegsRegisterClass) + BuildMI(MBB, I, TII.get(SP::FMOVS), DestReg).addReg(SrcReg); + else if (RC == SP::DFPRegsRegisterClass) + BuildMI(MBB, I, TII.get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD),DestReg) + .addReg(SrcReg); + else + assert (0 && "Can't copy this register"); +} + +void SparcRegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +MachineInstr *SparcRegisterInfo::foldMemoryOperand(MachineInstr* MI, + unsigned OpNum, + int FI) const { + bool isFloat = false; + MachineInstr *NewMI = NULL; + switch (MI->getOpcode()) { + case SP::ORrr: + if (MI->getOperand(1).isRegister() && MI->getOperand(1).getReg() == SP::G0&& + MI->getOperand(0).isRegister() && MI->getOperand(2).isRegister()) { + if (OpNum == 0) // COPY -> STORE + NewMI = BuildMI(TII.get(SP::STri)).addFrameIndex(FI).addImm(0) + .addReg(MI->getOperand(2).getReg()); + else // COPY -> LOAD + NewMI = BuildMI(TII.get(SP::LDri), MI->getOperand(0).getReg()) + .addFrameIndex(FI).addImm(0); + } + break; + case SP::FMOVS: + isFloat = true; + // FALLTHROUGH + case SP::FMOVD: + if (OpNum == 0) // COPY -> STORE + NewMI = BuildMI(TII.get(isFloat ? SP::STFri : SP::STDFri)) + .addFrameIndex(FI).addImm(0).addReg(MI->getOperand(1).getReg()); + else // COPY -> LOAD + NewMI = BuildMI(TII.get(isFloat ? SP::LDFri : SP::LDDFri), + MI->getOperand(0).getReg()).addFrameIndex(FI).addImm(0); + break; + } + + if (NewMI) + NewMI->copyKillDeadInfo(MI); + return NewMI; +} + +const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + static const unsigned CalleeSavedRegs[] = { 0 }; + return CalleeSavedRegs; +} + +BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(SP::G2); + Reserved.set(SP::G3); + Reserved.set(SP::G4); + Reserved.set(SP::O6); + Reserved.set(SP::I6); + Reserved.set(SP::I7); + Reserved.set(SP::G0); + Reserved.set(SP::G5); + Reserved.set(SP::G6); + Reserved.set(SP::G7); + return Reserved; +} + + +const TargetRegisterClass* const* +SparcRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 }; + return CalleeSavedRegClasses; +} + +bool SparcRegisterInfo::hasFP(const MachineFunction &MF) const { + return false; +} + +void SparcRegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + MachineInstr &MI = *I; + int Size = MI.getOperand(0).getImmedValue(); + if (MI.getOpcode() == SP::ADJCALLSTACKDOWN) + Size = -Size; + if (Size) + BuildMI(MBB, I, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size); + MBB.erase(I); +} + +void SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getFrameIndex(); + + // Addressable stack objects are accessed using neg. offsets from %fp + MachineFunction &MF = *MI.getParent()->getParent(); + int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MI.getOperand(i+1).getImmedValue(); + + // Replace frame index with a frame pointer reference. + if (Offset >= -4096 && Offset <= 4095) { + // If the offset is small enough to fit in the immediate field, directly + // encode it. + MI.getOperand(i).ChangeToRegister(SP::I6, false); + MI.getOperand(i+1).ChangeToImmediate(Offset); + } else { + // Otherwise, emit a G1 = SETHI %hi(offset). FIXME: it would be better to + // scavenge a register here instead of reserving G1 all of the time. + unsigned OffHi = (unsigned)Offset >> 10U; + BuildMI(*MI.getParent(), II, TII.get(SP::SETHIi), SP::G1).addImm(OffHi); + // Emit G1 = G1 + I6 + BuildMI(*MI.getParent(), II, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1) + .addReg(SP::I6); + // Insert: G1+%lo(offset) into the user. + MI.getOperand(i).ChangeToRegister(SP::G1, false); + MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1)); + } +} + +void SparcRegisterInfo:: +processFunctionBeforeFrameFinalized(MachineFunction &MF) const {} + +void SparcRegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + int NumBytes = (int) MFI->getStackSize(); + + // Emit the correct save instruction based on the number of bytes in + // the frame. Minimum stack frame size according to V8 ABI is: + // 16 words for register window spill + // 1 word for address of returned aggregate-value + // + 6 words for passing parameters on the stack + // ---------- + // 23 words * 4 bytes per word = 92 bytes + NumBytes += 92; + // Round up to next doubleword boundary -- a double-word boundary + // is required by the ABI. + NumBytes = (NumBytes + 7) & ~7; + NumBytes = -NumBytes; + + if (NumBytes >= -4096) { + BuildMI(MBB, MBB.begin(), TII.get(SP::SAVEri), + SP::O6).addImm(NumBytes).addReg(SP::O6); + } else { + MachineBasicBlock::iterator InsertPt = MBB.begin(); + // Emit this the hard way. This clobbers G1 which we always know is + // available here. + unsigned OffHi = (unsigned)NumBytes >> 10U; + BuildMI(MBB, InsertPt, TII.get(SP::SETHIi), SP::G1).addImm(OffHi); + // Emit G1 = G1 + I6 + BuildMI(MBB, InsertPt, TII.get(SP::ORri), SP::G1) + .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1)); + BuildMI(MBB, InsertPt, TII.get(SP::SAVErr), SP::O6) + .addReg(SP::O6).addReg(SP::G1); + } +} + +void SparcRegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + assert(MBBI->getOpcode() == SP::RETL && + "Can only put epilog before 'retl' instruction!"); + BuildMI(MBB, MBBI, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0) + .addReg(SP::G0); +} + +unsigned SparcRegisterInfo::getRARegister() const { + assert(0 && "What is the return address register"); + return 0; +} + +unsigned SparcRegisterInfo::getFrameRegister(MachineFunction &MF) const { + assert(0 && "What is the frame register"); + return SP::G1; +} + +unsigned SparcRegisterInfo::getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned SparcRegisterInfo::getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +#include "SparcGenRegisterInfo.inc" + diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h new file mode 100644 index 0000000..451964b --- /dev/null +++ b/lib/Target/Sparc/SparcRegisterInfo.h @@ -0,0 +1,86 @@ +//===- SparcRegisterInfo.h - Sparc Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the Sparc implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef SPARCREGISTERINFO_H +#define SPARCREGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "SparcGenRegisterInfo.h.inc" + +namespace llvm { + +class SparcSubtarget; +class TargetInstrInfo; +class Type; + +struct SparcRegisterInfo : public SparcGenRegisterInfo { + SparcSubtarget &Subtarget; + const TargetInstrInfo &TII; + + SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii); + + /// Code Generation virtual methods... + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + virtual MachineInstr* foldMemoryOperand(MachineInstr* MI, + unsigned OpNum, + int FrameIndex) const; + + const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td new file mode 100644 index 0000000..8e2f444 --- /dev/null +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -0,0 +1,158 @@ +//===- SparcRegisterInfo.td - Sparc Register defs ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the Sparc register file +//===----------------------------------------------------------------------===// + +class SparcReg<string n> : Register<n> { + field bits<5> Num; + let Namespace = "SP"; +} + +// Registers are identified with 5-bit ID numbers. +// Ri - 32-bit integer registers +class Ri<bits<5> num, string n> : SparcReg<n> { + let Num = num; +} +// Rf - 32-bit floating-point registers +class Rf<bits<5> num, string n> : SparcReg<n> { + let Num = num; +} +// Rd - Slots in the FP register file for 64-bit floating-point values. +class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> { + let Num = num; + let SubRegs = subregs; +} + +// Integer registers +def G0 : Ri< 0, "G0">, DwarfRegNum<0>; +def G1 : Ri< 1, "G1">, DwarfRegNum<1>; +def G2 : Ri< 2, "G2">, DwarfRegNum<2>; +def G3 : Ri< 3, "G3">, DwarfRegNum<3>; +def G4 : Ri< 4, "G4">, DwarfRegNum<4>; +def G5 : Ri< 5, "G5">, DwarfRegNum<5>; +def G6 : Ri< 6, "G6">, DwarfRegNum<6>; +def G7 : Ri< 7, "G7">, DwarfRegNum<7>; +def O0 : Ri< 8, "O0">, DwarfRegNum<8>; +def O1 : Ri< 9, "O1">, DwarfRegNum<9>; +def O2 : Ri<10, "O2">, DwarfRegNum<10>; +def O3 : Ri<11, "O3">, DwarfRegNum<11>; +def O4 : Ri<12, "O4">, DwarfRegNum<12>; +def O5 : Ri<13, "O5">, DwarfRegNum<13>; +def O6 : Ri<14, "O6">, DwarfRegNum<14>; +def O7 : Ri<15, "O7">, DwarfRegNum<15>; +def L0 : Ri<16, "L0">, DwarfRegNum<16>; +def L1 : Ri<17, "L1">, DwarfRegNum<17>; +def L2 : Ri<18, "L2">, DwarfRegNum<18>; +def L3 : Ri<19, "L3">, DwarfRegNum<19>; +def L4 : Ri<20, "L4">, DwarfRegNum<20>; +def L5 : Ri<21, "L5">, DwarfRegNum<21>; +def L6 : Ri<22, "L6">, DwarfRegNum<22>; +def L7 : Ri<23, "L7">, DwarfRegNum<23>; +def I0 : Ri<24, "I0">, DwarfRegNum<24>; +def I1 : Ri<25, "I1">, DwarfRegNum<25>; +def I2 : Ri<26, "I2">, DwarfRegNum<26>; +def I3 : Ri<27, "I3">, DwarfRegNum<27>; +def I4 : Ri<28, "I4">, DwarfRegNum<28>; +def I5 : Ri<29, "I5">, DwarfRegNum<29>; +def I6 : Ri<30, "I6">, DwarfRegNum<30>; +def I7 : Ri<31, "I7">, DwarfRegNum<31>; + +// Floating-point registers +def F0 : Rf< 0, "F0">, DwarfRegNum<32>; +def F1 : Rf< 1, "F1">, DwarfRegNum<33>; +def F2 : Rf< 2, "F2">, DwarfRegNum<34>; +def F3 : Rf< 3, "F3">, DwarfRegNum<35>; +def F4 : Rf< 4, "F4">, DwarfRegNum<36>; +def F5 : Rf< 5, "F5">, DwarfRegNum<37>; +def F6 : Rf< 6, "F6">, DwarfRegNum<38>; +def F7 : Rf< 7, "F7">, DwarfRegNum<39>; +def F8 : Rf< 8, "F8">, DwarfRegNum<40>; +def F9 : Rf< 9, "F9">, DwarfRegNum<41>; +def F10 : Rf<10, "F10">, DwarfRegNum<42>; +def F11 : Rf<11, "F11">, DwarfRegNum<43>; +def F12 : Rf<12, "F12">, DwarfRegNum<44>; +def F13 : Rf<13, "F13">, DwarfRegNum<45>; +def F14 : Rf<14, "F14">, DwarfRegNum<46>; +def F15 : Rf<15, "F15">, DwarfRegNum<47>; +def F16 : Rf<16, "F16">, DwarfRegNum<48>; +def F17 : Rf<17, "F17">, DwarfRegNum<49>; +def F18 : Rf<18, "F18">, DwarfRegNum<50>; +def F19 : Rf<19, "F19">, DwarfRegNum<51>; +def F20 : Rf<20, "F20">, DwarfRegNum<52>; +def F21 : Rf<21, "F21">, DwarfRegNum<53>; +def F22 : Rf<22, "F22">, DwarfRegNum<54>; +def F23 : Rf<23, "F23">, DwarfRegNum<55>; +def F24 : Rf<24, "F24">, DwarfRegNum<56>; +def F25 : Rf<25, "F25">, DwarfRegNum<57>; +def F26 : Rf<26, "F26">, DwarfRegNum<58>; +def F27 : Rf<27, "F27">, DwarfRegNum<59>; +def F28 : Rf<28, "F28">, DwarfRegNum<60>; +def F29 : Rf<29, "F29">, DwarfRegNum<61>; +def F30 : Rf<30, "F30">, DwarfRegNum<62>; +def F31 : Rf<31, "F31">, DwarfRegNum<63>; + +// Aliases of the F* registers used to hold 64-bit fp values (doubles) +def D0 : Rd< 0, "F0", [F0, F1]>, DwarfRegNum<32>; +def D1 : Rd< 2, "F2", [F2, F3]>, DwarfRegNum<34>; +def D2 : Rd< 4, "F4", [F4, F5]>, DwarfRegNum<36>; +def D3 : Rd< 6, "F6", [F6, F7]>, DwarfRegNum<38>; +def D4 : Rd< 8, "F8", [F8, F9]>, DwarfRegNum<40>; +def D5 : Rd<10, "F10", [F10, F11]>, DwarfRegNum<42>; +def D6 : Rd<12, "F12", [F12, F13]>, DwarfRegNum<44>; +def D7 : Rd<14, "F14", [F14, F15]>, DwarfRegNum<46>; +def D8 : Rd<16, "F16", [F16, F17]>, DwarfRegNum<48>; +def D9 : Rd<18, "F18", [F18, F19]>, DwarfRegNum<50>; +def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<52>; +def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<54>; +def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<56>; +def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<58>; +def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<60>; +def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<62>; + +// Register classes. +// +// FIXME: the register order should be defined in terms of the preferred +// allocation order... +// +def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7, + I0, I1, I2, I3, I4, I5, + O0, O1, O2, O3, O4, O5, O7, + + // FIXME: G1 reserved for now for large imm generation by frame code. + G1, + // Non-allocatable regs: + G2, G3, G4, // FIXME: OK for use only in + // applications, not libraries. + O6, // stack ptr + I6, // frame ptr + I7, // return address + G0, // constant zero + G5, G6, G7 // reserved for kernel + ]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + IntRegsClass::iterator + IntRegsClass::allocation_order_end(const MachineFunction &MF) const { + // FIXME: These special regs should be taken out of the regclass! + return end()-10 // Don't allocate special registers + -1; // FIXME: G1 reserved for large imm generation by frame code. + } + }]; +} + +def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8, + F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22, + F23, F24, F25, F26, F27, F28, F29, F30, F31]>; + +def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15]>; diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp new file mode 100644 index 0000000..9940fcf --- /dev/null +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -0,0 +1,43 @@ +//===- SparcSubtarget.cpp - SPARC Subtarget Information -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SPARC specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "SparcSubtarget.h" +#include "SparcGenSubtarget.inc" +using namespace llvm; + +// FIXME: temporary. +#include "llvm/Support/CommandLine.h" +namespace { + cl::opt<bool> EnableV9("enable-sparc-v9-insts", cl::Hidden, + cl::desc("Enable V9 instructions in the V8 target")); +} + +SparcSubtarget::SparcSubtarget(const Module &M, const std::string &FS) { + // Set the default features. + IsV9 = false; + V8DeprecatedInsts = false; + IsVIS = false; + + // Determine default and user specified characteristics + std::string CPU = "generic"; + + // FIXME: autodetect host here! + CPU = "v9"; // What is a good way to detect V9? + + // Parse features string. + ParseSubtargetFeatures(FS, CPU); + + // Unless explicitly enabled, disable the V9 instructions. + if (!EnableV9) + IsV9 = false; +} diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h new file mode 100644 index 0000000..59da95a --- /dev/null +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -0,0 +1,42 @@ +//=====-- SparcSubtarget.h - Define Subtarget for the SPARC ----*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the SPARC specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef SPARC_SUBTARGET_H +#define SPARC_SUBTARGET_H + +#include "llvm/Target/TargetSubtarget.h" +#include <string> + +namespace llvm { + class Module; + +class SparcSubtarget : public TargetSubtarget { + bool IsV9; + bool V8DeprecatedInsts; + bool IsVIS; +public: + SparcSubtarget(const Module &M, const std::string &FS); + + bool isV9() const { return IsV9; } + bool isVIS() const { return IsVIS; } + bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.cpp b/lib/Target/Sparc/SparcTargetAsmInfo.cpp new file mode 100644 index 0000000..01f7f11 --- /dev/null +++ b/lib/Target/Sparc/SparcTargetAsmInfo.cpp @@ -0,0 +1,25 @@ +//===-- SparcTargetAsmInfo.cpp - Sparc asm properties -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the SparcTargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "SparcTargetAsmInfo.h" + +using namespace llvm; + +SparcTargetAsmInfo::SparcTargetAsmInfo(const SparcTargetMachine &TM) { + Data16bitsDirective = "\t.half\t"; + Data32bitsDirective = "\t.word\t"; + Data64bitsDirective = 0; // .xword is only supported by V9. + ZeroDirective = "\t.skip\t"; + CommentString = "!"; + ConstantPoolSection = "\t.section \".rodata\",#alloc\n"; +} diff --git a/lib/Target/Sparc/SparcTargetAsmInfo.h b/lib/Target/Sparc/SparcTargetAsmInfo.h new file mode 100644 index 0000000..6b2dc59 --- /dev/null +++ b/lib/Target/Sparc/SparcTargetAsmInfo.h @@ -0,0 +1,31 @@ +//=====-- SparcTargetAsmInfo.h - Sparc asm properties ---------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the SparcTargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef SPARCTARGETASMINFO_H +#define SPARCTARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class SparcTargetMachine; + + struct SparcTargetAsmInfo : public TargetAsmInfo { + SparcTargetAsmInfo(const SparcTargetMachine &TM); + }; + + +} // namespace llvm + +#endif diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp new file mode 100644 index 0000000..e0206d8 --- /dev/null +++ b/lib/Target/Sparc/SparcTargetMachine.cpp @@ -0,0 +1,83 @@ +//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "SparcTargetAsmInfo.h" +#include "SparcTargetMachine.h" +#include "Sparc.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Target/TargetMachineRegistry.h" +using namespace llvm; + +namespace { + // Register the target. + RegisterTarget<SparcTargetMachine> X("sparc", " SPARC"); +} + +const TargetAsmInfo *SparcTargetMachine::createTargetAsmInfo() const { + return new SparcTargetAsmInfo(*this); +} + +/// SparcTargetMachine ctor - Create an ILP32 architecture model +/// +SparcTargetMachine::SparcTargetMachine(const Module &M, const std::string &FS) + : DataLayout("E-p:32:32"), + Subtarget(M, FS), InstrInfo(Subtarget), + FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) { +} + +unsigned SparcTargetMachine::getModuleMatchQuality(const Module &M) { + std::string TT = M.getTargetTriple(); + if (TT.size() >= 6 && std::string(TT.begin(), TT.begin()+6) == "sparc-") + return 20; + + // If the target triple is something non-sparc, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::BigEndian && + M.getPointerSize() == Module::Pointer32) +#ifdef __sparc__ + return 20; // BE/32 ==> Prefer sparc on sparc +#else + return 5; // BE/32 ==> Prefer ppc elsewhere +#endif + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + +#if defined(__sparc__) + return 10; +#else + return 0; +#endif +} + +bool SparcTargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + PM.add(createSparcISelDag(*this)); + return false; +} + +/// addPreEmitPass - This pass may be implemented by targets that want to run +/// passes immediately before machine code is emitted. This should return +/// true if -print-machineinstrs should print out the code after the passes. +bool SparcTargetMachine::addPreEmitPass(FunctionPassManager &PM, bool Fast) { + PM.add(createSparcFPMoverPass(*this)); + PM.add(createSparcDelaySlotFillerPass(*this)); + return true; +} + +bool SparcTargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + // Output assembly language. + PM.add(createSparcCodePrinterPass(Out, *this)); + return false; +} diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h new file mode 100644 index 0000000..cec0a44 --- /dev/null +++ b/lib/Target/Sparc/SparcTargetMachine.h @@ -0,0 +1,57 @@ +//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the Sparc specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef SPARCTARGETMACHINE_H +#define SPARCTARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "SparcInstrInfo.h" +#include "SparcSubtarget.h" + +namespace llvm { + +class Module; + +class SparcTargetMachine : public LLVMTargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + SparcSubtarget Subtarget; + SparcInstrInfo InstrInfo; + TargetFrameInfo FrameInfo; + +protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + +public: + SparcTargetMachine(const Module &M, const std::string &FS); + + virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual const TargetSubtarget *getSubtargetImpl() const{ return &Subtarget; } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + static unsigned getModuleMatchQuality(const Module &M); + + // Pass Pipeline Configuration + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPreEmitPass(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp new file mode 100644 index 0000000..d783f8b --- /dev/null +++ b/lib/Target/SubtargetFeature.cpp @@ -0,0 +1,357 @@ +//===- SubtargetFeature.cpp - CPU characteristics Implementation ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SubtargetFeature interface. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/SubtargetFeature.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Streams.h" +#include <algorithm> +#include <ostream> +#include <cassert> +#include <cctype> +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Static Helper Functions +//===----------------------------------------------------------------------===// + +/// hasFlag - Determine if a feature has a flag; '+' or '-' +/// +static inline bool hasFlag(const std::string &Feature) { + assert(!Feature.empty() && "Empty string"); + // Get first character + char Ch = Feature[0]; + // Check if first character is '+' or '-' flag + return Ch == '+' || Ch =='-'; +} + +/// StripFlag - Return string stripped of flag. +/// +static inline std::string StripFlag(const std::string &Feature) { + return hasFlag(Feature) ? Feature.substr(1) : Feature; +} + +/// isEnabled - Return true if enable flag; '+'. +/// +static inline bool isEnabled(const std::string &Feature) { + assert(!Feature.empty() && "Empty string"); + // Get first character + char Ch = Feature[0]; + // Check if first character is '+' for enabled + return Ch == '+'; +} + +/// PrependFlag - Return a string with a prepended flag; '+' or '-'. +/// +static inline std::string PrependFlag(const std::string &Feature, + bool IsEnabled) { + assert(!Feature.empty() && "Empty string"); + if (hasFlag(Feature)) return Feature; + return std::string(IsEnabled ? "+" : "-") + Feature; +} + +/// Split - Splits a string of comma separated items in to a vector of strings. +/// +static void Split(std::vector<std::string> &V, const std::string &S) { + // Start at beginning of string. + size_t Pos = 0; + while (true) { + // Find the next comma + size_t Comma = S.find(',', Pos); + // If no comma found then the the rest of the string is used + if (Comma == std::string::npos) { + // Add string to vector + V.push_back(S.substr(Pos)); + break; + } + // Otherwise add substring to vector + V.push_back(S.substr(Pos, Comma - Pos)); + // Advance to next item + Pos = Comma + 1; + } +} + +/// Join a vector of strings to a string with a comma separating each element. +/// +static std::string Join(const std::vector<std::string> &V) { + // Start with empty string. + std::string Result; + // If the vector is not empty + if (!V.empty()) { + // Start with the CPU feature + Result = V[0]; + // For each successive feature + for (size_t i = 1; i < V.size(); i++) { + // Add a comma + Result += ","; + // Add the feature + Result += V[i]; + } + } + // Return the features string + return Result; +} + +/// Adding features. +void SubtargetFeatures::AddFeature(const std::string &String, + bool IsEnabled) { + // Don't add empty features + if (!String.empty()) { + // Convert to lowercase, prepend flag and add to vector + Features.push_back(PrependFlag(LowercaseString(String), IsEnabled)); + } +} + +/// Find KV in array using binary search. +template<typename T> const T *Find(const std::string &S, const T *A, size_t L) { + // Make the lower bound element we're looking for + T KV; + KV.Key = S.c_str(); + // Determine the end of the array + const T *Hi = A + L; + // Binary search the array + const T *F = std::lower_bound(A, Hi, KV); + // If not found then return NULL + if (F == Hi || std::string(F->Key) != S) return NULL; + // Return the found array item + return F; +} + +/// getLongestEntryLength - Return the length of the longest entry in the table. +/// +static size_t getLongestEntryLength(const SubtargetFeatureKV *Table, + size_t Size) { + size_t MaxLen = 0; + for (size_t i = 0; i < Size; i++) + MaxLen = std::max(MaxLen, std::strlen(Table[i].Key)); + return MaxLen; +} + +/// Display help for feature choices. +/// +static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize, + const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) { + // Determine the length of the longest CPU and Feature entries. + unsigned MaxCPULen = getLongestEntryLength(CPUTable, CPUTableSize); + unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize); + + // Print the CPU table. + cerr << "Available CPUs for this target:\n\n"; + for (size_t i = 0; i != CPUTableSize; i++) + cerr << " " << CPUTable[i].Key + << std::string(MaxCPULen - std::strlen(CPUTable[i].Key), ' ') + << " - " << CPUTable[i].Desc << ".\n"; + cerr << "\n"; + + // Print the Feature table. + cerr << "Available features for this target:\n\n"; + for (size_t i = 0; i != FeatTableSize; i++) + cerr << " " << FeatTable[i].Key + << std::string(MaxFeatLen - std::strlen(FeatTable[i].Key), ' ') + << " - " << FeatTable[i].Desc << ".\n"; + cerr << "\n"; + + cerr << "Use +feature to enable a feature, or -feature to disable it.\n" + << "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n"; + exit(1); +} + +//===----------------------------------------------------------------------===// +// SubtargetFeatures Implementation +//===----------------------------------------------------------------------===// + +SubtargetFeatures::SubtargetFeatures(const std::string &Initial) { + // Break up string into separate features + Split(Features, Initial); +} + + +std::string SubtargetFeatures::getString() const { + return Join(Features); +} +void SubtargetFeatures::setString(const std::string &Initial) { + // Throw out old features + Features.clear(); + // Break up string into separate features + Split(Features, LowercaseString(Initial)); +} + + +/// setCPU - Set the CPU string. Replaces previous setting. Setting to "" +/// clears CPU. +void SubtargetFeatures::setCPU(const std::string &String) { + Features[0] = LowercaseString(String); +} + + +/// setCPUIfNone - Setting CPU string only if no string is set. +/// +void SubtargetFeatures::setCPUIfNone(const std::string &String) { + if (Features[0].empty()) setCPU(String); +} + +/// SetImpliedBits - For each feature that is (transitively) implied by this +/// feature, set it. +/// +static +void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, + const SubtargetFeatureKV *FeatureTable, + size_t FeatureTableSize) { + for (size_t i = 0; i < FeatureTableSize; ++i) { + const SubtargetFeatureKV &FE = FeatureTable[i]; + + if (FeatureEntry->Value == FE.Value) continue; + + if (FeatureEntry->Implies & FE.Value) { + Bits |= FE.Value; + SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize); + } + } +} + +/// ClearImpliedBits - For each feature that (transitively) implies this +/// feature, clear it. +/// +static +void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry, + const SubtargetFeatureKV *FeatureTable, + size_t FeatureTableSize) { + for (size_t i = 0; i < FeatureTableSize; ++i) { + const SubtargetFeatureKV &FE = FeatureTable[i]; + + if (FeatureEntry->Value == FE.Value) continue; + + if (FE.Implies & FeatureEntry->Value) { + Bits &= ~FE.Value; + ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize); + } + } +} + +/// getBits - Get feature bits. +/// +uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable, + size_t CPUTableSize, + const SubtargetFeatureKV *FeatureTable, + size_t FeatureTableSize) { + assert(CPUTable && "missing CPU table"); + assert(FeatureTable && "missing features table"); +#ifndef NDEBUG + for (size_t i = 1; i < CPUTableSize; i++) { + assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 && + "CPU table is not sorted"); + } + for (size_t i = 1; i < FeatureTableSize; i++) { + assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 && + "CPU features table is not sorted"); + } +#endif + uint32_t Bits = 0; // Resulting bits + + // Check if help is needed + if (Features[0] == "help") + Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize); + + // Find CPU entry + const SubtargetFeatureKV *CPUEntry = + Find(Features[0], CPUTable, CPUTableSize); + // If there is a match + if (CPUEntry) { + // Set base feature bits + Bits = CPUEntry->Value; + + // Set the feature implied by this CPU feature, if any. + for (size_t i = 0; i < FeatureTableSize; ++i) { + const SubtargetFeatureKV &FE = FeatureTable[i]; + if (CPUEntry->Value & FE.Value) + SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize); + } + } else { + cerr << "'" << Features[0] + << "' is not a recognized processor for this target" + << " (ignoring processor)" + << "\n"; + } + // Iterate through each feature + for (size_t i = 1; i < Features.size(); i++) { + const std::string &Feature = Features[i]; + + // Check for help + if (Feature == "+help") + Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize); + + // Find feature in table. + const SubtargetFeatureKV *FeatureEntry = + Find(StripFlag(Feature), FeatureTable, FeatureTableSize); + // If there is a match + if (FeatureEntry) { + // Enable/disable feature in bits + if (isEnabled(Feature)) { + Bits |= FeatureEntry->Value; + + // For each feature that this implies, set it. + SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize); + } else { + Bits &= ~FeatureEntry->Value; + + // For each feature that implies this, clear it. + ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize); + } + } else { + cerr << "'" << Feature + << "' is not a recognized feature for this target" + << " (ignoring feature)" + << "\n"; + } + } + + return Bits; +} + +/// Get info pointer +void *SubtargetFeatures::getInfo(const SubtargetInfoKV *Table, + size_t TableSize) { + assert(Table && "missing table"); +#ifndef NDEBUG + for (size_t i = 1; i < TableSize; i++) { + assert(strcmp(Table[i - 1].Key, Table[i].Key) < 0 && "Table is not sorted"); + } +#endif + + // Find entry + const SubtargetInfoKV *Entry = Find(Features[0], Table, TableSize); + + if (Entry) { + return Entry->Value; + } else { + cerr << "'" << Features[0] + << "' is not a recognized processor for this target" + << " (ignoring processor)" + << "\n"; + return NULL; + } +} + +/// print - Print feature string. +/// +void SubtargetFeatures::print(std::ostream &OS) const { + for (size_t i = 0; i < Features.size(); i++) { + OS << Features[i] << " "; + } + OS << "\n"; +} + +/// dump - Dump feature info. +/// +void SubtargetFeatures::dump() const { + print(*cerr.stream()); +} diff --git a/lib/Target/Target.td b/lib/Target/Target.td new file mode 100644 index 0000000..48af218 --- /dev/null +++ b/lib/Target/Target.td @@ -0,0 +1,413 @@ +//===- Target.td - Target Independent TableGen interface ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the target-independent interfaces which should be +// implemented by each target which is using a TableGen based code generator. +// +//===----------------------------------------------------------------------===// + +// Include all information about LLVM intrinsics. +include "llvm/Intrinsics.td" + +//===----------------------------------------------------------------------===// +// Register file description - These classes are used to fill in the target +// description classes. + +class RegisterClass; // Forward def + +// Register - You should define one instance of this class for each register +// in the target machine. String n will become the "name" of the register. +class Register<string n> { + string Namespace = ""; + string Name = n; + + // SpillSize - If this value is set to a non-zero value, it is the size in + // bits of the spill slot required to hold this register. If this value is + // set to zero, the information is inferred from any register classes the + // register belongs to. + int SpillSize = 0; + + // SpillAlignment - This value is used to specify the alignment required for + // spilling the register. Like SpillSize, this should only be explicitly + // specified if the register is not in a register class. + int SpillAlignment = 0; + + // Aliases - A list of registers that this register overlaps with. A read or + // modification of this register can potentially read or modify the aliased + // registers. + list<Register> Aliases = []; + + // SubRegs - A list of registers that are parts of this register. Note these + // are "immediate" sub-registers and the registers within the list do not + // themselves overlap. e.g. For X86, EAX's SubRegs list contains only [AX], + // not [AX, AH, AL]. + list<Register> SubRegs = []; + + // DwarfNumber - Number used internally by gcc/gdb to identify the register. + // These values can be determined by locating the <target>.h file in the + // directory llvmgcc/gcc/config/<target>/ and looking for REGISTER_NAMES. The + // order of these names correspond to the enumeration used by gcc. A value of + // -1 indicates that the gcc number is undefined. + int DwarfNumber = -1; +} + +// RegisterWithSubRegs - This can be used to define instances of Register which +// need to specify sub-registers. +// List "subregs" specifies which registers are sub-registers to this one. This +// is used to populate the SubRegs and AliasSet fields of TargetRegisterDesc. +// This allows the code generator to be careful not to put two values with +// overlapping live ranges into registers which alias. +class RegisterWithSubRegs<string n, list<Register> subregs> : Register<n> { + let SubRegs = subregs; +} + +// SubRegSet - This can be used to define a specific mapping of registers to +// indices, for use as named subregs of a particular physical register. Each +// register in 'subregs' becomes an addressable subregister at index 'n' of the +// corresponding register in 'regs'. +class SubRegSet<int n, list<Register> regs, list<Register> subregs> { + int index = n; + + list<Register> From = regs; + list<Register> To = subregs; +} + +// RegisterClass - Now that all of the registers are defined, and aliases +// between registers are defined, specify which registers belong to which +// register classes. This also defines the default allocation order of +// registers by register allocators. +// +class RegisterClass<string namespace, list<ValueType> regTypes, int alignment, + list<Register> regList> { + string Namespace = namespace; + + // RegType - Specify the list ValueType of the registers in this register + // class. Note that all registers in a register class must have the same + // ValueTypes. This is a list because some targets permit storing different + // types in same register, for example vector values with 128-bit total size, + // but different count/size of items, like SSE on x86. + // + list<ValueType> RegTypes = regTypes; + + // Size - Specify the spill size in bits of the registers. A default value of + // zero lets tablgen pick an appropriate size. + int Size = 0; + + // Alignment - Specify the alignment required of the registers when they are + // stored or loaded to memory. + // + int Alignment = alignment; + + // MemberList - Specify which registers are in this class. If the + // allocation_order_* method are not specified, this also defines the order of + // allocation used by the register allocator. + // + list<Register> MemberList = regList; + + // SubClassList - Specify which register classes correspond to subregisters + // of this class. The order should be by subregister set index. + list<RegisterClass> SubRegClassList = []; + + // MethodProtos/MethodBodies - These members can be used to insert arbitrary + // code into a generated register class. The normal usage of this is to + // overload virtual methods. + code MethodProtos = [{}]; + code MethodBodies = [{}]; +} + + +//===----------------------------------------------------------------------===// +// DwarfRegNum - This class provides a mapping of the llvm register enumeration +// to the register numbering used by gcc and gdb. These values are used by a +// debug information writer (ex. DwarfWriter) to describe where values may be +// located during execution. +class DwarfRegNum<int N> { + // DwarfNumber - Number used internally by gcc/gdb to identify the register. + // These values can be determined by locating the <target>.h file in the + // directory llvmgcc/gcc/config/<target>/ and looking for REGISTER_NAMES. The + // order of these names correspond to the enumeration used by gcc. A value of + // -1 indicates that the gcc number is undefined. + int DwarfNumber = N; +} + +//===----------------------------------------------------------------------===// +// Pull in the common support for scheduling +// +include "TargetSchedule.td" + +class Predicate; // Forward def + +//===----------------------------------------------------------------------===// +// Instruction set description - These classes correspond to the C++ classes in +// the Target/TargetInstrInfo.h file. +// +class Instruction { + string Name = ""; // The opcode string for this instruction + string Namespace = ""; + + dag OperandList; // An dag containing the MI operand list. + string AsmString = ""; // The .s format to print the instruction with. + + // Pattern - Set to the DAG pattern for this instruction, if we know of one, + // otherwise, uninitialized. + list<dag> Pattern; + + // The follow state will eventually be inferred automatically from the + // instruction pattern. + + list<Register> Uses = []; // Default to using no non-operand registers + list<Register> Defs = []; // Default to modifying no non-operand registers + + // Predicates - List of predicates which will be turned into isel matching + // code. + list<Predicate> Predicates = []; + + // Code size. + int CodeSize = 0; + + // Added complexity passed onto matching pattern. + int AddedComplexity = 0; + + // These bits capture information about the high-level semantics of the + // instruction. + bit isReturn = 0; // Is this instruction a return instruction? + bit isBranch = 0; // Is this instruction a branch instruction? + bit isBarrier = 0; // Can control flow fall through this instruction? + bit isCall = 0; // Is this instruction a call instruction? + bit isLoad = 0; // Is this instruction a load instruction? + bit isStore = 0; // Is this instruction a store instruction? + bit isTwoAddress = 0; // Is this a two address instruction? + bit isConvertibleToThreeAddress = 0; // Can this 2-addr instruction promote? + bit isCommutable = 0; // Is this 3 operand instruction commutable? + bit isTerminator = 0; // Is this part of the terminator for a basic block? + bit isReMaterializable = 0; // Is this instruction re-materializable? + bit isPredicable = 0; // Is this instruction predicable? + bit hasDelaySlot = 0; // Does this instruction have an delay slot? + bit usesCustomDAGSchedInserter = 0; // Pseudo instr needing special help. + bit hasCtrlDep = 0; // Does this instruction r/w ctrl-flow chains? + bit noResults = 0; // Does this instruction produce no results? + bit isNotDuplicable = 0; // Is it unsafe to duplicate this instruction? + + InstrItinClass Itinerary = NoItinerary;// Execution steps used for scheduling. + + string Constraints = ""; // OperandConstraint, e.g. $src = $dst. + + /// DisableEncoding - List of operand names (e.g. "$op1,$op2") that should not + /// be encoded into the output machineinstr. + string DisableEncoding = ""; +} + +/// Imp - Helper class for specifying the implicit uses/defs set for an +/// instruction. +class Imp<list<Register> uses, list<Register> defs> { + list<Register> Uses = uses; + list<Register> Defs = defs; +} + +/// Predicates - These are extra conditionals which are turned into instruction +/// selector matching code. Currently each predicate is just a string. +class Predicate<string cond> { + string CondString = cond; +} + +/// NoHonorSignDependentRounding - This predicate is true if support for +/// sign-dependent-rounding is not enabled. +def NoHonorSignDependentRounding + : Predicate<"!HonorSignDependentRoundingFPMath()">; + +class Requires<list<Predicate> preds> { + list<Predicate> Predicates = preds; +} + +/// ops definition - This is just a simple marker used to identify the operands +/// list for an instruction. This should be used like this: +/// (ops R32:$dst, R32:$src) or something similar. +def ops; + +/// variable_ops definition - Mark this instruction as taking a variable number +/// of operands. +def variable_ops; + +/// ptr_rc definition - Mark this operand as being a pointer value whose +/// register class is resolved dynamically via a callback to TargetInstrInfo. +/// FIXME: We should probably change this to a class which contain a list of +/// flags. But currently we have but one flag. +def ptr_rc; + +/// Operand Types - These provide the built-in operand types that may be used +/// by a target. Targets can optionally provide their own operand types as +/// needed, though this should not be needed for RISC targets. +class Operand<ValueType ty> { + ValueType Type = ty; + string PrintMethod = "printOperand"; + dag MIOperandInfo = (ops); +} + +def i1imm : Operand<i1>; +def i8imm : Operand<i8>; +def i16imm : Operand<i16>; +def i32imm : Operand<i32>; +def i64imm : Operand<i64>; + +/// zero_reg definition - Special node to stand for the zero register. +/// +def zero_reg; + +/// PredicateOperand - This can be used to define a predicate operand for an +/// instruction. OpTypes specifies the MIOperandInfo for the operand, and +/// AlwaysVal specifies the value of this predicate when set to "always +/// execute". +class PredicateOperand<ValueType ty, dag OpTypes, dag AlwaysVal> + : Operand<ty> { + let MIOperandInfo = OpTypes; + dag DefaultOps = AlwaysVal; +} + +/// OptionalDefOperand - This is used to define a optional definition operand +/// for an instruction. DefaultOps is the register the operand represents if none +/// is supplied, e.g. zero_reg. +class OptionalDefOperand<ValueType ty, dag OpTypes, dag defaultops> + : Operand<ty> { + let MIOperandInfo = OpTypes; + dag DefaultOps = defaultops; +} + + +// InstrInfo - This class should only be instantiated once to provide parameters +// which are global to the the target machine. +// +class InstrInfo { + // If the target wants to associate some target-specific information with each + // instruction, it should provide these two lists to indicate how to assemble + // the target specific information into the 32 bits available. + // + list<string> TSFlagsFields = []; + list<int> TSFlagsShifts = []; + + // Target can specify its instructions in either big or little-endian formats. + // For instance, while both Sparc and PowerPC are big-endian platforms, the + // Sparc manual specifies its instructions in the format [31..0] (big), while + // PowerPC specifies them using the format [0..31] (little). + bit isLittleEndianEncoding = 0; +} + +// Standard Instructions. +def PHI : Instruction { + let OperandList = (ops variable_ops); + let AsmString = "PHINODE"; + let Namespace = "TargetInstrInfo"; +} +def INLINEASM : Instruction { + let OperandList = (ops variable_ops); + let AsmString = ""; + let Namespace = "TargetInstrInfo"; +} +def LABEL : Instruction { + let OperandList = (ops i32imm:$id); + let AsmString = ""; + let Namespace = "TargetInstrInfo"; + let hasCtrlDep = 1; +} + +//===----------------------------------------------------------------------===// +// AsmWriter - This class can be implemented by targets that need to customize +// the format of the .s file writer. +// +// Subtargets can have multiple different asmwriters (e.g. AT&T vs Intel syntax +// on X86 for example). +// +class AsmWriter { + // AsmWriterClassName - This specifies the suffix to use for the asmwriter + // class. Generated AsmWriter classes are always prefixed with the target + // name. + string AsmWriterClassName = "AsmPrinter"; + + // InstFormatName - AsmWriters can specify the name of the format string to + // print instructions with. + string InstFormatName = "AsmString"; + + // Variant - AsmWriters can be of multiple different variants. Variants are + // used to support targets that need to emit assembly code in ways that are + // mostly the same for different targets, but have minor differences in + // syntax. If the asmstring contains {|} characters in them, this integer + // will specify which alternative to use. For example "{x|y|z}" with Variant + // == 1, will expand to "y". + int Variant = 0; +} +def DefaultAsmWriter : AsmWriter; + + +//===----------------------------------------------------------------------===// +// Target - This class contains the "global" target information +// +class Target { + // InstructionSet - Instruction set description for this target. + InstrInfo InstructionSet; + + // AssemblyWriters - The AsmWriter instances available for this target. + list<AsmWriter> AssemblyWriters = [DefaultAsmWriter]; +} + +//===----------------------------------------------------------------------===// +// SubtargetFeature - A characteristic of the chip set. +// +class SubtargetFeature<string n, string a, string v, string d, + list<SubtargetFeature> i = []> { + // Name - Feature name. Used by command line (-mattr=) to determine the + // appropriate target chip. + // + string Name = n; + + // Attribute - Attribute to be set by feature. + // + string Attribute = a; + + // Value - Value the attribute to be set to by feature. + // + string Value = v; + + // Desc - Feature description. Used by command line (-mattr=) to display help + // information. + // + string Desc = d; + + // Implies - Features that this feature implies are present. If one of those + // features isn't set, then this one shouldn't be set either. + // + list<SubtargetFeature> Implies = i; +} + +//===----------------------------------------------------------------------===// +// Processor chip sets - These values represent each of the chip sets supported +// by the scheduler. Each Processor definition requires corresponding +// instruction itineraries. +// +class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> { + // Name - Chip set name. Used by command line (-mcpu=) to determine the + // appropriate target chip. + // + string Name = n; + + // ProcItin - The scheduling information for the target processor. + // + ProcessorItineraries ProcItin = pi; + + // Features - list of + list<SubtargetFeature> Features = f; +} + +//===----------------------------------------------------------------------===// +// Pull in the common support for calling conventions. +// +include "TargetCallingConv.td" + +//===----------------------------------------------------------------------===// +// Pull in the common support for DAG isel generation. +// +include "TargetSelectionDAG.td" diff --git a/lib/Target/TargetAsmInfo.cpp b/lib/Target/TargetAsmInfo.cpp new file mode 100644 index 0000000..df7a2ec --- /dev/null +++ b/lib/Target/TargetAsmInfo.cpp @@ -0,0 +1,131 @@ +//===-- TargetAsmInfo.cpp - Asm Info ---------------------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines target asm properties related what form asm statements +// should take. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetAsmInfo.h" +#include <cctype> +#include <cstring> + +using namespace llvm; + +TargetAsmInfo::TargetAsmInfo() : + TextSection("\t.text"), + DataSection("\t.data"), + BSSSection("\t.bss"), + TLSDataSection("\t.section .tdata,\"awT\",@progbits"), + TLSBSSSection("\t.section .tbss,\"awT\",@nobits"), + ZeroFillDirective(0), + AddressSize(4), + NeedsSet(false), + MaxInstLength(4), + PCSymbol("$"), + SeparatorChar(';'), + CommentString("#"), + GlobalPrefix(""), + PrivateGlobalPrefix("."), + JumpTableSpecialLabelPrefix(0), + GlobalVarAddrPrefix(""), + GlobalVarAddrSuffix(""), + FunctionAddrPrefix(""), + FunctionAddrSuffix(""), + InlineAsmStart("#APP"), + InlineAsmEnd("#NO_APP"), + AssemblerDialect(0), + ZeroDirective("\t.zero\t"), + ZeroDirectiveSuffix(0), + AsciiDirective("\t.ascii\t"), + AscizDirective("\t.asciz\t"), + Data8bitsDirective("\t.byte\t"), + Data16bitsDirective("\t.short\t"), + Data32bitsDirective("\t.long\t"), + Data64bitsDirective("\t.quad\t"), + AlignDirective("\t.align\t"), + AlignmentIsInBytes(true), + SwitchToSectionDirective("\t.section\t"), + TextSectionStartSuffix(""), + DataSectionStartSuffix(""), + SectionEndDirectiveSuffix(0), + ConstantPoolSection("\t.section .rodata"), + JumpTableDataSection("\t.section .rodata"), + JumpTableDirective(0), + CStringSection(0), + StaticCtorsSection("\t.section .ctors,\"aw\",@progbits"), + StaticDtorsSection("\t.section .dtors,\"aw\",@progbits"), + FourByteConstantSection(0), + EightByteConstantSection(0), + SixteenByteConstantSection(0), + ReadOnlySection(0), + GlobalDirective(0), + SetDirective(0), + LCOMMDirective(0), + COMMDirective("\t.comm\t"), + COMMDirectiveTakesAlignment(true), + HasDotTypeDotSizeDirective(true), + UsedDirective(0), + WeakRefDirective(0), + HiddenDirective("\t.hidden\t"), + ProtectedDirective("\t.protected\t"), + AbsoluteDebugSectionOffsets(false), + AbsoluteEHSectionOffsets(false), + HasLEB128(false), + HasDotLoc(false), + HasDotFile(false), + SupportsDebugInformation(false), + SupportsExceptionHandling(false), + DwarfRequiresFrameSection(true), + DwarfSectionOffsetDirective(0), + DwarfAbbrevSection(".debug_abbrev"), + DwarfInfoSection(".debug_info"), + DwarfLineSection(".debug_line"), + DwarfFrameSection(".debug_frame"), + DwarfPubNamesSection(".debug_pubnames"), + DwarfPubTypesSection(".debug_pubtypes"), + DwarfStrSection(".debug_str"), + DwarfLocSection(".debug_loc"), + DwarfARangesSection(".debug_aranges"), + DwarfRangesSection(".debug_ranges"), + DwarfMacInfoSection(".debug_macinfo"), + DwarfEHFrameSection(".eh_frame"), + DwarfExceptionSection(".gcc_except_table"), + AsmTransCBE(0) { +} + +TargetAsmInfo::~TargetAsmInfo() { +} + +/// Measure the specified inline asm to determine an approximation of its +/// length. +/// Comments (which run till the next SeparatorChar or newline) do not +/// count as an instruction. +/// Any other non-whitespace text is considered an instruction, with +/// multiple instructions separated by SeparatorChar or newlines. +/// Variable-length instructions are not handled here; this function +/// may be overloaded in the target code to do that. +unsigned TargetAsmInfo::getInlineAsmLength(const char *Str) const { + // Count the number of instructions in the asm. + bool atInsnStart = true; + unsigned Length = 0; + for (; *Str; ++Str) { + if (*Str == '\n' || *Str == SeparatorChar) + atInsnStart = true; + if (atInsnStart && !isspace(*Str)) { + Length += MaxInstLength; + atInsnStart = false; + } + if (atInsnStart && strncmp(Str, CommentString, strlen(CommentString))==0) + atInsnStart = false; + } + + return Length; +} + diff --git a/lib/Target/TargetCallingConv.td b/lib/Target/TargetCallingConv.td new file mode 100644 index 0000000..9419320 --- /dev/null +++ b/lib/Target/TargetCallingConv.td @@ -0,0 +1,88 @@ +//===- TargetCallingConv.td - Target Calling Conventions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the target-independent interfaces with which targets +// describe their calling conventions. +// +//===----------------------------------------------------------------------===// + +class CCAction; +class CallingConv; + +/// CCPredicateAction - Instances of this class check some predicate, then +/// delegate to another action if the predicate is true. +class CCPredicateAction<CCAction A> : CCAction { + CCAction SubAction = A; +} + +/// CCIfType - If the current argument is one of the specified types, apply +/// Action A. +class CCIfType<list<ValueType> vts, CCAction A> : CCPredicateAction<A> { + list<ValueType> VTs = vts; +} + +/// CCIf - If the predicate matches, apply A. +class CCIf<string predicate, CCAction A> : CCPredicateAction<A> { + string Predicate = predicate; +} + +/// CCIfStruct - If the current argument is a struct, apply +/// Action A. +class CCIfStruct<CCAction A> : CCIf<"ArgFlags & ISD::ParamFlags::ByVal", A> { +} + +/// CCIfCC - Match of the current calling convention is 'CC'. +class CCIfCC<string CC, CCAction A> + : CCIf<!strconcat("State.getCallingConv() == ", CC), A> {} + +/// CCIfInReg - If this argument is marked with the 'inreg' attribute, apply +/// the specified action. +class CCIfInReg<CCAction A> : CCIf<"ArgFlags & ISD::ParamFlags::InReg", A> {} + +/// CCIfNotVarArg - If the current function is not vararg - apply the action +class CCIfNotVarArg<CCAction A> : CCIf<"!State.isVarArg()", A> {} + +/// CCAssignToReg - This action matches if there is a register in the specified +/// list that is still available. If so, it assigns the value to the first +/// available register and succeeds. +class CCAssignToReg<list<Register> regList> : CCAction { + list<Register> RegList = regList; +} + +/// CCAssignToStack - This action always matches: it assigns the value to a +/// stack slot of the specified size and alignment on the stack. +class CCAssignToStack<int size, int align> : CCAction { + int Size = size; + int Align = align; +} + +/// CCStructAssign - This action always matches: it will use the C ABI and +/// the register availability to decided whether to assign to a set of +/// registers or to a stack slot. +class CCStructAssign<list<Register> regList> : CCAction { + list<Register> RegList = regList; +} + +/// CCPromoteToType - If applied, this promotes the specified current value to +/// the specified type. +class CCPromoteToType<ValueType destTy> : CCAction { + ValueType DestTy = destTy; +} + +/// CCDelegateTo - This action invokes the specified sub-calling-convention. It +/// is successful if the specified CC matches. +class CCDelegateTo<CallingConv cc> : CCAction { + CallingConv CC = cc; +} + +/// CallingConv - An instance of this is used to define each calling convention +/// that the target supports. +class CallingConv<list<CCAction> actions> { + list<CCAction> Actions = actions; +} diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp new file mode 100644 index 0000000..301e8c1 --- /dev/null +++ b/lib/Target/TargetData.cpp @@ -0,0 +1,595 @@ +//===-- TargetData.cpp - Data size & alignment routines --------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines target properties related to datatype size/offset/alignment +// information. +// +// This structure should be created once, filled in if the defaults are not +// correct and then passed around by const&. None of the members functions +// require modification to the object. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetData.h" +#include "llvm/Module.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Constants.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringExtras.h" +#include <algorithm> +#include <cstdlib> +#include <sstream> +using namespace llvm; + +// Handle the Pass registration stuff necessary to use TargetData's. +namespace { + // Register the default SparcV9 implementation... + RegisterPass<TargetData> X("targetdata", "Target Data Layout"); +} +char TargetData::ID = 0; + +//===----------------------------------------------------------------------===// +// Support for StructLayout +//===----------------------------------------------------------------------===// + +StructLayout::StructLayout(const StructType *ST, const TargetData &TD) { + StructAlignment = 0; + StructSize = 0; + NumElements = ST->getNumElements(); + + // Loop over each of the elements, placing them in memory... + for (unsigned i = 0, e = NumElements; i != e; ++i) { + const Type *Ty = ST->getElementType(i); + unsigned TyAlign; + uint64_t TySize; + TyAlign = (ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty)); + TySize = TD.getTypeSize(Ty); + + // Add padding if necessary to make the data element aligned properly... + if (StructSize % TyAlign != 0) + StructSize = (StructSize/TyAlign + 1) * TyAlign; // Add padding... + + // Keep track of maximum alignment constraint + StructAlignment = std::max(TyAlign, StructAlignment); + + MemberOffsets[i] = StructSize; + StructSize += TySize; // Consume space for this data item + } + + // Empty structures have alignment of 1 byte. + if (StructAlignment == 0) StructAlignment = 1; + + // Add padding to the end of the struct so that it could be put in an array + // and all array elements would be aligned correctly. + if (StructSize % StructAlignment != 0) + StructSize = (StructSize/StructAlignment + 1) * StructAlignment; +} + + +/// getElementContainingOffset - Given a valid offset into the structure, +/// return the structure index that contains it. +unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const { + const uint64_t *SI = + std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset); + assert(SI != &MemberOffsets[0] && "Offset not in structure type!"); + --SI; + assert(*SI <= Offset && "upper_bound didn't work"); + assert((SI == &MemberOffsets[0] || *(SI-1) < Offset) && + (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) && + "Upper bound didn't work!"); + return SI-&MemberOffsets[0]; +} + +//===----------------------------------------------------------------------===// +// TargetAlignElem, TargetAlign support +//===----------------------------------------------------------------------===// + +TargetAlignElem +TargetAlignElem::get(AlignTypeEnum align_type, unsigned char abi_align, + unsigned char pref_align, uint32_t bit_width) { + TargetAlignElem retval; + retval.AlignType = align_type; + retval.ABIAlign = abi_align; + retval.PrefAlign = pref_align; + retval.TypeBitWidth = bit_width; + return retval; +} + +bool +TargetAlignElem::operator==(const TargetAlignElem &rhs) const { + return (AlignType == rhs.AlignType + && ABIAlign == rhs.ABIAlign + && PrefAlign == rhs.PrefAlign + && TypeBitWidth == rhs.TypeBitWidth); +} + +std::ostream & +TargetAlignElem::dump(std::ostream &os) const { + return os << AlignType + << TypeBitWidth + << ":" << (int) (ABIAlign * 8) + << ":" << (int) (PrefAlign * 8); +} + +const TargetAlignElem TargetData::InvalidAlignmentElem = + TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0); + +//===----------------------------------------------------------------------===// +// TargetData Class Implementation +//===----------------------------------------------------------------------===// + +/*! + A TargetDescription string consists of a sequence of hyphen-delimited + specifiers for target endianness, pointer size and alignments, and various + primitive type sizes and alignments. A typical string looks something like: + <br><br> + "E-p:32:32:32-i1:8:8-i8:8:8-i32:32:32-i64:32:64-f32:32:32-f64:32:64" + <br><br> + (note: this string is not fully specified and is only an example.) + \p + Alignments come in two flavors: ABI and preferred. ABI alignment (abi_align, + below) dictates how a type will be aligned within an aggregate and when used + as an argument. Preferred alignment (pref_align, below) determines a type's + alignment when emitted as a global. + \p + Specifier string details: + <br><br> + <i>[E|e]</i>: Endianness. "E" specifies a big-endian target data model, "e" + specifies a little-endian target data model. + <br><br> + <i>p:<size>:<abi_align>:<pref_align></i>: Pointer size, ABI and preferred + alignment. + <br><br> + <i><type><size>:<abi_align>:<pref_align></i>: Numeric type alignment. Type is + one of <i>i|f|v|a</i>, corresponding to integer, floating point, vector (aka + packed) or aggregate. Size indicates the size, e.g., 32 or 64 bits. + \p + The default string, fully specified is: + <br><br> + "E-p:64:64:64-a0:0:0-f32:32:32-f64:0:64" + "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:0:64" + "-v64:64:64-v128:128:128" + <br><br> + Note that in the case of aggregates, 0 is the default ABI and preferred + alignment. This is a special case, where the aggregate's computed worst-case + alignment will be used. + */ +void TargetData::init(const std::string &TargetDescription) { + std::string temp = TargetDescription; + + LittleEndian = false; + PointerMemSize = 8; + PointerABIAlign = 8; + PointerPrefAlign = PointerABIAlign; + + // Default alignments + setAlignment(INTEGER_ALIGN, 1, 1, 1); // Bool + setAlignment(INTEGER_ALIGN, 1, 1, 8); // Byte + setAlignment(INTEGER_ALIGN, 2, 2, 16); // short + setAlignment(INTEGER_ALIGN, 4, 4, 32); // int + setAlignment(INTEGER_ALIGN, 4, 8, 64); // long + setAlignment(FLOAT_ALIGN, 4, 4, 32); // float + setAlignment(FLOAT_ALIGN, 8, 8, 64); // double + setAlignment(VECTOR_ALIGN, 8, 8, 64); // v2i32 + setAlignment(VECTOR_ALIGN, 16, 16, 128); // v16i8, v8i16, v4i32, ... + setAlignment(AGGREGATE_ALIGN, 0, 8, 0); // struct, union, class, ... + + while (!temp.empty()) { + std::string token = getToken(temp, "-"); + std::string arg0 = getToken(token, ":"); + const char *p = arg0.c_str(); + switch(*p) { + case 'E': + LittleEndian = false; + break; + case 'e': + LittleEndian = true; + break; + case 'p': + PointerMemSize = atoi(getToken(token,":").c_str()) / 8; + PointerABIAlign = atoi(getToken(token,":").c_str()) / 8; + PointerPrefAlign = atoi(getToken(token,":").c_str()) / 8; + if (PointerPrefAlign == 0) + PointerPrefAlign = PointerABIAlign; + break; + case 'i': + case 'v': + case 'f': + case 'a': { + AlignTypeEnum align_type = + (*p == 'i' ? INTEGER_ALIGN : (*p == 'f' ? FLOAT_ALIGN : + (*p == 'v' ? VECTOR_ALIGN : AGGREGATE_ALIGN))); + uint32_t size = (uint32_t) atoi(++p); + unsigned char abi_align = atoi(getToken(token, ":").c_str()) / 8; + unsigned char pref_align = atoi(getToken(token, ":").c_str()) / 8; + if (pref_align == 0) + pref_align = abi_align; + setAlignment(align_type, abi_align, pref_align, size); + break; + } + default: + break; + } + } +} + +TargetData::TargetData(const Module *M) + : ImmutablePass((intptr_t)&ID) { + init(M->getDataLayout()); +} + +void +TargetData::setAlignment(AlignTypeEnum align_type, unsigned char abi_align, + unsigned char pref_align, uint32_t bit_width) { + for (unsigned i = 0, e = Alignments.size(); i != e; ++i) { + if (Alignments[i].AlignType == align_type && + Alignments[i].TypeBitWidth == bit_width) { + // Update the abi, preferred alignments. + Alignments[i].ABIAlign = abi_align; + Alignments[i].PrefAlign = pref_align; + return; + } + } + + Alignments.push_back(TargetAlignElem::get(align_type, abi_align, + pref_align, bit_width)); +} + +/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or +/// preferred if ABIInfo = false) the target wants for the specified datatype. +unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType, + uint32_t BitWidth, bool ABIInfo) const { + // Check to see if we have an exact match and remember the best match we see. + int BestMatchIdx = -1; + int LargestInt = -1; + for (unsigned i = 0, e = Alignments.size(); i != e; ++i) { + if (Alignments[i].AlignType == AlignType && + Alignments[i].TypeBitWidth == BitWidth) + return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign; + + // The best match so far depends on what we're looking for. + if (AlignType == VECTOR_ALIGN) { + // If this is a specification for a smaller vector type, we will fall back + // to it. This happens because <128 x double> can be implemented in terms + // of 64 <2 x double>. + if (Alignments[i].AlignType == VECTOR_ALIGN && + Alignments[i].TypeBitWidth < BitWidth) { + // Verify that we pick the biggest of the fallbacks. + if (BestMatchIdx == -1 || + Alignments[BestMatchIdx].TypeBitWidth < BitWidth) + BestMatchIdx = i; + } + } else if (AlignType == INTEGER_ALIGN && + Alignments[i].AlignType == INTEGER_ALIGN) { + // The "best match" for integers is the smallest size that is larger than + // the BitWidth requested. + if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 || + Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth)) + BestMatchIdx = i; + // However, if there isn't one that's larger, then we must use the + // largest one we have (see below) + if (LargestInt == -1 || + Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth) + LargestInt = i; + } + } + + // For integers, if we didn't find a best match, use the largest one found. + if (BestMatchIdx == -1) + BestMatchIdx = LargestInt; + + // Okay, we didn't find an exact solution. Fall back here depending on what + // is being looked for. + assert(BestMatchIdx != -1 && "Didn't find alignment info for this datatype!"); + + // Since we got a "best match" index, just return it. + return ABIInfo ? Alignments[BestMatchIdx].ABIAlign + : Alignments[BestMatchIdx].PrefAlign; +} + +/// LayoutInfo - The lazy cache of structure layout information maintained by +/// TargetData. Note that the struct types must have been free'd before +/// llvm_shutdown is called (and thus this is deallocated) because all the +/// targets with cached elements should have been destroyed. +/// +typedef std::pair<const TargetData*,const StructType*> LayoutKey; + +struct DenseMapLayoutKeyInfo { + static inline LayoutKey getEmptyKey() { return LayoutKey(0, 0); } + static inline LayoutKey getTombstoneKey() { + return LayoutKey((TargetData*)(intptr_t)-1, 0); + } + static unsigned getHashValue(const LayoutKey &Val) { + return DenseMapKeyInfo<void*>::getHashValue(Val.first) ^ + DenseMapKeyInfo<void*>::getHashValue(Val.second); + } + static bool isPod() { return true; } +}; + +typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy; +static ManagedStatic<LayoutInfoTy> LayoutInfo; + + +TargetData::~TargetData() { + if (LayoutInfo.isConstructed()) { + // Remove any layouts for this TD. + LayoutInfoTy &TheMap = *LayoutInfo; + for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end(); + I != E; ) { + if (I->first.first == this) { + I->second->~StructLayout(); + free(I->second); + TheMap.erase(I++); + } else { + ++I; + } + } + } +} + +const StructLayout *TargetData::getStructLayout(const StructType *Ty) const { + LayoutInfoTy &TheMap = *LayoutInfo; + + StructLayout *&SL = TheMap[LayoutKey(this, Ty)]; + if (SL) return SL; + + // Otherwise, create the struct layout. Because it is variable length, we + // malloc it, then use placement new. + int NumElts = Ty->getNumElements(); + StructLayout *L = + (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1)*sizeof(uint64_t)); + + // Set SL before calling StructLayout's ctor. The ctor could cause other + // entries to be added to TheMap, invalidating our reference. + SL = L; + + new (L) StructLayout(Ty, *this); + return L; +} + +/// InvalidateStructLayoutInfo - TargetData speculatively caches StructLayout +/// objects. If a TargetData object is alive when types are being refined and +/// removed, this method must be called whenever a StructType is removed to +/// avoid a dangling pointer in this cache. +void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const { + if (!LayoutInfo.isConstructed()) return; // No cache. + + LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty)); + if (I != LayoutInfo->end()) { + I->second->~StructLayout(); + free(I->second); + LayoutInfo->erase(I); + } +} + + +std::string TargetData::getStringRepresentation() const { + std::string repr; + repr.append(LittleEndian ? "e" : "E"); + repr.append("-p:").append(itostr((int64_t) (PointerMemSize * 8))). + append(":").append(itostr((int64_t) (PointerABIAlign * 8))). + append(":").append(itostr((int64_t) (PointerPrefAlign * 8))); + for (align_const_iterator I = Alignments.begin(); + I != Alignments.end(); + ++I) { + repr.append("-").append(1, (char) I->AlignType). + append(utostr((int64_t) I->TypeBitWidth)). + append(":").append(utostr((uint64_t) (I->ABIAlign * 8))). + append(":").append(utostr((uint64_t) (I->PrefAlign * 8))); + } + return repr; +} + + +uint64_t TargetData::getTypeSize(const Type *Ty) const { + assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!"); + switch (Ty->getTypeID()) { + case Type::LabelTyID: + case Type::PointerTyID: + return getPointerSize(); + case Type::ArrayTyID: { + const ArrayType *ATy = cast<ArrayType>(Ty); + uint64_t Size; + unsigned char Alignment; + Size = getTypeSize(ATy->getElementType()); + Alignment = getABITypeAlignment(ATy->getElementType()); + uint64_t AlignedSize = (Size + Alignment - 1)/Alignment*Alignment; + return AlignedSize*ATy->getNumElements(); + } + case Type::StructTyID: { + // Get the layout annotation... which is lazily created on demand. + const StructLayout *Layout = getStructLayout(cast<StructType>(Ty)); + return Layout->getSizeInBytes(); + } + case Type::IntegerTyID: { + unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth(); + if (BitWidth <= 8) { + return 1; + } else if (BitWidth <= 16) { + return 2; + } else if (BitWidth <= 32) { + return 4; + } else if (BitWidth <= 64) { + return 8; + } else { + // The size of this > 64 bit type is chosen as a multiple of the + // preferred alignment of the largest "native" size the target supports. + // We first obtain the the alignment info for this type and then compute + // the next largest multiple of that size. + uint64_t size = getAlignmentInfo(INTEGER_ALIGN, BitWidth, false) * 8; + return (((BitWidth / (size)) + (BitWidth % size != 0)) * size) / 8; + } + break; + } + case Type::VoidTyID: + return 1; + case Type::FloatTyID: + return 4; + case Type::DoubleTyID: + return 8; + case Type::VectorTyID: { + const VectorType *PTy = cast<VectorType>(Ty); + return PTy->getBitWidth() / 8; + } + default: + assert(0 && "TargetData::getTypeSize(): Unsupported type"); + break; + } + return 0; +} + +uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const { + if (Ty->isInteger()) + return cast<IntegerType>(Ty)->getBitWidth(); + else + return getTypeSize(Ty) * 8; +} + + +/*! + \param abi_or_pref Flag that determines which alignment is returned. true + returns the ABI alignment, false returns the preferred alignment. + \param Ty The underlying type for which alignment is determined. + + Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref + == false) for the requested type \a Ty. + */ +unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const { + int AlignType = -1; + + assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!"); + switch (Ty->getTypeID()) { + /* Early escape for the non-numeric types */ + case Type::LabelTyID: + case Type::PointerTyID: + return (abi_or_pref + ? getPointerABIAlignment() + : getPointerPrefAlignment()); + case Type::ArrayTyID: + return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref); + + case Type::StructTyID: { + // Packed structure types always have an ABI alignment of one. + if (cast<StructType>(Ty)->isPacked() && abi_or_pref) + return 1; + + // Get the layout annotation... which is lazily created on demand. + const StructLayout *Layout = getStructLayout(cast<StructType>(Ty)); + unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref); + return std::max(Align, (unsigned)Layout->getAlignment()); + } + case Type::IntegerTyID: + case Type::VoidTyID: + AlignType = INTEGER_ALIGN; + break; + case Type::FloatTyID: + case Type::DoubleTyID: + AlignType = FLOAT_ALIGN; + break; + case Type::VectorTyID: { + const VectorType *VTy = cast<VectorType>(Ty); + // Degenerate vectors are assumed to be scalar-ized + if (VTy->getNumElements() == 1) + return getAlignment(VTy->getElementType(), abi_or_pref); + else + AlignType = VECTOR_ALIGN; + break; + } + default: + assert(0 && "Bad type for getAlignment!!!"); + break; + } + + return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSize(Ty) * 8, + abi_or_pref); +} + +unsigned char TargetData::getABITypeAlignment(const Type *Ty) const { + return getAlignment(Ty, true); +} + +unsigned char TargetData::getPrefTypeAlignment(const Type *Ty) const { + return getAlignment(Ty, false); +} + +unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const { + unsigned Align = (unsigned) getPrefTypeAlignment(Ty); + assert(!(Align & (Align-1)) && "Alignment is not a power of two!"); + return Log2_32(Align); +} + +/// getIntPtrType - Return an unsigned integer type that is the same size or +/// greater to the host pointer size. +const Type *TargetData::getIntPtrType() const { + switch (getPointerSize()) { + default: assert(0 && "Unknown pointer size!"); + case 2: return Type::Int16Ty; + case 4: return Type::Int32Ty; + case 8: return Type::Int64Ty; + } +} + + +uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices, + unsigned NumIndices) const { + const Type *Ty = ptrTy; + assert(isa<PointerType>(Ty) && "Illegal argument for getIndexedOffset()"); + uint64_t Result = 0; + + generic_gep_type_iterator<Value* const*> + TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices); + for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) { + if (const StructType *STy = dyn_cast<StructType>(*TI)) { + assert(Indices[CurIDX]->getType() == Type::Int32Ty && + "Illegal struct idx"); + unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue(); + + // Get structure layout information... + const StructLayout *Layout = getStructLayout(STy); + + // Add in the offset, as calculated by the structure layout info... + Result += Layout->getElementOffset(FieldNo); + + // Update Ty to refer to current element + Ty = STy->getElementType(FieldNo); + } else { + // Update Ty to refer to current element + Ty = cast<SequentialType>(Ty)->getElementType(); + + // Get the array index and the size of each array element. + int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue(); + Result += arrayIdx * (int64_t)getTypeSize(Ty); + } + } + + return Result; +} + +/// getPreferredAlignmentLog - Return the preferred alignment of the +/// specified global, returned in log form. This includes an explicitly +/// requested alignment (if the global has one). +unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const { + const Type *ElemType = GV->getType()->getElementType(); + unsigned Alignment = getPreferredTypeAlignmentShift(ElemType); + if (GV->getAlignment() > (1U << Alignment)) + Alignment = Log2_32(GV->getAlignment()); + + if (GV->hasInitializer()) { + if (Alignment < 4) { + // If the global is not external, see if it is large. If so, give it a + // larger alignment. + if (getTypeSize(ElemType) > 128) + Alignment = 4; // 16-byte alignment. + } + } + return Alignment; +} diff --git a/lib/Target/TargetFrameInfo.cpp b/lib/Target/TargetFrameInfo.cpp new file mode 100644 index 0000000..e6f15b1 --- /dev/null +++ b/lib/Target/TargetFrameInfo.cpp @@ -0,0 +1,19 @@ +//===-- TargetFrameInfo.cpp - Implement machine frame interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements the layout of a stack frame on the target machine. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetFrameInfo.h" +#include <cstdlib> +using namespace llvm; + +TargetFrameInfo::~TargetFrameInfo() { +} diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp new file mode 100644 index 0000000..11b5b63 --- /dev/null +++ b/lib/Target/TargetInstrInfo.cpp @@ -0,0 +1,99 @@ +//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Constant.h" +#include "llvm/DerivedTypes.h" +using namespace llvm; + +/// findTiedToSrcOperand - Returns the operand that is tied to the specified +/// dest operand. Returns -1 if there isn't one. +int TargetInstrDescriptor::findTiedToSrcOperand(unsigned OpNum) const { + for (unsigned i = 0, e = numOperands; i != e; ++i) { + if (i == OpNum) + continue; + if (getOperandConstraint(i, TOI::TIED_TO) == (int)OpNum) + return i; + } + return -1; +} + + +TargetInstrInfo::TargetInstrInfo(const TargetInstrDescriptor* Desc, + unsigned numOpcodes) + : desc(Desc), NumOpcodes(numOpcodes) { +} + +TargetInstrInfo::~TargetInstrInfo() { +} + +// commuteInstruction - The default implementation of this method just exchanges +// operand 1 and 2. +MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI) const { + assert(MI->getOperand(1).isRegister() && MI->getOperand(2).isRegister() && + "This only knows how to commute register operands so far"); + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + bool Reg1IsKill = MI->getOperand(1).isKill(); + bool Reg2IsKill = MI->getOperand(2).isKill(); + MI->getOperand(2).setReg(Reg1); + MI->getOperand(1).setReg(Reg2); + if (Reg1IsKill) + MI->getOperand(2).setIsKill(); + else + MI->getOperand(2).unsetIsKill(); + if (Reg2IsKill) + MI->getOperand(1).setIsKill(); + else + MI->getOperand(1).unsetIsKill(); + return MI; +} + +bool TargetInstrInfo::PredicateInstruction(MachineInstr *MI, + const std::vector<MachineOperand> &Pred) const { + bool MadeChange = false; + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + if (TID->Flags & M_PREDICABLE) { + for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) { + if ((TID->OpInfo[i].Flags & M_PREDICATE_OPERAND)) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + MO.setReg(Pred[j].getReg()); + MadeChange = true; + } else if (MO.isImm()) { + MO.setImm(Pred[j].getImmedValue()); + MadeChange = true; + } else if (MO.isMBB()) { + MO.setMachineBasicBlock(Pred[j].getMachineBasicBlock()); + MadeChange = true; + } + ++j; + } + } + } + return MadeChange; +} + +bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + if (TID->Flags & M_TERMINATOR_FLAG) { + // Conditional branch is a special case. + if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0) + return true; + if ((TID->Flags & M_PREDICABLE) == 0) + return true; + return !isPredicated(MI); + } + return false; +} diff --git a/lib/Target/TargetMachOWriterInfo.cpp b/lib/Target/TargetMachOWriterInfo.cpp new file mode 100644 index 0000000..9a9d0d3 --- /dev/null +++ b/lib/Target/TargetMachOWriterInfo.cpp @@ -0,0 +1,25 @@ +//===-- llvm/Target/TargetMachOWriterInfo.h - MachO Writer Info -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bill Wendling and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the TargetMachOWriterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetMachOWriterInfo.h" +#include "llvm/CodeGen/MachineRelocation.h" +using namespace llvm; + +TargetMachOWriterInfo::~TargetMachOWriterInfo() {} + +MachineRelocation +TargetMachOWriterInfo::GetJTRelocation(unsigned Offset, + MachineBasicBlock *MBB) const { + // FIXME: do something about PIC + return MachineRelocation::getBB(Offset, MachineRelocation::VANILLA, MBB); +} diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp new file mode 100644 index 0000000..6c00a3f --- /dev/null +++ b/lib/Target/TargetMachine.cpp @@ -0,0 +1,164 @@ +//===-- TargetMachine.cpp - General Target Information ---------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the general parts of a Target machine. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +//--------------------------------------------------------------------------- +// Command-line options that tend to be useful on more than one back-end. +// + +namespace llvm { + bool PrintMachineCode; + bool NoFramePointerElim; + bool NoExcessFPPrecision; + bool UnsafeFPMath; + bool FiniteOnlyFPMathOption; + bool HonorSignDependentRoundingFPMathOption; + bool UseSoftFloat; + bool NoZerosInBSS; + bool ExceptionHandling; + Reloc::Model RelocationModel; + CodeModel::Model CMModel; +} +namespace { + cl::opt<bool, true> PrintCode("print-machineinstrs", + cl::desc("Print generated machine code"), + cl::location(PrintMachineCode), cl::init(false)); + + cl::opt<bool, true> + DisableFPElim("disable-fp-elim", + cl::desc("Disable frame pointer elimination optimization"), + cl::location(NoFramePointerElim), + cl::init(false)); + cl::opt<bool, true> + DisableExcessPrecision("disable-excess-fp-precision", + cl::desc("Disable optimizations that may increase FP precision"), + cl::location(NoExcessFPPrecision), + cl::init(false)); + cl::opt<bool, true> + EnableUnsafeFPMath("enable-unsafe-fp-math", + cl::desc("Enable optimizations that may decrease FP precision"), + cl::location(UnsafeFPMath), + cl::init(false)); + cl::opt<bool, true> + EnableFiniteOnlyFPMath("enable-finite-only-fp-math", + cl::desc("Enable optimizations that assumes non- NaNs / +-Infs"), + cl::location(FiniteOnlyFPMathOption), + cl::init(false)); + cl::opt<bool, true> + EnableHonorSignDependentRoundingFPMath(cl::Hidden, + "enable-sign-dependent-rounding-fp-math", + cl::desc("Force codegen to assume rounding mode can change dynamically"), + cl::location(HonorSignDependentRoundingFPMathOption), + cl::init(false)); + + cl::opt<bool, true> + GenerateSoftFloatCalls("soft-float", + cl::desc("Generate software floating point library calls"), + cl::location(UseSoftFloat), + cl::init(false)); + cl::opt<bool, true> + DontPlaceZerosInBSS("nozero-initialized-in-bss", + cl::desc("Don't place zero-initialized symbols into bss section"), + cl::location(NoZerosInBSS), + cl::init(false)); + cl::opt<bool, true> + EnableExceptionHandling("enable-eh", + cl::desc("Exception handling should be emitted."), + cl::location(ExceptionHandling), + cl::init(false)); + + cl::opt<llvm::Reloc::Model, true> + DefRelocationModel( + "relocation-model", + cl::desc("Choose relocation model"), + cl::location(RelocationModel), + cl::init(Reloc::Default), + cl::values( + clEnumValN(Reloc::Default, "default", + " Target default relocation model"), + clEnumValN(Reloc::Static, "static", + " Non-relocatable code"), + clEnumValN(Reloc::PIC_, "pic", + " Fully relocatable, position independent code"), + clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic", + " Relocatable external references, non-relocatable code"), + clEnumValEnd)); + cl::opt<llvm::CodeModel::Model, true> + DefCodeModel( + "code-model", + cl::desc("Choose code model"), + cl::location(CMModel), + cl::init(CodeModel::Default), + cl::values( + clEnumValN(CodeModel::Default, "default", + " Target default code model"), + clEnumValN(CodeModel::Small, "small", + " Small code model"), + clEnumValN(CodeModel::Kernel, "kernel", + " Kernel code model"), + clEnumValN(CodeModel::Medium, "medium", + " Medium code model"), + clEnumValN(CodeModel::Large, "large", + " Large code model"), + clEnumValEnd)); +} + +//--------------------------------------------------------------------------- +// TargetMachine Class +// + +TargetMachine::~TargetMachine() { + delete AsmInfo; +} + +/// getRelocationModel - Returns the code generation relocation model. The +/// choices are static, PIC, and dynamic-no-pic, and target default. +Reloc::Model TargetMachine::getRelocationModel() { + return RelocationModel; +} + +/// setRelocationModel - Sets the code generation relocation model. +void TargetMachine::setRelocationModel(Reloc::Model Model) { + RelocationModel = Model; +} + +/// getCodeModel - Returns the code model. The choices are small, kernel, +/// medium, large, and target default. +CodeModel::Model TargetMachine::getCodeModel() { + return CMModel; +} + +/// setCodeModel - Sets the code model. +void TargetMachine::setCodeModel(CodeModel::Model Model) { + CMModel = Model; +} + +namespace llvm { + /// FiniteOnlyFPMath - This returns true when the -enable-finite-only-fp-math + /// option is specified on the command line. If this returns false (default), + /// the code generator is not allowed to assume that FP arithmetic arguments + /// and results are never NaNs or +-Infs. + bool FiniteOnlyFPMath() { return UnsafeFPMath || FiniteOnlyFPMathOption; } + + /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume + /// that the rounding mode of the FPU can change from its default. + bool HonorSignDependentRoundingFPMath() { + return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; + } +} + diff --git a/lib/Target/TargetMachineRegistry.cpp b/lib/Target/TargetMachineRegistry.cpp new file mode 100644 index 0000000..2ab8f51 --- /dev/null +++ b/lib/Target/TargetMachineRegistry.cpp @@ -0,0 +1,107 @@ +//===-- TargetMachineRegistry.cpp - Target Auto Registration Impl ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes the RegisterTarget class, which TargetMachine +// implementations should use to register themselves with the system. This file +// also exposes the TargetMachineRegistry class, which allows tools to inspect +// all of registered targets. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetMachineRegistry.h" +#include <algorithm> +using namespace llvm; + +/// List - This is the main list of all of the registered target machines. +const TargetMachineRegistry::Entry *TargetMachineRegistry::List = 0; + +/// Listeners - All of the listeners registered to get notified when new targets +/// are loaded. +static TargetRegistrationListener *Listeners = 0; + +TargetMachineRegistry::Entry::Entry(const char *N, const char *SD, + TargetMachine *(*CF)(const Module &,const std::string &), + unsigned (*MMF)(const Module &M), unsigned (*JMF)()) + : Name(N), ShortDesc(SD), CtorFn(CF), ModuleMatchQualityFn(MMF), + JITMatchQualityFn(JMF), Next(List) { + List = this; + for (TargetRegistrationListener *L = Listeners; L; L = L->getNext()) + L->targetRegistered(this); +} + +TargetRegistrationListener::TargetRegistrationListener() { + Next = Listeners; + if (Next) Next->Prev = &Next; + Prev = &Listeners; + Listeners = this; +} + +TargetRegistrationListener::~TargetRegistrationListener() { + *Prev = Next; +} + +/// getClosestStaticTargetForModule - Given an LLVM module, pick the best target +/// that is compatible with the module. If no close target can be found, this +/// returns null and sets the Error string to a reason. +const TargetMachineRegistry::Entry * +TargetMachineRegistry::getClosestStaticTargetForModule(const Module &M, + std::string &Error) { + std::vector<std::pair<unsigned, const Entry *> > UsableTargets; + for (const Entry *E = getList(); E; E = E->getNext()) + if (unsigned Qual = E->ModuleMatchQualityFn(M)) + UsableTargets.push_back(std::make_pair(Qual, E)); + + if (UsableTargets.empty()) { + Error = "No available targets are compatible with this module"; + return 0; + } else if (UsableTargets.size() == 1) + return UsableTargets.back().second; + + // Otherwise, take the best target, but make sure we don't have to equally + // good best targets. + std::sort(UsableTargets.begin(), UsableTargets.end()); + if (UsableTargets.back().first ==UsableTargets[UsableTargets.size()-2].first){ + Error = "Cannot choose between targets \"" + + std::string(UsableTargets.back().second->Name) + "\" and \"" + + std::string(UsableTargets[UsableTargets.size()-2].second->Name) + "\""; + return 0; + } + return UsableTargets.back().second; +} + +/// getClosestTargetForJIT - Given an LLVM module, pick the best target that +/// is compatible with the current host and the specified module. If no +/// close target can be found, this returns null and sets the Error string +/// to a reason. +const TargetMachineRegistry::Entry * +TargetMachineRegistry::getClosestTargetForJIT(std::string &Error) { + std::vector<std::pair<unsigned, const Entry *> > UsableTargets; + for (const Entry *E = getList(); E; E = E->getNext()) + if (unsigned Qual = E->JITMatchQualityFn()) + UsableTargets.push_back(std::make_pair(Qual, E)); + + if (UsableTargets.empty()) { + Error = "No JIT is available for this host"; + return 0; + } else if (UsableTargets.size() == 1) + return UsableTargets.back().second; + + // Otherwise, take the best target. If there is a tie, just pick one. + unsigned MaxQual = UsableTargets.front().first; + const Entry *MaxQualTarget = UsableTargets.front().second; + + for (unsigned i = 1, e = UsableTargets.size(); i != e; ++i) + if (UsableTargets[i].first > MaxQual) { + MaxQual = UsableTargets[i].first; + MaxQualTarget = UsableTargets[i].second; + } + + return MaxQualTarget; +} + diff --git a/lib/Target/TargetSchedule.td b/lib/Target/TargetSchedule.td new file mode 100644 index 0000000..8ac537f --- /dev/null +++ b/lib/Target/TargetSchedule.td @@ -0,0 +1,72 @@ +//===- TargetSchedule.td - Target Independent Scheduling ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the target-independent scheduling interfaces which should +// be implemented by each target which is using TableGen based scheduling. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Processor functional unit - These values represent the function units +// available across all chip sets for the target. Eg., IntUnit, FPUnit, ... +// These may be independent values for each chip set or may be shared across +// all chip sets of the target. Each functional unit is treated as a resource +// during scheduling and has an affect instruction order based on availability +// during a time interval. +// +class FuncUnit; + +//===----------------------------------------------------------------------===// +// Instruction stage - These values represent a step in the execution of an +// instruction. The latency represents the number of discrete time slots used +// need to complete the stage. Units represent the choice of functional units +// that can be used to complete the stage. Eg. IntUnit1, IntUnit2. +// +class InstrStage<int cycles, list<FuncUnit> units> { + int Cycles = cycles; // length of stage in machine cycles + list<FuncUnit> Units = units; // choice of functional units +} + +//===----------------------------------------------------------------------===// +// Instruction itinerary - An itinerary represents a sequential series of steps +// required to complete an instruction. Itineraries are represented as lists of +// instruction stages. +// + +//===----------------------------------------------------------------------===// +// Instruction itinerary classes - These values represent 'named' instruction +// itinerary. Using named itineraries simplifies managing groups of +// instructions across chip sets. An instruction uses the same itinerary class +// across all chip sets. Thus a new chip set can be added without modifying +// instruction information. +// +class InstrItinClass; +def NoItinerary : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Instruction itinerary data - These values provide a runtime map of an +// instruction itinerary class (name) to it's itinerary data. +// +class InstrItinData<InstrItinClass Class, list<InstrStage> stages> { + InstrItinClass TheClass = Class; + list<InstrStage> Stages = stages; +} + +//===----------------------------------------------------------------------===// +// Processor itineraries - These values represent the set of all itinerary +// classes for a given chip set. +// +class ProcessorItineraries<list<InstrItinData> iid> { + list<InstrItinData> IID = iid; +} + +// NoItineraries - A marker that can be used by processors without schedule +// info. +def NoItineraries : ProcessorItineraries<[]>; + diff --git a/lib/Target/TargetSelectionDAG.td b/lib/Target/TargetSelectionDAG.td new file mode 100644 index 0000000..491bb02 --- /dev/null +++ b/lib/Target/TargetSelectionDAG.td @@ -0,0 +1,763 @@ +//===- TargetSelectionDAG.td - Common code for DAG isels ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the target-independent interfaces used by SelectionDAG +// instruction selection generators. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Selection DAG Type Constraint definitions. +// +// Note that the semantics of these constraints are hard coded into tblgen. To +// modify or add constraints, you have to hack tblgen. +// + +class SDTypeConstraint<int opnum> { + int OperandNum = opnum; +} + +// SDTCisVT - The specified operand has exactly this VT. +class SDTCisVT<int OpNum, ValueType vt> : SDTypeConstraint<OpNum> { + ValueType VT = vt; +} + +class SDTCisPtrTy<int OpNum> : SDTypeConstraint<OpNum>; + +// SDTCisInt - The specified operand is has integer type. +class SDTCisInt<int OpNum> : SDTypeConstraint<OpNum>; + +// SDTCisFP - The specified operand is has floating point type. +class SDTCisFP<int OpNum> : SDTypeConstraint<OpNum>; + +// SDTCisSameAs - The two specified operands have identical types. +class SDTCisSameAs<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> { + int OtherOperandNum = OtherOp; +} + +// SDTCisVTSmallerThanOp - The specified operand is a VT SDNode, and its type is +// smaller than the 'Other' operand. +class SDTCisVTSmallerThanOp<int OpNum, int OtherOp> : SDTypeConstraint<OpNum> { + int OtherOperandNum = OtherOp; +} + +class SDTCisOpSmallerThanOp<int SmallOp, int BigOp> : SDTypeConstraint<SmallOp>{ + int BigOperandNum = BigOp; +} + +/// SDTCisIntVectorOfSameSize - This indicates that ThisOp and OtherOp are +/// vector types, and that ThisOp is the result of +/// MVT::getIntVectorWithNumElements with the number of elements that ThisOp +/// has. +class SDTCisIntVectorOfSameSize<int ThisOp, int OtherOp> + : SDTypeConstraint<ThisOp> { + int OtherOpNum = OtherOp; +} + +//===----------------------------------------------------------------------===// +// Selection DAG Type Profile definitions. +// +// These use the constraints defined above to describe the type requirements of +// the various nodes. These are not hard coded into tblgen, allowing targets to +// add their own if needed. +// + +// SDTypeProfile - This profile describes the type requirements of a Selection +// DAG node. +class SDTypeProfile<int numresults, int numoperands, + list<SDTypeConstraint> constraints> { + int NumResults = numresults; + int NumOperands = numoperands; + list<SDTypeConstraint> Constraints = constraints; +} + +// Builtin profiles. +def SDTIntLeaf: SDTypeProfile<1, 0, [SDTCisInt<0>]>; // for 'imm'. +def SDTFPLeaf : SDTypeProfile<1, 0, [SDTCisFP<0>]>; // for 'fpimm'. +def SDTPtrLeaf: SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; // for '&g'. +def SDTOther : SDTypeProfile<1, 0, [SDTCisVT<0, OtherVT>]>; // for 'vt'. +def SDTUNDEF : SDTypeProfile<1, 0, []>; // for 'undef'. +def SDTUnaryOp : SDTypeProfile<1, 1, []>; // bitconvert + +def SDTIntBinOp : SDTypeProfile<1, 2, [ // add, and, or, xor, udiv, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0> +]>; +def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl + SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2> +]>; +def SDTFPBinOp : SDTypeProfile<1, 2, [ // fadd, fmul, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> +]>; +def SDTFPSignOp : SDTypeProfile<1, 2, [ // fcopysign. + SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisFP<2> +]>; +def SDTFPTernaryOp : SDTypeProfile<1, 3, [ // fmadd, fnmsub, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0> +]>; +def SDTIntUnaryOp : SDTypeProfile<1, 1, [ // ctlz + SDTCisSameAs<0, 1>, SDTCisInt<0> +]>; +def SDTIntExtendOp : SDTypeProfile<1, 1, [ // sext, zext, anyext + SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0> +]>; +def SDTIntTruncOp : SDTypeProfile<1, 1, [ // trunc + SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1> +]>; +def SDTFPUnaryOp : SDTypeProfile<1, 1, [ // fneg, fsqrt, etc + SDTCisSameAs<0, 1>, SDTCisFP<0> +]>; +def SDTFPRoundOp : SDTypeProfile<1, 1, [ // fround + SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1> +]>; +def SDTFPExtendOp : SDTypeProfile<1, 1, [ // fextend + SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0> +]>; +def SDTIntToFPOp : SDTypeProfile<1, 1, [ // [su]int_to_fp + SDTCisFP<0>, SDTCisInt<1> +]>; +def SDTFPToIntOp : SDTypeProfile<1, 1, [ // fp_to_[su]int + SDTCisInt<0>, SDTCisFP<1> +]>; +def SDTExtInreg : SDTypeProfile<1, 2, [ // sext_inreg + SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>, + SDTCisVTSmallerThanOp<2, 1> +]>; + +def SDTSetCC : SDTypeProfile<1, 3, [ // setcc + SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> +]>; + +def SDTSelect : SDTypeProfile<1, 3, [ // select + SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3> +]>; + +def SDTSelectCC : SDTypeProfile<1, 5, [ // select_cc + SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisSameAs<0, 3>, + SDTCisVT<5, OtherVT> +]>; + +def SDTBr : SDTypeProfile<0, 1, [ // br + SDTCisVT<0, OtherVT> +]>; + +def SDTBrcond : SDTypeProfile<0, 2, [ // brcond + SDTCisInt<0>, SDTCisVT<1, OtherVT> +]>; + +def SDTBrind : SDTypeProfile<0, 1, [ // brind + SDTCisPtrTy<0> +]>; + +def SDTRet : SDTypeProfile<0, 0, []>; // ret + +def SDTLoad : SDTypeProfile<1, 1, [ // load + SDTCisPtrTy<1> +]>; + +def SDTStore : SDTypeProfile<0, 2, [ // store + SDTCisPtrTy<1> +]>; + +def SDTIStore : SDTypeProfile<1, 3, [ // indexed store + SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3> +]>; + +def SDTVecShuffle : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisIntVectorOfSameSize<3, 0> +]>; + +//===----------------------------------------------------------------------===// +// Selection DAG Node Properties. +// +// Note: These are hard coded into tblgen. +// +class SDNodeProperty; +def SDNPCommutative : SDNodeProperty; // X op Y == Y op X +def SDNPAssociative : SDNodeProperty; // (X op Y) op Z == X op (Y op Z) +def SDNPHasChain : SDNodeProperty; // R/W chain operand and result +def SDNPOutFlag : SDNodeProperty; // Write a flag result +def SDNPInFlag : SDNodeProperty; // Read a flag operand +def SDNPOptInFlag : SDNodeProperty; // Optionally read a flag operand + +//===----------------------------------------------------------------------===// +// Selection DAG Node definitions. +// +class SDNode<string opcode, SDTypeProfile typeprof, + list<SDNodeProperty> props = [], string sdclass = "SDNode"> { + string Opcode = opcode; + string SDClass = sdclass; + list<SDNodeProperty> Properties = props; + SDTypeProfile TypeProfile = typeprof; +} + +def set; +def node; +def srcvalue; + +def imm : SDNode<"ISD::Constant" , SDTIntLeaf , [], "ConstantSDNode">; +def fpimm : SDNode<"ISD::TargetConstantFP", + SDTFPLeaf, [], "ConstantFPSDNode">; +def vt : SDNode<"ISD::VALUETYPE" , SDTOther , [], "VTSDNode">; +def bb : SDNode<"ISD::BasicBlock", SDTOther , [], "BasicBlockSDNode">; +def cond : SDNode<"ISD::CONDCODE" , SDTOther , [], "CondCodeSDNode">; +def undef : SDNode<"ISD::UNDEF" , SDTUNDEF , []>; +def globaladdr : SDNode<"ISD::GlobalAddress", SDTPtrLeaf, [], + "GlobalAddressSDNode">; +def tglobaladdr : SDNode<"ISD::TargetGlobalAddress", SDTPtrLeaf, [], + "GlobalAddressSDNode">; +def globaltlsaddr : SDNode<"ISD::GlobalTLSAddress", SDTPtrLeaf, [], + "GlobalAddressSDNode">; +def tglobaltlsaddr : SDNode<"ISD::TargetGlobalTLSAddress", SDTPtrLeaf, [], + "GlobalAddressSDNode">; +def constpool : SDNode<"ISD::ConstantPool", SDTPtrLeaf, [], + "ConstantPoolSDNode">; +def tconstpool : SDNode<"ISD::TargetConstantPool", SDTPtrLeaf, [], + "ConstantPoolSDNode">; +def jumptable : SDNode<"ISD::JumpTable", SDTPtrLeaf, [], + "JumpTableSDNode">; +def tjumptable : SDNode<"ISD::TargetJumpTable", SDTPtrLeaf, [], + "JumpTableSDNode">; +def frameindex : SDNode<"ISD::FrameIndex", SDTPtrLeaf, [], + "FrameIndexSDNode">; +def tframeindex : SDNode<"ISD::TargetFrameIndex", SDTPtrLeaf, [], + "FrameIndexSDNode">; +def externalsym : SDNode<"ISD::ExternalSymbol", SDTPtrLeaf, [], + "ExternalSymbolSDNode">; +def texternalsym: SDNode<"ISD::TargetExternalSymbol", SDTPtrLeaf, [], + "ExternalSymbolSDNode">; + +def add : SDNode<"ISD::ADD" , SDTIntBinOp , + [SDNPCommutative, SDNPAssociative]>; +def sub : SDNode<"ISD::SUB" , SDTIntBinOp>; +def mul : SDNode<"ISD::MUL" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; +def mulhs : SDNode<"ISD::MULHS" , SDTIntBinOp, [SDNPCommutative]>; +def mulhu : SDNode<"ISD::MULHU" , SDTIntBinOp, [SDNPCommutative]>; +def sdiv : SDNode<"ISD::SDIV" , SDTIntBinOp>; +def udiv : SDNode<"ISD::UDIV" , SDTIntBinOp>; +def srem : SDNode<"ISD::SREM" , SDTIntBinOp>; +def urem : SDNode<"ISD::UREM" , SDTIntBinOp>; +def srl : SDNode<"ISD::SRL" , SDTIntShiftOp>; +def sra : SDNode<"ISD::SRA" , SDTIntShiftOp>; +def shl : SDNode<"ISD::SHL" , SDTIntShiftOp>; +def rotl : SDNode<"ISD::ROTL" , SDTIntShiftOp>; +def rotr : SDNode<"ISD::ROTR" , SDTIntShiftOp>; +def and : SDNode<"ISD::AND" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; +def or : SDNode<"ISD::OR" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; +def xor : SDNode<"ISD::XOR" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; +def addc : SDNode<"ISD::ADDC" , SDTIntBinOp, + [SDNPCommutative, SDNPOutFlag]>; +def adde : SDNode<"ISD::ADDE" , SDTIntBinOp, + [SDNPCommutative, SDNPOutFlag, SDNPInFlag]>; +def subc : SDNode<"ISD::SUBC" , SDTIntBinOp, + [SDNPOutFlag]>; +def sube : SDNode<"ISD::SUBE" , SDTIntBinOp, + [SDNPOutFlag, SDNPInFlag]>; + +def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; +def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; +def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; +def cttz : SDNode<"ISD::CTTZ" , SDTIntUnaryOp>; +def ctpop : SDNode<"ISD::CTPOP" , SDTIntUnaryOp>; +def sext : SDNode<"ISD::SIGN_EXTEND", SDTIntExtendOp>; +def zext : SDNode<"ISD::ZERO_EXTEND", SDTIntExtendOp>; +def anyext : SDNode<"ISD::ANY_EXTEND" , SDTIntExtendOp>; +def trunc : SDNode<"ISD::TRUNCATE" , SDTIntTruncOp>; +def bitconvert : SDNode<"ISD::BIT_CONVERT", SDTUnaryOp>; + +def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; +def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; +def fmul : SDNode<"ISD::FMUL" , SDTFPBinOp, [SDNPCommutative]>; +def fdiv : SDNode<"ISD::FDIV" , SDTFPBinOp>; +def frem : SDNode<"ISD::FREM" , SDTFPBinOp>; +def fabs : SDNode<"ISD::FABS" , SDTFPUnaryOp>; +def fneg : SDNode<"ISD::FNEG" , SDTFPUnaryOp>; +def fsqrt : SDNode<"ISD::FSQRT" , SDTFPUnaryOp>; +def fsin : SDNode<"ISD::FSIN" , SDTFPUnaryOp>; +def fcos : SDNode<"ISD::FCOS" , SDTFPUnaryOp>; + +def fround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>; +def fextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>; +def fcopysign : SDNode<"ISD::FCOPYSIGN" , SDTFPSignOp>; + +def sint_to_fp : SDNode<"ISD::SINT_TO_FP" , SDTIntToFPOp>; +def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>; +def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>; +def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>; + +def setcc : SDNode<"ISD::SETCC" , SDTSetCC>; +def select : SDNode<"ISD::SELECT" , SDTSelect>; +def selectcc : SDNode<"ISD::SELECT_CC" , SDTSelectCC>; + +def brcond : SDNode<"ISD::BRCOND" , SDTBrcond, [SDNPHasChain]>; +def brind : SDNode<"ISD::BRIND" , SDTBrind, [SDNPHasChain]>; +def br : SDNode<"ISD::BR" , SDTBr, [SDNPHasChain]>; +def ret : SDNode<"ISD::RET" , SDTRet, [SDNPHasChain]>; + +// Do not use ld, st directly. Use load, extload, sextload, zextload, store, +// and truncst (see below). +def ld : SDNode<"ISD::LOAD" , SDTLoad, [SDNPHasChain]>; +def st : SDNode<"ISD::STORE" , SDTStore, [SDNPHasChain]>; +def ist : SDNode<"ISD::STORE" , SDTIStore, [SDNPHasChain]>; + +def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>; +def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, 0, []>, []>; +def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>, + []>; +def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT", + SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>; +def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; + +// Nodes for intrinsics, you should use the intrinsic itself and let tblgen use +// these internally. Don't reference these directly. +def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain]>; +def intrinsic_w_chain : SDNode<"ISD::INTRINSIC_W_CHAIN", + SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, + [SDNPHasChain]>; +def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN", + SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>; + + +//===----------------------------------------------------------------------===// +// Selection DAG Condition Codes + +class CondCode; // ISD::CondCode enums +def SETOEQ : CondCode; def SETOGT : CondCode; +def SETOGE : CondCode; def SETOLT : CondCode; def SETOLE : CondCode; +def SETONE : CondCode; def SETO : CondCode; def SETUO : CondCode; +def SETUEQ : CondCode; def SETUGT : CondCode; def SETUGE : CondCode; +def SETULT : CondCode; def SETULE : CondCode; def SETUNE : CondCode; + +def SETEQ : CondCode; def SETGT : CondCode; def SETGE : CondCode; +def SETLT : CondCode; def SETLE : CondCode; def SETNE : CondCode; + + +//===----------------------------------------------------------------------===// +// Selection DAG Node Transformation Functions. +// +// This mechanism allows targets to manipulate nodes in the output DAG once a +// match has been formed. This is typically used to manipulate immediate +// values. +// +class SDNodeXForm<SDNode opc, code xformFunction> { + SDNode Opcode = opc; + code XFormFunction = xformFunction; +} + +def NOOP_SDNodeXForm : SDNodeXForm<imm, [{}]>; + + +//===----------------------------------------------------------------------===// +// Selection DAG Pattern Fragments. +// +// Pattern fragments are reusable chunks of dags that match specific things. +// They can take arguments and have C++ predicates that control whether they +// match. They are intended to make the patterns for common instructions more +// compact and readable. +// + +/// PatFrag - Represents a pattern fragment. This can match something on the +/// DAG, frame a single node to multiply nested other fragments. +/// +class PatFrag<dag ops, dag frag, code pred = [{}], + SDNodeXForm xform = NOOP_SDNodeXForm> { + dag Operands = ops; + dag Fragment = frag; + code Predicate = pred; + SDNodeXForm OperandTransform = xform; +} + +// PatLeaf's are pattern fragments that have no operands. This is just a helper +// to define immediates and other common things concisely. +class PatLeaf<dag frag, code pred = [{}], SDNodeXForm xform = NOOP_SDNodeXForm> + : PatFrag<(ops), frag, pred, xform>; + +// Leaf fragments. + +def vtInt : PatLeaf<(vt), [{ return MVT::isInteger(N->getVT()); }]>; +def vtFP : PatLeaf<(vt), [{ return MVT::isFloatingPoint(N->getVT()); }]>; + +def immAllOnes : PatLeaf<(imm), [{ return N->isAllOnesValue(); }]>; +def immAllOnesV: PatLeaf<(build_vector), [{ + return ISD::isBuildVectorAllOnes(N); +}]>; +def immAllZerosV: PatLeaf<(build_vector), [{ + return ISD::isBuildVectorAllZeros(N); +}]>; + +def immAllOnesV_bc: PatLeaf<(bitconvert), [{ + return ISD::isBuildVectorAllOnes(N); +}]>; + + +// Other helper fragments. +def not : PatFrag<(ops node:$in), (xor node:$in, immAllOnes)>; +def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>; +def vnot_conv : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV_bc)>; +def ineg : PatFrag<(ops node:$in), (sub 0, node:$in)>; + +// load fragments. +def load : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::NON_EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; + +// extending load fragments. +def extloadi1 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i1; + return false; +}]>; +def extloadi8 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i8; + return false; +}]>; +def extloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i16; + return false; +}]>; +def extloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i32; + return false; +}]>; +def extloadf32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::EXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::f32; + return false; +}]>; + +def sextloadi1 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::SEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i1; + return false; +}]>; +def sextloadi8 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::SEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i8; + return false; +}]>; +def sextloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::SEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i16; + return false; +}]>; +def sextloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::SEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i32; + return false; +}]>; + +def zextloadi1 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::ZEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i1; + return false; +}]>; +def zextloadi8 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::ZEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i8; + return false; +}]>; +def zextloadi16 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::ZEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i16; + return false; +}]>; +def zextloadi32 : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) + return LD->getExtensionType() == ISD::ZEXTLOAD && + LD->getAddressingMode() == ISD::UNINDEXED && + LD->getLoadedVT() == MVT::i32; + return false; +}]>; + +// store fragments. +def store : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return !ST->isTruncatingStore() && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; + +// truncstore fragments. +def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1 && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; +def truncstorei8 : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8 && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; +def truncstorei16 : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16 && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; +def truncstorei32 : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32 && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; +def truncstoref32 : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32 && + ST->getAddressingMode() == ISD::UNINDEXED; + return false; +}]>; + +// indexed store fragments. +def pre_store : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + !ST->isTruncatingStore(); + } + return false; +}]>; + +def pre_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1; + } + return false; +}]>; +def pre_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8; + } + return false; +}]>; +def pre_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16; + } + return false; +}]>; +def pre_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32; + } + return false; +}]>; +def pre_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::PRE_INC || AM == ISD::PRE_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32; + } + return false; +}]>; + +def post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset), + (ist node:$val, node:$ptr, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return !ST->isTruncatingStore() && + (AM == ISD::POST_INC || AM == ISD::POST_DEC); + } + return false; +}]>; + +def post_truncsti1 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::POST_INC || AM == ISD::POST_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i1; + } + return false; +}]>; +def post_truncsti8 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::POST_INC || AM == ISD::POST_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i8; + } + return false; +}]>; +def post_truncsti16 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::POST_INC || AM == ISD::POST_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i16; + } + return false; +}]>; +def post_truncsti32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::POST_INC || AM == ISD::POST_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::i32; + } + return false; +}]>; +def post_truncstf32 : PatFrag<(ops node:$val, node:$base, node:$offset), + (ist node:$val, node:$base, node:$offset), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + ISD::MemIndexedMode AM = ST->getAddressingMode(); + return (AM == ISD::POST_INC || AM == ISD::POST_DEC) && + ST->isTruncatingStore() && ST->getStoredVT() == MVT::f32; + } + return false; +}]>; + +// setcc convenience fragments. +def setoeq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOEQ)>; +def setogt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOGT)>; +def setoge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOGE)>; +def setolt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOLT)>; +def setole : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETOLE)>; +def setone : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETONE)>; +def seto : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETO)>; +def setuo : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUO)>; +def setueq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUEQ)>; +def setugt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGT)>; +def setuge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGE)>; +def setult : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETULT)>; +def setule : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETULE)>; +def setune : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUNE)>; +def seteq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETEQ)>; +def setgt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGT)>; +def setge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGE)>; +def setlt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETLT)>; +def setle : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETLE)>; +def setne : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETNE)>; + +//===----------------------------------------------------------------------===// +// Selection DAG Pattern Support. +// +// Patterns are what are actually matched against the target-flavored +// instruction selection DAG. Instructions defined by the target implicitly +// define patterns in most cases, but patterns can also be explicitly added when +// an operation is defined by a sequence of instructions (e.g. loading a large +// immediate value on RISC targets that do not support immediates as large as +// their GPRs). +// + +class Pattern<dag patternToMatch, list<dag> resultInstrs> { + dag PatternToMatch = patternToMatch; + list<dag> ResultInstrs = resultInstrs; + list<Predicate> Predicates = []; // See class Instruction in Target.td. + int AddedComplexity = 0; // See class Instruction in Target.td. +} + +// Pat - A simple (but common) form of a pattern, which produces a simple result +// not needing a full list. +class Pat<dag pattern, dag result> : Pattern<pattern, [result]>; + +//===----------------------------------------------------------------------===// +// Complex pattern definitions. +// +// Complex patterns, e.g. X86 addressing mode, requires pattern matching code +// in C++. NumOperands is the number of operands returned by the select function; +// SelectFunc is the name of the function used to pattern match the max. pattern; +// RootNodes are the list of possible root nodes of the sub-dags to match. +// e.g. X86 addressing mode - def addr : ComplexPattern<4, "SelectAddr", [add]>; +// +class ComplexPattern<ValueType ty, int numops, string fn, + list<SDNode> roots = [], list<SDNodeProperty> props = []> { + ValueType Ty = ty; + int NumOperands = numops; + string SelectFunc = fn; + list<SDNode> RootNodes = roots; + list<SDNodeProperty> Properties = props; +} + +//===----------------------------------------------------------------------===// +// Dwarf support. +// +def SDT_dwarf_loc : SDTypeProfile<0, 3, + [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>; +def dwarf_loc : SDNode<"ISD::DEBUG_LOC", SDT_dwarf_loc,[SDNPHasChain]>; + + + diff --git a/lib/Target/TargetSubtarget.cpp b/lib/Target/TargetSubtarget.cpp new file mode 100644 index 0000000..a8bb9b9 --- /dev/null +++ b/lib/Target/TargetSubtarget.cpp @@ -0,0 +1,22 @@ +//===-- TargetSubtarget.cpp - General Target Information -------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Begeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the general parts of a Subtarget. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Target/TargetSubtarget.h" +using namespace llvm; + +//--------------------------------------------------------------------------- +// TargetSubtarget Class +// +TargetSubtarget::TargetSubtarget() {} + +TargetSubtarget::~TargetSubtarget() {} diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile new file mode 100644 index 0000000..5416cdb --- /dev/null +++ b/lib/Target/X86/Makefile @@ -0,0 +1,20 @@ +##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file was developed by the LLVM research group and is distributed under +# the University of Illinois Open Source License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../.. +LIBRARYNAME = LLVMX86 +TARGET = X86 + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \ + X86GenRegisterInfo.inc X86GenInstrNames.inc \ + X86GenInstrInfo.inc X86GenAsmWriter.inc \ + X86GenAsmWriter1.inc X86GenDAGISel.inc \ + X86GenCallingConv.inc X86GenSubtarget.inc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt new file mode 100644 index 0000000..d94fa02 --- /dev/null +++ b/lib/Target/X86/README-FPStack.txt @@ -0,0 +1,99 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: FP stack related stuff +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +Some targets (e.g. athlons) prefer freep to fstp ST(0): +http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html + +//===---------------------------------------------------------------------===// + +On darwin/x86, we should codegen: + + ret double 0.000000e+00 + +as fld0/ret, not as: + + movl $0, 4(%esp) + movl $0, (%esp) + fldl (%esp) + ... + ret + +//===---------------------------------------------------------------------===// + +This should use fiadd on chips where it is profitable: +double foo(double P, int *I) { return P+*I; } + +We have fiadd patterns now but the followings have the same cost and +complexity. We need a way to specify the later is more profitable. + +def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP:$dst, (fadd RFP:$src1, + (extloadf64f32 addr:$src2)))]>; + // ST(0) = ST(0) + [mem32] + +def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW, + [(set RFP:$dst, (fadd RFP:$src1, + (X86fild addr:$src2, i32)))]>; + // ST(0) = ST(0) + [mem32int] + +//===---------------------------------------------------------------------===// + +The FP stackifier needs to be global. Also, it should handle simple permutates +to reduce number of shuffle instructions, e.g. turning: + +fld P -> fld Q +fld Q fld P +fxch + +or: + +fxch -> fucomi +fucomi jl X +jg X + +Ideas: +http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html + + +//===---------------------------------------------------------------------===// + +Add a target specific hook to DAG combiner to handle SINT_TO_FP and +FP_TO_SINT when the source operand is already in memory. + +//===---------------------------------------------------------------------===// + +Open code rint,floor,ceil,trunc: +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html + +Opencode the sincos[f] libcall. + +//===---------------------------------------------------------------------===// + +None of the FPStack instructions are handled in +X86RegisterInfo::foldMemoryOperand, which prevents the spiller from +folding spill code into the instructions. + +//===---------------------------------------------------------------------===// + +Currently the x86 codegen isn't very good at mixing SSE and FPStack +code: + +unsigned int foo(double x) { return x; } + +foo: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl (%esp), %eax + addl $20, %esp + ret + +This will be solved when we go to a dynamic programming based isel. + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt new file mode 100644 index 0000000..57c7c3f --- /dev/null +++ b/lib/Target/X86/README-MMX.txt @@ -0,0 +1,69 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: MMX-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +This: + +#include <mmintrin.h> + +__v2si qux(int A) { + return (__v2si){ 0, A }; +} + +is compiled into: + +_qux: + subl $28, %esp + movl 32(%esp), %eax + movd %eax, %mm0 + movq %mm0, (%esp) + movl (%esp), %eax + movl %eax, 20(%esp) + movq %mm0, 8(%esp) + movl 12(%esp), %eax + movl %eax, 16(%esp) + movq 16(%esp), %mm0 + addl $28, %esp + ret + +Yuck! + +GCC gives us: + +_qux: + subl $12, %esp + movl 16(%esp), %eax + movl 20(%esp), %edx + movl $0, (%eax) + movl %edx, 4(%eax) + addl $12, %esp + ret $4 + +//===---------------------------------------------------------------------===// + +int main() { + __m64 A[1] = { _mm_cvtsi32_si64(1) }; + __m64 B[1] = { _mm_cvtsi32_si64(10) }; + __m64 sum = _mm_cvtsi32_si64(0); + + sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum); + + printf("Sum = %d\n", _mm_cvtsi64_si32(sum)); + return 0; +} + +Generates: + + movl $11, %eax +### movd %eax, %mm0 +### movq %mm0, 8(%esp) +### movl 8(%esp), %eax + movl %eax, 4(%esp) + movl $_str, (%esp) + call L_printf$stub + xorl %eax, %eax + addl $28, %esp + +These instructions are unnecessary. diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt new file mode 100644 index 0000000..f4b54c4 --- /dev/null +++ b/lib/Target/X86/README-SSE.txt @@ -0,0 +1,629 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: SSE-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +Expand libm rounding functions inline: Significant speedups possible. +http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html + +//===---------------------------------------------------------------------===// + +When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and +other fast SSE modes. + +//===---------------------------------------------------------------------===// + +Think about doing i64 math in SSE regs. + +//===---------------------------------------------------------------------===// + +This testcase should have no SSE instructions in it, and only one load from +a constant pool: + +double %test3(bool %B) { + %C = select bool %B, double 123.412, double 523.01123123 + ret double %C +} + +Currently, the select is being lowered, which prevents the dag combiner from +turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' + +The pattern isel got this one right. + +//===---------------------------------------------------------------------===// + +SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction +like this: + + X += y + +and the register allocator decides to spill X, it is cheaper to emit this as: + +Y += [xslot] +store Y -> [xslot] + +than as: + +tmp = [xslot] +tmp += y +store tmp -> [xslot] + +..and this uses one fewer register (so this should be done at load folding +time, not at spiller time). *Note* however that this can only be done +if Y is dead. Here's a testcase: + +%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0] +implementation ; Functions: +declare void %printf(int, ...) +void %main() { +build_tree.exit: + br label %no_exit.i7 +no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit + %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1] + %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1] + %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 + %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 + br bool false, label %Compute_Tree.exit23, label %no_exit.i7 +Compute_Tree.exit23: ; preds = %no_exit.i7 + tail call void (int, ...)* %printf( int 0 ) + store double %tmp.34.i18, double* null + ret void +} + +We currently emit: + +.BBmain_1: + xorpd %XMM1, %XMM1 + addsd %XMM0, %XMM1 +*** movsd %XMM2, QWORD PTR [%ESP + 8] +*** addsd %XMM2, %XMM1 +*** movsd QWORD PTR [%ESP + 8], %XMM2 + jmp .BBmain_1 # no_exit.i7 + +This is a bugpoint reduced testcase, which is why the testcase doesn't make +much sense (e.g. its an infinite loop). :) + +//===---------------------------------------------------------------------===// + +SSE should implement 'select_cc' using 'emulated conditional moves' that use +pcmp/pand/pandn/por to do a selection instead of a conditional branch: + +double %X(double %Y, double %Z, double %A, double %B) { + %C = setlt double %A, %B + %z = add double %Z, 0.0 ;; select operand is not a load + %D = select bool %C, double %Y, double %z + ret double %D +} + +We currently emit: + +_X: + subl $12, %esp + xorpd %xmm0, %xmm0 + addsd 24(%esp), %xmm0 + movsd 32(%esp), %xmm1 + movsd 16(%esp), %xmm2 + ucomisd 40(%esp), %xmm1 + jb LBB_X_2 +LBB_X_1: + movsd %xmm0, %xmm2 +LBB_X_2: + movsd %xmm2, (%esp) + fldl (%esp) + addl $12, %esp + ret + +//===---------------------------------------------------------------------===// + +It's not clear whether we should use pxor or xorps / xorpd to clear XMM +registers. The choice may depend on subtarget information. We should do some +more experiments on different x86 machines. + +//===---------------------------------------------------------------------===// + +Currently the x86 codegen isn't very good at mixing SSE and FPStack +code: + +unsigned int foo(double x) { return x; } + +foo: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl (%esp), %eax + addl $20, %esp + ret + +This will be solved when we go to a dynamic programming based isel. + +//===---------------------------------------------------------------------===// + +Lower memcpy / memset to a series of SSE 128 bit move instructions when it's +feasible. + +//===---------------------------------------------------------------------===// + +Teach the coalescer to commute 2-addr instructions, allowing us to eliminate +the reg-reg copy in this example: + +float foo(int *x, float *y, unsigned c) { + float res = 0.0; + unsigned i; + for (i = 0; i < c; i++) { + float xx = (float)x[i]; + xx = xx * y[i]; + xx += res; + res = xx; + } + return res; +} + +LBB_foo_3: # no_exit + cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI] + mulss %XMM0, DWORD PTR [%EAX + 4*%ESI] + addss %XMM0, %XMM1 + inc %ESI + cmp %ESI, %ECX +**** movaps %XMM1, %XMM0 + jb LBB_foo_3 # no_exit + +//===---------------------------------------------------------------------===// + +Codegen: + if (copysign(1.0, x) == copysign(1.0, y)) +into: + if (x^y & mask) +when using SSE. + +//===---------------------------------------------------------------------===// + +Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half +of a v4sf value. + +//===---------------------------------------------------------------------===// + +Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. +Perhaps use pxor / xorp* to clear a XMM register first? + +//===---------------------------------------------------------------------===// + +How to decide when to use the "floating point version" of logical ops? Here are +some code fragments: + + movaps LCPI5_5, %xmm2 + divps %xmm1, %xmm2 + mulps %xmm2, %xmm3 + mulps 8656(%ecx), %xmm3 + addps 8672(%ecx), %xmm3 + andps LCPI5_6, %xmm2 + andps LCPI5_1, %xmm3 + por %xmm2, %xmm3 + movdqa %xmm3, (%edi) + + movaps LCPI5_5, %xmm1 + divps %xmm0, %xmm1 + mulps %xmm1, %xmm3 + mulps 8656(%ecx), %xmm3 + addps 8672(%ecx), %xmm3 + andps LCPI5_6, %xmm1 + andps LCPI5_1, %xmm3 + orps %xmm1, %xmm3 + movaps %xmm3, 112(%esp) + movaps %xmm3, (%ebx) + +Due to some minor source change, the later case ended up using orps and movaps +instead of por and movdqa. Does it matter? + +//===---------------------------------------------------------------------===// + +X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible +to choose between movaps, movapd, and movdqa based on types of source and +destination? + +How about andps, andpd, and pand? Do we really care about the type of the packed +elements? If not, why not always use the "ps" variants which are likely to be +shorter. + +//===---------------------------------------------------------------------===// + +External test Nurbs exposed some problems. Look for +__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc +emits: + + movaps (%edx), %xmm2 #59.21 + movaps (%edx), %xmm5 #60.21 + movaps (%edx), %xmm4 #61.21 + movaps (%edx), %xmm3 #62.21 + movl 40(%ecx), %ebp #69.49 + shufps $0, %xmm2, %xmm5 #60.21 + movl 100(%esp), %ebx #69.20 + movl (%ebx), %edi #69.20 + imull %ebp, %edi #69.49 + addl (%eax), %edi #70.33 + shufps $85, %xmm2, %xmm4 #61.21 + shufps $170, %xmm2, %xmm3 #62.21 + shufps $255, %xmm2, %xmm2 #63.21 + lea (%ebp,%ebp,2), %ebx #69.49 + negl %ebx #69.49 + lea -3(%edi,%ebx), %ebx #70.33 + shll $4, %ebx #68.37 + addl 32(%ecx), %ebx #68.37 + testb $15, %bl #91.13 + jne L_B1.24 # Prob 5% #91.13 + +This is the llvm code after instruction scheduling: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %reg1078 = MOV32ri -3 + %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 + %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 + %reg1080 = IMUL32rr %reg1079, %reg1037 + %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 + %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 + %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 + %reg1082 = SHL32ri %reg1038, 4 + %reg1039 = ADD32rr %reg1036, %reg1082 + %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 + %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 + %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 + %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 + %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 + %reg1040 = MOV32rr %reg1039 + %reg1084 = AND32ri8 %reg1039, 15 + CMP32ri8 %reg1084, 0 + JE mbb<cond_next204,0xa914d30> + +Still ok. After register allocation: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %EAX = MOV32ri -3 + %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 + ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 + %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 + %EDX = MOV32rm %EDX, 1, %NOREG, 40 + IMUL32rr %EAX<def&use>, %EDX + %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 + %ESI = MOV32rm %ESI, 1, %NOREG, 0 + MOV32mr <fi#4>, 1, %NOREG, 0, %ESI + %EAX = LEA32r %ESI, 1, %EAX, -3 + %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 + %ESI = MOV32rm %ESI, 1, %NOREG, 32 + %EDI = MOV32rr %EAX + SHL32ri %EDI<def&use>, 4 + ADD32rr %EDI<def&use>, %ESI + %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 + %XMM1 = MOVAPSrr %XMM0 + SHUFPSrr %XMM1<def&use>, %XMM1, 170 + %XMM2 = MOVAPSrr %XMM0 + SHUFPSrr %XMM2<def&use>, %XMM2, 0 + %XMM3 = MOVAPSrr %XMM0 + SHUFPSrr %XMM3<def&use>, %XMM3, 255 + SHUFPSrr %XMM0<def&use>, %XMM0, 85 + %EBX = MOV32rr %EDI + AND32ri8 %EBX<def&use>, 15 + CMP32ri8 %EBX, 0 + JE mbb<cond_next204,0xa914d30> + +This looks really bad. The problem is shufps is a destructive opcode. Since it +appears as operand two in more than one shufps ops. It resulted in a number of +copies. Note icc also suffers from the same problem. Either the instruction +selector should select pshufd or The register allocator can made the two-address +to three-address transformation. + +It also exposes some other problems. See MOV32ri -3 and the spills. + +//===---------------------------------------------------------------------===// + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500 + +LLVM is producing bad code. + +LBB_main_4: # cond_true44 + addps %xmm1, %xmm2 + subps %xmm3, %xmm2 + movaps (%ecx), %xmm4 + movaps %xmm2, %xmm1 + addps %xmm4, %xmm1 + addl $16, %ecx + incl %edx + cmpl $262144, %edx + movaps %xmm3, %xmm2 + movaps %xmm4, %xmm3 + jne LBB_main_4 # cond_true44 + +There are two problems. 1) No need to two loop induction variables. We can +compare against 262144 * 16. 2) Known register coalescer issue. We should +be able eliminate one of the movaps: + + addps %xmm2, %xmm1 <=== Commute! + subps %xmm3, %xmm1 + movaps (%ecx), %xmm4 + movaps %xmm1, %xmm1 <=== Eliminate! + addps %xmm4, %xmm1 + addl $16, %ecx + incl %edx + cmpl $262144, %edx + movaps %xmm3, %xmm2 + movaps %xmm4, %xmm3 + jne LBB_main_4 # cond_true44 + +//===---------------------------------------------------------------------===// + +Consider: + +__m128 test(float a) { + return _mm_set_ps(0.0, 0.0, 0.0, a*a); +} + +This compiles into: + +movss 4(%esp), %xmm1 +mulss %xmm1, %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Because mulss doesn't modify the top 3 elements, the top elements of +xmm1 are already zero'd. We could compile this to: + +movss 4(%esp), %xmm0 +mulss %xmm0, %xmm0 +ret + +//===---------------------------------------------------------------------===// + +Here's a sick and twisted idea. Consider code like this: + +__m128 test(__m128 a) { + float b = *(float*)&A; + ... + return _mm_set_ps(0.0, 0.0, 0.0, b); +} + +This might compile to this code: + +movaps c(%esp), %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Now consider if the ... code caused xmm1 to get spilled. This might produce +this code: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +xorps %xmm0, %xmm0 +movaps c2(%esp), %xmm1 +movss %xmm1, %xmm0 +ret + +However, since the reload is only used by these instructions, we could +"fold" it into the uses, producing something like this: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +movss c2(%esp), %xmm0 +ret + +... saving two instructions. + +The basic idea is that a reload from a spill slot, can, if only one 4-byte +chunk is used, bring in 3 zeros the the one element instead of 4 elements. +This can be used to simplify a variety of shuffle operations, where the +elements are fixed zeros. + +//===---------------------------------------------------------------------===// + +For this: + +#include <emmintrin.h> +void test(__m128d *r, __m128d *A, double B) { + *r = _mm_loadl_pd(*A, &B); +} + +We generates: + + subl $12, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 20(%esp), %eax + movapd (%eax), %xmm0 + movlpd (%esp), %xmm0 + movl 16(%esp), %eax + movapd %xmm0, (%eax) + addl $12, %esp + ret + +icc generates: + + movl 4(%esp), %edx #3.6 + movl 8(%esp), %eax #3.6 + movapd (%eax), %xmm0 #4.22 + movlpd 12(%esp), %xmm0 #4.8 + movapd %xmm0, (%edx) #4.3 + ret #5.1 + +So icc is smart enough to know that B is in memory so it doesn't load it and +store it back to stack. + +//===---------------------------------------------------------------------===// + +__m128d test1( __m128d A, __m128d B) { + return _mm_shuffle_pd(A, B, 0x3); +} + +compiles to + +shufpd $3, %xmm1, %xmm0 + +Perhaps it's better to use unpckhpd instead? + +unpckhpd %xmm1, %xmm0 + +Don't know if unpckhpd is faster. But it is shorter. + +//===---------------------------------------------------------------------===// + +This code generates ugly code, probably due to costs being off or something: + +void %test(float* %P, <4 x float>* %P2 ) { + %xFloat0.688 = load float* %P + %loadVector37.712 = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 + store <4 x float> %inFloat3.713, <4 x float>* %P2 + ret void +} + +Generates: + +_test: + pxor %xmm0, %xmm0 + movd %xmm0, %eax ;; EAX = 0! + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + pinsrw $6, %eax, %xmm0 + shrl $16, %eax ;; EAX = 0 again! + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +It would be better to generate: + +_test: + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + xor %eax, %eax + pinsrw $6, %eax, %xmm0 + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +or use pxor (to make a zero vector) and shuffle (to insert it). + +//===---------------------------------------------------------------------===// + +Some useful information in the Apple Altivec / SSE Migration Guide: + +http://developer.apple.com/documentation/Performance/Conceptual/ +Accelerate_sse_migration/index.html + +e.g. SSE select using and, andnot, or. Various SSE compare translations. + +//===---------------------------------------------------------------------===// + +Add hooks to commute some CMPP operations. + +//===---------------------------------------------------------------------===// + +Apply the same transformation that merged four float into a single 128-bit load +to loads from constant pool. + +//===---------------------------------------------------------------------===// + +Floating point max / min are commutable when -enable-unsafe-fp-path is +specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other +nodes which are selected to max / min instructions that are marked commutable. + +//===---------------------------------------------------------------------===// + +We should compile this: +#include <xmmintrin.h> +typedef union { + int i[4]; + float f[4]; + __m128 v; +} vector4_t; +void swizzle (const void *a, vector4_t * b, vector4_t * c) { + b->v = _mm_loadl_pi (b->v, (__m64 *) a); + c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1); +} + +to: + +_swizzle: + movl 4(%esp), %eax + movl 8(%esp), %edx + movl 12(%esp), %ecx + movlps (%eax), %xmm0 + movlps %xmm0, (%edx) + movlps 8(%eax), %xmm0 + movlps %xmm0, (%ecx) + ret + +not: + +swizzle: + movl 8(%esp), %eax + movaps (%eax), %xmm0 + movl 4(%esp), %ecx + movlps (%ecx), %xmm0 + movaps %xmm0, (%eax) + movl 12(%esp), %eax + movaps (%eax), %xmm0 + movlps 8(%ecx), %xmm0 + movaps %xmm0, (%eax) + ret + +//===---------------------------------------------------------------------===// + +This code: + +#include <emmintrin.h> +__m128i test(long long i) { return _mm_cvtsi64x_si128(i); } + +Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we +get this (on x86-64): + +_test: + movd %rdi, %xmm1 + xorps %xmm0, %xmm0 + movsd %xmm1, %xmm0 + ret + +The LLVM IR is: + +target triple = "x86_64-apple-darwin8" +define <2 x i64> @test(i64 %i) { +entry: + %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 + %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 + ret <2 x i64> %tmp11 +} + +//===---------------------------------------------------------------------===// + +These functions should produce the same code: + +#include <emmintrin.h> + +typedef long long __m128i __attribute__ ((__vector_size__ (16))); + +int foo(__m128i* val) { + return __builtin_ia32_vec_ext_v4si(*val, 1); +} +int bar(__m128i* val) { + union vs { + __m128i *_v; + int* _s; + } v = {val}; + return v._s[1]; +} + +We currently produce (with -m64): + +_foo: + pshufd $1, (%rdi), %xmm0 + movd %xmm0, %eax + ret +_bar: + movl 4(%rdi), %eax + ret + +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt new file mode 100644 index 0000000..191904a --- /dev/null +++ b/lib/Target/X86/README-X86-64.txt @@ -0,0 +1,223 @@ +//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// + +Implement different PIC models? Right now we only support Mac OS X with small +PIC code model. + +//===---------------------------------------------------------------------===// + +Make use of "Red Zone". + +//===---------------------------------------------------------------------===// + +Implement __int128 and long double support. + +//===---------------------------------------------------------------------===// + +For this: + +extern void xx(void); +void bar(void) { + xx(); +} + +gcc compiles to: + +.globl _bar +_bar: + jmp _xx + +We need to do the tailcall optimization as well. + +//===---------------------------------------------------------------------===// + +AMD64 Optimization Manual 8.2 has some nice information about optimizing integer +multiplication by a constant. How much of it applies to Intel's X86-64 +implementation? There are definite trade-offs to consider: latency vs. register +pressure vs. code size. + +//===---------------------------------------------------------------------===// + +Are we better off using branches instead of cmove to implement FP to +unsigned i64? + +_conv: + ucomiss LC0(%rip), %xmm0 + cvttss2siq %xmm0, %rdx + jb L3 + subss LC0(%rip), %xmm0 + movabsq $-9223372036854775808, %rax + cvttss2siq %xmm0, %rdx + xorq %rax, %rdx +L3: + movq %rdx, %rax + ret + +instead of + +_conv: + movss LCPI1_0(%rip), %xmm1 + cvttss2siq %xmm0, %rcx + movaps %xmm0, %xmm2 + subss %xmm1, %xmm2 + cvttss2siq %xmm2, %rax + movabsq $-9223372036854775808, %rdx + xorq %rdx, %rax + ucomiss %xmm1, %xmm0 + cmovb %rcx, %rax + ret + +Seems like the jb branch has high likelyhood of being taken. It would have +saved a few instructions. + +//===---------------------------------------------------------------------===// + +Poor codegen: + +int X[2]; +int b; +void test(void) { + memset(X, b, 2*sizeof(X[0])); +} + +llc: + movq _b@GOTPCREL(%rip), %rax + movzbq (%rax), %rax + movq %rax, %rcx + shlq $8, %rcx + orq %rax, %rcx + movq %rcx, %rax + shlq $16, %rax + orq %rcx, %rax + movq %rax, %rcx + shlq $32, %rcx + movq _X@GOTPCREL(%rip), %rdx + orq %rax, %rcx + movq %rcx, (%rdx) + ret + +gcc: + movq _b@GOTPCREL(%rip), %rax + movabsq $72340172838076673, %rdx + movzbq (%rax), %rax + imulq %rdx, %rax + movq _X@GOTPCREL(%rip), %rdx + movq %rax, (%rdx) + ret + +//===---------------------------------------------------------------------===// + +Vararg function prologue can be further optimized. Currently all XMM registers +are stored into register save area. Most of them can be eliminated since the +upper bound of the number of XMM registers used are passed in %al. gcc produces +something like the following: + + movzbl %al, %edx + leaq 0(,%rdx,4), %rax + leaq 4+L2(%rip), %rdx + leaq 239(%rsp), %rax + jmp *%rdx + movaps %xmm7, -15(%rax) + movaps %xmm6, -31(%rax) + movaps %xmm5, -47(%rax) + movaps %xmm4, -63(%rax) + movaps %xmm3, -79(%rax) + movaps %xmm2, -95(%rax) + movaps %xmm1, -111(%rax) + movaps %xmm0, -127(%rax) +L2: + +It jumps over the movaps that do not need to be stored. Hard to see this being +significant as it added 5 instruciton (including a indirect branch) to avoid +executing 0 to 8 stores in the function prologue. + +Perhaps we can optimize for the common case where no XMM registers are used for +parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a +leaf function where we can determine that no XMM input parameter is need, avoid +emitting the stores at all. + +//===---------------------------------------------------------------------===// + +AMD64 has a complex calling convention for aggregate passing by value: + +1. If the size of an object is larger than two eightbytes, or in C++, is a non- + POD structure or union type, or contains unaligned fields, it has class + MEMORY. +2. Both eightbytes get initialized to class NO_CLASS. +3. Each field of an object is classified recursively so that always two fields + are considered. The resulting class is calculated according to the classes + of the fields in the eightbyte: + (a) If both classes are equal, this is the resulting class. + (b) If one of the classes is NO_CLASS, the resulting class is the other + class. + (c) If one of the classes is MEMORY, the result is the MEMORY class. + (d) If one of the classes is INTEGER, the result is the INTEGER. + (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as + class. + (f) Otherwise class SSE is used. +4. Then a post merger cleanup is done: + (a) If one of the classes is MEMORY, the whole argument is passed in memory. + (b) If SSEUP is not preceeded by SSE, it is converted to SSE. + +Currently llvm frontend does not handle this correctly. + +Problem 1: + typedef struct { int i; double d; } QuadWordS; +It is currently passed in two i64 integer registers. However, gcc compiled +callee expects the second element 'd' to be passed in XMM0. + +Problem 2: + typedef struct { int32_t i; float j; double d; } QuadWordS; +The size of the first two fields == i64 so they will be combined and passed in +a integer register RDI. The third field is still passed in XMM0. + +Problem 3: + typedef struct { int64_t i; int8_t j; int64_t d; } S; + void test(S s) +The size of this aggregate is greater than two i64 so it should be passed in +memory. Currently llvm breaks this down and passed it in three integer +registers. + +Problem 4: +Taking problem 3 one step ahead where a function expects a aggregate value +in memory followed by more parameter(s) passed in register(s). + void test(S s, int b) + +LLVM IR does not allow parameter passing by aggregates, therefore it must break +the aggregates value (in problem 3 and 4) into a number of scalar values: + void %test(long %s.i, byte %s.j, long %s.d); + +However, if the backend were to lower this code literally it would pass the 3 +values in integer registers. To force it be passed in memory, the frontend +should change the function signiture to: + void %test(long %undef1, long %undef2, long %undef3, long %undef4, + long %undef5, long %undef6, + long %s.i, byte %s.j, long %s.d); +And the callee would look something like this: + call void %test( undef, undef, undef, undef, undef, undef, + %tmp.s.i, %tmp.s.j, %tmp.s.d ); +The first 6 undef parameters would exhaust the 6 integer registers used for +parameter passing. The following three integer values would then be forced into +memory. + +For problem 4, the parameter 'd' would be moved to the front of the parameter +list so it will be passed in register: + void %test(int %d, + long %undef1, long %undef2, long %undef3, long %undef4, + long %undef5, long %undef6, + long %s.i, byte %s.j, long %s.d); + +//===---------------------------------------------------------------------===// + +Right now the asm printer assumes GlobalAddress are accessed via RIP relative +addressing. Therefore, it is not possible to generate this: + movabsq $__ZTV10polynomialIdE+16, %rax + +That is ok for now since we currently only support small model. So the above +is selected as + leaq __ZTV10polynomialIdE+16(%rip), %rax + +This is probably slightly slower but is much shorter than movabsq. However, if +we were to support medium or larger code models, we need to use the movabs +instruction. We should probably introduce something like AbsoluteAddress to +distinguish it from GlobalAddress so the asm printer and JIT code emitter can +do the right thing. diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt new file mode 100644 index 0000000..f15090a --- /dev/null +++ b/lib/Target/X86/README.txt @@ -0,0 +1,1150 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend. +//===---------------------------------------------------------------------===// + +Missing features: + - Support for SSE4: http://www.intel.com/software/penryn +http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf + - support for 3DNow! + - weird abis? + +//===---------------------------------------------------------------------===// + +Add a MUL2U and MUL2S nodes to represent a multiply that returns both the +Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to +X86, & make the dag combiner produce it when needed. This will eliminate one +imul from the code generated for: + +long long test(long long X, long long Y) { return X*Y; } + +by using the EAX result from the mul. We should add a similar node for +DIVREM. + +another case is: + +long long test(int X, int Y) { return (long long)X*Y; } + +... which should only be one imul instruction. + +or: + +unsigned long long int t2(unsigned int a, unsigned int b) { + return (unsigned long long)a * b; +} + +... which should be one mul instruction. + + +This can be done with a custom expander, but it would be nice to move this to +generic code. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 +backend knows how to three-addressify this shift, but it appears the register +allocator isn't even asking it to do so in this case. We should investigate +why this isn't happening, it could have significant impact on other important +cases for X86 as well. + +//===---------------------------------------------------------------------===// + +This should be one DIV/IDIV instruction, not a libcall: + +unsigned test(unsigned long long X, unsigned Y) { + return X/Y; +} + +This can be done trivially with a custom legalizer. What about overflow +though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 + +//===---------------------------------------------------------------------===// + +Improvements to the multiply -> shift/add algorithm: +http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html + +//===---------------------------------------------------------------------===// + +Improve code like this (occurs fairly frequently, e.g. in LLVM): +long long foo(int x) { return 1LL << x; } + +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html +http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html + +Another useful one would be ~0ULL >> X and ~0ULL << X. + +One better solution for 1LL << x is: + xorl %eax, %eax + xorl %edx, %edx + testb $32, %cl + sete %al + setne %dl + sall %cl, %eax + sall %cl, %edx + +But that requires good 8-bit subreg support. + +64-bit shifts (in general) expand to really bad code. Instead of using +cmovs, we should expand to a conditional branch like GCC produces. + +//===---------------------------------------------------------------------===// + +Compile this: +_Bool f(_Bool a) { return a!=1; } + +into: + movzbl %dil, %eax + xorl $1, %eax + ret + +//===---------------------------------------------------------------------===// + +Some isel ideas: + +1. Dynamic programming based approach when compile time if not an + issue. +2. Code duplication (addressing mode) during isel. +3. Other ideas from "Register-Sensitive Selection, Duplication, and + Sequencing of Instructions". +4. Scheduling for reduced register pressure. E.g. "Minimum Register + Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" + and other related papers. + http://citeseer.ist.psu.edu/govindarajan01minimum.html + +//===---------------------------------------------------------------------===// + +Should we promote i16 to i32 to avoid partial register update stalls? + +//===---------------------------------------------------------------------===// + +Leave any_extend as pseudo instruction and hint to register +allocator. Delay codegen until post register allocation. + +//===---------------------------------------------------------------------===// + +Count leading zeros and count trailing zeros: + +int clz(int X) { return __builtin_clz(X); } +int ctz(int X) { return __builtin_ctz(X); } + +$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel +clz: + bsr %eax, DWORD PTR [%esp+4] + xor %eax, 31 + ret +ctz: + bsf %eax, DWORD PTR [%esp+4] + ret + +however, check that these are defined for 0 and 32. Our intrinsics are, GCC's +aren't. + +Another example (use predsimplify to eliminate a select): + +int foo (unsigned long j) { + if (j) + return __builtin_ffs (j) - 1; + else + return 0; +} + +//===---------------------------------------------------------------------===// + +It appears icc use push for parameter passing. Need to investigate. + +//===---------------------------------------------------------------------===// + +Only use inc/neg/not instructions on processors where they are faster than +add/sub/xor. They are slower on the P4 due to only updating some processor +flags. + +//===---------------------------------------------------------------------===// + +The instruction selector sometimes misses folding a load into a compare. The +pattern is written as (cmp reg, (load p)). Because the compare isn't +commutative, it is not matched with the load on both sides. The dag combiner +should be made smart enough to cannonicalize the load into the RHS of a compare +when it can invert the result of the compare for free. + +//===---------------------------------------------------------------------===// + +How about intrinsics? An example is: + *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C)); + +compiles to + pmuludq (%eax), %xmm0 + movl 8(%esp), %eax + movdqa (%eax), %xmm1 + pmulhuw %xmm0, %xmm1 + +The transformation probably requires a X86 specific pass or a DAG combiner +target specific hook. + +//===---------------------------------------------------------------------===// + +In many cases, LLVM generates code like this: + +_test: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setl %al + movzbl %al, %eax + ret + +on some processors (which ones?), it is more efficient to do this: + +_test: + movl 8(%esp), %ebx + xor %eax, %eax + cmpl %ebx, 4(%esp) + setl %al + ret + +Doing this correctly is tricky though, as the xor clobbers the flags. + +//===---------------------------------------------------------------------===// + +We should generate bts/btr/etc instructions on targets where they are cheap or +when codesize is important. e.g., for: + +void setbit(int *target, int bit) { + *target |= (1 << bit); +} +void clearbit(int *target, int bit) { + *target &= ~(1 << bit); +} + +//===---------------------------------------------------------------------===// + +Instead of the following for memset char*, 1, 10: + + movl $16843009, 4(%edx) + movl $16843009, (%edx) + movw $257, 8(%edx) + +It might be better to generate + + movl $16843009, %eax + movl %eax, 4(%edx) + movl %eax, (%edx) + movw al, 8(%edx) + +when we can spare a register. It reduces code size. + +//===---------------------------------------------------------------------===// + +Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently +get this: + +int %test1(int %X) { + %Y = div int %X, 8 + ret int %Y +} + +_test1: + movl 4(%esp), %eax + movl %eax, %ecx + sarl $31, %ecx + shrl $29, %ecx + addl %ecx, %eax + sarl $3, %eax + ret + +GCC knows several different ways to codegen it, one of which is this: + +_test1: + movl 4(%esp), %eax + cmpl $-1, %eax + leal 7(%eax), %ecx + cmovle %ecx, %eax + sarl $3, %eax + ret + +which is probably slower, but it's interesting at least :) + +//===---------------------------------------------------------------------===// + +The first BB of this code: + +declare bool %foo() +int %bar() { + %V = call bool %foo() + br bool %V, label %T, label %F +T: + ret int 1 +F: + call bool %foo() + ret int 12 +} + +compiles to: + +_bar: + subl $12, %esp + call L_foo$stub + xorb $1, %al + testb %al, %al + jne LBB_bar_2 # F + +It would be better to emit "cmp %al, 1" than a xor and test. + +//===---------------------------------------------------------------------===// + +Enable X86InstrInfo::convertToThreeAddress(). + +//===---------------------------------------------------------------------===// + +We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl +We should leave these as libcalls for everything over a much lower threshold, +since libc is hand tuned for medium and large mem ops (avoiding RFO for large +stores, TLB preheating, etc) + +//===---------------------------------------------------------------------===// + +Optimize this into something reasonable: + x * copysign(1.0, y) * copysign(1.0, z) + +//===---------------------------------------------------------------------===// + +Optimize copysign(x, *y) to use an integer load from y. + +//===---------------------------------------------------------------------===// + +%X = weak global int 0 + +void %foo(int %N) { + %N = cast int %N to uint + %tmp.24 = setgt int %N, 0 + br bool %tmp.24, label %no_exit, label %return + +no_exit: + %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ] + %i.0.0 = cast uint %indvar to int + volatile store int %i.0.0, int* %X + %indvar.next = add uint %indvar, 1 + %exitcond = seteq uint %indvar.next, %N + br bool %exitcond, label %return, label %no_exit + +return: + ret void +} + +compiles into: + + .text + .align 4 + .globl _foo +_foo: + movl 4(%esp), %eax + cmpl $1, %eax + jl LBB_foo_4 # return +LBB_foo_1: # no_exit.preheader + xorl %ecx, %ecx +LBB_foo_2: # no_exit + movl L_X$non_lazy_ptr, %edx + movl %ecx, (%edx) + incl %ecx + cmpl %eax, %ecx + jne LBB_foo_2 # no_exit +LBB_foo_3: # return.loopexit +LBB_foo_4: # return + ret + +We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after +remateralization is implemented. This can be accomplished with 1) a target +dependent LICM pass or 2) makeing SelectDAG represent the whole function. + +//===---------------------------------------------------------------------===// + +The following tests perform worse with LSR: + +lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. + +//===---------------------------------------------------------------------===// + +We are generating far worse code than gcc: + +volatile short X, Y; + +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} + +LBB1_1: #bb.preheader + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: #bb + movl L_X$non_lazy_ptr, %esi + movw %dx, (%esi) + movw %dx, %si + shlw $2, %si + movl L_Y$non_lazy_ptr, %edi + movw %si, (%edi) + incl %ecx + incw %dx + cmpl %eax, %ecx + jne LBB1_2 #bb + +vs. + + xorl %edx, %edx + movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi + movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx +L4: + movw %dx, (%esi) + leal 0(,%edx,4), %eax + movw %ax, (%ecx) + addl $1, %edx + cmpl %edx, %edi + jne L4 + +There are 3 issues: + +1. Lack of post regalloc LICM. +2. Poor sub-regclass support. That leads to inability to promote the 16-bit + arithmetic op to 32-bit and making use of leal. +3. LSR unable to reused IV for a different type (i16 vs. i32) even though + the cast would be free. + +//===---------------------------------------------------------------------===// + +Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / +FR64 to VR128. + +//===---------------------------------------------------------------------===// + +mov $reg, 48(%esp) +... +leal 48(%esp), %eax +mov %eax, (%esp) +call _foo + +Obviously it would have been better for the first mov (or any op) to store +directly %esp[0] if there are no other uses. + +//===---------------------------------------------------------------------===// + +Adding to the list of cmp / test poor codegen issues: + +int test(__m128 *A, __m128 *B) { + if (_mm_comige_ss(*A, *B)) + return 3; + else + return 4; +} + +_test: + movl 8(%esp), %eax + movaps (%eax), %xmm0 + movl 4(%esp), %eax + movaps (%eax), %xmm1 + comiss %xmm0, %xmm1 + setae %al + movzbl %al, %ecx + movl $3, %eax + movl $4, %edx + cmpl $0, %ecx + cmove %edx, %eax + ret + +Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There +are a number of issues. 1) We are introducing a setcc between the result of the +intrisic call and select. 2) The intrinsic is expected to produce a i32 value +so a any extend (which becomes a zero extend) is added. + +We probably need some kind of target DAG combine hook to fix this. + +//===---------------------------------------------------------------------===// + +We generate significantly worse code for this than GCC: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 +http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 + +There is also one case we do worse on PPC. + +//===---------------------------------------------------------------------===// + +If shorter, we should use things like: +movzwl %ax, %eax +instead of: +andl $65535, %EAX + +The former can also be used when the two-addressy nature of the 'and' would +require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). + +//===---------------------------------------------------------------------===// + +Bad codegen: + +char foo(int x) { return x; } + +_foo: + movl 4(%esp), %eax + shll $24, %eax + sarl $24, %eax + ret + +SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of +sub-registers. + +//===---------------------------------------------------------------------===// + +Consider this: + +typedef struct pair { float A, B; } pair; +void pairtest(pair P, float *FP) { + *FP = P.A+P.B; +} + +We currently generate this code with llvmgcc4: + +_pairtest: + movl 8(%esp), %eax + movl 4(%esp), %ecx + movd %eax, %xmm0 + movd %ecx, %xmm1 + addss %xmm0, %xmm1 + movl 12(%esp), %eax + movss %xmm1, (%eax) + ret + +we should be able to generate: +_pairtest: + movss 4(%esp), %xmm0 + movl 12(%esp), %eax + addss 8(%esp), %xmm0 + movss %xmm0, (%eax) + ret + +The issue is that llvmgcc4 is forcing the struct to memory, then passing it as +integer chunks. It does this so that structs like {short,short} are passed in +a single 32-bit integer stack slot. We should handle the safe cases above much +nicer, while still handling the hard cases. + +While true in general, in this specific case we could do better by promoting +load int + bitcast to float -> load fload. This basically needs alignment info, +the code is already implemented (but disabled) in dag combine). + +//===---------------------------------------------------------------------===// + +Another instruction selector deficiency: + +void %bar() { + %tmp = load int (int)** %foo + %tmp = tail call int %tmp( int 3 ) + ret void +} + +_bar: + subl $12, %esp + movl L_foo$non_lazy_ptr, %eax + movl (%eax), %eax + call *%eax + addl $12, %esp + ret + +The current isel scheme will not allow the load to be folded in the call since +the load's chain result is read by the callseq_start. + +//===---------------------------------------------------------------------===// + +Don't forget to find a way to squash noop truncates in the JIT environment. + +//===---------------------------------------------------------------------===// + +Implement anyext in the same manner as truncate that would allow them to be +eliminated. + +//===---------------------------------------------------------------------===// + +How about implementing truncate / anyext as a property of machine instruction +operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register. +Do this for the cases where a truncate / anyext is guaranteed to be eliminated. +For IA32 that is truncate from 32 to 16 and anyext from 16 to 32. + +//===---------------------------------------------------------------------===// + +For this: + +int test(int a) +{ + return a * 3; +} + +We currently emits + imull $3, 4(%esp), %eax + +Perhaps this is what we really should generate is? Is imull three or four +cycles? Note: ICC generates this: + movl 4(%esp), %eax + leal (%eax,%eax,2), %eax + +The current instruction priority is based on pattern complexity. The former is +more "complex" because it folds a load so the latter will not be emitted. + +Perhaps we should use AddedComplexity to give LEA32r a higher priority? We +should always try to match LEA first since the LEA matching code does some +estimate to determine whether the match is profitable. + +However, if we care more about code size, then imull is better. It's two bytes +shorter than movl + leal. + +//===---------------------------------------------------------------------===// + +Implement CTTZ, CTLZ with bsf and bsr. + +//===---------------------------------------------------------------------===// + +It appears gcc place string data with linkonce linkage in +.section __TEXT,__const_coal,coalesced instead of +.section __DATA,__const_coal,coalesced. +Take a look at darwin.h, there are other Darwin assembler directives that we +do not make use of. + +//===---------------------------------------------------------------------===// + +int %foo(int* %a, int %t) { +entry: + br label %cond_true + +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] + %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] + %tmp2 = getelementptr int* %a, int %x.0.0 + %tmp3 = load int* %tmp2 ; <int> [#uses=1] + %tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1] + %tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2] + %tmp9 = add int %x.0.0, 1 ; <int> [#uses=2] + %tmp = setgt int %tmp9, 39 ; <bool> [#uses=1] + br bool %tmp, label %bb12, label %cond_true + +bb12: ; preds = %cond_true + ret int %tmp7 +} + +is pessimized by -loop-reduce and -indvars + +//===---------------------------------------------------------------------===// + +u32 to float conversion improvement: + +float uint32_2_float( unsigned u ) { + float fl = (int) (u & 0xffff); + float fh = (int) (u >> 16); + fh *= 0x1.0p16f; + return fh + fl; +} + +00000000 subl $0x04,%esp +00000003 movl 0x08(%esp,1),%eax +00000007 movl %eax,%ecx +00000009 shrl $0x10,%ecx +0000000c cvtsi2ss %ecx,%xmm0 +00000010 andl $0x0000ffff,%eax +00000015 cvtsi2ss %eax,%xmm1 +00000019 mulss 0x00000078,%xmm0 +00000021 addss %xmm1,%xmm0 +00000025 movss %xmm0,(%esp,1) +0000002a flds (%esp,1) +0000002d addl $0x04,%esp +00000030 ret + +//===---------------------------------------------------------------------===// + +When using fastcc abi, align stack slot of argument of type double on 8 byte +boundary to improve performance. + +//===---------------------------------------------------------------------===// + +Codegen: + +int f(int a, int b) { + if (a == 4 || a == 6) + b++; + return b; +} + + +as: + +or eax, 2 +cmp eax, 6 +jz label + +//===---------------------------------------------------------------------===// + +GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting +simplifications for integer "x cmp y ? a : b". For example, instead of: + +int G; +void f(int X, int Y) { + G = X < 0 ? 14 : 13; +} + +compiling to: + +_f: + movl $14, %eax + movl $13, %ecx + movl 4(%esp), %edx + testl %edx, %edx + cmovl %eax, %ecx + movl %ecx, _G + ret + +it could be: +_f: + movl 4(%esp), %eax + sarl $31, %eax + notl %eax + addl $14, %eax + movl %eax, _G + ret + +etc. + +//===---------------------------------------------------------------------===// + +Currently we don't have elimination of redundant stack manipulations. Consider +the code: + +int %main() { +entry: + call fastcc void %test1( ) + call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) ) + ret int 0 +} + +declare fastcc void %test1() + +declare fastcc void %test2(sbyte*) + + +This currently compiles to: + + subl $16, %esp + call _test5 + addl $12, %esp + subl $16, %esp + movl $_test5, (%esp) + call _test6 + addl $12, %esp + +The add\sub pair is really unneeded here. + +//===---------------------------------------------------------------------===// + +We currently compile sign_extend_inreg into two shifts: + +long foo(long X) { + return (long)(signed char)X; +} + +becomes: + +_foo: + movl 4(%esp), %eax + shll $24, %eax + sarl $24, %eax + ret + +This could be: + +_foo: + movsbl 4(%esp),%eax + ret + +//===---------------------------------------------------------------------===// + +Consider the expansion of: + +uint %test3(uint %X) { + %tmp1 = rem uint %X, 255 + ret uint %tmp1 +} + +Currently it compiles to: + +... + movl $2155905153, %ecx + movl 8(%esp), %esi + movl %esi, %eax + mull %ecx +... + +This could be "reassociated" into: + + movl $2155905153, %eax + movl 8(%esp), %ecx + mull %ecx + +to avoid the copy. In fact, the existing two-address stuff would do this +except that mul isn't a commutative 2-addr instruction. I guess this has +to be done at isel time based on the #uses to mul? + +//===---------------------------------------------------------------------===// + +Make sure the instruction which starts a loop does not cross a cacheline +boundary. This requires knowning the exact length of each machine instruction. +That is somewhat complicated, but doable. Example 256.bzip2: + +In the new trace, the hot loop has an instruction which crosses a cacheline +boundary. In addition to potential cache misses, this can't help decoding as I +imagine there has to be some kind of complicated decoder reset and realignment +to grab the bytes from the next cacheline. + +532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines +942 942 0x3d03 movl %dh, (1809(%esp, %esi) +937 937 0x3d0a incl %esi +3 3 0x3d0b cmpb %bl, %dl +27 27 0x3d0d jnz 0x000062db <main+11707> + +//===---------------------------------------------------------------------===// + +In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. + +//===---------------------------------------------------------------------===// + +This could be a single 16-bit load. + +int f(char *p) { + if ((p[0] == 1) & (p[1] == 2)) return 1; + return 0; +} + +//===---------------------------------------------------------------------===// + +We should inline lrintf and probably other libc functions. + +//===---------------------------------------------------------------------===// + +Start using the flags more. For example, compile: + +int add_zf(int *x, int y, int a, int b) { + if ((*x += y) == 0) + return a; + else + return b; +} + +to: + addl %esi, (%rdi) + movl %edx, %eax + cmovne %ecx, %eax + ret +instead of: + +_add_zf: + addl (%rdi), %esi + movl %esi, (%rdi) + testl %esi, %esi + cmove %edx, %ecx + movl %ecx, %eax + ret + +and: + +int add_zf(int *x, int y, int a, int b) { + if ((*x + y) < 0) + return a; + else + return b; +} + +to: + +add_zf: + addl (%rdi), %esi + movl %edx, %eax + cmovns %ecx, %eax + ret + +instead of: + +_add_zf: + addl (%rdi), %esi + testl %esi, %esi + cmovs %edx, %ecx + movl %ecx, %eax + ret + +//===---------------------------------------------------------------------===// + +This: +#include <math.h> +int foo(double X) { return isnan(X); } + +compiles to (-m64): + +_foo: + pxor %xmm1, %xmm1 + ucomisd %xmm1, %xmm0 + setp %al + movzbl %al, %eax + ret + +the pxor is not needed, we could compare the value against itself. + +//===---------------------------------------------------------------------===// + +These two functions have identical effects: + +unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} +unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} + +We currently compile them to: + +_f: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + movl 8(%esp), %edx + cmpl %edx, %ecx + jne LBB1_2 #UnifiedReturnBlock +LBB1_1: #cond_true + addl $2, %eax + ret +LBB1_2: #UnifiedReturnBlock + movl %ecx, %eax + ret +_f2: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + cmpl 8(%esp), %ecx + sete %cl + movzbl %cl, %ecx + leal 1(%ecx,%eax), %eax + ret + +both of which are inferior to GCC's: + +_f: + movl 4(%esp), %edx + leal 1(%edx), %eax + addl $2, %edx + cmpl 8(%esp), %eax + cmove %edx, %eax + ret +_f2: + movl 4(%esp), %eax + addl $1, %eax + xorl %edx, %edx + cmpl 8(%esp), %eax + sete %dl + addl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +This code: + +void test(int X) { + if (X) abort(); +} + +is currently compiled to: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp + ret +LBB1_1: + call L_abort$stub + +It would be better to produce: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub + +Though this probably isn't worth it. + +//===---------------------------------------------------------------------===// + +We need to teach the codegen to convert two-address INC instructions to LEA +when the flags are dead. For example, on X86-64, compile: + +int foo(int A, int B) { + return A+1; +} + +to: + +_foo: + leal 1(%edi), %eax + ret + +instead of: + +_foo: + incl %edi + movl %edi, %eax + ret + +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +%G = external global int + +int %test1(int %X, int %Y) { + %Z = add int %X, %Y + volatile store int %Y, int* %G + volatile store int %Z, int* %G + ret int %X +} + +int %test2(int %X) { + %Z = add int %X, 1 ;; inc + volatile store int %Z, int* %G + ret int %X +} + +//===---------------------------------------------------------------------===// + +We use push/pop of stack space around calls in situations where we don't have to. +Call to f below produces: + subl $16, %esp <<<<< + movl %eax, (%esp) + call L_f$stub + addl $16, %esp <<<<< +The stack push/pop can be moved into the prolog/epilog. It does this because it's +building the frame pointer, but this should not be sufficient, only the use of alloca +should cause it to do this. +(There are other issues shown by this code, but this is one.) + +typedef struct _range_t { + float fbias; + float fscale; + int ibias; + int iscale; + int ishift; + unsigned char lut[]; +} range_t; + +struct _decode_t { + int type:4; + int unit:4; + int alpha:8; + int N:8; + int bpc:8; + int bpp:16; + int skip:8; + int swap:8; + const range_t*const*range; +}; + +typedef struct _decode_t decode_t; + +extern int f(const decode_t* decode); + +int decode_byte (const decode_t* decode) { + if (decode->swap != 0) + return f(decode); + return 0; +} + + +//===---------------------------------------------------------------------===// + +This: +#include <xmmintrin.h> +unsigned test(float f) { + return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f )); +} + +Compiles to: +_test: + movss 4(%esp), %xmm0 + movd %xmm0, %eax + ret + +it should compile to a move from the stack slot directly into eax. DAGCombine +has this xform, but it is currently disabled until the alignment fields of +the load/store nodes are trustworthy. + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +For code like: +phi (undef, x) + +We get an implicit def on the undef side. If the phi is spilled, we then get: +implicitdef xmm1 +store xmm1 -> stack + +It should be possible to teach the x86 backend to "fold" the store into the +implicitdef, which just deletes the implicit def. + +These instructions should go away: +#IMPLICIT_DEF %xmm1 +movaps %xmm1, 192(%esp) +movaps %xmm1, 224(%esp) +movaps %xmm1, 176(%esp) diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h new file mode 100644 index 0000000..c7663be --- /dev/null +++ b/lib/Target/X86/X86.h @@ -0,0 +1,66 @@ +//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the x86 +// target library, as used by the LLVM JIT. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_X86_H +#define TARGET_X86_H + +#include <iosfwd> + +namespace llvm { + +class X86TargetMachine; +class FunctionPassManager; +class FunctionPass; +class MachineCodeEmitter; + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *createX86ISelDag(X86TargetMachine &TM, bool Fast); + +/// createX86FloatingPointStackifierPass - This function returns a pass which +/// converts floating point register references and pseudo instructions into +/// floating point stack references and physical instructions. +/// +FunctionPass *createX86FloatingPointStackifierPass(); + +/// createX86CodePrinterPass - Returns a pass that prints the X86 +/// assembly code for a MachineFunction to the given output stream, +/// using the given target machine description. +/// +FunctionPass *createX86CodePrinterPass(std::ostream &o, X86TargetMachine &tm); + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified MCE object. +FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM, + MachineCodeEmitter &MCE); + +/// createX86EmitCodeToMemory - Returns a pass that converts a register +/// allocated function into raw machine code in a dynamically +/// allocated chunk of memory. +/// +FunctionPass *createEmitX86CodeToMemory(); + +} // End llvm namespace + +// Defines symbolic names for X86 registers. This defines a mapping from +// register name to register number. +// +#include "X86GenRegisterNames.inc" + +// Defines symbolic names for the X86 instructions. +// +#include "X86GenInstrNames.inc" + +#endif diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td new file mode 100644 index 0000000..98362c8 --- /dev/null +++ b/lib/Target/X86/X86.td @@ -0,0 +1,150 @@ +//===- X86.td - Target definition file for the Intel X86 arch ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, refered to +// here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "../Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", + "Enable MMX instructions">; +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + [FeatureMMX]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions">; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureSSE2]>; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"generic", []>; +def : Proc<"i386", []>; +def : Proc<"i486", []>; +def : Proc<"pentium", []>; +def : Proc<"pentium-mmx", [FeatureMMX]>; +def : Proc<"i686", []>; +def : Proc<"pentiumpro", []>; +def : Proc<"pentium2", [FeatureMMX]>; +def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium-m", [FeatureSSE2]>; +def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"x86-64", [Feature64Bit]>; +def : Proc<"yonah", [FeatureSSE3]>; +def : Proc<"prescott", [FeatureSSE3]>; +def : Proc<"nocona", [FeatureSSE3]>; +def : Proc<"core2", [FeatureSSSE3]>; + +def : Proc<"k6", [FeatureMMX]>; +def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; +def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>; +def : Proc<"athlon", [FeatureMMX, Feature3DNowA]>; +def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA]>; +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA]>; +def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA]>; +def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA]>; +def : Proc<"k8", [Feature3DNowA, Feature64Bit]>; +def : Proc<"opteron", [Feature3DNowA, Feature64Bit]>; +def : Proc<"athlon64", [Feature3DNowA, Feature64Bit]>; +def : Proc<"athlon-fx", [Feature3DNowA, Feature64Bit]>; + +def : Proc<"winchip-c6", [FeatureMMX]>; +def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>; +def : Proc<"c3", [FeatureMMX, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSSE1]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86InstrInfo.td" + +def X86InstrInfo : InstrInfo { + + // Define how we want to layout our TargetSpecific information field... This + // should be kept up-to-date with the fields in the X86InstrInfo.h file. + let TSFlagsFields = ["FormBits", + "hasOpSizePrefix", + "hasAdSizePrefix", + "Prefix", + "hasREX_WPrefix", + "ImmTypeBits", + "FPFormBits", + "Opcode"]; + let TSFlagsShifts = [0, + 6, + 7, + 8, + 12, + 13, + 16, + 24]; +} + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "X86CallingConv.td" + + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTAsmPrinter"; + int Variant = 0; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelAsmPrinter"; + int Variant = 1; +} + + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; +} diff --git a/lib/Target/X86/X86ATTAsmPrinter.cpp b/lib/Target/X86/X86ATTAsmPrinter.cpp new file mode 100755 index 0000000..e97babe --- /dev/null +++ b/lib/Target/X86/X86ATTAsmPrinter.cpp @@ -0,0 +1,607 @@ +//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to AT&T format assembly +// language. This printer is the output mechanism used by `llc'. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86ATTAsmPrinter.h" +#include "X86.h" +#include "X86COFF.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "X86TargetAsmInfo.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CallingConv.h" +#include "llvm/Module.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +static std::string computePICLabel(unsigned FnNum, + const TargetAsmInfo *TAI, + const X86Subtarget* Subtarget) { + std::string label; + if (Subtarget->isTargetDarwin()) + label = "\"L" + utostr_32(FnNum) + "$pb\""; + else if (Subtarget->isTargetELF()) + label = ".Lllvm$" + utostr_32(FnNum) + "$piclabel"; + else + assert(0 && "Don't know how to print PIC label!\n"); + + return label; +} + +/// getSectionForFunction - Return the section that we should emit the +/// specified function body into. +std::string X86ATTAsmPrinter::getSectionForFunction(const Function &F) const { + switch (F.getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: + case Function::DLLExportLinkage: + case Function::ExternalLinkage: + return TAI->getTextSection(); + case Function::WeakLinkage: + case Function::LinkOnceLinkage: + if (Subtarget->isTargetDarwin()) { + return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions"; + } else if (Subtarget->isTargetCygMing()) { + return "\t.section\t.text$linkonce." + CurrentFnName + ",\"ax\""; + } else { + return "\t.section\t.llvm.linkonce.t." + CurrentFnName + + ",\"ax\",@progbits"; + } + } +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + if (TAI->doesSupportDebugInformation()) { + // Let PassManager know we need debug information and relay + // the MachineModuleInfo address on to DwarfWriter. + DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>()); + } + + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // Print out labels for the function. + const Function *F = MF.getFunction(); + unsigned CC = F->getCallingConv(); + + // Populate function information map. Actually, We don't want to populate + // non-stdcall or non-fastcall functions' information right now. + if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall) + FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>(); + + X86SharedAsmPrinter::decorateName(CurrentFnName, F); + + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + switch (F->getLinkage()) { + default: assert(0 && "Unknown linkage type!"); + case Function::InternalLinkage: // Symbols default to internal. + EmitAlignment(4, F); // FIXME: This should be parameterized somewhere. + break; + case Function::DLLExportLinkage: + DLLExportedFns.insert(Mang->makeNameProper(F->getName(), "")); + //FALLS THROUGH + case Function::ExternalLinkage: + EmitAlignment(4, F); // FIXME: This should be parameterized somewhere. + O << "\t.globl\t" << CurrentFnName << "\n"; + break; + case Function::LinkOnceLinkage: + case Function::WeakLinkage: + if (Subtarget->isTargetDarwin()) { + O << "\t.globl\t" << CurrentFnName << "\n"; + O << "\t.weak_definition\t" << CurrentFnName << "\n"; + } else if (Subtarget->isTargetCygMing()) { + EmitAlignment(4, F); // FIXME: This should be parameterized somewhere. + O << "\t.globl " << CurrentFnName << "\n"; + O << "\t.linkonce discard\n"; + } else { + EmitAlignment(4, F); // FIXME: This should be parameterized somewhere. + O << "\t.weak " << CurrentFnName << "\n"; + } + break; + } + if (F->hasHiddenVisibility()) { + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << CurrentFnName << "\n"; + } else if (F->hasProtectedVisibility()) { + if (const char *Directive = TAI->getProtectedDirective()) + O << Directive << CurrentFnName << "\n"; + } + + if (Subtarget->isTargetELF()) + O << "\t.type " << CurrentFnName << ",@function\n"; + else if (Subtarget->isTargetCygMing()) { + O << "\t.def\t " << CurrentFnName + << ";\t.scl\t" << + (F->getLinkage() == Function::InternalLinkage ? COFF::C_STAT : COFF::C_EXT) + << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT) + << ";\t.endef\n"; + } + + O << CurrentFnName << ":\n"; + // Add some workaround for linkonce linkage on Cygwin\MinGW + if (Subtarget->isTargetCygMing() && + (F->getLinkage() == Function::LinkOnceLinkage || + F->getLinkage() == Function::WeakLinkage)) + O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n"; + + if (TAI->doesSupportDebugInformation()) { + // Emit pre-function debug information. + DW.BeginFunction(&MF); + } + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block. + if (I->pred_begin() != I->pred_end()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printMachineInstruction(II); + } + } + + if (TAI->hasDotTypeDotSizeDirective()) + O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n"; + + if (TAI->doesSupportDebugInformation()) { + // Emit post-function debug information. + DW.EndFunction(); + } + + // Print out jump tables referenced by the function. + EmitJumpTableInfo(MF.getJumpTableInfo(), MF); + + // We didn't modify anything. + return false; +} + +static inline bool printGOT(TargetMachine &TM, const X86Subtarget* ST) { + return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_; +} + +static inline bool printStub(TargetMachine &TM, const X86Subtarget* ST) { + return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static; +} + +void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, + const char *Modifier, bool NotRIPRel) { + const MachineOperand &MO = MI->getOperand(OpNo); + const MRegisterInfo &RI = *TM.getRegisterInfo(); + switch (MO.getType()) { + case MachineOperand::MO_Register: { + assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) && + "Virtual registers should not make it this far!"); + O << '%'; + unsigned Reg = MO.getReg(); + if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + MVT::ValueType VT = (strcmp(Modifier+6,"64") == 0) ? + MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : + ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); + Reg = getX86SubSuperRegister(Reg, VT); + } + for (const char *Name = RI.get(Reg).Name; *Name; ++Name) + O << (char)tolower(*Name); + return; + } + + case MachineOperand::MO_Immediate: + if (!Modifier || + (strcmp(Modifier, "debug") && strcmp(Modifier, "mem"))) + O << '$'; + O << MO.getImmedValue(); + return; + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_JumpTableIndex: { + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + if (!isMemOp) O << '$'; + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << "_" + << MO.getJumpTableIndex(); + + if (TM.getRelocationModel() == Reloc::PIC_) { + if (Subtarget->isPICStyleStub()) + O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber() + << "$pb\""; + else if (Subtarget->isPICStyleGOT()) + O << "@GOTOFF"; + } + + if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel) + O << "(%rip)"; + return; + } + case MachineOperand::MO_ConstantPoolIndex: { + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + if (!isMemOp) O << '$'; + O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" + << MO.getConstantPoolIndex(); + + if (TM.getRelocationModel() == Reloc::PIC_) { + if (Subtarget->isPICStyleStub()) + O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber() + << "$pb\""; + else if (Subtarget->isPICStyleGOT()) + O << "@GOTOFF"; + } + + int Offset = MO.getOffset(); + if (Offset > 0) + O << "+" << Offset; + else if (Offset < 0) + O << Offset; + + if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel) + O << "(%rip)"; + return; + } + case MachineOperand::MO_GlobalAddress: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + bool needCloseParen = false; + + GlobalValue *GV = MO.getGlobal(); + GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + bool isThreadLocal = GVar && GVar->isThreadLocal(); + + std::string Name = Mang->getValueName(GV); + X86SharedAsmPrinter::decorateName(Name, GV); + + if (!isMemOp && !isCallOp) + O << '$'; + else if (Name[0] == '$') { + // The name begins with a dollar-sign. In order to avoid having it look + // like an integer immediate to the assembler, enclose it in parens. + O << '('; + needCloseParen = true; + } + + if (printStub(TM, Subtarget)) { + // Link-once, declaration, or Weakly-linked global variables need + // non-lazily-resolved stubs + if (GV->isDeclaration() || + GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()) { + // Dynamically-resolved functions need a stub for the function. + if (isCallOp && isa<Function>(GV)) { + FnStubs.insert(Name); + O << TAI->getPrivateGlobalPrefix() << Name << "$stub"; + } else { + GVStubs.insert(Name); + O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr"; + } + } else { + if (GV->hasDLLImportLinkage()) + O << "__imp_"; + O << Name; + } + + if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_) + O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber() + << "$pb\""; + } else { + if (GV->hasDLLImportLinkage()) { + O << "__imp_"; + } + O << Name; + + if (isCallOp && isa<Function>(GV)) { + if (printGOT(TM, Subtarget)) { + // Assemble call via PLT for non-local symbols + if (!(GV->hasHiddenVisibility() || GV->hasProtectedVisibility()) || + GV->isDeclaration()) + O << "@PLT"; + } + if (Subtarget->isTargetCygMing() && GV->isDeclaration()) + // Save function name for later type emission + FnStubs.insert(Name); + } + } + + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + int Offset = MO.getOffset(); + if (Offset > 0) + O << "+" << Offset; + else if (Offset < 0) + O << Offset; + + if (isThreadLocal) { + if (TM.getRelocationModel() == Reloc::PIC_) + O << "@TLSGD"; // general dynamic TLS model + else + if (GV->isDeclaration()) + O << "@INDNTPOFF"; // initial exec TLS model + else + O << "@NTPOFF"; // local exec TLS model + } else if (isMemOp) { + if (printGOT(TM, Subtarget)) { + if (Subtarget->GVRequiresExtraLoad(GV, TM, false)) + O << "@GOT"; + else + O << "@GOTOFF"; + } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) { + if ((GV->isDeclaration() || + GV->hasWeakLinkage() || + GV->hasLinkOnceLinkage()) && + TM.getRelocationModel() != Reloc::Static) + O << "@GOTPCREL"; + + if (needCloseParen) { + needCloseParen = false; + O << ')'; + } + + // Use rip when possible to reduce code size, except when + // index or base register are also part of the address. e.g. + // foo(%rip)(%rcx,%rax,4) is not legal + O << "(%rip)"; + } + } + + if (needCloseParen) + O << ')'; + + return; + } + case MachineOperand::MO_ExternalSymbol: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + bool needCloseParen = false; + std::string Name(TAI->getGlobalPrefix()); + Name += MO.getSymbolName(); + if (isCallOp && printStub(TM, Subtarget)) { + FnStubs.insert(Name); + O << TAI->getPrivateGlobalPrefix() << Name << "$stub"; + return; + } + if (!isCallOp) + O << '$'; + else if (Name[0] == '$') { + // The name begins with a dollar-sign. In order to avoid having it look + // like an integer immediate to the assembler, enclose it in parens. + O << '('; + needCloseParen = true; + } + + O << Name; + + if (printGOT(TM, Subtarget)) { + std::string GOTName(TAI->getGlobalPrefix()); + GOTName+="_GLOBAL_OFFSET_TABLE_"; + if (Name == GOTName) + // HACK! Emit extra offset to PC during printing GOT offset to + // compensate for the size of popl instruction. The resulting code + // should look like: + // call .piclabel + // piclabel: + // popl %some_register + // addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register + O << " + [.-" + << computePICLabel(getFunctionNumber(), TAI, Subtarget) << "]"; + + if (isCallOp) + O << "@PLT"; + } + + if (needCloseParen) + O << ')'; + + if (!isCallOp && Subtarget->isPICStyleRIPRel()) + O << "(%rip)"; + + return; + } + default: + O << "<unknown operand type>"; return; + } +} + +void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) { + unsigned char value = MI->getOperand(Op).getImmedValue(); + assert(value <= 7 && "Invalid ssecc argument!"); + switch (value) { + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + } +} + +void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, + const char *Modifier){ + assert(isMem(MI, Op) && "Invalid memory reference!"); + MachineOperand BaseReg = MI->getOperand(Op); + MachineOperand IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + + bool NotRIPRel = IndexReg.getReg() || BaseReg.getReg(); + if (DispSpec.isGlobalAddress() || + DispSpec.isConstantPoolIndex() || + DispSpec.isJumpTableIndex()) { + printOperand(MI, Op+3, "mem", NotRIPRel); + } else { + int DispVal = DispSpec.getImmedValue(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) + O << DispVal; + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + unsigned ScaleVal = MI->getOperand(Op+1).getImmedValue(); + unsigned BaseRegOperand = 0, IndexRegOperand = 2; + + // There are cases where we can end up with ESP/RSP in the indexreg slot. + // If this happens, swap the base/index register to support assemblers that + // don't work when the index is *SP. + if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) { + assert(ScaleVal == 1 && "Scale not supported for stack pointer!"); + std::swap(BaseReg, IndexReg); + std::swap(BaseRegOperand, IndexRegOperand); + } + + O << "("; + if (BaseReg.getReg()) + printOperand(MI, Op+BaseRegOperand, Modifier); + + if (IndexReg.getReg()) { + O << ","; + printOperand(MI, Op+IndexRegOperand, Modifier); + if (ScaleVal != 1) + O << "," << ScaleVal; + } + O << ")"; + } +} + +void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) { + std::string label = computePICLabel(getFunctionNumber(), TAI, Subtarget); + O << label << "\n" << label << ":"; +} + + +bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO, + const char Mode) { + const MRegisterInfo &RI = *TM.getRegisterInfo(); + unsigned Reg = MO.getReg(); + switch (Mode) { + default: return true; // Unknown mode. + case 'b': // Print QImode register + Reg = getX86SubSuperRegister(Reg, MVT::i8); + break; + case 'h': // Print QImode high register + Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + break; + case 'w': // Print HImode register + Reg = getX86SubSuperRegister(Reg, MVT::i16); + break; + case 'k': // Print SImode register + Reg = getX86SubSuperRegister(Reg, MVT::i32); + break; + } + + O << '%'; + for (const char *Name = RI.get(Reg).Name; *Name; ++Name) + O << (char)tolower(*Name); + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'c': // Don't print "$" before a global var name or constant. + printOperand(MI, OpNo, "mem"); + return false; + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + if (MI->getOperand(OpNo).isReg()) + return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]); + printOperand(MI, OpNo); + return false; + + case 'P': // Don't print @PLT, but do print as memory. + printOperand(MI, OpNo, "mem"); + return false; + } + } + + printOperand(MI, OpNo); + return false; +} + +bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + printMemReference(MI, OpNo); + return false; +} + +/// printMachineInstruction -- Print out a single X86 LLVM instruction +/// MI in AT&T syntax to the current output stream. +/// +void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) { + ++EmittedInsts; + + // See if a truncate instruction can be turned into a nop. + switch (MI->getOpcode()) { + default: break; + case X86::TRUNC_64to32: + case X86::TRUNC_64to16: + case X86::TRUNC_32to16: + case X86::TRUNC_32to8: + case X86::TRUNC_16to8: + case X86::TRUNC_32_to8: + case X86::TRUNC_16_to8: { + const MachineOperand &MO0 = MI->getOperand(0); + const MachineOperand &MO1 = MI->getOperand(1); + unsigned Reg0 = MO0.getReg(); + unsigned Reg1 = MO1.getReg(); + unsigned Opc = MI->getOpcode(); + if (Opc == X86::TRUNC_64to32) + Reg1 = getX86SubSuperRegister(Reg1, MVT::i32); + else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16) + Reg1 = getX86SubSuperRegister(Reg1, MVT::i16); + else + Reg1 = getX86SubSuperRegister(Reg1, MVT::i8); + O << TAI->getCommentString() << " TRUNCATE "; + if (Reg0 != Reg1) + O << "\n\t"; + break; + } + case X86::PsMOVZX64rr32: + O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t"; + break; + } + + // Call the autogenerated instruction printer routines. + printInstruction(MI); +} + +// Include the auto-generated portion of the assembly writer. +#include "X86GenAsmWriter.inc" + diff --git a/lib/Target/X86/X86ATTAsmPrinter.h b/lib/Target/X86/X86ATTAsmPrinter.h new file mode 100755 index 0000000..a3bdce9 --- /dev/null +++ b/lib/Target/X86/X86ATTAsmPrinter.h @@ -0,0 +1,87 @@ +//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AT&T assembly code printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ATTASMPRINTER_H +#define X86ATTASMPRINTER_H + +#include "X86AsmPrinter.h" +#include "llvm/CodeGen/ValueTypes.h" + +namespace llvm { + +struct X86ATTAsmPrinter : public X86SharedAsmPrinter { + X86ATTAsmPrinter(std::ostream &O, X86TargetMachine &TM, const TargetAsmInfo *T) + : X86SharedAsmPrinter(O, TM, T) { } + + virtual const char *getPassName() const { + return "X86 AT&T-Style Assembly Printer"; + } + + /// printInstruction - This method is automatically generated by tablegen + /// from the instruction set description. This method returns true if the + /// machine instruction was sufficiently described to print it, otherwise it + /// returns false. + bool printInstruction(const MachineInstr *MI); + + // These methods are used by the tablegen'erated instruction printer. + void printOperand(const MachineInstr *MI, unsigned OpNo, + const char *Modifier = 0, bool NotRIPRel = false); + void printi8mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printi16mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printi32mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printi64mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printi128mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printf32mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printf64mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printf128mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo); + } + void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) { + printMemReference(MI, OpNo, "subreg64"); + } + + bool printAsmMRegister(const MachineOperand &MO, const char Mode); + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + + void printMachineInstruction(const MachineInstr *MI); + void printSSECC(const MachineInstr *MI, unsigned Op); + void printMemReference(const MachineInstr *MI, unsigned Op, + const char *Modifier=NULL); + void printPICLabel(const MachineInstr *MI, unsigned Op); + bool runOnMachineFunction(MachineFunction &F); + + /// getSectionForFunction - Return the section that we should emit the + /// specified function body into. + virtual std::string getSectionForFunction(const Function &F) const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp new file mode 100644 index 0000000..59b9b1f --- /dev/null +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -0,0 +1,409 @@ +//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file the shared super class printer that converts from our internal +// representation of machine-dependent LLVM code to Intel and AT&T format +// assembly language. +// This printer is the output mechanism used by `llc'. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "X86ATTAsmPrinter.h" +#include "X86COFF.h" +#include "X86IntelAsmPrinter.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Type.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static X86MachineFunctionInfo calculateFunctionInfo(const Function *F, + const TargetData *TD) { + X86MachineFunctionInfo Info; + uint64_t Size = 0; + + switch (F->getCallingConv()) { + case CallingConv::X86_StdCall: + Info.setDecorationStyle(StdCall); + break; + case CallingConv::X86_FastCall: + Info.setDecorationStyle(FastCall); + break; + default: + return Info; + } + + for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + AI != AE; ++AI) + // Size should be aligned to DWORD boundary + Size += ((TD->getTypeSize(AI->getType()) + 3)/4)*4; + + // We're not supporting tooooo huge arguments :) + Info.setBytesToPopOnReturn((unsigned int)Size); + return Info; +} + + +/// decorateName - Query FunctionInfoMap and use this information for various +/// name decoration. +void X86SharedAsmPrinter::decorateName(std::string &Name, + const GlobalValue *GV) { + const Function *F = dyn_cast<Function>(GV); + if (!F) return; + + // We don't want to decorate non-stdcall or non-fastcall functions right now + unsigned CC = F->getCallingConv(); + if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall) + return; + + // Decorate names only when we're targeting Cygwin/Mingw32 targets + if (!Subtarget->isTargetCygMing()) + return; + + FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F); + + const X86MachineFunctionInfo *Info; + if (info_item == FunctionInfoMap.end()) { + // Calculate apropriate function info and populate map + FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData()); + Info = &FunctionInfoMap[F]; + } else { + Info = &info_item->second; + } + + const FunctionType *FT = F->getFunctionType(); + switch (Info->getDecorationStyle()) { + case None: + break; + case StdCall: + // "Pure" variadic functions do not receive @0 suffix. + if (!FT->isVarArg() || (FT->getNumParams() == 0) || + (FT->getNumParams() == 1 && FT->isStructReturn())) + Name += '@' + utostr_32(Info->getBytesToPopOnReturn()); + break; + case FastCall: + // "Pure" variadic functions do not receive @0 suffix. + if (!FT->isVarArg() || (FT->getNumParams() == 0) || + (FT->getNumParams() == 1 && FT->isStructReturn())) + Name += '@' + utostr_32(Info->getBytesToPopOnReturn()); + + if (Name[0] == '_') { + Name[0] = '@'; + } else { + Name = '@' + Name; + } + break; + default: + assert(0 && "Unsupported DecorationStyle"); + } +} + +/// doInitialization +bool X86SharedAsmPrinter::doInitialization(Module &M) { + if (TAI->doesSupportDebugInformation()) { + // Emit initial debug information. + DW.BeginModule(&M); + } + + AsmPrinter::doInitialization(M); + + // Darwin wants symbols to be quoted if they have complex names. + if (Subtarget->isTargetDarwin()) + Mang->setUseQuotes(true); + + return false; +} + +bool X86SharedAsmPrinter::doFinalization(Module &M) { + // Note: this code is not shared by the Intel printer as it is too different + // from how MASM does things. When making changes here don't forget to look + // at X86IntelAsmPrinter::doFinalization(). + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (!I->hasInitializer()) + continue; // External global require no code + + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) { + if (Subtarget->isTargetDarwin() && + TM.getRelocationModel() == Reloc::Static) { + if (I->getName() == "llvm.global_ctors") + O << ".reference .constructors_used\n"; + else if (I->getName() == "llvm.global_dtors") + O << ".reference .destructors_used\n"; + } + continue; + } + + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + const Type *Type = C->getType(); + unsigned Size = TD->getTypeSize(Type); + unsigned Align = TD->getPreferredAlignmentLog(I); + + if (I->hasHiddenVisibility()) { + if (const char *Directive = TAI->getHiddenDirective()) + O << Directive << name << "\n"; + } else if (I->hasProtectedVisibility()) { + if (const char *Directive = TAI->getProtectedDirective()) + O << Directive << name << "\n"; + } + + if (Subtarget->isTargetELF()) + O << "\t.type " << name << ",@object\n"; + + if (C->isNullValue()) { + if (I->hasExternalLinkage()) { + if (const char *Directive = TAI->getZeroFillDirective()) { + O << "\t.globl\t" << name << "\n"; + O << Directive << "__DATA__, __common, " << name << ", " + << Size << ", " << Align << "\n"; + continue; + } + } + + if (!I->hasSection() && !I->isThreadLocal() && + (I->hasInternalLinkage() || I->hasWeakLinkage() || + I->hasLinkOnceLinkage())) { + if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it. + if (!NoZerosInBSS && TAI->getBSSSection()) + SwitchToDataSection(TAI->getBSSSection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + if (TAI->getLCOMMDirective() != NULL) { + if (I->hasInternalLinkage()) { + O << TAI->getLCOMMDirective() << name << "," << Size; + if (Subtarget->isTargetDarwin()) + O << "," << Align; + } else + O << TAI->getCOMMDirective() << name << "," << Size; + } else { + if (!Subtarget->isTargetCygMing()) { + if (I->hasInternalLinkage()) + O << "\t.local\t" << name << "\n"; + } + O << TAI->getCOMMDirective() << name << "," << Size; + if (TAI->getCOMMDirectiveTakesAlignment()) + O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align); + } + O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n"; + continue; + } + } + + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + if (Subtarget->isTargetDarwin()) { + O << "\t.globl " << name << "\n" + << "\t.weak_definition " << name << "\n"; + SwitchToDataSection(".section __DATA,__const_coal,coalesced", I); + } else if (Subtarget->isTargetCygMing()) { + std::string SectionName(".section\t.data$linkonce." + + name + + ",\"aw\""); + SwitchToDataSection(SectionName.c_str(), I); + O << "\t.globl " << name << "\n" + << "\t.linkonce same_size\n"; + } else { + std::string SectionName("\t.section\t.llvm.linkonce.d." + + name + + ",\"aw\",@progbits"); + SwitchToDataSection(SectionName.c_str(), I); + O << "\t.weak " << name << "\n"; + } + break; + case GlobalValue::AppendingLinkage: + // FIXME: appending linkage variables should go into a section of + // their name or something. For now, just emit them as external. + case GlobalValue::DLLExportLinkage: + DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),"")); + // FALL THROUGH + case GlobalValue::ExternalLinkage: + // If external or appending, declare as a global symbol + O << "\t.globl " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: { + if (I->isConstant()) { + const ConstantArray *CVA = dyn_cast<ConstantArray>(C); + if (TAI->getCStringSection() && CVA && CVA->isCString()) { + SwitchToDataSection(TAI->getCStringSection(), I); + break; + } + } + // FIXME: special handling for ".ctors" & ".dtors" sections + if (I->hasSection() && + (I->getSection() == ".ctors" || + I->getSection() == ".dtors")) { + std::string SectionName = ".section " + I->getSection(); + + if (Subtarget->isTargetCygMing()) { + SectionName += ",\"aw\""; + } else { + assert(!Subtarget->isTargetDarwin()); + SectionName += ",\"aw\",@progbits"; + } + + SwitchToDataSection(SectionName.c_str()); + } else { + if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection()) + SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() : + TAI->getBSSSection(), I); + else if (!I->isConstant()) + SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() : + TAI->getDataSection(), I); + else if (I->isThreadLocal()) + SwitchToDataSection(TAI->getTLSDataSection()); + else { + // Read-only data. + bool HasReloc = C->ContainsRelocations(); + if (HasReloc && + Subtarget->isTargetDarwin() && + TM.getRelocationModel() != Reloc::Static) + SwitchToDataSection("\t.const_data\n"); + else if (!HasReloc && Size == 4 && + TAI->getFourByteConstantSection()) + SwitchToDataSection(TAI->getFourByteConstantSection(), I); + else if (!HasReloc && Size == 8 && + TAI->getEightByteConstantSection()) + SwitchToDataSection(TAI->getEightByteConstantSection(), I); + else if (!HasReloc && Size == 16 && + TAI->getSixteenByteConstantSection()) + SwitchToDataSection(TAI->getSixteenByteConstantSection(), I); + else if (TAI->getReadOnlySection()) + SwitchToDataSection(TAI->getReadOnlySection(), I); + else + SwitchToDataSection(TAI->getDataSection(), I); + } + } + + break; + } + default: + assert(0 && "Unknown linkage type!"); + } + + EmitAlignment(Align, I); + O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName() + << "\n"; + if (TAI->hasDotTypeDotSizeDirective()) + O << "\t.size " << name << ", " << Size << "\n"; + // If the initializer is a extern weak symbol, remember to emit the weak + // reference! + if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + if (GV->hasExternalWeakLinkage()) + ExtWeakSymbols.insert(GV); + + EmitGlobalConstant(C); + } + + // Output linker support code for dllexported globals + if (DLLExportedGVs.begin() != DLLExportedGVs.end()) { + SwitchToDataSection(".section .drectve"); + } + + for (std::set<std::string>::iterator i = DLLExportedGVs.begin(), + e = DLLExportedGVs.end(); + i != e; ++i) { + O << "\t.ascii \" -export:" << *i << ",data\"\n"; + } + + if (DLLExportedFns.begin() != DLLExportedFns.end()) { + SwitchToDataSection(".section .drectve"); + } + + for (std::set<std::string>::iterator i = DLLExportedFns.begin(), + e = DLLExportedFns.end(); + i != e; ++i) { + O << "\t.ascii \" -export:" << *i << "\"\n"; + } + + if (Subtarget->isTargetDarwin()) { + SwitchToDataSection(""); + + // Output stubs for dynamically-linked functions + unsigned j = 1; + for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end(); + i != e; ++i, ++j) { + SwitchToDataSection(".section __IMPORT,__jump_table,symbol_stubs," + "self_modifying_code+pure_instructions,5", 0); + O << "L" << *i << "$stub:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\thlt ; hlt ; hlt ; hlt ; hlt\n"; + } + + O << "\n"; + + // Output stubs for external and common global variables. + if (GVStubs.begin() != GVStubs.end()) + SwitchToDataSection( + ".section __IMPORT,__pointers,non_lazy_symbol_pointers"); + for (std::set<std::string>::iterator i = GVStubs.begin(), e = GVStubs.end(); + i != e; ++i) { + O << "L" << *i << "$non_lazy_ptr:\n"; + O << "\t.indirect_symbol " << *i << "\n"; + O << "\t.long\t0\n"; + } + + // Emit final debug information. + DW.EndModule(); + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + O << "\t.subsections_via_symbols\n"; + } else if (Subtarget->isTargetCygMing()) { + // Emit type information for external functions + for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end(); + i != e; ++i) { + O << "\t.def\t " << *i + << ";\t.scl\t" << COFF::C_EXT + << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT) + << ";\t.endef\n"; + } + + // Emit final debug information. + DW.EndModule(); + } else if (Subtarget->isTargetELF()) { + // Emit final debug information. + DW.EndModule(); + } + + AsmPrinter::doFinalization(M); + return false; // success +} + +/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code +/// for a MachineFunction to the given output stream, using the given target +/// machine description. +/// +FunctionPass *llvm::createX86CodePrinterPass(std::ostream &o, + X86TargetMachine &tm) { + const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>(); + + if (Subtarget->isFlavorIntel()) { + return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo()); + } else { + return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo()); + } +} diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h new file mode 100755 index 0000000..45be89e --- /dev/null +++ b/lib/Target/X86/X86AsmPrinter.h @@ -0,0 +1,97 @@ +//===-- X86AsmPrinter.h - Convert X86 LLVM code to Intel assembly ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file the shared super class printer that converts from our internal +// representation of machine-dependent LLVM code to Intel and AT&T format +// assembly language. This printer is the output mechanism used by `llc'. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ASMPRINTER_H +#define X86ASMPRINTER_H + +#include "X86.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/DwarfWriter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/Support/Compiler.h" +#include <set> + + +namespace llvm { + +struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter { + DwarfWriter DW; + + X86SharedAsmPrinter(std::ostream &O, X86TargetMachine &TM, + const TargetAsmInfo *T) + : AsmPrinter(O, TM, T), DW(O, this, T) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + } + + // We have to propagate some information about MachineFunction to + // AsmPrinter. It's ok, when we're printing the function, since we have + // access to MachineFunction and can get the appropriate MachineFunctionInfo. + // Unfortunately, this is not possible when we're printing reference to + // Function (e.g. calling it and so on). Even more, there is no way to get the + // corresponding MachineFunctions: it can even be not created at all. That's + // why we should use additional structure, when we're collecting all necessary + // information. + // + // This structure is using e.g. for name decoration for stdcall & fastcall'ed + // function, since we have to use arguments' size for decoration. + typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap; + FMFInfoMap FunctionInfoMap; + + void decorateName(std::string& Name, const GlobalValue* GV); + + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + if (Subtarget->isTargetDarwin() || + Subtarget->isTargetELF() || + Subtarget->isTargetCygMing()) { + AU.addRequired<MachineModuleInfo>(); + } + MachineFunctionPass::getAnalysisUsage(AU); + } + + const X86Subtarget *Subtarget; + + // Necessary for Darwin to print out the apprioriate types of linker stubs + std::set<std::string> FnStubs, GVStubs, LinkOnceStubs; + + // Necessary for dllexport support + std::set<std::string> DLLExportedFns, DLLExportedGVs; + + inline static bool isScale(const MachineOperand &MO) { + return MO.isImmediate() && + (MO.getImmedValue() == 1 || MO.getImmedValue() == 2 || + MO.getImmedValue() == 4 || MO.getImmedValue() == 8); + } + + inline static bool isMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFrameIndex()) return true; + return Op+4 <= MI->getNumOperands() && + MI->getOperand(Op ).isRegister() && isScale(MI->getOperand(Op+1)) && + MI->getOperand(Op+2).isRegister() && + (MI->getOperand(Op+3).isImmediate() || + MI->getOperand(Op+3).isGlobalAddress() || + MI->getOperand(Op+3).isConstantPoolIndex() || + MI->getOperand(Op+3).isJumpTableIndex()); + } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h new file mode 100644 index 0000000..75892ef --- /dev/null +++ b/lib/Target/X86/X86COFF.h @@ -0,0 +1,95 @@ +//===--- X86COFF.h - Some definitions from COFF documentations ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Anton Korobeynikov and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file just defines some symbols found in COFF documentation. They are +// used to emit function type information for COFF targets (Cygwin/Mingw32). +// +//===----------------------------------------------------------------------===// + +#ifndef X86COFF_H +#define X86COFF_H + +namespace COFF +{ +/// Storage class tells where and what the symbol represents +enum StorageClass { + C_EFCN = -1, ///< Physical end of function + C_NULL = 0, ///< No symbol + C_AUTO = 1, ///< External definition + C_EXT = 2, ///< External symbol + C_STAT = 3, ///< Static + C_REG = 4, ///< Register variable + C_EXTDEF = 5, ///< External definition + C_LABEL = 6, ///< Label + C_ULABEL = 7, ///< Undefined label + C_MOS = 8, ///< Member of structure + C_ARG = 9, ///< Function argument + C_STRTAG = 10, ///< Structure tag + C_MOU = 11, ///< Member of union + C_UNTAG = 12, ///< Union tag + C_TPDEF = 13, ///< Type definition + C_USTATIC = 14, ///< Undefined static + C_ENTAG = 15, ///< Enumeration tag + C_MOE = 16, ///< Member of enumeration + C_REGPARM = 17, ///< Register parameter + C_FIELD = 18, ///< Bit field + + C_BLOCK = 100, ///< ".bb" or ".eb" - beginning or end of block + C_FCN = 101, ///< ".bf" or ".ef" - beginning or end of function + C_EOS = 102, ///< End of structure + C_FILE = 103, ///< File name + C_LINE = 104, ///< Line number, reformatted as symbol + C_ALIAS = 105, ///< Duplicate tag + C_HIDDEN = 106 ///< External symbol in dmert public lib +}; + +/// The type of the symbol. This is made up of a base type and a derived type. +/// For example, pointer to int is "pointer to T" and "int" +enum SymbolType { + T_NULL = 0, ///< No type info + T_ARG = 1, ///< Void function argument (only used by compiler) + T_VOID = 1, ///< The same as above. Just named differently in some specs. + T_CHAR = 2, ///< Character + T_SHORT = 3, ///< Short integer + T_INT = 4, ///< Integer + T_LONG = 5, ///< Long integer + T_FLOAT = 6, ///< Floating point + T_DOUBLE = 7, ///< Double word + T_STRUCT = 8, ///< Structure + T_UNION = 9, ///< Union + T_ENUM = 10, ///< Enumeration + T_MOE = 11, ///< Member of enumeration + T_UCHAR = 12, ///< Unsigned character + T_USHORT = 13, ///< Unsigned short + T_UINT = 14, ///< Unsigned integer + T_ULONG = 15 ///< Unsigned long +}; + +/// Derived type of symbol +enum SymbolDerivedType { + DT_NON = 0, ///< No derived type + DT_PTR = 1, ///< Pointer to T + DT_FCN = 2, ///< Function returning T + DT_ARY = 3 ///< Array of T +}; + +/// Masks for extracting parts of type +enum SymbolTypeMasks { + N_BTMASK = 017, ///< Mask for base type + N_TMASK = 060 ///< Mask for derived type +}; + +/// Offsets of parts of type +enum Shifts { + N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT) +}; + +} + +#endif // X86COFF_H diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td new file mode 100644 index 0000000..39811bd7 --- /dev/null +++ b/lib/Target/X86/X86CallingConv.td @@ -0,0 +1,172 @@ +//===- X86CallingConv.td - Calling Conventions for X86 32/64 ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the X86-32 and X86-64 +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// Return-value conventions common to all X86 CC's. +def RetCC_X86Common : CallingConv<[ + // Scalar values are returned in AX first, then DX. + CCIfType<[i8] , CCAssignToReg<[AL]>>, + CCIfType<[i16], CCAssignToReg<[AX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>, + + // Vector types are returned in XMM0 and XMM1, when they fit. If the target + // doesn't have XMM registers, it won't have vector types. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1]>>, + + // MMX vector types are always returned in MM0. If the target doesn't have + // MM0, it doesn't support these vector types. + CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>> +]>; + +// X86-32 C return-value convention. +def RetCC_X86_32_C : CallingConv<[ + // The X86-32 calling convention returns FP values in ST0, otherwise it is the + // same as the common X86 calling conv. + CCIfType<[f32], CCAssignToReg<[ST0]>>, + CCIfType<[f64], CCAssignToReg<[ST0]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 FastCC return-value convention. +def RetCC_X86_32_Fast : CallingConv<[ + // The X86-32 fastcc returns FP values in XMM0 if the target has SSE2, + // otherwise it is the the C calling conventions. + CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>, + CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-64 C return-value convention. +def RetCC_X86_64_C : CallingConv<[ + // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f32], CCAssignToReg<[XMM0]>>, + CCIfType<[f64], CCAssignToReg<[XMM0]>>, + CCDelegateTo<RetCC_X86Common> +]>; + + + +// This is the root return-value convention for the X86-32 backend. +def RetCC_X86_32 : CallingConv<[ + // If FastCC, use RetCC_X86_32_Fast. + CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // Otherwise, use RetCC_X86_32_C. + CCDelegateTo<RetCC_X86_32_C> +]>; + +// This is the root return-value convention for the X86-64 backend. +def RetCC_X86_64 : CallingConv<[ + // Always just the same as C calling conv for X86-64. + CCDelegateTo<RetCC_X86_64_C> +]>; + +// This is the return-value convention used for the entire X86 backend. +def RetCC_X86 : CallingConv<[ + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, + CCDelegateTo<RetCC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// X86-64 Argument Calling Conventions +//===----------------------------------------------------------------------===// + +def CC_X86_64_C : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8, R9 ]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>, + + // The first 8 MMX vector arguments are passed in GPRs. + CCIfType<[v8i8, v4i16, v2i32, v1i64], + CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // __m64 vectors get 8-byte stack slots that are 8-byte aligned. + CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> +]>; + + +//===----------------------------------------------------------------------===// +// X86 C Calling Convention +//===----------------------------------------------------------------------===// + +/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP +/// values are spilled on the stack, and the first 4 vector values go in XMM +/// regs. +def CC_X86_32_Common : CallingConv<[ + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Doubles get 8-byte slots that are 4-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // The first 4 vector arguments are passed in XMM registers. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // Other vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // __m64 vectors get 8-byte stack slots that are 8-byte aligned. They are + // passed in the parameter area. + CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> +]>; + +def CC_X86_32_C : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + + +def CC_X86_32_FastCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp new file mode 100644 index 0000000..8b22634 --- /dev/null +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -0,0 +1,824 @@ +//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the X86 machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-emitter" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "X86Relocations.h" +#include "X86.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass { + const X86InstrInfo *II; + const TargetData *TD; + TargetMachine &TM; + MachineCodeEmitter &MCE; + bool Is64BitMode; + public: + static char ID; + explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce) + : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm), + MCE(mce), Is64BitMode(false) {} + Emitter(TargetMachine &tm, MachineCodeEmitter &mce, + const X86InstrInfo &ii, const TargetData &td, bool is64) + : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm), + MCE(mce), Is64BitMode(is64) {} + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "X86 Machine Code Emitter"; + } + + void emitInstruction(const MachineInstr &MI); + + private: + void emitPCRelativeBlockAddress(MachineBasicBlock *MBB); + void emitPCRelativeValue(intptr_t Address); + void emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub); + void emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc, + int Disp = 0, unsigned PCAdj = 0); + void emitExternalSymbolAddress(const char *ES, unsigned Reloc); + void emitConstPoolAddress(unsigned CPI, unsigned Reloc, int Disp = 0, + unsigned PCAdj = 0); + void emitJumpTableAddress(unsigned JTI, unsigned Reloc, unsigned PCAdj = 0); + + void emitDisplacementField(const MachineOperand *RelocOp, int DispVal, + unsigned PCAdj = 0); + + void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField); + void emitSIBByte(unsigned SS, unsigned Index, unsigned Base); + void emitConstant(uint64_t Val, unsigned Size); + + void emitMemModRMByte(const MachineInstr &MI, + unsigned Op, unsigned RegOpcodeField, + unsigned PCAdj = 0); + + unsigned getX86RegNum(unsigned RegNo); + bool isX86_64ExtendedReg(const MachineOperand &MO); + unsigned determineREX(const MachineInstr &MI); + }; + char Emitter::ID = 0; +} + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified MCE object. +FunctionPass *llvm::createX86CodeEmitterPass(X86TargetMachine &TM, + MachineCodeEmitter &MCE) { + return new Emitter(TM, MCE); +} + +bool Emitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + II = ((X86TargetMachine&)MF.getTarget()).getInstrInfo(); + TD = ((X86TargetMachine&)MF.getTarget()).getTargetData(); + Is64BitMode = + ((X86TargetMachine&)MF.getTarget()).getSubtarget<X86Subtarget>().is64Bit(); + + do { + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) + emitInstruction(*I); + } + } while (MCE.finishFunction(MF)); + + return false; +} + +/// emitPCRelativeValue - Emit a PC relative address. +/// +void Emitter::emitPCRelativeValue(intptr_t Address) { + MCE.emitWordLE(Address-MCE.getCurrentPCValue()-4); +} + +/// emitPCRelativeBlockAddress - This method keeps track of the information +/// necessary to resolve the address of this block later and emits a dummy +/// value. +/// +void Emitter::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) { + // Remember where this reference was and where it is to so we can + // deal with it later. + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + X86::reloc_pcrel_word, MBB)); + MCE.emitWordLE(0); +} + +/// emitGlobalAddressForCall - Emit the specified address to the code stream +/// assuming this is part of a function call, which is PC relative. +/// +void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub) { + MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), + X86::reloc_pcrel_word, GV, 0, + DoesntNeedStub)); + MCE.emitWordLE(0); +} + +/// emitGlobalAddress - Emit the specified address to the code stream assuming +/// this is part of a "take the address of a global" instruction. +/// +void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc, + int Disp /* = 0 */, + unsigned PCAdj /* = 0 */) { + MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + GV, PCAdj)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitWordLE(0); + MCE.emitWordLE(Disp); // The relocated value will be added to the displacement +} + +/// emitExternalSymbolAddress - Arrange for the address of an external symbol to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void Emitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) { + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, ES)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitWordLE(0); + MCE.emitWordLE(0); +} + +/// emitConstPoolAddress - Arrange for the address of an constant pool +/// to be emitted to the current location in the function, and allow it to be PC +/// relative. +void Emitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc, + int Disp /* = 0 */, + unsigned PCAdj /* = 0 */) { + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, CPI, PCAdj)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitWordLE(0); + MCE.emitWordLE(Disp); // The relocated value will be added to the displacement +} + +/// emitJumpTableAddress - Arrange for the address of a jump table to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +void Emitter::emitJumpTableAddress(unsigned JTI, unsigned Reloc, + unsigned PCAdj /* = 0 */) { + MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, JTI, PCAdj)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitWordLE(0); + MCE.emitWordLE(0); // The relocated value will be added to the displacement +} + +/// N86 namespace - Native X86 Register numbers... used by X86 backend. +/// +namespace N86 { + enum { + EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7 + }; +} + +// getX86RegNum - This function maps LLVM register identifiers to their X86 +// specific numbering, which is used in various places encoding instructions. +// +unsigned Emitter::getX86RegNum(unsigned RegNo) { + switch(RegNo) { + case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX; + case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX; + case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX; + case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX; + case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH: + return N86::ESP; + case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH: + return N86::EBP; + case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH: + return N86::ESI; + case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH: + return N86::EDI; + + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + return N86::EAX; + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + return N86::ECX; + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + return N86::EDX; + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + return N86::EBX; + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + return N86::ESP; + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + return N86::EBP; + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + return N86::ESI; + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + return N86::EDI; + + case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3: + case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7: + return RegNo-X86::ST0; + + case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: + case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: + return II->getRegisterInfo().getDwarfRegNum(RegNo) - + II->getRegisterInfo().getDwarfRegNum(X86::XMM0); + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + return II->getRegisterInfo().getDwarfRegNum(RegNo) - + II->getRegisterInfo().getDwarfRegNum(X86::XMM8); + + default: + assert(MRegisterInfo::isVirtualRegister(RegNo) && + "Unknown physical register!"); + assert(0 && "Register allocator hasn't allocated reg correctly yet!"); + return 0; + } +} + +inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); +} + +void Emitter::emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeFld){ + MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg))); +} + +void Emitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) { + // SIB byte is in the same format as the ModRMByte... + MCE.emitByte(ModRMByte(SS, Index, Base)); +} + +void Emitter::emitConstant(uint64_t Val, unsigned Size) { + // Output the constant in little endian byte order... + for (unsigned i = 0; i != Size; ++i) { + MCE.emitByte(Val & 255); + Val >>= 8; + } +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +void Emitter::emitDisplacementField(const MachineOperand *RelocOp, + int DispVal, unsigned PCAdj) { + // If this is a simple integer displacement that doesn't require a relocation, + // emit it now. + if (!RelocOp) { + emitConstant(DispVal, 4); + return; + } + + // Otherwise, this is something that requires a relocation. Emit it as such + // now. + if (RelocOp->isGlobalAddress()) { + // In 64-bit static small code model, we could potentially emit absolute. + // But it's probably not beneficial. + // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative + // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute + unsigned rt= Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word; + emitGlobalAddressForPtr(RelocOp->getGlobal(), rt, + RelocOp->getOffset(), PCAdj); + } else if (RelocOp->isConstantPoolIndex()) { + // Must be in 64-bit mode. + emitConstPoolAddress(RelocOp->getConstantPoolIndex(), X86::reloc_pcrel_word, + RelocOp->getOffset(), PCAdj); + } else if (RelocOp->isJumpTableIndex()) { + // Must be in 64-bit mode. + emitJumpTableAddress(RelocOp->getJumpTableIndex(), X86::reloc_pcrel_word, + PCAdj); + } else { + assert(0 && "Unknown value to relocate!"); + } +} + +void Emitter::emitMemModRMByte(const MachineInstr &MI, + unsigned Op, unsigned RegOpcodeField, + unsigned PCAdj) { + const MachineOperand &Op3 = MI.getOperand(Op+3); + int DispVal = 0; + const MachineOperand *DispForReloc = 0; + + // Figure out what sort of displacement we have to handle here. + if (Op3.isGlobalAddress()) { + DispForReloc = &Op3; + } else if (Op3.isConstantPoolIndex()) { + if (Is64BitMode) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex()); + DispVal += Op3.getOffset(); + } + } else if (Op3.isJumpTableIndex()) { + if (Is64BitMode) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex()); + } + } else { + DispVal = Op3.getImm(); + } + + const MachineOperand &Base = MI.getOperand(Op); + const MachineOperand &Scale = MI.getOperand(Op+1); + const MachineOperand &IndexReg = MI.getOperand(Op+2); + + unsigned BaseReg = Base.getReg(); + + // Is a SIB byte needed? + if (IndexReg.getReg() == 0 && + (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) { + if (BaseReg == 0) { // Just a displacement? + // Emit special case [disp32] encoding + MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); + + emitDisplacementField(DispForReloc, DispVal, PCAdj); + } else { + unsigned BaseRegNo = getX86RegNum(BaseReg); + if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) { + // Emit simple indirect register encoding... [EAX] f.e. + MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); + } else if (!DispForReloc && isDisp8(DispVal)) { + // Emit the disp8 encoding... [REG+disp8] + MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); + emitConstant(DispVal, 1); + } else { + // Emit the most general non-SIB encoding: [REG+disp32] + MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); + emitDisplacementField(DispForReloc, DispVal, PCAdj); + } + } + + } else { // We need a SIB byte, so start by outputting the ModR/M byte first + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=5, to JUST get the index, scale, and displacement. + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispForReloc) { + // Emit the normal disp32 encoding. + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) { + // Emit no displacement ModR/M byte + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + } else if (isDisp8(DispVal)) { + // Emit the disp8 encoding... + MCE.emitByte(ModRMByte(1, RegOpcodeField, 4)); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else { + // Emit the normal disp32 encoding... + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base. The + // displacement has already been output. + assert(IndexReg.getReg() && "Index register must be specified!"); + emitSIBByte(SS, getX86RegNum(IndexReg.getReg()), 5); + } else { + unsigned BaseRegNo = getX86RegNum(BaseReg); + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = getX86RegNum(IndexReg.getReg()); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + emitSIBByte(SS, IndexRegNo, BaseRegNo); + } + + // Do we need to output a displacement? + if (ForceDisp8) { + emitConstant(DispVal, 1); + } else if (DispVal != 0 || ForceDisp32) { + emitDisplacementField(DispForReloc, DispVal, PCAdj); + } + } +} + +static unsigned sizeOfImm(const TargetInstrDescriptor *Desc) { + switch (Desc->TSFlags & X86II::ImmMask) { + case X86II::Imm8: return 1; + case X86II::Imm16: return 2; + case X86II::Imm32: return 4; + case X86II::Imm64: return 8; + default: assert(0 && "Immediate size not set!"); + return 0; + } +} + +/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register? +/// e.g. r8, xmm8, etc. +bool Emitter::isX86_64ExtendedReg(const MachineOperand &MO) { + if (!MO.isRegister()) return false; + unsigned RegNo = MO.getReg(); + int DWNum = II->getRegisterInfo().getDwarfRegNum(RegNo); + if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::R8) && + DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::R15)) + return true; + if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::XMM8) && + DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::XMM15)) + return true; + return false; +} + +inline static bool isX86_64TruncToByte(unsigned oc) { + return (oc == X86::TRUNC_64to8 || oc == X86::TRUNC_32to8 || + oc == X86::TRUNC_16to8); +} + + +inline static bool isX86_64NonExtLowByteReg(unsigned reg) { + return (reg == X86::SPL || reg == X86::BPL || + reg == X86::SIL || reg == X86::DIL); +} + +/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +unsigned Emitter::determineREX(const MachineInstr &MI) { + unsigned REX = 0; + const TargetInstrDescriptor *Desc = MI.getInstrDescriptor(); + unsigned Opcode = Desc->Opcode; + + // Pseudo instructions do not need REX prefix byte. + if ((Desc->TSFlags & X86II::FormMask) == X86II::Pseudo) + return 0; + if (Desc->TSFlags & X86II::REX_W) + REX |= 1 << 3; + + unsigned NumOps = Desc->numOperands; + if (NumOps) { + bool isTwoAddr = NumOps > 1 && + Desc->getOperandConstraint(1, TOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + bool isTrunc8 = isX86_64TruncToByte(Opcode); + unsigned i = isTwoAddr ? 1 : 0; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isRegister()) { + unsigned Reg = MO.getReg(); + // Trunc to byte are actually movb. The real source operand is the low + // byte of the register. + if (isTrunc8 && i == 1) + Reg = getX86SubSuperRegister(Reg, MVT::i8); + if (isX86_64NonExtLowByteReg(Reg)) + REX |= 0x40; + } + } + + switch (Desc->TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + if (isX86_64ExtendedReg(MI.getOperand(0))) + REX |= (1 << 0) | (1 << 2); + break; + case X86II::MRMSrcReg: { + if (isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (isX86_64ExtendedReg(MO)) + REX |= 1 << 0; + } + break; + } + case X86II::MRMSrcMem: { + if (isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isRegister()) { + if (isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = isTwoAddr ? 5 : 4; + i = isTwoAddr ? 1 : 0; + if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e))) + REX |= 1 << 2; + unsigned Bit = 0; + for (; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isRegister()) { + if (isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + default: { + if (isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 0; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (isX86_64ExtendedReg(MO)) + REX |= 1 << 2; + } + break; + } + } + } + return REX; +} + +void Emitter::emitInstruction(const MachineInstr &MI) { + NumEmitted++; // Keep track of the # of mi's emitted + + const TargetInstrDescriptor *Desc = MI.getInstrDescriptor(); + unsigned Opcode = Desc->Opcode; + + // Emit the repeat opcode prefix as needed. + if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3); + + // Emit the operand size opcode prefix as needed. + if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66); + + // Emit the address size opcode prefix as needed. + if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67); + + bool Need0FPrefix = false; + switch (Desc->TSFlags & X86II::Op0Mask) { + case X86II::TB: + Need0FPrefix = true; // Two-byte opcode prefix + break; + case X86II::T8: + MCE.emitByte(0x0F); + MCE.emitByte(0x38); + break; + case X86II::TA: + MCE.emitByte(0x0F); + MCE.emitByte(0x3A); + break; + case X86II::REP: break; // already handled. + case X86II::XS: // F3 0F + MCE.emitByte(0xF3); + Need0FPrefix = true; + break; + case X86II::XD: // F2 0F + MCE.emitByte(0xF2); + Need0FPrefix = true; + break; + case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: + case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: + MCE.emitByte(0xD8+ + (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8) + >> X86II::Op0Shift)); + break; // Two-byte opcode prefix + default: assert(0 && "Invalid prefix!"); + case 0: break; // No prefix! + } + + if (Is64BitMode) { + // REX prefix + unsigned REX = determineREX(MI); + if (REX) + MCE.emitByte(0x40 | REX); + } + + // 0x0F escape code must be emitted just before the opcode. + if (Need0FPrefix) + MCE.emitByte(0x0F); + + // If this is a two-address instruction, skip one of the register operands. + unsigned NumOps = Desc->numOperands; + unsigned CurOp = 0; + if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1) + CurOp++; + + unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc); + switch (Desc->TSFlags & X86II::FormMask) { + default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!"); + case X86II::Pseudo: +#ifndef NDEBUG + switch (Opcode) { + default: + assert(0 && "psuedo instructions should be removed before code emission"); + case TargetInstrInfo::INLINEASM: + assert(0 && "JIT does not support inline asm!\n"); + case TargetInstrInfo::LABEL: + assert(0 && "JIT does not support meta labels!\n"); + case X86::IMPLICIT_USE: + case X86::IMPLICIT_DEF: + case X86::IMPLICIT_DEF_GR8: + case X86::IMPLICIT_DEF_GR16: + case X86::IMPLICIT_DEF_GR32: + case X86::IMPLICIT_DEF_GR64: + case X86::IMPLICIT_DEF_FR32: + case X86::IMPLICIT_DEF_FR64: + case X86::IMPLICIT_DEF_VR64: + case X86::IMPLICIT_DEF_VR128: + case X86::FP_REG_KILL: + break; + } +#endif + CurOp = NumOps; + break; + + case X86II::RawFrm: + MCE.emitByte(BaseOpcode); + if (CurOp != NumOps) { + const MachineOperand &MO = MI.getOperand(CurOp++); + if (MO.isMachineBasicBlock()) { + emitPCRelativeBlockAddress(MO.getMachineBasicBlock()); + } else if (MO.isGlobalAddress()) { + bool NeedStub = Is64BitMode || + Opcode == X86::TAILJMPd || + Opcode == X86::TAILJMPr || Opcode == X86::TAILJMPm; + emitGlobalAddressForCall(MO.getGlobal(), !NeedStub); + } else if (MO.isExternalSymbol()) { + emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); + } else if (MO.isImmediate()) { + emitConstant(MO.getImm(), sizeOfImm(Desc)); + } else { + assert(0 && "Unknown RawFrm operand!"); + } + } + break; + + case X86II::AddRegFrm: + MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg())); + + if (CurOp != NumOps) { + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = sizeOfImm(Desc); + if (MO1.isImmediate()) + emitConstant(MO1.getImm(), Size); + else { + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word; + if (Opcode == X86::MOV64ri) + rt = X86::reloc_absolute_dword; // FIXME: add X86II flag? + if (MO1.isGlobalAddress()) + emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset()); + else if (MO1.isExternalSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isConstantPoolIndex()) + emitConstPoolAddress(MO1.getConstantPoolIndex(), rt); + else if (MO1.isJumpTableIndex()) + emitJumpTableAddress(MO1.getJumpTableIndex(), rt); + } + } + break; + + case X86II::MRMDestReg: { + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + getX86RegNum(MI.getOperand(CurOp+1).getReg())); + CurOp += 2; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc)); + break; + } + case X86II::MRMDestMem: { + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(CurOp+4).getReg())); + CurOp += 5; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc)); + break; + } + + case X86II::MRMSrcReg: + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp+1).getReg(), + getX86RegNum(MI.getOperand(CurOp).getReg())); + CurOp += 2; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc)); + break; + + case X86II::MRMSrcMem: { + unsigned PCAdj = (CurOp+5 != NumOps) ? sizeOfImm(Desc) : 0; + + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()), + PCAdj); + CurOp += 5; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc)); + break; + } + + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp++).getReg(), + (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r); + + if (CurOp != NumOps) { + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = sizeOfImm(Desc); + if (MO1.isImmediate()) + emitConstant(MO1.getImm(), Size); + else { + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : X86::reloc_absolute_word; + if (Opcode == X86::MOV64ri32) + rt = X86::reloc_absolute_word; // FIXME: add X86II flag? + if (MO1.isGlobalAddress()) + emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset()); + else if (MO1.isExternalSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isConstantPoolIndex()) + emitConstPoolAddress(MO1.getConstantPoolIndex(), rt); + else if (MO1.isJumpTableIndex()) + emitJumpTableAddress(MO1.getJumpTableIndex(), rt); + } + } + break; + + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + unsigned PCAdj = (CurOp+4 != NumOps) ? + (MI.getOperand(CurOp+4).isImmediate() ? sizeOfImm(Desc) : 4) : 0; + + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m, + PCAdj); + CurOp += 4; + + if (CurOp != NumOps) { + const MachineOperand &MO = MI.getOperand(CurOp++); + unsigned Size = sizeOfImm(Desc); + if (MO.isImmediate()) + emitConstant(MO.getImm(), Size); + else { + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : X86::reloc_absolute_word; + if (Opcode == X86::MOV64mi32) + rt = X86::reloc_absolute_word; // FIXME: add X86II flag? + if (MO.isGlobalAddress()) + emitGlobalAddressForPtr(MO.getGlobal(), rt, MO.getOffset()); + else if (MO.isExternalSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), rt); + else if (MO.isConstantPoolIndex()) + emitConstPoolAddress(MO.getConstantPoolIndex(), rt); + else if (MO.isJumpTableIndex()) + emitJumpTableAddress(MO.getJumpTableIndex(), rt); + } + } + break; + } + + case X86II::MRMInitReg: + MCE.emitByte(BaseOpcode); + // Duplicate register, used by things like MOV8r0 (aka xor reg,reg). + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + getX86RegNum(MI.getOperand(CurOp).getReg())); + ++CurOp; + break; + } + + assert((Desc->Flags & M_VARIABLE_OPS) != 0 || + CurOp == NumOps && "Unknown encoding!"); +} diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp new file mode 100644 index 0000000..f8f8d48 --- /dev/null +++ b/lib/Target/X86/X86ELFWriterInfo.cpp @@ -0,0 +1,18 @@ +//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bill Wendling and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the X86 backend. +// +//===----------------------------------------------------------------------===// + +#include "X86ELFWriterInfo.h" +using namespace llvm; + +X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {} +X86ELFWriterInfo::~X86ELFWriterInfo() {} diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h new file mode 100644 index 0000000..eb564fb --- /dev/null +++ b/lib/Target/X86/X86ELFWriterInfo.h @@ -0,0 +1,29 @@ +//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Bill Wendling and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the X86 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_ELF_WRITER_INFO_H +#define X86_ELF_WRITER_INFO_H + +#include "llvm/Target/TargetELFWriterInfo.h" + +namespace llvm { + + class X86ELFWriterInfo : public TargetELFWriterInfo { + public: + X86ELFWriterInfo(); + virtual ~X86ELFWriterInfo(); + }; + +} // end llvm namespace + +#endif // X86_ELF_WRITER_INFO_H diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp new file mode 100644 index 0000000..c293a32 --- /dev/null +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -0,0 +1,882 @@ +//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which converts floating point instructions from +// virtual registers into register stack instructions. This pass uses live +// variable information to indicate where the FPn registers are used and their +// lifetimes. +// +// This pass is hampered by the lack of decent CFG manipulation routines for +// machine code. In particular, this wants to be able to split critical edges +// as necessary, traverse the machine basic block CFG in depth-first order, and +// allow there to be multiple machine basic blocks for each LLVM basicblock +// (needed for critical edge splitting). +// +// In particular, this pass currently barfs on critical edges. Because of this, +// it requires the instruction selector to insert FP_REG_KILL instructions on +// the exits of any basic block that has critical edges going from it, or which +// branch to a critical basic block. +// +// FIXME: this is not implemented yet. The stackifier pass only works on local +// basic blocks. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-codegen" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Compiler.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include <algorithm> +#include <set> +using namespace llvm; + +STATISTIC(NumFXCH, "Number of fxch instructions inserted"); +STATISTIC(NumFP , "Number of floating point instructions"); + +namespace { + struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass { + static char ID; + FPS() : MachineFunctionPass((intptr_t)&ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { return "X86 FP Stackifier"; } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LiveVariables>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: + const TargetInstrInfo *TII; // Machine instruction info. + LiveVariables *LV; // Live variable info for current function... + MachineBasicBlock *MBB; // Current basic block + unsigned Stack[8]; // FP<n> Registers in each stack slot... + unsigned RegMap[8]; // Track which stack slot contains each register + unsigned StackTop; // The current top of the FP stack. + + void dumpStack() const { + cerr << "Stack contents:"; + for (unsigned i = 0; i != StackTop; ++i) { + cerr << " FP" << Stack[i]; + assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); + } + cerr << "\n"; + } + private: + // getSlot - Return the stack slot number a particular register number is + // in... + unsigned getSlot(unsigned RegNo) const { + assert(RegNo < 8 && "Regno out of range!"); + return RegMap[RegNo]; + } + + // getStackEntry - Return the X86::FP<n> register in register ST(i) + unsigned getStackEntry(unsigned STi) const { + assert(STi < StackTop && "Access past stack top!"); + return Stack[StackTop-1-STi]; + } + + // getSTReg - Return the X86::ST(i) register which contains the specified + // FP<RegNo> register + unsigned getSTReg(unsigned RegNo) const { + return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0; + } + + // pushReg - Push the specified FP<n> register onto the stack + void pushReg(unsigned Reg) { + assert(Reg < 8 && "Register number out of range!"); + assert(StackTop < 8 && "Stack overflow!"); + Stack[StackTop] = Reg; + RegMap[Reg] = StackTop++; + } + + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } + void moveToTop(unsigned RegNo, MachineBasicBlock::iterator &I) { + if (!isAtTop(RegNo)) { + unsigned STReg = getSTReg(RegNo); + unsigned RegOnTop = getStackEntry(0); + + // Swap the slots the regs are in + std::swap(RegMap[RegNo], RegMap[RegOnTop]); + + // Swap stack slot contents + assert(RegMap[RegOnTop] < StackTop); + std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]); + + // Emit an fxch to update the runtime processors version of the state + BuildMI(*MBB, I, TII->get(X86::XCH_F)).addReg(STReg); + NumFXCH++; + } + } + + void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) { + unsigned STReg = getSTReg(RegNo); + pushReg(AsReg); // New register on top of stack + + BuildMI(*MBB, I, TII->get(X86::LD_Frr)).addReg(STReg); + } + + // popStackAfter - Pop the current value off of the top of the FP stack + // after the specified instruction. + void popStackAfter(MachineBasicBlock::iterator &I); + + // freeStackSlotAfter - Free the specified register from the register stack, + // so that it is no longer in a register. If the register is currently at + // the top of the stack, we just pop the current instruction, otherwise we + // store the current top-of-stack into the specified slot, then pop the top + // of stack. + void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + void handleZeroArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFPRW(MachineBasicBlock::iterator &I); + void handleTwoArgFP(MachineBasicBlock::iterator &I); + void handleCompareFP(MachineBasicBlock::iterator &I); + void handleCondMovFP(MachineBasicBlock::iterator &I); + void handleSpecialFP(MachineBasicBlock::iterator &I); + }; + char FPS::ID = 0; +} + +FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } + +/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP +/// register references into FP stack references. +/// +bool FPS::runOnMachineFunction(MachineFunction &MF) { + // We only need to run this pass if there are any FP registers used in this + // function. If it is all integer, there is nothing for us to do! + bool FPIsUsed = false; + + assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + for (unsigned i = 0; i <= 6; ++i) + if (MF.isPhysRegUsed(X86::FP0+i)) { + FPIsUsed = true; + break; + } + + // Early exit. + if (!FPIsUsed) return false; + + TII = MF.getTarget().getInstrInfo(); + LV = &getAnalysis<LiveVariables>(); + StackTop = 0; + + // Process the function in depth first order so that we process at least one + // of the predecessors for every reachable block in the function. + std::set<MachineBasicBlock*> Processed; + MachineBasicBlock *Entry = MF.begin(); + + bool Changed = false; + for (df_ext_iterator<MachineBasicBlock*, std::set<MachineBasicBlock*> > + I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed); + I != E; ++I) + Changed |= processBasicBlock(MF, **I); + + return Changed; +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// transforming FP instructions into their stack form. +/// +bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + unsigned Flags = MI->getInstrDescriptor()->TSFlags; + if ((Flags & X86II::FPTypeMask) == X86II::NotFP) + continue; // Efficiently ignore non-fp insts! + + MachineInstr *PrevMI = 0; + if (I != BB.begin()) + PrevMI = prior(I); + + ++NumFP; // Keep track of # of pseudo instrs + DOUT << "\nFPInst:\t" << *MI; + + // Get dead variables list now because the MI pointer may be deleted as part + // of processing! + SmallVector<unsigned, 8> DeadRegs; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDead()) + DeadRegs.push_back(MO.getReg()); + } + + switch (Flags & X86II::FPTypeMask) { + case X86II::ZeroArgFP: handleZeroArgFP(I); break; + case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0) + case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0)) + case X86II::TwoArgFP: handleTwoArgFP(I); break; + case X86II::CompareFP: handleCompareFP(I); break; + case X86II::CondMovFP: handleCondMovFP(I); break; + case X86II::SpecialFP: handleSpecialFP(I); break; + default: assert(0 && "Unknown FP Type!"); + } + + // Check to see if any of the values defined by this instruction are dead + // after definition. If so, pop them. + for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { + unsigned Reg = DeadRegs[i]; + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n"; + freeStackSlotAfter(I, Reg-X86::FP0); + } + } + + // Print out all of the instructions expanded to if -debug + DEBUG( + MachineBasicBlock::iterator PrevI(PrevMI); + if (I == PrevI) { + cerr << "Just deleted pseudo instruction\n"; + } else { + MachineBasicBlock::iterator Start = I; + // Rewind to first instruction newly inserted. + while (Start != BB.begin() && prior(Start) != PrevI) --Start; + cerr << "Inserted instructions:\n\t"; + Start->print(*cerr.stream(), &MF.getTarget()); + while (++Start != next(I)); + } + dumpStack(); + ); + + Changed = true; + } + + assert(StackTop == 0 && "Stack not empty at end of basic block?"); + return Changed; +} + +//===----------------------------------------------------------------------===// +// Efficient Lookup Table Support +//===----------------------------------------------------------------------===// + +namespace { + struct TableEntry { + unsigned from; + unsigned to; + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + friend bool operator<(unsigned V, const TableEntry &TE) { + return V < TE.from; + } + }; +} + +static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { + for (unsigned i = 0; i != NumEntries-1; ++i) + if (!(Table[i] < Table[i+1])) return false; + return true; +} + +static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); + if (I != Table+N && I->from == Opcode) + return I->to; + return -1; +} + +#define ARRAY_SIZE(TABLE) \ + (sizeof(TABLE)/sizeof(TABLE[0])) + +#ifdef NDEBUG +#define ASSERT_SORTED(TABLE) +#else +#define ASSERT_SORTED(TABLE) \ + { static bool TABLE##Checked = false; \ + if (!TABLE##Checked) { \ + assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) && \ + "All lookup tables must be sorted for efficient access!"); \ + TABLE##Checked = true; \ + } \ + } +#endif + +//===----------------------------------------------------------------------===// +// Register File -> Register Stack Mapping Methods +//===----------------------------------------------------------------------===// + +// OpcodeTable - Sorted map of register instructions to their stack version. +// The first element is an register file pseudo instruction, the second is the +// concrete X86 instruction which uses the register stack. +// +static const TableEntry OpcodeTable[] = { + { X86::ABS_Fp32 , X86::ABS_F }, + { X86::ABS_Fp64 , X86::ABS_F }, + { X86::ADD_Fp32m , X86::ADD_F32m }, + { X86::ADD_Fp64m , X86::ADD_F64m }, + { X86::ADD_Fp64m32 , X86::ADD_F32m }, + { X86::ADD_FpI16m32 , X86::ADD_FI16m }, + { X86::ADD_FpI16m64 , X86::ADD_FI16m }, + { X86::ADD_FpI32m32 , X86::ADD_FI32m }, + { X86::ADD_FpI32m64 , X86::ADD_FI32m }, + { X86::CHS_Fp32 , X86::CHS_F }, + { X86::CHS_Fp64 , X86::CHS_F }, + { X86::CMOVBE_Fp32 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp64 , X86::CMOVBE_F }, + { X86::CMOVB_Fp32 , X86::CMOVB_F }, + { X86::CMOVB_Fp64 , X86::CMOVB_F }, + { X86::CMOVE_Fp32 , X86::CMOVE_F }, + { X86::CMOVE_Fp64 , X86::CMOVE_F }, + { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F }, + { X86::CMOVNB_Fp32 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp64 , X86::CMOVNB_F }, + { X86::CMOVNE_Fp32 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp64 , X86::CMOVNE_F }, + { X86::CMOVNP_Fp32 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp64 , X86::CMOVNP_F }, + { X86::CMOVP_Fp32 , X86::CMOVP_F }, + { X86::CMOVP_Fp64 , X86::CMOVP_F }, + { X86::COS_Fp32 , X86::COS_F }, + { X86::COS_Fp64 , X86::COS_F }, + { X86::DIVR_Fp32m , X86::DIVR_F32m }, + { X86::DIVR_Fp64m , X86::DIVR_F64m }, + { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, + { X86::DIVR_FpI16m32, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m64, X86::DIVR_FI16m}, + { X86::DIVR_FpI32m32, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m64, X86::DIVR_FI32m}, + { X86::DIV_Fp32m , X86::DIV_F32m }, + { X86::DIV_Fp64m , X86::DIV_F64m }, + { X86::DIV_Fp64m32 , X86::DIV_F32m }, + { X86::DIV_FpI16m32 , X86::DIV_FI16m }, + { X86::DIV_FpI16m64 , X86::DIV_FI16m }, + { X86::DIV_FpI32m32 , X86::DIV_FI32m }, + { X86::DIV_FpI32m64 , X86::DIV_FI32m }, + { X86::ILD_Fp16m32 , X86::ILD_F16m }, + { X86::ILD_Fp16m64 , X86::ILD_F16m }, + { X86::ILD_Fp32m32 , X86::ILD_F32m }, + { X86::ILD_Fp32m64 , X86::ILD_F32m }, + { X86::ILD_Fp64m32 , X86::ILD_F64m }, + { X86::ILD_Fp64m64 , X86::ILD_F64m }, + { X86::ISTT_Fp16m32 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m64 , X86::ISTT_FP16m}, + { X86::ISTT_Fp32m32 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m64 , X86::ISTT_FP32m}, + { X86::ISTT_Fp64m32 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m64 , X86::ISTT_FP64m}, + { X86::IST_Fp16m32 , X86::IST_F16m }, + { X86::IST_Fp16m64 , X86::IST_F16m }, + { X86::IST_Fp32m32 , X86::IST_F32m }, + { X86::IST_Fp32m64 , X86::IST_F32m }, + { X86::IST_Fp64m32 , X86::IST_FP64m }, + { X86::IST_Fp64m64 , X86::IST_FP64m }, + { X86::LD_Fp032 , X86::LD_F0 }, + { X86::LD_Fp064 , X86::LD_F0 }, + { X86::LD_Fp132 , X86::LD_F1 }, + { X86::LD_Fp164 , X86::LD_F1 }, + { X86::LD_Fp32m , X86::LD_F32m }, + { X86::LD_Fp64m , X86::LD_F64m }, + { X86::MUL_Fp32m , X86::MUL_F32m }, + { X86::MUL_Fp64m , X86::MUL_F64m }, + { X86::MUL_Fp64m32 , X86::MUL_F32m }, + { X86::MUL_FpI16m32 , X86::MUL_FI16m }, + { X86::MUL_FpI16m64 , X86::MUL_FI16m }, + { X86::MUL_FpI32m32 , X86::MUL_FI32m }, + { X86::MUL_FpI32m64 , X86::MUL_FI32m }, + { X86::SIN_Fp32 , X86::SIN_F }, + { X86::SIN_Fp64 , X86::SIN_F }, + { X86::SQRT_Fp32 , X86::SQRT_F }, + { X86::SQRT_Fp64 , X86::SQRT_F }, + { X86::ST_Fp32m , X86::ST_F32m }, + { X86::ST_Fp64m , X86::ST_F64m }, + { X86::ST_Fp64m32 , X86::ST_F32m }, + { X86::SUBR_Fp32m , X86::SUBR_F32m }, + { X86::SUBR_Fp64m , X86::SUBR_F64m }, + { X86::SUBR_Fp64m32 , X86::SUBR_F32m }, + { X86::SUBR_FpI16m32, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m64, X86::SUBR_FI16m}, + { X86::SUBR_FpI32m32, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m64, X86::SUBR_FI32m}, + { X86::SUB_Fp32m , X86::SUB_F32m }, + { X86::SUB_Fp64m , X86::SUB_F64m }, + { X86::SUB_Fp64m32 , X86::SUB_F32m }, + { X86::SUB_FpI16m32 , X86::SUB_FI16m }, + { X86::SUB_FpI16m64 , X86::SUB_FI16m }, + { X86::SUB_FpI32m32 , X86::SUB_FI32m }, + { X86::SUB_FpI32m64 , X86::SUB_FI32m }, + { X86::TST_Fp32 , X86::TST_F }, + { X86::TST_Fp64 , X86::TST_F }, + { X86::UCOM_FpIr32 , X86::UCOM_FIr }, + { X86::UCOM_FpIr64 , X86::UCOM_FIr }, + { X86::UCOM_Fpr32 , X86::UCOM_Fr }, + { X86::UCOM_Fpr64 , X86::UCOM_Fr }, +}; + +static unsigned getConcreteOpcode(unsigned Opcode) { + ASSERT_SORTED(OpcodeTable); + int Opc = Lookup(OpcodeTable, ARRAY_SIZE(OpcodeTable), Opcode); + assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); + return Opc; +} + +//===----------------------------------------------------------------------===// +// Helper Methods +//===----------------------------------------------------------------------===// + +// PopTable - Sorted map of instructions to their popping version. The first +// element is an instruction, the second is the version which pops. +// +static const TableEntry PopTable[] = { + { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + + { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, + { X86::DIV_FrST0 , X86::DIV_FPrST0 }, + + { X86::IST_F16m , X86::IST_FP16m }, + { X86::IST_F32m , X86::IST_FP32m }, + + { X86::MUL_FrST0 , X86::MUL_FPrST0 }, + + { X86::ST_F32m , X86::ST_FP32m }, + { X86::ST_F64m , X86::ST_FP64m }, + { X86::ST_Frr , X86::ST_FPrr }, + + { X86::SUBR_FrST0, X86::SUBR_FPrST0 }, + { X86::SUB_FrST0 , X86::SUB_FPrST0 }, + + { X86::UCOM_FIr , X86::UCOM_FIPr }, + + { X86::UCOM_FPr , X86::UCOM_FPPr }, + { X86::UCOM_Fr , X86::UCOM_FPr }, +}; + +/// popStackAfter - Pop the current value off of the top of the FP stack after +/// the specified instruction. This attempts to be sneaky and combine the pop +/// into the instruction itself if possible. The iterator is left pointing to +/// the last instruction, be it a new pop instruction inserted, or the old +/// instruction if it was modified in place. +/// +void FPS::popStackAfter(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(PopTable); + assert(StackTop > 0 && "Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + + // Check to see if there is a popping version of this instruction... + int Opcode = Lookup(PopTable, ARRAY_SIZE(PopTable), I->getOpcode()); + if (Opcode != -1) { + I->setInstrDescriptor(TII->get(Opcode)); + if (Opcode == X86::UCOM_FPPr) + I->RemoveOperand(0); + } else { // Insert an explicit pop + I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(X86::ST0); + } +} + +/// freeStackSlotAfter - Free the specified register from the register stack, so +/// that it is no longer in a register. If the register is currently at the top +/// of the stack, we just pop the current instruction, otherwise we store the +/// current top-of-stack into the specified slot, then pop the top of stack. +void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { + if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. + popStackAfter(I); + return; + } + + // Otherwise, store the top of stack into the dead slot, killing the operand + // without having to add in an explicit xchg then pop. + // + unsigned STReg = getSTReg(FPRegNo); + unsigned OldSlot = getSlot(FPRegNo); + unsigned TopReg = Stack[StackTop-1]; + Stack[OldSlot] = TopReg; + RegMap[TopReg] = OldSlot; + RegMap[FPRegNo] = ~0; + Stack[--StackTop] = ~0; + I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(STReg); +} + + +static unsigned getFPReg(const MachineOperand &MO) { + assert(MO.isRegister() && "Expected an FP register!"); + unsigned Reg = MO.getReg(); + assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); + return Reg - X86::FP0; +} + + +//===----------------------------------------------------------------------===// +// Instruction transformation implementation +//===----------------------------------------------------------------------===// + +/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> +/// +void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned DestReg = getFPReg(MI->getOperand(0)); + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); // Remove the explicit ST(0) operand + MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // Result gets pushed on the stack. + pushReg(DestReg); +} + +/// handleOneArgFP - fst <mem>, ST(0) +/// +void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned NumOps = MI->getInstrDescriptor()->numOperands; + assert((NumOps == 5 || NumOps == 1) && + "Can only handle fst* & ftst instructions!"); + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); + bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg); + + // FISTP64m is strange because there isn't a non-popping versions. + // If we have one _and_ we don't want to pop the operand, duplicate the value + // on the stack instead of moving it. This ensure that popping the value is + // always ok. + // Ditto FISTTP16m, FISTTP32m, FISTTP64m. + // + if (!KillsSrc && + (MI->getOpcode() == X86::IST_Fp64m32 || + MI->getOpcode() == X86::ISTT_Fp16m32 || + MI->getOpcode() == X86::ISTT_Fp32m32 || + MI->getOpcode() == X86::ISTT_Fp64m32 || + MI->getOpcode() == X86::IST_Fp64m64 || + MI->getOpcode() == X86::ISTT_Fp16m64 || + MI->getOpcode() == X86::ISTT_Fp32m64 || + MI->getOpcode() == X86::ISTT_Fp64m64)) { + duplicateToTop(Reg, 7 /*temp register*/, I); + } else { + moveToTop(Reg, I); // Move to the top of the stack... + } + + // Convert from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand + MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode()))); + + if (MI->getOpcode() == X86::IST_FP64m || + MI->getOpcode() == X86::ISTT_FP16m || + MI->getOpcode() == X86::ISTT_FP32m || + MI->getOpcode() == X86::ISTT_FP64m) { + assert(StackTop > 0 && "Stack empty??"); + --StackTop; + } else if (KillsSrc) { // Last use of operand? + popStackAfter(I); + } +} + + +/// handleOneArgFPRW: Handle instructions that read from the top of stack and +/// replace the value with a newly computed value. These instructions may have +/// non-fp operands after their FP operands. +/// +/// Examples: +/// R1 = fchs R2 +/// R1 = fadd R2, [mem] +/// +void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned NumOps = MI->getInstrDescriptor()->numOperands; + assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!"); + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(1)); + bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg); + + if (KillsSrc) { + // If this is the last use of the source register, just make sure it's on + // the top of the stack. + moveToTop(Reg, I); + assert(StackTop > 0 && "Stack cannot be empty!"); + --StackTop; + pushReg(getFPReg(MI->getOperand(0))); + } else { + // If this is not the last use of the source register, _copy_ it to the top + // of the stack. + duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I); + } + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(1); // Drop the source operand. + MI->RemoveOperand(0); // Drop the destination operand. + MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode()))); +} + + +//===----------------------------------------------------------------------===// +// Define tables of various ways to map pseudo instructions +// + +// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i) +static const TableEntry ForwardST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, + { X86::ADD_Fp64 , X86::ADD_FST0r }, + { X86::DIV_Fp32 , X86::DIV_FST0r }, + { X86::DIV_Fp64 , X86::DIV_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, + { X86::MUL_Fp64 , X86::MUL_FST0r }, + { X86::SUB_Fp32 , X86::SUB_FST0r }, + { X86::SUB_Fp64 , X86::SUB_FST0r }, +}; + +// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0) +static const TableEntry ReverseST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FST0r }, + { X86::DIV_Fp64 , X86::DIVR_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FST0r }, + { X86::SUB_Fp64 , X86::SUBR_FST0r }, +}; + +// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i) +static const TableEntry ForwardSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FrST0 }, + { X86::DIV_Fp64 , X86::DIVR_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FrST0 }, + { X86::SUB_Fp64 , X86::SUBR_FrST0 }, +}; + +// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0) +static const TableEntry ReverseSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, + { X86::ADD_Fp64 , X86::ADD_FrST0 }, + { X86::DIV_Fp32 , X86::DIV_FrST0 }, + { X86::DIV_Fp64 , X86::DIV_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, + { X86::MUL_Fp64 , X86::MUL_FrST0 }, + { X86::SUB_Fp32 , X86::SUB_FrST0 }, + { X86::SUB_Fp64 , X86::SUB_FrST0 }, +}; + + +/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual +/// instructions which need to be simplified and possibly transformed. +/// +/// Result: ST(0) = fsub ST(0), ST(i) +/// ST(i) = fsub ST(0), ST(i) +/// ST(0) = fsubr ST(0), ST(i) +/// ST(i) = fsubr ST(0), ST(i) +/// +void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getInstrDescriptor()->numOperands; + assert(NumOperands == 3 && "Illegal TwoArgFP instruction!"); + unsigned Dest = getFPReg(MI->getOperand(0)); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0); + bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1); + + unsigned TOS = getStackEntry(0); + + // One of our operands must be on the top of the stack. If neither is yet, we + // need to move one. + if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? + // We can choose to move either operand to the top of the stack. If one of + // the operands is killed by this instruction, we want that one so that we + // can update right on top of the old version. + if (KillsOp0) { + moveToTop(Op0, I); // Move dead operand to TOS. + TOS = Op0; + } else if (KillsOp1) { + moveToTop(Op1, I); + TOS = Op1; + } else { + // All of the operands are live after this instruction executes, so we + // cannot update on top of any operand. Because of this, we must + // duplicate one of the stack elements to the top. It doesn't matter + // which one we pick. + // + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + } else if (!KillsOp0 && !KillsOp1) { + // If we DO have one of our operands at the top of the stack, but we don't + // have a dead operand, we must duplicate one of the operands to a new slot + // on the stack. + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + + // Now we know that one of our operands is on the top of the stack, and at + // least one of our operands is killed by this instruction. + assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) && + "Stack conditions not set up right!"); + + // We decide which form to use based on what is on the top of the stack, and + // which operand is killed by this instruction. + const TableEntry *InstTable; + bool isForward = TOS == Op0; + bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); + if (updateST0) { + if (isForward) + InstTable = ForwardST0Table; + else + InstTable = ReverseST0Table; + } else { + if (isForward) + InstTable = ForwardSTiTable; + else + InstTable = ReverseSTiTable; + } + + int Opcode = Lookup(InstTable, ARRAY_SIZE(ForwardST0Table), MI->getOpcode()); + assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); + + // NotTOS - The register which is not on the top of stack... + unsigned NotTOS = (TOS == Op0) ? Op1 : Op0; + + // Replace the old instruction with a new instruction + MBB->remove(I++); + I = BuildMI(*MBB, I, TII->get(Opcode)).addReg(getSTReg(NotTOS)); + + // If both operands are killed, pop one off of the stack in addition to + // overwriting the other one. + if (KillsOp0 && KillsOp1 && Op0 != Op1) { + assert(!updateST0 && "Should have updated other operand!"); + popStackAfter(I); // Pop the top of stack + } + + // Update stack information so that we know the destination register is now on + // the stack. + unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS); + assert(UpdatedSlot < StackTop && Dest < 7); + Stack[UpdatedSlot] = Dest; + RegMap[Dest] = UpdatedSlot; + delete MI; // Remove the old instruction +} + +/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP +/// register arguments and no explicit destinations. +/// +void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getInstrDescriptor()->numOperands; + assert(NumOperands == 2 && "Illegal FUCOM* instruction!"); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0); + bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1); + + // Make sure the first operand is on the top of stack, the other one can be + // anywhere. + moveToTop(Op0, I); + + // Change from the pseudo instruction to the concrete instruction. + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->RemoveOperand(1); + MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If any of the operands are killed by this instruction, free them. + if (KillsOp0) freeStackSlotAfter(I, Op0); + if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1); +} + +/// handleCondMovFP - Handle two address conditional move instructions. These +/// instructions move a st(i) register to st(0) iff a condition is true. These +/// instructions require that the first operand is at the top of the stack, but +/// otherwise don't modify the stack at all. +void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + + unsigned Op0 = getFPReg(MI->getOperand(0)); + unsigned Op1 = getFPReg(MI->getOperand(2)); + bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1); + + // The first operand *must* be on the top of the stack. + moveToTop(Op0, I); + + // Change the second operand to the stack register that the operand is in. + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); + MI->RemoveOperand(1); + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If we kill the second operand, make sure to pop it from the stack. + if (Op0 != Op1 && KillsOp1) { + // Get this value off of the register stack. + freeStackSlotAfter(I, Op1); + } +} + + +/// handleSpecialFP - Handle special instructions which behave unlike other +/// floating point instructions. This is primarily intended for use by pseudo +/// instructions. +/// +void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + switch (MI->getOpcode()) { + default: assert(0 && "Unknown SpecialFP instruction!"); + case X86::FpGETRESULT32: // Appears immediately after a call returning FP type! + case X86::FpGETRESULT64: // Appears immediately after a call returning FP type! + assert(StackTop == 0 && "Stack should be empty after a call!"); + pushReg(getFPReg(MI->getOperand(0))); + break; + case X86::FpSETRESULT32: + case X86::FpSETRESULT64: + assert(StackTop == 1 && "Stack should have one element on it to return!"); + --StackTop; // "Forget" we have something on the top of stack! + break; + case X86::MOV_Fp3232: + case X86::MOV_Fp3264: + case X86::MOV_Fp6432: + case X86::MOV_Fp6464: { + unsigned SrcReg = getFPReg(MI->getOperand(1)); + unsigned DestReg = getFPReg(MI->getOperand(0)); + + if (LV->KillsRegister(MI, X86::FP0+SrcReg)) { + // If the input operand is killed, we can just change the owner of the + // incoming stack slot into the result. + unsigned Slot = getSlot(SrcReg); + assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!"); + Stack[Slot] = DestReg; + RegMap[DestReg] = Slot; + + } else { + // For FMOV we just duplicate the specified value to a new stack slot. + // This could be made better, but would require substantial changes. + duplicateToTop(SrcReg, DestReg, I); + } + break; + } + } + + I = MBB->erase(I); // Remove the pseudo instruction + --I; +} diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp new file mode 100644 index 0000000..8b1690c --- /dev/null +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -0,0 +1,1342 @@ +//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a DAG pattern matching instruction selector for X86, +// converting from a legalized dag to a X86 dag. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-isel" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86ISelLowering.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/CFG.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/ADT/Statistic.h" +#include <queue> +#include <set> +using namespace llvm; + +STATISTIC(NumFPKill , "Number of FP_REG_KILL instructions added"); +STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); + + +//===----------------------------------------------------------------------===// +// Pattern Matcher Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses + /// SDOperand's instead of register numbers for the leaves of the matched + /// tree. + struct X86ISelAddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + struct { // This is really a union, discriminated by BaseType! + SDOperand Reg; + int FrameIndex; + } Base; + + bool isRIPRel; // RIP relative? + unsigned Scale; + SDOperand IndexReg; + unsigned Disp; + GlobalValue *GV; + Constant *CP; + const char *ES; + int JT; + unsigned Align; // CP alignment. + + X86ISelAddressMode() + : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0), + GV(0), CP(0), ES(0), JT(-1), Align(0) { + } + }; +} + +namespace { + //===--------------------------------------------------------------------===// + /// ISel - X86 specific code to select X86 machine instructions for + /// SelectionDAG operations. + /// + class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel { + /// ContainsFPCode - Every instruction we select that uses or defines a FP + /// register should set this to true. + bool ContainsFPCode; + + /// FastISel - Enable fast(er) instruction selection. + /// + bool FastISel; + + /// TM - Keep a reference to X86TargetMachine. + /// + X86TargetMachine &TM; + + /// X86Lowering - This object fully describes how to lower LLVM code to an + /// X86-specific SelectionDAG. + X86TargetLowering X86Lowering; + + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// GlobalBaseReg - keeps track of the virtual register mapped onto global + /// base register. + unsigned GlobalBaseReg; + + public: + X86DAGToDAGISel(X86TargetMachine &tm, bool fast) + : SelectionDAGISel(X86Lowering), + ContainsFPCode(false), FastISel(fast), TM(tm), + X86Lowering(*TM.getTargetLowering()), + Subtarget(&TM.getSubtarget<X86Subtarget>()) {} + + virtual bool runOnFunction(Function &Fn) { + // Make sure we re-emit a set of the global base reg if necessary + GlobalBaseReg = 0; + return SelectionDAGISel::runOnFunction(Fn); + } + + virtual const char *getPassName() const { + return "X86 DAG->DAG Instruction Selection"; + } + + /// InstructionSelectBasicBlock - This callback is invoked by + /// SelectionDAGISel when it has created a SelectionDAG for us to codegen. + virtual void InstructionSelectBasicBlock(SelectionDAG &DAG); + + virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF); + + virtual bool CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root); + +// Include the pieces autogenerated from the target description. +#include "X86GenDAGISel.inc" + + private: + SDNode *Select(SDOperand N); + + bool MatchAddress(SDOperand N, X86ISelAddressMode &AM, + bool isRoot = true, unsigned Depth = 0); + bool SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Scale, SDOperand &Index, SDOperand &Disp); + bool SelectLEAAddr(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Scale, SDOperand &Index, SDOperand &Disp); + bool SelectScalarSSELoad(SDOperand Op, SDOperand Pred, + SDOperand N, SDOperand &Base, SDOperand &Scale, + SDOperand &Index, SDOperand &Disp, + SDOperand &InChain, SDOperand &OutChain); + bool TryFoldLoad(SDOperand P, SDOperand N, + SDOperand &Base, SDOperand &Scale, + SDOperand &Index, SDOperand &Disp); + void InstructionSelectPreprocess(SelectionDAG &DAG); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op, + char ConstraintCode, + std::vector<SDOperand> &OutOps, + SelectionDAG &DAG); + + void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); + + inline void getAddressOperands(X86ISelAddressMode &AM, SDOperand &Base, + SDOperand &Scale, SDOperand &Index, + SDOperand &Disp) { + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? + CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) : + AM.Base.Reg; + Scale = getI8Imm(AM.Scale); + Index = AM.IndexReg; + // These are 32-bit even in 64-bit mode since RIP relative offset + // is 32-bit. + if (AM.GV) + Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp); + else if (AM.CP) + Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp); + else if (AM.ES) + Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32); + else if (AM.JT != -1) + Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32); + else + Disp = getI32Imm(AM.Disp); + } + + /// getI8Imm - Return a target constant with the specified value, of type + /// i8. + inline SDOperand getI8Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i8); + } + + /// getI16Imm - Return a target constant with the specified value, of type + /// i16. + inline SDOperand getI16Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i16); + } + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDOperand getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC + /// base register. Return the virtual register that holds this value. + SDNode *getGlobalBaseReg(); + +#ifndef NDEBUG + unsigned Indent; +#endif + }; +} + +static SDNode *findFlagUse(SDNode *N) { + unsigned FlagResNo = N->getNumValues()-1; + for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { + SDNode *User = *I; + for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { + SDOperand Op = User->getOperand(i); + if (Op.Val == N && Op.ResNo == FlagResNo) + return User; + } + } + return NULL; +} + +static void findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse, + SDNode *Root, SDNode *Skip, bool &found, + std::set<SDNode *> &Visited) { + if (found || + Use->getNodeId() > Def->getNodeId() || + !Visited.insert(Use).second) + return; + + for (unsigned i = 0, e = Use->getNumOperands(); !found && i != e; ++i) { + SDNode *N = Use->getOperand(i).Val; + if (N == Skip) + continue; + if (N == Def) { + if (Use == ImmedUse) + continue; // Immediate use is ok. + if (Use == Root) { + assert(Use->getOpcode() == ISD::STORE || + Use->getOpcode() == X86ISD::CMP); + continue; + } + found = true; + break; + } + findNonImmUse(N, Def, ImmedUse, Root, Skip, found, Visited); + } +} + +/// isNonImmUse - Start searching from Root up the DAG to check is Def can +/// be reached. Return true if that's the case. However, ignore direct uses +/// by ImmedUse (which would be U in the example illustrated in +/// CanBeFoldedBy) and by Root (which can happen in the store case). +/// FIXME: to be really generic, we should allow direct use by any node +/// that is being folded. But realisticly since we only fold loads which +/// have one non-chain use, we only need to watch out for load/op/store +/// and load/op/cmp case where the root (store / cmp) may reach the load via +/// its chain operand. +static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse, + SDNode *Skip = NULL) { + std::set<SDNode *> Visited; + bool found = false; + findNonImmUse(Root, Def, ImmedUse, Root, Skip, found, Visited); + return found; +} + + +bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) { + if (FastISel) return false; + + // If U use can somehow reach N through another path then U can't fold N or + // it will create a cycle. e.g. In the following diagram, U can reach N + // through X. If N is folded into into U, then X is both a predecessor and + // a successor of U. + // + // [ N ] + // ^ ^ + // | | + // / \--- + // / [X] + // | ^ + // [U]--------| + + if (isNonImmUse(Root, N, U)) + return false; + + // If U produces a flag, then it gets (even more) interesting. Since it + // would have been "glued" together with its flag use, we need to check if + // it might reach N: + // + // [ N ] + // ^ ^ + // | | + // [U] \-- + // ^ [TF] + // | ^ + // | | + // \ / + // [FU] + // + // If FU (flag use) indirectly reach N (the load), and U fold N (call it + // NU), then TF is a predecessor of FU and a successor of NU. But since + // NU and FU are flagged together, this effectively creates a cycle. + bool HasFlagUse = false; + MVT::ValueType VT = Root->getValueType(Root->getNumValues()-1); + while ((VT == MVT::Flag && !Root->use_empty())) { + SDNode *FU = findFlagUse(Root); + if (FU == NULL) + break; + else { + Root = FU; + HasFlagUse = true; + } + VT = Root->getValueType(Root->getNumValues()-1); + } + + if (HasFlagUse) + return !isNonImmUse(Root, N, Root, U); + return true; +} + +/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand +/// and move load below the TokenFactor. Replace store's chain operand with +/// load's chain result. +static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load, + SDOperand Store, SDOperand TF) { + std::vector<SDOperand> Ops; + for (unsigned i = 0, e = TF.Val->getNumOperands(); i != e; ++i) + if (Load.Val == TF.Val->getOperand(i).Val) + Ops.push_back(Load.Val->getOperand(0)); + else + Ops.push_back(TF.Val->getOperand(i)); + DAG.UpdateNodeOperands(TF, &Ops[0], Ops.size()); + DAG.UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2)); + DAG.UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1), + Store.getOperand(2), Store.getOperand(3)); +} + +/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction +/// selector to pick more load-modify-store instructions. This is a common +/// case: +/// +/// [Load chain] +/// ^ +/// | +/// [Load] +/// ^ ^ +/// | | +/// / \- +/// / | +/// [TokenFactor] [Op] +/// ^ ^ +/// | | +/// \ / +/// \ / +/// [Store] +/// +/// The fact the store's chain operand != load's chain will prevent the +/// (store (op (load))) instruction from being selected. We can transform it to: +/// +/// [Load chain] +/// ^ +/// | +/// [TokenFactor] +/// ^ +/// | +/// [Load] +/// ^ ^ +/// | | +/// | \- +/// | | +/// | [Op] +/// | ^ +/// | | +/// \ / +/// \ / +/// [Store] +void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) { + for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(), + E = DAG.allnodes_end(); I != E; ++I) { + if (!ISD::isNON_TRUNCStore(I)) + continue; + SDOperand Chain = I->getOperand(0); + if (Chain.Val->getOpcode() != ISD::TokenFactor) + continue; + + SDOperand N1 = I->getOperand(1); + SDOperand N2 = I->getOperand(2); + if (MVT::isFloatingPoint(N1.getValueType()) || + MVT::isVector(N1.getValueType()) || + !N1.hasOneUse()) + continue; + + bool RModW = false; + SDOperand Load; + unsigned Opcode = N1.Val->getOpcode(); + switch (Opcode) { + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::ADDC: + case ISD::ADDE: { + SDOperand N10 = N1.getOperand(0); + SDOperand N11 = N1.getOperand(1); + if (ISD::isNON_EXTLoad(N10.Val)) + RModW = true; + else if (ISD::isNON_EXTLoad(N11.Val)) { + RModW = true; + std::swap(N10, N11); + } + RModW = RModW && N10.Val->isOperand(Chain.Val) && N10.hasOneUse() && + (N10.getOperand(1) == N2) && + (N10.Val->getValueType(0) == N1.getValueType()); + if (RModW) + Load = N10; + break; + } + case ISD::SUB: + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + case ISD::ROTL: + case ISD::ROTR: + case ISD::SUBC: + case ISD::SUBE: + case X86ISD::SHLD: + case X86ISD::SHRD: { + SDOperand N10 = N1.getOperand(0); + if (ISD::isNON_EXTLoad(N10.Val)) + RModW = N10.Val->isOperand(Chain.Val) && N10.hasOneUse() && + (N10.getOperand(1) == N2) && + (N10.Val->getValueType(0) == N1.getValueType()); + if (RModW) + Load = N10; + break; + } + } + + if (RModW) { + MoveBelowTokenFactor(DAG, Load, SDOperand(I, 0), Chain); + ++NumLoadMoved; + } + } +} + +/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel +/// when it has created a SelectionDAG for us to codegen. +void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) { + DEBUG(BB->dump()); + MachineFunction::iterator FirstMBB = BB; + + if (!FastISel) + InstructionSelectPreprocess(DAG); + + // Codegen the basic block. +#ifndef NDEBUG + DOUT << "===== Instruction selection begins:\n"; + Indent = 0; +#endif + DAG.setRoot(SelectRoot(DAG.getRoot())); +#ifndef NDEBUG + DOUT << "===== Instruction selection ends:\n"; +#endif + + DAG.RemoveDeadNodes(); + + // Emit machine code to BB. + ScheduleAndEmitDAG(DAG); + + // If we are emitting FP stack code, scan the basic block to determine if this + // block defines any FP values. If so, put an FP_REG_KILL instruction before + // the terminator of the block. + if (!Subtarget->hasSSE2()) { + // Note that FP stack instructions *are* used in SSE code when returning + // values, but these are not live out of the basic block, so we don't need + // an FP_REG_KILL in this case either. + bool ContainsFPCode = false; + + // Scan all of the machine instructions in these MBBs, checking for FP + // stores. + MachineFunction::iterator MBBI = FirstMBB; + do { + for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end(); + !ContainsFPCode && I != E; ++I) { + if (I->getNumOperands() != 0 && I->getOperand(0).isRegister()) { + const TargetRegisterClass *clas; + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) { + if (I->getOperand(op).isRegister() && I->getOperand(op).isDef() && + MRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) && + ((clas = RegMap->getRegClass(I->getOperand(0).getReg())) == + X86::RFP32RegisterClass || + clas == X86::RFP64RegisterClass)) { + ContainsFPCode = true; + break; + } + } + } + } + } while (!ContainsFPCode && &*(MBBI++) != BB); + + // Check PHI nodes in successor blocks. These PHI's will be lowered to have + // a copy of the input value in this block. + if (!ContainsFPCode) { + // Final check, check LLVM BB's that are successors to the LLVM BB + // corresponding to BB for FP PHI nodes. + const BasicBlock *LLVMBB = BB->getBasicBlock(); + const PHINode *PN; + for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB); + !ContainsFPCode && SI != E; ++SI) { + for (BasicBlock::const_iterator II = SI->begin(); + (PN = dyn_cast<PHINode>(II)); ++II) { + if (PN->getType()->isFloatingPoint()) { + ContainsFPCode = true; + break; + } + } + } + } + + // Finally, if we found any FP code, emit the FP_REG_KILL instruction. + if (ContainsFPCode) { + BuildMI(*BB, BB->getFirstTerminator(), + TM.getInstrInfo()->get(X86::FP_REG_KILL)); + ++NumFPKill; + } + } +} + +/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in +/// the main function. +void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, + MachineFrameInfo *MFI) { + const TargetInstrInfo *TII = TM.getInstrInfo(); + if (Subtarget->isTargetCygMing()) + BuildMI(BB, TII->get(X86::CALLpcrel32)).addExternalSymbol("__main"); + + // Switch the FPU to 64-bit precision mode for better compatibility and speed. + int CWFrameIdx = MFI->CreateStackObject(2, 2); + addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx); + + // Set the high part to be 64-bit precision. + addFrameReference(BuildMI(BB, TII->get(X86::MOV8mi)), + CWFrameIdx, 1).addImm(2); + + // Reload the modified control word now. + addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx); +} + +void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) { + // If this is main, emit special code for main. + MachineBasicBlock *BB = MF.begin(); + if (Fn.hasExternalLinkage() && Fn.getName() == "main") + EmitSpecialCodeForMain(BB, MF.getFrameInfo()); +} + +/// MatchAddress - Add the specified node to the specified addressing mode, +/// returning true if it cannot be done. This just pattern matches for the +/// addressing mode +bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM, + bool isRoot, unsigned Depth) { + if (Depth > 5) { + // Default, generate it as a register. + AM.BaseType = X86ISelAddressMode::RegBase; + AM.Base.Reg = N; + return false; + } + + // RIP relative addressing: %rip + 32-bit displacement! + if (AM.isRIPRel) { + if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) { + int64_t Val = cast<ConstantSDNode>(N)->getSignExtended(); + if (isInt32(AM.Disp + Val)) { + AM.Disp += Val; + return false; + } + } + return true; + } + + int id = N.Val->getNodeId(); + bool Available = isSelected(id); + + switch (N.getOpcode()) { + default: break; + case ISD::Constant: { + int64_t Val = cast<ConstantSDNode>(N)->getSignExtended(); + if (isInt32(AM.Disp + Val)) { + AM.Disp += Val; + return false; + } + break; + } + + case X86ISD::Wrapper: { + bool is64Bit = Subtarget->is64Bit(); + // Under X86-64 non-small code model, GV (and friends) are 64-bits. + if (is64Bit && TM.getCodeModel() != CodeModel::Small) + break; + if (AM.GV != 0 || AM.CP != 0 || AM.ES != 0 || AM.JT != -1) + break; + // If value is available in a register both base and index components have + // been picked, we can't fit the result available in the register in the + // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement. + if (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val)) { + bool isStatic = TM.getRelocationModel() == Reloc::Static; + SDOperand N0 = N.getOperand(0); + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + GlobalValue *GV = G->getGlobal(); + bool isAbs32 = !is64Bit || isStatic; + if (isAbs32 || isRoot) { + AM.GV = GV; + AM.Disp += G->getOffset(); + AM.isRIPRel = !isAbs32; + return false; + } + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + if (!is64Bit || isStatic || isRoot) { + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.Disp += CP->getOffset(); + AM.isRIPRel = !isStatic; + return false; + } + } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) { + if (isStatic || isRoot) { + AM.ES = S->getSymbol(); + AM.isRIPRel = !isStatic; + return false; + } + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + if (isStatic || isRoot) { + AM.JT = J->getIndex(); + AM.isRIPRel = !isStatic; + return false; + } + } + } + break; + } + + case ISD::FrameIndex: + if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) { + AM.BaseType = X86ISelAddressMode::FrameIndexBase; + AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); + return false; + } + break; + + case ISD::SHL: + if (!Available && AM.IndexReg.Val == 0 && AM.Scale == 1) + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) { + unsigned Val = CN->getValue(); + if (Val == 1 || Val == 2 || Val == 3) { + AM.Scale = 1 << Val; + SDOperand ShVal = N.Val->getOperand(0); + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() && + isa<ConstantSDNode>(ShVal.Val->getOperand(1))) { + AM.IndexReg = ShVal.Val->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(ShVal.Val->getOperand(1)); + uint64_t Disp = AM.Disp + (AddVal->getValue() << Val); + if (isInt32(Disp)) + AM.Disp = Disp; + else + AM.IndexReg = ShVal; + } else { + AM.IndexReg = ShVal; + } + return false; + } + } + break; + + case ISD::MUL: + // X*[3,5,9] -> X+X*[2,4,8] + if (!Available && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base.Reg.Val == 0 && + AM.IndexReg.Val == 0) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) + if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) { + AM.Scale = unsigned(CN->getValue())-1; + + SDOperand MulVal = N.Val->getOperand(0); + SDOperand Reg; + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (MulVal.Val->getOpcode() == ISD::ADD && MulVal.hasOneUse() && + isa<ConstantSDNode>(MulVal.Val->getOperand(1))) { + Reg = MulVal.Val->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(MulVal.Val->getOperand(1)); + uint64_t Disp = AM.Disp + AddVal->getValue() * CN->getValue(); + if (isInt32(Disp)) + AM.Disp = Disp; + else + Reg = N.Val->getOperand(0); + } else { + Reg = N.Val->getOperand(0); + } + + AM.IndexReg = AM.Base.Reg = Reg; + return false; + } + } + break; + + case ISD::ADD: + if (!Available) { + X86ISelAddressMode Backup = AM; + if (!MatchAddress(N.Val->getOperand(0), AM, false, Depth+1) && + !MatchAddress(N.Val->getOperand(1), AM, false, Depth+1)) + return false; + AM = Backup; + if (!MatchAddress(N.Val->getOperand(1), AM, false, Depth+1) && + !MatchAddress(N.Val->getOperand(0), AM, false, Depth+1)) + return false; + AM = Backup; + } + break; + + case ISD::OR: + // Handle "X | C" as "X + C" iff X is known to have C bits clear. + if (!Available) { + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + X86ISelAddressMode Backup = AM; + // Start with the LHS as an addr mode. + if (!MatchAddress(N.getOperand(0), AM, false) && + // Address could not have picked a GV address for the displacement. + AM.GV == NULL && + // On x86-64, the resultant disp must fit in 32-bits. + isInt32(AM.Disp + CN->getSignExtended()) && + // Check to see if the LHS & C is zero. + CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getValue())) { + AM.Disp += CN->getValue(); + return false; + } + AM = Backup; + } + } + break; + } + + // Is the base register already occupied? + if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) { + // If so, check to see if the scale index register is set. + if (AM.IndexReg.Val == 0) { + AM.IndexReg = N; + AM.Scale = 1; + return false; + } + + // Otherwise, we cannot select it. + return true; + } + + // Default, generate it as a register. + AM.BaseType = X86ISelAddressMode::RegBase; + AM.Base.Reg = N; + return false; +} + +/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// It returns the operands which make up the maximal addressing mode it can +/// match by reference. +bool X86DAGToDAGISel::SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base, + SDOperand &Scale, SDOperand &Index, + SDOperand &Disp) { + X86ISelAddressMode AM; + if (MatchAddress(N, AM)) + return false; + + MVT::ValueType VT = N.getValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base.Reg.Val) + AM.Base.Reg = CurDAG->getRegister(0, VT); + } + + if (!AM.IndexReg.Val) + AM.IndexReg = CurDAG->getRegister(0, VT); + + getAddressOperands(AM, Base, Scale, Index, Disp); + return true; +} + +/// isZeroNode - Returns true if Elt is a constant zero or a floating point +/// constant +0.0. +static inline bool isZeroNode(SDOperand Elt) { + return ((isa<ConstantSDNode>(Elt) && + cast<ConstantSDNode>(Elt)->getValue() == 0) || + (isa<ConstantFPSDNode>(Elt) && + cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0))); +} + + +/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to +/// match a load whose top elements are either undef or zeros. The load flavor +/// is derived from the type of N, which is either v4f32 or v2f64. +bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred, + SDOperand N, SDOperand &Base, + SDOperand &Scale, SDOperand &Index, + SDOperand &Disp, SDOperand &InChain, + SDOperand &OutChain) { + if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) { + InChain = N.getOperand(0).getValue(1); + if (ISD::isNON_EXTLoad(InChain.Val) && + InChain.getValue(0).hasOneUse() && + N.hasOneUse() && + CanBeFoldedBy(N.Val, Pred.Val, Op.Val)) { + LoadSDNode *LD = cast<LoadSDNode>(InChain); + if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp)) + return false; + OutChain = LD->getChain(); + return true; + } + } + + // Also handle the case where we explicitly require zeros in the top + // elements. This is a vector shuffle from the zero vector. + if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() && + N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && + N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(1).Val->hasOneUse() && + ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) && + N.getOperand(1).getOperand(0).hasOneUse()) { + // Check to see if the BUILD_VECTOR is building a zero vector. + SDOperand BV = N.getOperand(0); + for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i) + if (!isZeroNode(BV.getOperand(i)) && + BV.getOperand(i).getOpcode() != ISD::UNDEF) + return false; // Not a zero/undef vector. + // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something + // from the LHS. + unsigned VecWidth = BV.getNumOperands(); + SDOperand ShufMask = N.getOperand(2); + assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!"); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) { + if (C->getValue() == VecWidth) { + for (unsigned i = 1; i != VecWidth; ++i) { + if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) { + // ok. + } else { + ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i)); + if (C->getValue() >= VecWidth) return false; + } + } + } + + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0)); + if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp)) + return false; + OutChain = LD->getChain(); + InChain = SDOperand(LD, 1); + return true; + } + } + return false; +} + + +/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// mode it matches can be cost effectively emitted as an LEA instruction. +bool X86DAGToDAGISel::SelectLEAAddr(SDOperand Op, SDOperand N, + SDOperand &Base, SDOperand &Scale, + SDOperand &Index, SDOperand &Disp) { + X86ISelAddressMode AM; + if (MatchAddress(N, AM)) + return false; + + MVT::ValueType VT = N.getValueType(); + unsigned Complexity = 0; + if (AM.BaseType == X86ISelAddressMode::RegBase) + if (AM.Base.Reg.Val) + Complexity = 1; + else + AM.Base.Reg = CurDAG->getRegister(0, VT); + else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Complexity = 4; + + if (AM.IndexReg.Val) + Complexity++; + else + AM.IndexReg = CurDAG->getRegister(0, VT); + + // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with + // a simple shift. + if (AM.Scale > 1) + Complexity++; + + // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA + // to a LEA. This is determined with some expermentation but is by no means + // optimal (especially for code size consideration). LEA is nice because of + // its three-address nature. Tweak the cost function again when we can run + // convertToThreeAddress() at register allocation time. + if (AM.GV || AM.CP || AM.ES || AM.JT != -1) { + // For X86-64, we should always use lea to materialize RIP relative + // addresses. + if (Subtarget->is64Bit()) + Complexity = 4; + else + Complexity += 2; + } + + if (AM.Disp && (AM.Base.Reg.Val || AM.IndexReg.Val)) + Complexity++; + + if (Complexity > 2) { + getAddressOperands(AM, Base, Scale, Index, Disp); + return true; + } + return false; +} + +bool X86DAGToDAGISel::TryFoldLoad(SDOperand P, SDOperand N, + SDOperand &Base, SDOperand &Scale, + SDOperand &Index, SDOperand &Disp) { + if (ISD::isNON_EXTLoad(N.Val) && + N.hasOneUse() && + CanBeFoldedBy(N.Val, P.Val, P.Val)) + return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp); + return false; +} + +/// getGlobalBaseReg - Output the instructions required to put the +/// base address to use for accessing globals into a register. +/// +SDNode *X86DAGToDAGISel::getGlobalBaseReg() { + assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing"); + if (!GlobalBaseReg) { + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = BB->getParent()->front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + SSARegMap *RegMap = BB->getParent()->getSSARegMap(); + unsigned PC = RegMap->createVirtualRegister(X86::GR32RegisterClass); + + const TargetInstrInfo *TII = TM.getInstrInfo(); + BuildMI(FirstMBB, MBBI, TII->get(X86::MovePCtoStack)); + BuildMI(FirstMBB, MBBI, TII->get(X86::POP32r), PC); + + // If we're using vanilla 'GOT' PIC style, we should use relative addressing + // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external + if (TM.getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) { + GlobalBaseReg = RegMap->createVirtualRegister(X86::GR32RegisterClass); + BuildMI(FirstMBB, MBBI, TII->get(X86::ADD32ri), GlobalBaseReg). + addReg(PC). + addExternalSymbol("_GLOBAL_OFFSET_TABLE_"); + } else { + GlobalBaseReg = PC; + } + + } + return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val; +} + +static SDNode *FindCallStartFromCall(SDNode *Node) { + if (Node->getOpcode() == ISD::CALLSEQ_START) return Node; + assert(Node->getOperand(0).getValueType() == MVT::Other && + "Node doesn't have a token chain argument!"); + return FindCallStartFromCall(Node->getOperand(0).Val); +} + +SDNode *X86DAGToDAGISel::Select(SDOperand N) { + SDNode *Node = N.Val; + MVT::ValueType NVT = Node->getValueType(0); + unsigned Opc, MOpc; + unsigned Opcode = Node->getOpcode(); + +#ifndef NDEBUG + DOUT << std::string(Indent, ' ') << "Selecting: "; + DEBUG(Node->dump(CurDAG)); + DOUT << "\n"; + Indent += 2; +#endif + + if (Opcode >= ISD::BUILTIN_OP_END && Opcode < X86ISD::FIRST_NUMBER) { +#ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "== "; + DEBUG(Node->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; +#endif + return NULL; // Already selected. + } + + switch (Opcode) { + default: break; + case X86ISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case ISD::ADD: { + // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd + // code and is matched first so to prevent it from being turned into + // LEA32r X+c. + // In 64-bit mode, use LEA to take advantage of RIP-relative addressing. + MVT::ValueType PtrVT = TLI.getPointerTy(); + SDOperand N0 = N.getOperand(0); + SDOperand N1 = N.getOperand(1); + if (N.Val->getValueType(0) == PtrVT && + N0.getOpcode() == X86ISD::Wrapper && + N1.getOpcode() == ISD::Constant) { + unsigned Offset = (unsigned)cast<ConstantSDNode>(N1)->getValue(); + SDOperand C(0, 0); + // TODO: handle ExternalSymbolSDNode. + if (GlobalAddressSDNode *G = + dyn_cast<GlobalAddressSDNode>(N0.getOperand(0))) { + C = CurDAG->getTargetGlobalAddress(G->getGlobal(), PtrVT, + G->getOffset() + Offset); + } else if (ConstantPoolSDNode *CP = + dyn_cast<ConstantPoolSDNode>(N0.getOperand(0))) { + C = CurDAG->getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment(), + CP->getOffset()+Offset); + } + + if (C.Val) { + if (Subtarget->is64Bit()) { + SDOperand Ops[] = { CurDAG->getRegister(0, PtrVT), getI8Imm(1), + CurDAG->getRegister(0, PtrVT), C }; + return CurDAG->SelectNodeTo(N.Val, X86::LEA64r, MVT::i64, Ops, 4); + } else + return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, PtrVT, C); + } + } + + // Other cases are handled by auto-generated code. + break; + } + + case ISD::MULHU: + case ISD::MULHS: { + if (Opcode == ISD::MULHU) + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; + case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; + case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; + case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; + } + else + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; + case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; + case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; + case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; + } + + unsigned LoReg, HiReg; + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break; + case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break; + case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break; + case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break; + } + + SDOperand N0 = Node->getOperand(0); + SDOperand N1 = Node->getOperand(1); + + bool foldedLoad = false; + SDOperand Tmp0, Tmp1, Tmp2, Tmp3; + foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3); + // MULHU and MULHS are commmutative + if (!foldedLoad) { + foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3); + if (foldedLoad) { + N0 = Node->getOperand(1); + N1 = Node->getOperand(0); + } + } + + SDOperand Chain; + if (foldedLoad) { + Chain = N1.getOperand(0); + AddToISelQueue(Chain); + } else + Chain = CurDAG->getEntryNode(); + + SDOperand InFlag(0, 0); + AddToISelQueue(N0); + Chain = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT), + N0, InFlag); + InFlag = Chain.getValue(1); + + if (foldedLoad) { + AddToISelQueue(Tmp0); + AddToISelQueue(Tmp1); + AddToISelQueue(Tmp2); + AddToISelQueue(Tmp3); + SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag }; + SDNode *CNode = + CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6); + Chain = SDOperand(CNode, 0); + InFlag = SDOperand(CNode, 1); + } else { + AddToISelQueue(N1); + InFlag = + SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0); + } + + SDOperand Result = CurDAG->getCopyFromReg(Chain, HiReg, NVT, InFlag); + ReplaceUses(N.getValue(0), Result); + if (foldedLoad) + ReplaceUses(N1.getValue(1), Result.getValue(1)); + +#ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "=> "; + DEBUG(Result.Val->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; +#endif + return NULL; + } + + case ISD::SDIV: + case ISD::UDIV: + case ISD::SREM: + case ISD::UREM: { + bool isSigned = Opcode == ISD::SDIV || Opcode == ISD::SREM; + bool isDiv = Opcode == ISD::SDIV || Opcode == ISD::UDIV; + if (!isSigned) + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; + case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; + case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; + case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; + } + else + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; + case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; + case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; + case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; + } + + unsigned LoReg, HiReg; + unsigned ClrOpcode, SExtOpcode; + switch (NVT) { + default: assert(0 && "Unsupported VT!"); + case MVT::i8: + LoReg = X86::AL; HiReg = X86::AH; + ClrOpcode = 0; + SExtOpcode = X86::CBW; + break; + case MVT::i16: + LoReg = X86::AX; HiReg = X86::DX; + ClrOpcode = X86::MOV16r0; + SExtOpcode = X86::CWD; + break; + case MVT::i32: + LoReg = X86::EAX; HiReg = X86::EDX; + ClrOpcode = X86::MOV32r0; + SExtOpcode = X86::CDQ; + break; + case MVT::i64: + LoReg = X86::RAX; HiReg = X86::RDX; + ClrOpcode = X86::MOV64r0; + SExtOpcode = X86::CQO; + break; + } + + SDOperand N0 = Node->getOperand(0); + SDOperand N1 = Node->getOperand(1); + SDOperand InFlag(0, 0); + if (NVT == MVT::i8 && !isSigned) { + // Special case for div8, just use a move with zero extension to AX to + // clear the upper 8 bits (AH). + SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Move, Chain; + if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3)) { + SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N0.getOperand(0) }; + AddToISelQueue(N0.getOperand(0)); + AddToISelQueue(Tmp0); + AddToISelQueue(Tmp1); + AddToISelQueue(Tmp2); + AddToISelQueue(Tmp3); + Move = + SDOperand(CurDAG->getTargetNode(X86::MOVZX16rm8, MVT::i16, MVT::Other, + Ops, 5), 0); + Chain = Move.getValue(1); + ReplaceUses(N0.getValue(1), Chain); + } else { + AddToISelQueue(N0); + Move = + SDOperand(CurDAG->getTargetNode(X86::MOVZX16rr8, MVT::i16, N0), 0); + Chain = CurDAG->getEntryNode(); + } + Chain = CurDAG->getCopyToReg(Chain, X86::AX, Move, InFlag); + InFlag = Chain.getValue(1); + } else { + AddToISelQueue(N0); + InFlag = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), LoReg, N0, + InFlag).getValue(1); + if (isSigned) { + // Sign extend the low part into the high part. + InFlag = + SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0); + } else { + // Zero out the high part, effectively zero extending the input. + SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), HiReg, ClrNode, + InFlag).getValue(1); + } + } + + SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Chain; + bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3); + if (foldedLoad) { + AddToISelQueue(N1.getOperand(0)); + AddToISelQueue(Tmp0); + AddToISelQueue(Tmp1); + AddToISelQueue(Tmp2); + AddToISelQueue(Tmp3); + SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag }; + SDNode *CNode = + CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6); + Chain = SDOperand(CNode, 0); + InFlag = SDOperand(CNode, 1); + } else { + AddToISelQueue(N1); + Chain = CurDAG->getEntryNode(); + InFlag = + SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0); + } + + SDOperand Result = + CurDAG->getCopyFromReg(Chain, isDiv ? LoReg : HiReg, NVT, InFlag); + ReplaceUses(N.getValue(0), Result); + if (foldedLoad) + ReplaceUses(N1.getValue(1), Result.getValue(1)); + +#ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "=> "; + DEBUG(Result.Val->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; +#endif + + return NULL; + } + + case ISD::TRUNCATE: { + if (!Subtarget->is64Bit() && NVT == MVT::i8) { + unsigned Opc2; + MVT::ValueType VT; + switch (Node->getOperand(0).getValueType()) { + default: assert(0 && "Unknown truncate!"); + case MVT::i16: + Opc = X86::MOV16to16_; + VT = MVT::i16; + Opc2 = X86::TRUNC_16_to8; + break; + case MVT::i32: + Opc = X86::MOV32to32_; + VT = MVT::i32; + Opc2 = X86::TRUNC_32_to8; + break; + } + + AddToISelQueue(Node->getOperand(0)); + SDOperand Tmp = + SDOperand(CurDAG->getTargetNode(Opc, VT, Node->getOperand(0)), 0); + SDNode *ResNode = CurDAG->getTargetNode(Opc2, NVT, Tmp); + +#ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "=> "; + DEBUG(ResNode->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; +#endif + return ResNode; + } + + break; + } + } + + SDNode *ResNode = SelectCode(N); + +#ifndef NDEBUG + DOUT << std::string(Indent-2, ' ') << "=> "; + if (ResNode == NULL || ResNode == N.Val) + DEBUG(N.Val->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DOUT << "\n"; + Indent -= 2; +#endif + + return ResNode; +} + +bool X86DAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDOperand &Op, char ConstraintCode, + std::vector<SDOperand> &OutOps, SelectionDAG &DAG){ + SDOperand Op0, Op1, Op2, Op3; + switch (ConstraintCode) { + case 'o': // offsetable ?? + case 'v': // not offsetable ?? + default: return true; + case 'm': // memory + if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3)) + return true; + break; + } + + OutOps.push_back(Op0); + OutOps.push_back(Op1); + OutOps.push_back(Op2); + OutOps.push_back(Op3); + AddToISelQueue(Op0); + AddToISelQueue(Op1); + AddToISelQueue(Op2); + AddToISelQueue(Op3); + return false; +} + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, bool Fast) { + return new X86DAGToDAGISel(TM, Fast); +} diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp new file mode 100644 index 0000000..37dea79 --- /dev/null +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -0,0 +1,5094 @@ +//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86ISelLowering.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Function.h" +#include "llvm/Intrinsics.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SSARegMap.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/StringExtras.h" +using namespace llvm; + +X86TargetLowering::X86TargetLowering(TargetMachine &TM) + : TargetLowering(TM) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + X86ScalarSSE = Subtarget->hasSSE2(); + X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + + RegInfo = TM.getRegisterInfo(); + + // Set up the TargetLowering object. + + // X86 is weird, it always uses i8 for shift amounts and setcc results. + setShiftAmountType(MVT::i8); + setSetCCResultType(MVT::i8); + setSetCCResultContents(ZeroOrOneSetCCResult); + setSchedulingPreference(SchedulingForRegPressure); + setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 + setStackPointerRegisterToSaveRestore(X86StackPtr); + + if (Subtarget->isTargetDarwin()) { + // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(false); + setUseUnderscoreLongJmp(false); + } else if (Subtarget->isTargetMingw()) { + // MS runtime is weird: it exports _setjmp, but longjmp! + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(false); + } else { + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + } + + // Set up the register classes. + addRegisterClass(MVT::i8, X86::GR8RegisterClass); + addRegisterClass(MVT::i16, X86::GR16RegisterClass); + addRegisterClass(MVT::i32, X86::GR32RegisterClass); + if (Subtarget->is64Bit()) + addRegisterClass(MVT::i64, X86::GR64RegisterClass); + + setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand); + + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + } else { + if (X86ScalarSSE) + // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand); + else + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + } + + // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSE) + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } + + if (!Subtarget->is64Bit()) { + // Custom lower SINT_TO_FP and FP_TO_SINT from/to i64 in 32-bit mode. + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + } + + // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); + + if (X86ScalarSSE) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + } else { + if (X86ScalarSSE && !Subtarget->hasSSE3()) + // Expand FP_TO_UINT into a select. + // FIXME: We would like to use a Custom expander here eventually to do + // the optimal thing for SSE vs. the default expansion in the legalizer. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); + else + // With SSE3 we can use fisttpll to convert to a signed i64. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + } + + // TODO: when we have SSE, these could be more efficient, by using movd/movq. + if (!X86ScalarSSE) { + setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); + setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); + } + + setOperationAction(ISD::BR_JT , MVT::Other, Expand); + setOperationAction(ISD::BRCOND , MVT::Other, Custom); + setOperationAction(ISD::BR_CC , MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + setOperationAction(ISD::MEMMOVE , MVT::Other, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f64 , Expand); + + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTTZ , MVT::i8 , Expand); + setOperationAction(ISD::CTLZ , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTTZ , MVT::i16 , Expand); + setOperationAction(ISD::CTLZ , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + setOperationAction(ISD::CTTZ , MVT::i32 , Expand); + setOperationAction(ISD::CTLZ , MVT::i32 , Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + setOperationAction(ISD::CTLZ , MVT::i64 , Expand); + } + + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); + setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + + // These should be promoted to a larger select which is supported. + setOperationAction(ISD::SELECT , MVT::i1 , Promote); + setOperationAction(ISD::SELECT , MVT::i8 , Promote); + // X86 wants to expand cmov itself. + setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i32 , Custom); + setOperationAction(ISD::SELECT , MVT::f32 , Custom); + setOperationAction(ISD::SELECT , MVT::f64 , Custom); + setOperationAction(ISD::SETCC , MVT::i8 , Custom); + setOperationAction(ISD::SETCC , MVT::i16 , Custom); + setOperationAction(ISD::SETCC , MVT::i32 , Custom); + setOperationAction(ISD::SETCC , MVT::f32 , Custom); + setOperationAction(ISD::SETCC , MVT::f64 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SELECT , MVT::i64 , Custom); + setOperationAction(ISD::SETCC , MVT::i64 , Custom); + } + // X86 ret instruction may pop stack. + setOperationAction(ISD::RET , MVT::Other, Custom); + if (!Subtarget->is64Bit()) + setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + + // Darwin ABI issue. + setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); + setOperationAction(ISD::JumpTable , MVT::i32 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); + setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); + setOperationAction(ISD::JumpTable , MVT::i64 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); + } + // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); + // X86 wants to expand memset / memcpy itself. + setOperationAction(ISD::MEMSET , MVT::Other, Custom); + setOperationAction(ISD::MEMCPY , MVT::Other, Custom); + + // We don't have line number support yet. + setOperationAction(ISD::LOCATION, MVT::Other, Expand); + setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand); + // FIXME - use subtarget debug flags + if (!Subtarget->isTargetDarwin() && + !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing()) + setOperationAction(ISD::LABEL, MVT::Other, Expand); + + setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + if (Subtarget->is64Bit()) { + // FIXME: Verify + setExceptionPointerRegister(X86::RAX); + setExceptionSelectorRegister(X86::RDX); + } else { + setExceptionPointerRegister(X86::EAX); + setExceptionSelectorRegister(X86::EDX); + } + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + else + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + if (Subtarget->isTargetCygMing()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + + if (X86ScalarSSE) { + // Set up the FP register classes. + addRegisterClass(MVT::f32, X86::FR32RegisterClass); + addRegisterClass(MVT::f64, X86::FR64RegisterClass); + + // Use ANDPD to simulate FABS. + setOperationAction(ISD::FABS , MVT::f64, Custom); + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f64, Custom); + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + // Use ANDPD and ORPD to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + + // Expand FP immediates into loads from the stack, except for the special + // cases we handle. + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + addLegalFPImmediate(+0.0); // xorps / xorpd + } else { + // Set up the FP register classes. + addRegisterClass(MVT::f64, X86::RFP64RegisterClass); + addRegisterClass(MVT::f32, X86::RFP32RegisterClass); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + setOperationAction(ISD::UNDEF, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::f32, Expand); + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f64 , Expand); + } + + setOperationAction(ISD::ConstantFP, MVT::f64, Expand); + setOperationAction(ISD::ConstantFP, MVT::f32, Expand); + addLegalFPImmediate(+0.0); // FLD0 + addLegalFPImmediate(+1.0); // FLD1 + addLegalFPImmediate(-0.0); // FLD0/FCHS + addLegalFPImmediate(-1.0); // FLD1/FCHS + } + + // First set operation action for all vector types to expand. Then we + // will selectively turn on ones that can be effectively codegen'd. + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand); + setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FNEG, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand); + } + + if (Subtarget->hasMMX()) { + addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); + addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); + addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); + addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); + + // FIXME: add MMX packed arithmetics + + setOperationAction(ISD::ADD, MVT::v8i8, Legal); + setOperationAction(ISD::ADD, MVT::v4i16, Legal); + setOperationAction(ISD::ADD, MVT::v2i32, Legal); + setOperationAction(ISD::ADD, MVT::v1i64, Legal); + + setOperationAction(ISD::SUB, MVT::v8i8, Legal); + setOperationAction(ISD::SUB, MVT::v4i16, Legal); + setOperationAction(ISD::SUB, MVT::v2i32, Legal); + + setOperationAction(ISD::MULHS, MVT::v4i16, Legal); + setOperationAction(ISD::MUL, MVT::v4i16, Legal); + + setOperationAction(ISD::AND, MVT::v8i8, Promote); + AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); + setOperationAction(ISD::AND, MVT::v4i16, Promote); + AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); + setOperationAction(ISD::AND, MVT::v2i32, Promote); + AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); + setOperationAction(ISD::AND, MVT::v1i64, Legal); + + setOperationAction(ISD::OR, MVT::v8i8, Promote); + AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); + setOperationAction(ISD::OR, MVT::v4i16, Promote); + AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); + setOperationAction(ISD::OR, MVT::v2i32, Promote); + AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); + setOperationAction(ISD::OR, MVT::v1i64, Legal); + + setOperationAction(ISD::XOR, MVT::v8i8, Promote); + AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); + setOperationAction(ISD::XOR, MVT::v4i16, Promote); + AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); + setOperationAction(ISD::XOR, MVT::v2i32, Promote); + AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); + setOperationAction(ISD::XOR, MVT::v1i64, Legal); + + setOperationAction(ISD::LOAD, MVT::v8i8, Promote); + AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); + setOperationAction(ISD::LOAD, MVT::v2i32, Promote); + AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); + setOperationAction(ISD::LOAD, MVT::v1i64, Legal); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); + } + + if (Subtarget->hasSSE1()) { + addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + } + + if (Subtarget->hasSSE2()) { + addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); + addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); + addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); + addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); + addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); + + setOperationAction(ISD::ADD, MVT::v16i8, Legal); + setOperationAction(ISD::ADD, MVT::v8i16, Legal); + setOperationAction(ISD::ADD, MVT::v4i32, Legal); + setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::SUB, MVT::v16i8, Legal); + setOperationAction(ISD::SUB, MVT::v8i16, Legal); + setOperationAction(ISD::SUB, MVT::v4i32, Legal); + setOperationAction(ISD::SUB, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f64, Legal); + setOperationAction(ISD::FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + // Implement v4f32 insert_vector_elt in terms of SSE2 v8i16 ones. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. + for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { + setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Custom); + } + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + + // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. + for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { + setOperationAction(ISD::AND, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::AND, (MVT::ValueType)VT, MVT::v2i64); + setOperationAction(ISD::OR, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::OR, (MVT::ValueType)VT, MVT::v2i64); + setOperationAction(ISD::XOR, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::XOR, (MVT::ValueType)VT, MVT::v2i64); + setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::LOAD, (MVT::ValueType)VT, MVT::v2i64); + setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote); + AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64); + } + + // Custom lower v2i64 and v2f64 selects. + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::LOAD, MVT::v2i64, Legal); + setOperationAction(ISD::SELECT, MVT::v2f64, Custom); + setOperationAction(ISD::SELECT, MVT::v2i64, Custom); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::SELECT); + + computeRegisterProperties(); + + // FIXME: These should be based on subtarget info. Plus, the values should + // be smaller when we are in optimizing for size mode. + maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores + maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores + maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores + allowUnalignedMemoryAccesses = true; // x86 supports it! +} + + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "X86GenCallingConv.inc" + +/// LowerRET - Lower an ISD::RET node. +SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { + assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); + + SmallVector<CCValAssign, 16> RVLocs; + unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); + bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); + CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); + CCInfo.AnalyzeReturn(Op.Val, RetCC_X86); + + + // If this is the first return lowered for this function, add the regs to the + // liveout set for the function. + if (DAG.getMachineFunction().liveout_empty()) { + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc()) + DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg()); + } + + SDOperand Chain = Op.getOperand(0); + SDOperand Flag; + + // Copy the result values into the output registers. + if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() || + RVLocs[0].getLocReg() != X86::ST0) { + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1), + Flag); + Flag = Chain.getValue(1); + } + } else { + // We need to handle a destination of ST0 specially, because it isn't really + // a register. + SDOperand Value = Op.getOperand(1); + + // If this is an FP return with ScalarSSE, we need to move the value from + // an XMM register onto the fp-stack. + if (X86ScalarSSE) { + SDOperand MemLoc; + + // If this is a load into a scalarsse value, don't store the loaded value + // back to the stack, only to reload it: just replace the scalar-sse load. + if (ISD::isNON_EXTLoad(Value.Val) && + (Chain == Value.getValue(1) || Chain == Value.getOperand(0))) { + Chain = Value.getOperand(0); + MemLoc = Value.getOperand(1); + } else { + // Spill the value to memory and reload it into top of stack. + unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8; + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); + MemLoc = DAG.getFrameIndex(SSFI, getPointerTy()); + Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0); + } + SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other); + SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())}; + Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3); + Chain = Value.getValue(1); + } + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDOperand Ops[] = { Chain, Value }; + Chain = DAG.getNode(X86ISD::FP_SET_RESULT, Tys, Ops, 2); + Flag = Chain.getValue(1); + } + + SDOperand BytesToPop = DAG.getConstant(getBytesToPopOnReturn(), MVT::i16); + if (Flag.Val) + return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop, Flag); + else + return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop); +} + + +/// LowerCallResult - Lower the result values of an ISD::CALL into the +/// appropriate copies out of appropriate physical registers. This assumes that +/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call +/// being lowered. The returns a SDNode with the same number of values as the +/// ISD::CALL. +SDNode *X86TargetLowering:: +LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, + unsigned CallingConv, SelectionDAG &DAG) { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0; + CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); + CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); + + + SmallVector<SDOperand, 8> ResultVals; + + // Copy all of the result registers out of their specified physreg. + if (RVLocs.size() != 1 || RVLocs[0].getLocReg() != X86::ST0) { + for (unsigned i = 0; i != RVLocs.size(); ++i) { + Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(), + RVLocs[i].getValVT(), InFlag).getValue(1); + InFlag = Chain.getValue(2); + ResultVals.push_back(Chain.getValue(0)); + } + } else { + // Copies from the FP stack are special, as ST0 isn't a valid register + // before the fp stackifier runs. + + // Copy ST0 into an RFP register with FP_GET_RESULT. + SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag); + SDOperand GROps[] = { Chain, InFlag }; + SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2); + Chain = RetVal.getValue(1); + InFlag = RetVal.getValue(2); + + // If we are using ScalarSSE, store ST(0) to the stack and reload it into + // an XMM register. + if (X86ScalarSSE) { + // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This + // shouldn't be necessary except that RFP cannot be live across + // multiple blocks. When stackifier is fixed, they can be uncoupled. + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); + SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDOperand Ops[] = { + Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag + }; + Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5); + RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0); + Chain = RetVal.getValue(1); + } + ResultVals.push_back(RetVal); + } + + // Merge everything together with a MERGE_VALUES node. + ResultVals.push_back(Chain); + return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(), + &ResultVals[0], ResultVals.size()).Val; +} + + +//===----------------------------------------------------------------------===// +// C & StdCall Calling Convention implementation +//===----------------------------------------------------------------------===// +// StdCall calling convention seems to be standard for many Windows' API +// routines and around. It differs from C calling convention just a little: +// callee should clean up the stack, not caller. Symbols should be also +// decorated in some fancy way :) It doesn't support any vector arguments. + +/// AddLiveIn - This helper function adds the specified physical register to the +/// MachineFunction as a live in value. It also creates a corresponding virtual +/// register for it. +static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg, + const TargetRegisterClass *RC) { + assert(RC->contains(PReg) && "Not the correct regclass!"); + unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC); + MF.addLiveIn(PReg, VReg); + return VReg; +} + +SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, + bool isStdCall) { + unsigned NumArgs = Op.Val->getNumValues() - 1; + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + SDOperand Root = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + getTargetMachine(), ArgLocs); + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); + + SmallVector<SDOperand, 8> ArgValues; + unsigned LastVal = ~0U; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + MVT::ValueType RegVT = VA.getLocVT(); + TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = X86::GR32RegisterClass; + else { + assert(MVT::isVector(RegVT)); + RC = X86::VR128RegisterClass; + } + + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); + SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + + if (VA.getLocInfo() != CCValAssign::Full) + ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue); + + ArgValues.push_back(ArgValue); + } else { + assert(VA.isMemLoc()); + + // Create the nodes corresponding to a load from this parameter slot. + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + VA.getLocMemOffset()); + SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0)); + } + } + + unsigned StackSize = CCInfo.getNextStackOffset(); + + ArgValues.push_back(Root); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) + VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); + + if (isStdCall && !isVarArg) { + BytesToPopOnReturn = StackSize; // Callee pops everything.. + BytesCallerReserves = 0; + } else { + BytesToPopOnReturn = 0; // Callee pops nothing. + + // If this is an sret function, the return should pop the hidden pointer. + if (NumArgs && + (cast<ConstantSDNode>(Op.getOperand(3))->getValue() & + ISD::ParamFlags::StructReturn)) + BytesToPopOnReturn = 4; + + BytesCallerReserves = StackSize; + } + + RegSaveFrameIndex = 0xAAAAAAA; // X86-64 only. + ReturnAddrIndex = 0; // No return address slot generated yet. + + MF.getInfo<X86MachineFunctionInfo>() + ->setBytesToPopOnReturn(BytesToPopOnReturn); + + // Return the new list of results. + return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), + &ArgValues[0], ArgValues.size()).getValue(Op.ResNo); +} + +SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, + unsigned CC) { + SDOperand Chain = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + unsigned NumOps = (Op.getNumOperands() - 5) / 2; + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + + SDOperand StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + } + } + + // If the first argument is an sret pointer, remember it. + bool isSRet = NumOps && + (cast<ConstantSDNode>(Op.getOperand(6))->getValue() & + ISD::ParamFlags::StructReturn); + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) { + Chain = DAG.getCopyToReg(Chain, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), + getTargetMachine(), true)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add an implicit use GOT pointer in EBX. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) + Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); + + if (InFlag.Val) + Ops.push_back(InFlag); + + Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, + NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node. + unsigned NumBytesForCalleeToPush = 0; + + if (CC == CallingConv::X86_StdCall) { + if (isVarArg) + NumBytesForCalleeToPush = isSRet ? 4 : 0; + else + NumBytesForCalleeToPush = NumBytes; + } else { + // If this is is a call to a struct-return function, the callee + // pops the hidden struct pointer, so we have to push it back. + // This is common for Darwin/X86, Linux & Mingw32 targets. + NumBytesForCalleeToPush = isSRet ? 4 : 0; + } + + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); + Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy())); + Ops.push_back(InFlag); + Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); +} + + +//===----------------------------------------------------------------------===// +// FastCall Calling Convention implementation +//===----------------------------------------------------------------------===// +// +// The X86 'fastcall' calling convention passes up to two integer arguments in +// registers (an appropriate portion of ECX/EDX), passes arguments in C order, +// and requires that the callee pop its arguments off the stack (allowing proper +// tail calls), and has the same return value conventions as C calling convs. +// +// This calling convention always arranges for the callee pop value to be 8n+4 +// bytes, which is needed for tail recursion elimination and stack alignment +// reasons. +SDOperand +X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + SDOperand Root = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + getTargetMachine(), ArgLocs); + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_FastCall); + + SmallVector<SDOperand, 8> ArgValues; + unsigned LastVal = ~0U; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + MVT::ValueType RegVT = VA.getLocVT(); + TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = X86::GR32RegisterClass; + else { + assert(MVT::isVector(RegVT)); + RC = X86::VR128RegisterClass; + } + + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); + SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + + if (VA.getLocInfo() != CCValAssign::Full) + ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue); + + ArgValues.push_back(ArgValue); + } else { + assert(VA.isMemLoc()); + + // Create the nodes corresponding to a load from this parameter slot. + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + VA.getLocMemOffset()); + SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0)); + } + } + + ArgValues.push_back(Root); + + unsigned StackSize = CCInfo.getNextStackOffset(); + + if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { + // Make sure the instruction takes 8n+4 bytes to make sure the start of the + // arguments and the arguments after the retaddr has been pushed are aligned. + if ((StackSize & 7) == 0) + StackSize += 4; + } + + VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. + RegSaveFrameIndex = 0xAAAAAAA; // X86-64 only. + ReturnAddrIndex = 0; // No return address slot generated yet. + BytesToPopOnReturn = StackSize; // Callee pops all stack arguments. + BytesCallerReserves = 0; + + MF.getInfo<X86MachineFunctionInfo>() + ->setBytesToPopOnReturn(BytesToPopOnReturn); + + // Return the new list of results. + return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), + &ArgValues[0], ArgValues.size()).getValue(Op.ResNo); +} + +SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, + unsigned CC) { + SDOperand Chain = Op.getOperand(0); + bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_FastCall); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { + // Make sure the instruction takes 8n+4 bytes to make sure the start of the + // arguments and the arguments after the retaddr has been pushed are aligned. + if ((NumBytes & 7) == 0) + NumBytes += 4; + } + + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + + SDOperand StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), + getTargetMachine(), true)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) { + Chain = DAG.getCopyToReg(Chain, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + InFlag); + InFlag = Chain.getValue(1); + } + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add an implicit use GOT pointer in EBX. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) + Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); + + if (InFlag.Val) + Ops.push_back(InFlag); + + // FIXME: Do not generate X86ISD::TAILCALL for now. + Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, + NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Returns a flag for retval copy to use. + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); + Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); + Ops.push_back(InFlag); + Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); +} + + +//===----------------------------------------------------------------------===// +// X86-64 C Calling Convention implementation +//===----------------------------------------------------------------------===// + +SDOperand +X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + SDOperand Root = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + + static const unsigned GPR64ArgRegs[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + getTargetMachine(), ArgLocs); + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C); + + SmallVector<SDOperand, 8> ArgValues; + unsigned LastVal = ~0U; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + MVT::ValueType RegVT = VA.getLocVT(); + TargetRegisterClass *RC; + if (RegVT == MVT::i32) + RC = X86::GR32RegisterClass; + else if (RegVT == MVT::i64) + RC = X86::GR64RegisterClass; + else if (RegVT == MVT::f32) + RC = X86::FR32RegisterClass; + else if (RegVT == MVT::f64) + RC = X86::FR64RegisterClass; + else { + assert(MVT::isVector(RegVT)); + if (MVT::getSizeInBits(RegVT) == 64) { + RC = X86::GR64RegisterClass; // MMX values are passed in GPRs. + RegVT = MVT::i64; + } else + RC = X86::VR128RegisterClass; + } + + unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); + SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + + if (VA.getLocInfo() != CCValAssign::Full) + ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue); + + // Handle MMX values passed in GPRs. + if (RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass && + MVT::getSizeInBits(RegVT) == 64) + ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue); + + ArgValues.push_back(ArgValue); + } else { + assert(VA.isMemLoc()); + + // Create the nodes corresponding to a load from this parameter slot. + int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8, + VA.getLocMemOffset()); + SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy()); + ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0)); + } + } + + unsigned StackSize = CCInfo.getNextStackOffset(); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so they + // may be loaded by deferencing the result of va_next. + VarArgsGPOffset = NumIntRegs * 8; + VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16; + VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); + RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16); + + // Store the integer parameter registers. + SmallVector<SDOperand, 8> MemOps; + SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); + SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN, + DAG.getConstant(VarArgsGPOffset, getPointerTy())); + for (; NumIntRegs != 6; ++NumIntRegs) { + unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs], + X86::GR64RegisterClass); + SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64); + SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(8, getPointerTy())); + } + + // Now store the XMM (fp + vector) parameter registers. + FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN, + DAG.getConstant(VarArgsFPOffset, getPointerTy())); + for (; NumXMMRegs != 8; ++NumXMMRegs) { + unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs], + X86::VR128RegisterClass); + SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32); + SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(16, getPointerTy())); + } + if (!MemOps.empty()) + Root = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOps[0], MemOps.size()); + } + + ArgValues.push_back(Root); + + ReturnAddrIndex = 0; // No return address slot generated yet. + BytesToPopOnReturn = 0; // Callee pops nothing. + BytesCallerReserves = StackSize; + + // Return the new list of results. + return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(), + &ArgValues[0], ArgValues.size()).getValue(Op.ResNo); +} + +SDOperand +X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG, + unsigned CC) { + SDOperand Chain = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + + SDOperand StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + + if (isVarArg) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + + Chain = DAG.getCopyToReg(Chain, X86::AL, + DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + if (getTargetMachine().getCodeModel() != CodeModel::Large + && !Subtarget->GVRequiresExtraLoad(G->getGlobal(), + getTargetMachine(), true)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + if (getTargetMachine().getCodeModel() != CodeModel::Large) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + if (InFlag.Val) + Ops.push_back(InFlag); + + // FIXME: Do not generate X86ISD::TAILCALL for now. + Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, + NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Returns a flag for retval copy to use. + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(NumBytes, getPointerTy())); + Ops.push_back(DAG.getConstant(0, getPointerTy())); + Ops.push_back(InFlag); + Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); +} + + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + + +SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { + if (ReturnAddrIndex == 0) { + // Set up a frame object for the return address. + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->is64Bit()) + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8); + else + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4); + } + + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); +} + + + +/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86 +/// specific condition code. It returns a false if it cannot do a direct +/// translation. X86CC is the translated CondCode. LHS/RHS are modified as +/// needed. +static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP, + unsigned &X86CC, SDOperand &LHS, SDOperand &RHS, + SelectionDAG &DAG) { + X86CC = X86::COND_INVALID; + if (!isFP) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { + if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { + // X > -1 -> X == 0, jump !sign. + RHS = DAG.getConstant(0, RHS.getValueType()); + X86CC = X86::COND_NS; + return true; + } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + // X < 0 -> X == 0, jump on sign. + X86CC = X86::COND_S; + return true; + } + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETEQ: X86CC = X86::COND_E; break; + case ISD::SETGT: X86CC = X86::COND_G; break; + case ISD::SETGE: X86CC = X86::COND_GE; break; + case ISD::SETLT: X86CC = X86::COND_L; break; + case ISD::SETLE: X86CC = X86::COND_LE; break; + case ISD::SETNE: X86CC = X86::COND_NE; break; + case ISD::SETULT: X86CC = X86::COND_B; break; + case ISD::SETUGT: X86CC = X86::COND_A; break; + case ISD::SETULE: X86CC = X86::COND_BE; break; + case ISD::SETUGE: X86CC = X86::COND_AE; break; + } + } else { + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + bool Flip = false; + switch (SetCCOpcode) { + default: break; + case ISD::SETUEQ: + case ISD::SETEQ: X86CC = X86::COND_E; break; + case ISD::SETOLT: Flip = true; // Fallthrough + case ISD::SETOGT: + case ISD::SETGT: X86CC = X86::COND_A; break; + case ISD::SETOLE: Flip = true; // Fallthrough + case ISD::SETOGE: + case ISD::SETGE: X86CC = X86::COND_AE; break; + case ISD::SETUGT: Flip = true; // Fallthrough + case ISD::SETULT: + case ISD::SETLT: X86CC = X86::COND_B; break; + case ISD::SETUGE: Flip = true; // Fallthrough + case ISD::SETULE: + case ISD::SETLE: X86CC = X86::COND_BE; break; + case ISD::SETONE: + case ISD::SETNE: X86CC = X86::COND_NE; break; + case ISD::SETUO: X86CC = X86::COND_P; break; + case ISD::SETO: X86CC = X86::COND_NP; break; + } + if (Flip) + std::swap(LHS, RHS); + } + + return X86CC != X86::COND_INVALID; +} + +/// hasFPCMov - is there a floating point cmov for the specific X86 condition +/// code. Current x86 isa includes the following FP cmov instructions: +/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. +static bool hasFPCMov(unsigned X86CC) { + switch (X86CC) { + default: + return false; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_E: + case X86::COND_P: + case X86::COND_A: + case X86::COND_AE: + case X86::COND_NE: + case X86::COND_NP: + return true; + } +} + +/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode. Return +/// true if Op is undef or if its value falls within the specified range (L, H]. +static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) { + if (Op.getOpcode() == ISD::UNDEF) + return true; + + unsigned Val = cast<ConstantSDNode>(Op)->getValue(); + return (Val >= Low && Val < Hi); +} + +/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode. Return +/// true if Op is undef or if its value equal to the specified value. +static bool isUndefOrEqual(SDOperand Op, unsigned Val) { + if (Op.getOpcode() == ISD::UNDEF) + return true; + return cast<ConstantSDNode>(Op)->getValue() == Val; +} + +/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to PSHUFD. +bool X86::isPSHUFDMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 4) + return false; + + // Check if the value doesn't reference the second vector. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + if (cast<ConstantSDNode>(Arg)->getValue() >= 4) + return false; + } + + return true; +} + +/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to PSHUFHW. +bool X86::isPSHUFHWMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 8) + return false; + + // Lower quadword copied in order. + for (unsigned i = 0; i != 4; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + if (cast<ConstantSDNode>(Arg)->getValue() != i) + return false; + } + + // Upper quadword shuffled. + for (unsigned i = 4; i != 8; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val < 4 || Val > 7) + return false; + } + + return true; +} + +/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to PSHUFLW. +bool X86::isPSHUFLWMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 8) + return false; + + // Upper quadword copied in order. + for (unsigned i = 4; i != 8; ++i) + if (!isUndefOrEqual(N->getOperand(i), i)) + return false; + + // Lower quadword shuffled. + for (unsigned i = 0; i != 4; ++i) + if (!isUndefOrInRange(N->getOperand(i), 0, 4)) + return false; + + return true; +} + +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to SHUFP*. +static bool isSHUFPMask(const SDOperand *Elems, unsigned NumElems) { + if (NumElems != 2 && NumElems != 4) return false; + + unsigned Half = NumElems / 2; + for (unsigned i = 0; i < Half; ++i) + if (!isUndefOrInRange(Elems[i], 0, NumElems)) + return false; + for (unsigned i = Half; i < NumElems; ++i) + if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2)) + return false; + + return true; +} + +bool X86::isSHUFPMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return ::isSHUFPMask(N->op_begin(), N->getNumOperands()); +} + +/// isCommutedSHUFP - Returns true if the shuffle mask is exactly +/// the reverse of what x86 shuffles want. x86 shuffles requires the lower +/// half elements to come from vector 1 (which would equal the dest.) and +/// the upper half to come from vector 2. +static bool isCommutedSHUFP(const SDOperand *Ops, unsigned NumOps) { + if (NumOps != 2 && NumOps != 4) return false; + + unsigned Half = NumOps / 2; + for (unsigned i = 0; i < Half; ++i) + if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2)) + return false; + for (unsigned i = Half; i < NumOps; ++i) + if (!isUndefOrInRange(Ops[i], 0, NumOps)) + return false; + return true; +} + +static bool isCommutedSHUFP(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return isCommutedSHUFP(N->op_begin(), N->getNumOperands()); +} + +/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVHLPS. +bool X86::isMOVHLPSMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 4) + return false; + + // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 + return isUndefOrEqual(N->getOperand(0), 6) && + isUndefOrEqual(N->getOperand(1), 7) && + isUndefOrEqual(N->getOperand(2), 2) && + isUndefOrEqual(N->getOperand(3), 3); +} + +/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form +/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, +/// <2, 3, 2, 3> +bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 4) + return false; + + // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3 + return isUndefOrEqual(N->getOperand(0), 2) && + isUndefOrEqual(N->getOperand(1), 3) && + isUndefOrEqual(N->getOperand(2), 2) && + isUndefOrEqual(N->getOperand(3), 3); +} + +/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. +bool X86::isMOVLPMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + unsigned NumElems = N->getNumOperands(); + if (NumElems != 2 && NumElems != 4) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(N->getOperand(i), i + NumElems)) + return false; + + for (unsigned i = NumElems/2; i < NumElems; ++i) + if (!isUndefOrEqual(N->getOperand(i), i)) + return false; + + return true; +} + +/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} +/// and MOVLHPS. +bool X86::isMOVHPMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + unsigned NumElems = N->getNumOperands(); + if (NumElems != 2 && NumElems != 4) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(N->getOperand(i), i)) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) { + SDOperand Arg = N->getOperand(i + NumElems/2); + if (!isUndefOrEqual(Arg, i + NumElems)) + return false; + } + + return true; +} + +/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKL. +bool static isUNPCKLMask(const SDOperand *Elts, unsigned NumElts, + bool V2IsSplat = false) { + if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) + return false; + + for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) { + SDOperand BitI = Elts[i]; + SDOperand BitI1 = Elts[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } + } + + return true; +} + +bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat); +} + +/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKH. +bool static isUNPCKHMask(const SDOperand *Elts, unsigned NumElts, + bool V2IsSplat = false) { + if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) + return false; + + for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) { + SDOperand BitI = Elts[i]; + SDOperand BitI1 = Elts[i+1]; + if (!isUndefOrEqual(BitI, j + NumElts/2)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) + return false; + } + } + + return true; +} + +bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat); +} + +/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form +/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, +/// <0, 0, 1, 1> +bool X86::isUNPCKL_v_undef_Mask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + unsigned NumElems = N->getNumOperands(); + if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) + return false; + + for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) { + SDOperand BitI = N->getOperand(i); + SDOperand BitI1 = N->getOperand(i+1); + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + + return true; +} + +/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form +/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, +/// <2, 2, 3, 3> +bool X86::isUNPCKH_v_undef_Mask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + unsigned NumElems = N->getNumOperands(); + if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) + return false; + + for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { + SDOperand BitI = N->getOperand(i); + SDOperand BitI1 = N->getOperand(i + 1); + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + + return true; +} + +/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSS, +/// MOVSD, and MOVD, i.e. setting the lowest element. +static bool isMOVLMask(const SDOperand *Elts, unsigned NumElts) { + if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) + return false; + + if (!isUndefOrEqual(Elts[0], NumElts)) + return false; + + for (unsigned i = 1; i < NumElts; ++i) { + if (!isUndefOrEqual(Elts[i], i)) + return false; + } + + return true; +} + +bool X86::isMOVLMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return ::isMOVLMask(N->op_begin(), N->getNumOperands()); +} + +/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse +/// of what x86 movss want. X86 movs requires the lowest element to be lowest +/// element of vector 2 and the other elements to come from vector 1 in order. +static bool isCommutedMOVL(const SDOperand *Ops, unsigned NumOps, + bool V2IsSplat = false, + bool V2IsUndef = false) { + if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) + return false; + + if (!isUndefOrEqual(Ops[0], 0)) + return false; + + for (unsigned i = 1; i < NumOps; ++i) { + SDOperand Arg = Ops[i]; + if (!(isUndefOrEqual(Arg, i+NumOps) || + (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) || + (V2IsSplat && isUndefOrEqual(Arg, NumOps)))) + return false; + } + + return true; +} + +static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false, + bool V2IsUndef = false) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + return isCommutedMOVL(N->op_begin(), N->getNumOperands(), + V2IsSplat, V2IsUndef); +} + +/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. +bool X86::isMOVSHDUPMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 4) + return false; + + // Expect 1, 1, 3, 3 + for (unsigned i = 0; i < 2; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val != 1) return false; + } + + bool HasHi = false; + for (unsigned i = 2; i < 4; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val != 3) return false; + HasHi = true; + } + + // Don't use movshdup if it can be done with a shufps. + return HasHi; +} + +/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. +bool X86::isMOVSLDUPMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 4) + return false; + + // Expect 0, 0, 2, 2 + for (unsigned i = 0; i < 2; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val != 0) return false; + } + + bool HasHi = false; + for (unsigned i = 2; i < 4; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val != 2) return false; + HasHi = true; + } + + // Don't use movshdup if it can be done with a shufps. + return HasHi; +} + +/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a identity operation on the LHS or RHS. +static bool isIdentityMask(SDNode *N, bool RHS = false) { + unsigned NumElems = N->getNumOperands(); + for (unsigned i = 0; i < NumElems; ++i) + if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0))) + return false; + return true; +} + +/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies +/// a splat of a single element. +static bool isSplatMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + // This is a splat operation if each element of the permute is the same, and + // if the value doesn't reference the second vector. + unsigned NumElems = N->getNumOperands(); + SDOperand ElementBase; + unsigned i = 0; + for (; i != NumElems; ++i) { + SDOperand Elt = N->getOperand(i); + if (isa<ConstantSDNode>(Elt)) { + ElementBase = Elt; + break; + } + } + + if (!ElementBase.Val) + return false; + + for (; i != NumElems; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + if (Arg != ElementBase) return false; + } + + // Make sure it is a splat of the first vector operand. + return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems; +} + +/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies +/// a splat of a single element and it's a 2 or 4 element mask. +bool X86::isSplatMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + // We can only splat 64-bit, and 32-bit quantities with a single instruction. + if (N->getNumOperands() != 4 && N->getNumOperands() != 2) + return false; + return ::isSplatMask(N); +} + +/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a splat of zero element. +bool X86::isSplatLoMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) + if (!isUndefOrEqual(N->getOperand(i), 0)) + return false; + return true; +} + +/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle +/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* +/// instructions. +unsigned X86::getShuffleSHUFImmediate(SDNode *N) { + unsigned NumOperands = N->getNumOperands(); + unsigned Shift = (NumOperands == 4) ? 2 : 1; + unsigned Mask = 0; + for (unsigned i = 0; i < NumOperands; ++i) { + unsigned Val = 0; + SDOperand Arg = N->getOperand(NumOperands-i-1); + if (Arg.getOpcode() != ISD::UNDEF) + Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val >= NumOperands) Val -= NumOperands; + Mask |= Val; + if (i != NumOperands - 1) + Mask <<= Shift; + } + + return Mask; +} + +/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle +/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW +/// instructions. +unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { + unsigned Mask = 0; + // 8 nodes, but we only care about the last 4. + for (unsigned i = 7; i >= 4; --i) { + unsigned Val = 0; + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() != ISD::UNDEF) + Val = cast<ConstantSDNode>(Arg)->getValue(); + Mask |= (Val - 4); + if (i != 4) + Mask <<= 2; + } + + return Mask; +} + +/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle +/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW +/// instructions. +unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { + unsigned Mask = 0; + // 8 nodes, but we only care about the first 4. + for (int i = 3; i >= 0; --i) { + unsigned Val = 0; + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() != ISD::UNDEF) + Val = cast<ConstantSDNode>(Arg)->getValue(); + Mask |= Val; + if (i != 0) + Mask <<= 2; + } + + return Mask; +} + +/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand +/// specifies a 8 element shuffle that can be broken into a pair of +/// PSHUFHW and PSHUFLW. +static bool isPSHUFHW_PSHUFLWMask(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR); + + if (N->getNumOperands() != 8) + return false; + + // Lower quadword shuffled. + for (unsigned i = 0; i != 4; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val > 4) + return false; + } + + // Upper quadword shuffled. + for (unsigned i = 4; i != 8; ++i) { + SDOperand Arg = N->getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val < 4 || Val > 7) + return false; + } + + return true; +} + +/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as +/// values in ther permute mask. +static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1, + SDOperand &V2, SDOperand &Mask, + SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType MaskVT = Mask.getValueType(); + MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT); + unsigned NumElems = Mask.getNumOperands(); + SmallVector<SDOperand, 8> MaskVec; + + for (unsigned i = 0; i != NumElems; ++i) { + SDOperand Arg = Mask.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) { + MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT)); + continue; + } + assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val < NumElems) + MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT)); + else + MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT)); + } + + std::swap(V1, V2); + Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); +} + +/// ShouldXformToMOVHLPS - Return true if the node should be transformed to +/// match movhlps. The lower half elements should come from upper half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). +static bool ShouldXformToMOVHLPS(SDNode *Mask) { + unsigned NumElems = Mask->getNumOperands(); + if (NumElems != 4) + return false; + for (unsigned i = 0, e = 2; i != e; ++i) + if (!isUndefOrEqual(Mask->getOperand(i), i+2)) + return false; + for (unsigned i = 2; i != 4; ++i) + if (!isUndefOrEqual(Mask->getOperand(i), i+4)) + return false; + return true; +} + +/// isScalarLoadToVector - Returns true if the node is a scalar load that +/// is promoted to a vector. +static inline bool isScalarLoadToVector(SDNode *N) { + if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) { + N = N->getOperand(0).Val; + return ISD::isNON_EXTLoad(N); + } + return false; +} + +/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to +/// match movlp{s|d}. The lower half elements should come from lower half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). And since V1 will become the source of the +/// MOVLP, it must be either a vector load or a scalar load to vector. +static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) { + if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) + return false; + // Is V2 is a vector load, don't do this transformation. We will try to use + // load folding shufps op. + if (ISD::isNON_EXTLoad(V2)) + return false; + + unsigned NumElems = Mask->getNumOperands(); + if (NumElems != 2 && NumElems != 4) + return false; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Mask->getOperand(i), i)) + return false; + for (unsigned i = NumElems/2; i != NumElems; ++i) + if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems)) + return false; + return true; +} + +/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are +/// all the same. +static bool isSplatVector(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + SDOperand SplatValue = N->getOperand(0); + for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) + if (N->getOperand(i) != SplatValue) + return false; + return true; +} + +/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved +/// to an undef. +static bool isUndefShuffle(SDNode *N) { + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + SDOperand V1 = N->getOperand(0); + SDOperand V2 = N->getOperand(1); + SDOperand Mask = N->getOperand(2); + unsigned NumElems = Mask.getNumOperands(); + for (unsigned i = 0; i != NumElems; ++i) { + SDOperand Arg = Mask.getOperand(i); + if (Arg.getOpcode() != ISD::UNDEF) { + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val < NumElems && V1.getOpcode() != ISD::UNDEF) + return false; + else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF) + return false; + } + } + return true; +} + +/// isZeroNode - Returns true if Elt is a constant zero or a floating point +/// constant +0.0. +static inline bool isZeroNode(SDOperand Elt) { + return ((isa<ConstantSDNode>(Elt) && + cast<ConstantSDNode>(Elt)->getValue() == 0) || + (isa<ConstantFPSDNode>(Elt) && + cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0))); +} + +/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved +/// to an zero vector. +static bool isZeroShuffle(SDNode *N) { + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + SDOperand V1 = N->getOperand(0); + SDOperand V2 = N->getOperand(1); + SDOperand Mask = N->getOperand(2); + unsigned NumElems = Mask.getNumOperands(); + for (unsigned i = 0; i != NumElems; ++i) { + SDOperand Arg = Mask.getOperand(i); + if (Arg.getOpcode() != ISD::UNDEF) { + unsigned Idx = cast<ConstantSDNode>(Arg)->getValue(); + if (Idx < NumElems) { + unsigned Opc = V1.Val->getOpcode(); + if (Opc == ISD::UNDEF) + continue; + if (Opc != ISD::BUILD_VECTOR || + !isZeroNode(V1.Val->getOperand(Idx))) + return false; + } else if (Idx >= NumElems) { + unsigned Opc = V2.Val->getOpcode(); + if (Opc == ISD::UNDEF) + continue; + if (Opc != ISD::BUILD_VECTOR || + !isZeroNode(V2.Val->getOperand(Idx - NumElems))) + return false; + } + } + } + return true; +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) { + assert(MVT::isVector(VT) && "Expected a vector type"); + unsigned NumElems = MVT::getVectorNumElements(VT); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + bool isFP = MVT::isFloatingPoint(EVT); + SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT); + SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero); + return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size()); +} + +/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements +/// that point to V2 points to its first element. +static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) { + assert(Mask.getOpcode() == ISD::BUILD_VECTOR); + + bool Changed = false; + SmallVector<SDOperand, 8> MaskVec; + unsigned NumElems = Mask.getNumOperands(); + for (unsigned i = 0; i != NumElems; ++i) { + SDOperand Arg = Mask.getOperand(i); + if (Arg.getOpcode() != ISD::UNDEF) { + unsigned Val = cast<ConstantSDNode>(Arg)->getValue(); + if (Val > NumElems) { + Arg = DAG.getConstant(NumElems, Arg.getValueType()); + Changed = true; + } + } + MaskVec.push_back(Arg); + } + + if (Changed) + Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(), + &MaskVec[0], MaskVec.size()); + return Mask; +} + +/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG) { + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT); + + SmallVector<SDOperand, 8> MaskVec; + MaskVec.push_back(DAG.getConstant(NumElems, BaseVT)); + for (unsigned i = 1; i != NumElems; ++i) + MaskVec.push_back(DAG.getConstant(i, BaseVT)); + return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); +} + +/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation +/// of specified width. +static SDOperand getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) { + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT); + SmallVector<SDOperand, 8> MaskVec; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) { + MaskVec.push_back(DAG.getConstant(i, BaseVT)); + MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); +} + +/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation +/// of specified width. +static SDOperand getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) { + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT); + unsigned Half = NumElems/2; + SmallVector<SDOperand, 8> MaskVec; + for (unsigned i = 0; i != Half; ++i) { + MaskVec.push_back(DAG.getConstant(i + Half, BaseVT)); + MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT)); + } + return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size()); +} + +/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32. +/// +static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) { + SDOperand V1 = Op.getOperand(0); + SDOperand Mask = Op.getOperand(2); + MVT::ValueType VT = Op.getValueType(); + unsigned NumElems = Mask.getNumOperands(); + Mask = getUnpacklMask(NumElems, DAG); + while (NumElems != 4) { + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask); + NumElems >>= 1; + } + V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1); + + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + Mask = getZeroVector(MaskVT, DAG); + SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1, + DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask); + return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle); +} + +/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified +/// vector of zero or undef vector. +static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT, + unsigned NumElems, unsigned Idx, + bool isZero, SelectionDAG &DAG) { + SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT); + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType EVT = MVT::getVectorElementType(MaskVT); + SDOperand Zero = DAG.getConstant(0, EVT); + SmallVector<SDOperand, 8> MaskVec(NumElems, Zero); + MaskVec[Idx] = DAG.getConstant(NumElems, EVT); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); +} + +/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. +/// +static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, TargetLowering &TLI) { + if (NumNonZero > 8) + return SDOperand(); + + SDOperand V(0, 0); + bool First = true; + for (unsigned i = 0; i < 16; ++i) { + bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; + if (ThisIsNonZero && First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, DAG); + else + V = DAG.getNode(ISD::UNDEF, MVT::v8i16); + First = false; + } + + if ((i & 1) != 0) { + SDOperand ThisElt(0, 0), LastElt(0, 0); + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + if (LastIsNonZero) { + LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1)); + } + if (ThisIsNonZero) { + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i)); + ThisElt = DAG.getNode(ISD::SHL, MVT::i16, + ThisElt, DAG.getConstant(8, MVT::i8)); + if (LastIsNonZero) + ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt); + } else + ThisElt = LastElt; + + if (ThisElt.Val) + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt, + DAG.getConstant(i/2, TLI.getPointerTy())); + } + } + + return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V); +} + +/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. +/// +static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, TargetLowering &TLI) { + if (NumNonZero > 4) + return SDOperand(); + + SDOperand V(0, 0); + bool First = true; + for (unsigned i = 0; i < 8; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, DAG); + else + V = DAG.getNode(ISD::UNDEF, MVT::v8i16); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i), + DAG.getConstant(i, TLI.getPointerTy())); + } + } + + return V; +} + +SDOperand +X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) { + // All zero's are handled with pxor. + if (ISD::isBuildVectorAllZeros(Op.Val)) + return Op; + + // All one's are handled with pcmpeqd. + if (ISD::isBuildVectorAllOnes(Op.Val)) + return Op; + + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + unsigned EVTBits = MVT::getSizeInBits(EVT); + + unsigned NumElems = Op.getNumOperands(); + unsigned NumZero = 0; + unsigned NumNonZero = 0; + unsigned NonZeros = 0; + std::set<SDOperand> Values; + for (unsigned i = 0; i < NumElems; ++i) { + SDOperand Elt = Op.getOperand(i); + if (Elt.getOpcode() != ISD::UNDEF) { + Values.insert(Elt); + if (isZeroNode(Elt)) + NumZero++; + else { + NonZeros |= (1 << i); + NumNonZero++; + } + } + } + + if (NumNonZero == 0) { + if (NumZero == 0) + // All undef vector. Return an UNDEF. + return DAG.getNode(ISD::UNDEF, VT); + else + // A mix of zero and undef. Return a zero vector. + return getZeroVector(VT, DAG); + } + + // Splat is obviously ok. Let legalizer expand it to a shuffle. + if (Values.size() == 1) + return SDOperand(); + + // Special case for single non-zero element. + if (NumNonZero == 1) { + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDOperand Item = Op.getOperand(Idx); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item); + if (Idx == 0) + // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. + return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx, + NumZero > 0, DAG); + + if (EVTBits == 32) { + // Turn it into a shuffle of zero and zero-extended scalar to vector. + Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0, + DAG); + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT); + SmallVector<SDOperand, 8> MaskVec; + for (unsigned i = 0; i < NumElems; i++) + MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT)); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item, + DAG.getNode(ISD::UNDEF, VT), Mask); + } + } + + // Let legalizer expand 2-wide build_vectors. + if (EVTBits == 64) + return SDOperand(); + + // If element VT is < 32 bits, convert it to inserts into a zero vector. + if (EVTBits == 8 && NumElems == 16) { + SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + *this); + if (V.Val) return V; + } + + if (EVTBits == 16 && NumElems == 8) { + SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + *this); + if (V.Val) return V; + } + + // If element VT is == 32 bits, turn it into a number of shuffles. + SmallVector<SDOperand, 8> V; + V.resize(NumElems); + if (NumElems == 4 && NumZero > 0) { + for (unsigned i = 0; i < 4; ++i) { + bool isZero = !(NonZeros & (1 << i)); + if (isZero) + V[i] = getZeroVector(VT, DAG); + else + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i)); + } + + for (unsigned i = 0; i < 2; ++i) { + switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { + default: break; + case 0: + V[i] = V[i*2]; // Must be a zero vector. + break; + case 1: + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2], + getMOVLMask(NumElems, DAG)); + break; + case 2: + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1], + getMOVLMask(NumElems, DAG)); + break; + case 3: + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1], + getUnpacklMask(NumElems, DAG)); + break; + } + } + + // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd) + // clears the upper bits. + // FIXME: we can do the same for v4f32 case when we know both parts of + // the lower half come from scalar_to_vector (loadf32). We should do + // that in post legalizer dag combiner with target specific hooks. + if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0) + return V[0]; + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType EVT = MVT::getVectorElementType(MaskVT); + SmallVector<SDOperand, 8> MaskVec; + bool Reverse = (NonZeros & 0x3) == 2; + for (unsigned i = 0; i < 2; ++i) + if (Reverse) + MaskVec.push_back(DAG.getConstant(1-i, EVT)); + else + MaskVec.push_back(DAG.getConstant(i, EVT)); + Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; + for (unsigned i = 0; i < 2; ++i) + if (Reverse) + MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT)); + else + MaskVec.push_back(DAG.getConstant(i+NumElems, EVT)); + SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskVec[0], MaskVec.size()); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask); + } + + if (Values.size() > 2) { + // Expand into a number of unpckl*. + // e.g. for v4f32 + // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> + // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> + // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> + SDOperand UnpckMask = getUnpacklMask(NumElems, DAG); + for (unsigned i = 0; i < NumElems; ++i) + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i)); + NumElems >>= 1; + while (NumElems != 0) { + for (unsigned i = 0; i < NumElems; ++i) + V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems], + UnpckMask); + NumElems >>= 1; + } + return V[0]; + } + + return SDOperand(); +} + +SDOperand +X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) { + SDOperand V1 = Op.getOperand(0); + SDOperand V2 = Op.getOperand(1); + SDOperand PermMask = Op.getOperand(2); + MVT::ValueType VT = Op.getValueType(); + unsigned NumElems = PermMask.getNumOperands(); + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsSplat = false; + bool V2IsSplat = false; + + if (isUndefShuffle(Op.Val)) + return DAG.getNode(ISD::UNDEF, VT); + + if (isZeroShuffle(Op.Val)) + return getZeroVector(VT, DAG); + + if (isIdentityMask(PermMask.Val)) + return V1; + else if (isIdentityMask(PermMask.Val, true)) + return V2; + + if (isSplatMask(PermMask.Val)) { + if (NumElems <= 4) return Op; + // Promote it to a v4i32 splat. + return PromoteSplat(Op, DAG); + } + + if (X86::isMOVLMask(PermMask.Val)) + return (V1IsUndef) ? V2 : Op; + + if (X86::isMOVSHDUPMask(PermMask.Val) || + X86::isMOVSLDUPMask(PermMask.Val) || + X86::isMOVHLPSMask(PermMask.Val) || + X86::isMOVHPMask(PermMask.Val) || + X86::isMOVLPMask(PermMask.Val)) + return Op; + + if (ShouldXformToMOVHLPS(PermMask.Val) || + ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val)) + return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + + bool Commuted = false; + V1IsSplat = isSplatVector(V1.Val); + V2IsSplat = isSplatVector(V2.Val); + if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { + Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + std::swap(V1IsSplat, V2IsSplat); + std::swap(V1IsUndef, V2IsUndef); + Commuted = true; + } + + if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) { + if (V2IsUndef) return V1; + Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + if (V2IsSplat) { + // V2 is a splat, so the mask may be malformed. That is, it may point + // to any V2 element. The instruction selectior won't like this. Get + // a corrected mask and commute to form a proper MOVS{S|D}. + SDOperand NewMask = getMOVLMask(NumElems, DAG); + if (NewMask.Val != PermMask.Val) + Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); + } + return Op; + } + + if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) || + X86::isUNPCKH_v_undef_Mask(PermMask.Val) || + X86::isUNPCKLMask(PermMask.Val) || + X86::isUNPCKHMask(PermMask.Val)) + return Op; + + if (V2IsSplat) { + // Normalize mask so all entries that point to V2 points to its first + // element then try to match unpck{h|l} again. If match, return a + // new vector_shuffle with the corrected mask. + SDOperand NewMask = NormalizeMask(PermMask, DAG); + if (NewMask.Val != PermMask.Val) { + if (X86::isUNPCKLMask(PermMask.Val, true)) { + SDOperand NewMask = getUnpacklMask(NumElems, DAG); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); + } else if (X86::isUNPCKHMask(PermMask.Val, true)) { + SDOperand NewMask = getUnpackhMask(NumElems, DAG); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask); + } + } + } + + // Normalize the node to match x86 shuffle ops if needed + if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val)) + Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + + if (Commuted) { + // Commute is back and try unpck* again. + Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); + if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) || + X86::isUNPCKH_v_undef_Mask(PermMask.Val) || + X86::isUNPCKLMask(PermMask.Val) || + X86::isUNPCKHMask(PermMask.Val)) + return Op; + } + + // If VT is integer, try PSHUF* first, then SHUFP*. + if (MVT::isInteger(VT)) { + if (X86::isPSHUFDMask(PermMask.Val) || + X86::isPSHUFHWMask(PermMask.Val) || + X86::isPSHUFLWMask(PermMask.Val)) { + if (V2.getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, + DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); + return Op; + } + + if (X86::isSHUFPMask(PermMask.Val) && + MVT::getSizeInBits(VT) != 64) // Don't do this for MMX. + return Op; + + // Handle v8i16 shuffle high / low shuffle node pair. + if (VT == MVT::v8i16 && isPSHUFHW_PSHUFLWMask(PermMask.Val)) { + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems); + MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT); + SmallVector<SDOperand, 8> MaskVec; + for (unsigned i = 0; i != 4; ++i) + MaskVec.push_back(PermMask.getOperand(i)); + for (unsigned i = 4; i != 8; ++i) + MaskVec.push_back(DAG.getConstant(i, BaseVT)); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskVec[0], MaskVec.size()); + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); + MaskVec.clear(); + for (unsigned i = 0; i != 4; ++i) + MaskVec.push_back(DAG.getConstant(i, BaseVT)); + for (unsigned i = 4; i != 8; ++i) + MaskVec.push_back(PermMask.getOperand(i)); + Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0],MaskVec.size()); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask); + } + } else { + // Floating point cases in the other order. + if (X86::isSHUFPMask(PermMask.Val)) + return Op; + if (X86::isPSHUFDMask(PermMask.Val) || + X86::isPSHUFHWMask(PermMask.Val) || + X86::isPSHUFLWMask(PermMask.Val)) { + if (V2.getOpcode() != ISD::UNDEF) + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, + DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask); + return Op; + } + } + + if (NumElems == 4 && + // Don't do this for MMX. + MVT::getSizeInBits(VT) != 64) { + MVT::ValueType MaskVT = PermMask.getValueType(); + MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT); + SmallVector<std::pair<int, int>, 8> Locs; + Locs.reserve(NumElems); + SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT)); + unsigned NumHi = 0; + unsigned NumLo = 0; + // If no more than two elements come from either vector. This can be + // implemented with two shuffles. First shuffle gather the elements. + // The second shuffle, which takes the first shuffle as both of its + // vector operands, put the elements into the right order. + for (unsigned i = 0; i != NumElems; ++i) { + SDOperand Elt = PermMask.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) { + Locs[i] = std::make_pair(-1, -1); + } else { + unsigned Val = cast<ConstantSDNode>(Elt)->getValue(); + if (Val < NumElems) { + Locs[i] = std::make_pair(0, NumLo); + Mask1[NumLo] = Elt; + NumLo++; + } else { + Locs[i] = std::make_pair(1, NumHi); + if (2+NumHi < NumElems) + Mask1[2+NumHi] = Elt; + NumHi++; + } + } + } + if (NumLo <= 2 && NumHi <= 2) { + V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &Mask1[0], Mask1.size())); + for (unsigned i = 0; i != NumElems; ++i) { + if (Locs[i].first == -1) + continue; + else { + unsigned Idx = (i < NumElems/2) ? 0 : NumElems; + Idx += Locs[i].first * (NumElems/2) + Locs[i].second; + Mask2[i] = DAG.getConstant(Idx, MaskEVT); + } + } + + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &Mask2[0], Mask2.size())); + } + + // Break it into (shuffle shuffle_hi, shuffle_lo). + Locs.clear(); + SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT)); + SmallVector<SDOperand,8> *MaskPtr = &LoMask; + unsigned MaskIdx = 0; + unsigned LoIdx = 0; + unsigned HiIdx = NumElems/2; + for (unsigned i = 0; i != NumElems; ++i) { + if (i == NumElems/2) { + MaskPtr = &HiMask; + MaskIdx = 1; + LoIdx = 0; + HiIdx = NumElems/2; + } + SDOperand Elt = PermMask.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) { + Locs[i] = std::make_pair(-1, -1); + } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) { + Locs[i] = std::make_pair(MaskIdx, LoIdx); + (*MaskPtr)[LoIdx] = Elt; + LoIdx++; + } else { + Locs[i] = std::make_pair(MaskIdx, HiIdx); + (*MaskPtr)[HiIdx] = Elt; + HiIdx++; + } + } + + SDOperand LoShuffle = + DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &LoMask[0], LoMask.size())); + SDOperand HiShuffle = + DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &HiMask[0], HiMask.size())); + SmallVector<SDOperand, 8> MaskOps; + for (unsigned i = 0; i != NumElems; ++i) { + if (Locs[i].first == -1) { + MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT)); + } else { + unsigned Idx = Locs[i].first * NumElems + Locs[i].second; + MaskOps.push_back(DAG.getConstant(Idx, MaskEVT)); + } + } + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskOps[0], MaskOps.size())); + } + + return SDOperand(); +} + +SDOperand +X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return SDOperand(); + + MVT::ValueType VT = Op.getValueType(); + // TODO: handle v16i8. + if (MVT::getSizeInBits(VT) == 16) { + // Transform it so it match pextrw which produces a 32-bit result. + MVT::ValueType EVT = (MVT::ValueType)(VT+1); + SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT, + Op.getOperand(0), Op.getOperand(1)); + SDOperand Assert = DAG.getNode(ISD::AssertZext, EVT, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, VT, Assert); + } else if (MVT::getSizeInBits(VT) == 32) { + SDOperand Vec = Op.getOperand(0); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + if (Idx == 0) + return Op; + // SHUFPS the element to the lowest double word, then movss. + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + SmallVector<SDOperand, 8> IdxVec; + IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &IdxVec[0], IdxVec.size()); + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), + Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + DAG.getConstant(0, getPointerTy())); + } else if (MVT::getSizeInBits(VT) == 64) { + SDOperand Vec = Op.getOperand(0); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + if (Idx == 0) + return Op; + + // UNPCKHPD the element to the lowest double word, then movsd. + // Note if the lower 64 bits of the result of the UNPCKHPD is then stored + // to a f64mem, the whole operation is folded into a single MOVHPDmr. + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + SmallVector<SDOperand, 8> IdxVec; + IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT))); + IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT))); + SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &IdxVec[0], IdxVec.size()); + Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(), + Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, + DAG.getConstant(0, getPointerTy())); + } + + return SDOperand(); +} + +SDOperand +X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) { + // Transform it so it match pinsrw which expects a 16-bit value in a GR32 + // as its second argument. + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType BaseVT = MVT::getVectorElementType(VT); + SDOperand N0 = Op.getOperand(0); + SDOperand N1 = Op.getOperand(1); + SDOperand N2 = Op.getOperand(2); + if (MVT::getSizeInBits(BaseVT) == 16) { + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getConstant(cast<ConstantSDNode>(N2)->getValue(),getPointerTy()); + return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2); + } else if (MVT::getSizeInBits(BaseVT) == 32) { + unsigned Idx = cast<ConstantSDNode>(N2)->getValue(); + if (Idx == 0) { + // Use a movss. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, N1); + MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4); + MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT); + SmallVector<SDOperand, 8> MaskVec; + MaskVec.push_back(DAG.getConstant(4, BaseVT)); + for (unsigned i = 1; i <= 3; ++i) + MaskVec.push_back(DAG.getConstant(i, BaseVT)); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, N0, N1, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, + &MaskVec[0], MaskVec.size())); + } else { + // Use two pinsrw instructions to insert a 32 bit value. + Idx <<= 1; + if (MVT::isFloatingPoint(N1.getValueType())) { + if (ISD::isNON_EXTLoad(N1.Val)) { + // Just load directly from f32mem to GR32. + LoadSDNode *LD = cast<LoadSDNode>(N1); + N1 = DAG.getLoad(MVT::i32, LD->getChain(), LD->getBasePtr(), + LD->getSrcValue(), LD->getSrcValueOffset()); + } else { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4f32, N1); + N1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, N1); + N1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, N1, + DAG.getConstant(0, getPointerTy())); + } + } + N0 = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, N0); + N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1, + DAG.getConstant(Idx, getPointerTy())); + N1 = DAG.getNode(ISD::SRL, MVT::i32, N1, DAG.getConstant(16, MVT::i8)); + N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1, + DAG.getConstant(Idx+1, getPointerTy())); + return DAG.getNode(ISD::BIT_CONVERT, VT, N0); + } + } + + return SDOperand(); +} + +SDOperand +X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) { + SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0)); + return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOV32ri. +SDOperand +X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(), + getPointerTy(), + CP->getAlignment()); + Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->isPICStyleRIPRel()) { + Result = DAG.getNode(ISD::ADD, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result); + } + + return Result; +} + +SDOperand +X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) { + GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy()); + Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->isPICStyleRIPRel()) { + Result = DAG.getNode(ISD::ADD, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result); + } + + // For Darwin & Mingw32, external and weak symbols are indirect, so we want to + // load the value at address GV, not the value of GV itself. This means that + // the GlobalAddress must be in the base or index register of the address, not + // the GV offset field. Platform check is inside GVRequiresExtraLoad() call + // The same applies for external symbols during PIC codegen + if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false)) + Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, NULL, 0); + + return Result; +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model +static SDOperand +LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const MVT::ValueType PtrVT) { + SDOperand InFlag; + SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + PtrVT), InFlag); + InFlag = Chain.getValue(1); + + // emit leal symbol@TLSGD(,%ebx,1), %eax + SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag); + SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), + GA->getValueType(0), + GA->getOffset()); + SDOperand Ops[] = { Chain, TGA, InFlag }; + SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3); + InFlag = Result.getValue(2); + Chain = Result.getValue(1); + + // call ___tls_get_addr. This function receives its argument in + // the register EAX. + Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag); + InFlag = Chain.getValue(1); + + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDOperand Ops1[] = { Chain, + DAG.getTargetExternalSymbol("___tls_get_addr", + PtrVT), + DAG.getRegister(X86::EAX, PtrVT), + DAG.getRegister(X86::EBX, PtrVT), + InFlag }; + Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5); + InFlag = Chain.getValue(1); + + return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or +// "local exec" model. +static SDOperand +LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const MVT::ValueType PtrVT) { + // Get the Thread Pointer + SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT); + // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial + // exec) + SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), + GA->getValueType(0), + GA->getOffset()); + SDOperand Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA); + + if (GA->getGlobal()->isDeclaration()) // initial exec TLS model + Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, NULL, 0); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset); +} + +SDOperand +X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) { + // TODO: implement the "local dynamic" model + // TODO: implement the "initial exec"model for pic executables + assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() && + "TLS not implemented for non-ELF and 64-bit targets"); + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + // If the relocation model is PIC, use the "General Dynamic" TLS Model, + // otherwise use the "Local Exec"TLS Model + if (getTargetMachine().getRelocationModel() == Reloc::PIC_) + return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy()); + else + return LowerToTLSExecModel(GA, DAG, getPointerTy()); +} + +SDOperand +X86TargetLowering::LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG) { + const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); + SDOperand Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); + Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->isPICStyleRIPRel()) { + Result = DAG.getNode(ISD::ADD, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result); + } + + return Result; +} + +SDOperand X86TargetLowering::LowerJumpTable(SDOperand Op, SelectionDAG &DAG) { + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + SDOperand Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); + Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->isPICStyleRIPRel()) { + Result = DAG.getNode(ISD::ADD, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()), + Result); + } + + return Result; +} + +SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 && + "Not an i64 shift!"); + bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; + SDOperand ShOpLo = Op.getOperand(0); + SDOperand ShOpHi = Op.getOperand(1); + SDOperand ShAmt = Op.getOperand(2); + SDOperand Tmp1 = isSRA ? + DAG.getNode(ISD::SRA, MVT::i32, ShOpHi, DAG.getConstant(31, MVT::i8)) : + DAG.getConstant(0, MVT::i32); + + SDOperand Tmp2, Tmp3; + if (Op.getOpcode() == ISD::SHL_PARTS) { + Tmp2 = DAG.getNode(X86ISD::SHLD, MVT::i32, ShOpHi, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, MVT::i32, ShOpLo, ShAmt); + } else { + Tmp2 = DAG.getNode(X86ISD::SHRD, MVT::i32, ShOpLo, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, MVT::i32, ShOpHi, ShAmt); + } + + const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag); + SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt, + DAG.getConstant(32, MVT::i8)); + SDOperand COps[]={DAG.getEntryNode(), AndNode, DAG.getConstant(0, MVT::i8)}; + SDOperand InFlag = DAG.getNode(X86ISD::CMP, VTs, 2, COps, 3).getValue(1); + + SDOperand Hi, Lo; + SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8); + + VTs = DAG.getNodeValueTypes(MVT::i32, MVT::Flag); + SmallVector<SDOperand, 4> Ops; + if (Op.getOpcode() == ISD::SHL_PARTS) { + Ops.push_back(Tmp2); + Ops.push_back(Tmp3); + Ops.push_back(CC); + Ops.push_back(InFlag); + Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); + InFlag = Hi.getValue(1); + + Ops.clear(); + Ops.push_back(Tmp3); + Ops.push_back(Tmp1); + Ops.push_back(CC); + Ops.push_back(InFlag); + Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); + } else { + Ops.push_back(Tmp2); + Ops.push_back(Tmp3); + Ops.push_back(CC); + Ops.push_back(InFlag); + Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); + InFlag = Lo.getValue(1); + + Ops.clear(); + Ops.push_back(Tmp3); + Ops.push_back(Tmp1); + Ops.push_back(CC); + Ops.push_back(InFlag); + Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); + } + + VTs = DAG.getNodeValueTypes(MVT::i32, MVT::i32); + Ops.clear(); + Ops.push_back(Lo); + Ops.push_back(Hi); + return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size()); +} + +SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getOperand(0).getValueType() <= MVT::i64 && + Op.getOperand(0).getValueType() >= MVT::i16 && + "Unknown SINT_TO_FP to lower!"); + + SDOperand Result; + MVT::ValueType SrcVT = Op.getOperand(0).getValueType(); + unsigned Size = MVT::getSizeInBits(SrcVT)/8; + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); + SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDOperand Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0), + StackSlot, NULL, 0); + + // Build the FILD + SDVTList Tys; + if (X86ScalarSSE) + Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); + else + Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(StackSlot); + Ops.push_back(DAG.getValueType(SrcVT)); + Result = DAG.getNode(X86ScalarSSE ? X86ISD::FILD_FLAG :X86ISD::FILD, + Tys, &Ops[0], Ops.size()); + + if (X86ScalarSSE) { + Chain = Result.getValue(1); + SDOperand InFlag = Result.getValue(2); + + // FIXME: Currently the FST is flagged to the FILD_FLAG. This + // shouldn't be necessary except that RFP cannot be live across + // multiple blocks. When stackifier is fixed, they can be uncoupled. + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); + SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + Tys = DAG.getVTList(MVT::Other); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Result); + Ops.push_back(StackSlot); + Ops.push_back(DAG.getValueType(Op.getValueType())); + Ops.push_back(InFlag); + Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size()); + Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, NULL, 0); + } + + return Result; +} + +SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) { + assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 && + "Unknown FP_TO_SINT to lower!"); + // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary + // stack slot. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); + SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + unsigned Opc; + switch (Op.getValueType()) { + default: assert(0 && "Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } + + SDOperand Chain = DAG.getEntryNode(); + SDOperand Value = Op.getOperand(0); + if (X86ScalarSSE) { + assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!"); + Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0); + SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); + SDOperand Ops[] = { + Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) + }; + Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3); + Chain = Value.getValue(1); + SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); + StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + } + + // Build the FP_TO_INT*_IN_MEM + SDOperand Ops[] = { Chain, Value, StackSlot }; + SDOperand FIST = DAG.getNode(Opc, MVT::Other, Ops, 3); + + // Load the result. + return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0); +} + +SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType EltVT = VT; + if (MVT::isVector(VT)) + EltVT = MVT::getVectorElementType(VT); + const Type *OpNTy = MVT::getTypeForValueType(EltVT); + std::vector<Constant*> CV; + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63))); + CV.push_back(C); + CV.push_back(C); + } else { + Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31))); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + } + Constant *CS = ConstantStruct::get(CV); + SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SmallVector<SDOperand, 3> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(CPIdx); + Ops.push_back(DAG.getSrcValue(NULL)); + SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask); +} + +SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) { + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType EltVT = VT; + if (MVT::isVector(VT)) + EltVT = MVT::getVectorElementType(VT); + const Type *OpNTy = MVT::getTypeForValueType(EltVT); + std::vector<Constant*> CV; + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63)); + CV.push_back(C); + CV.push_back(C); + } else { + Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31)); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + } + Constant *CS = ConstantStruct::get(CV); + SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SmallVector<SDOperand, 3> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(CPIdx); + Ops.push_back(DAG.getSrcValue(NULL)); + SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask); +} + +SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) { + SDOperand Op0 = Op.getOperand(0); + SDOperand Op1 = Op.getOperand(1); + MVT::ValueType VT = Op.getValueType(); + MVT::ValueType SrcVT = Op1.getValueType(); + const Type *SrcTy = MVT::getTypeForValueType(SrcVT); + + // If second operand is smaller, extend it first. + if (MVT::getSizeInBits(SrcVT) < MVT::getSizeInBits(VT)) { + Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1); + SrcVT = VT; + } + + // First get the sign bit of second operand. + std::vector<Constant*> CV; + if (SrcVT == MVT::f64) { + CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(1ULL << 63))); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + } else { + CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(1U << 31))); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + } + Constant *CS = ConstantStruct::get(CV); + SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); + SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other); + SmallVector<SDOperand, 3> Ops; + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(CPIdx); + Ops.push_back(DAG.getSrcValue(NULL)); + SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1); + + // Shift sign bit right or left if the two operands have different types. + if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) { + // Op0 is MVT::f32, Op1 is MVT::f64. + SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit); + SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit, + DAG.getConstant(32, MVT::i32)); + SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit, + DAG.getConstant(0, getPointerTy())); + } + + // Clear first operand sign bit. + CV.clear(); + if (VT == MVT::f64) { + CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(~(1ULL << 63)))); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + } else { + CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(~(1U << 31)))); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + CV.push_back(ConstantFP::get(SrcTy, 0.0)); + } + CS = ConstantStruct::get(CV); + CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); + Tys = DAG.getVTList(VT, MVT::Other); + Ops.clear(); + Ops.push_back(DAG.getEntryNode()); + Ops.push_back(CPIdx); + Ops.push_back(DAG.getSrcValue(NULL)); + SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size()); + SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2); + + // Or the value with the sign bit. + return DAG.getNode(X86ISD::FOR, VT, Val, SignBit); +} + +SDOperand X86TargetLowering::LowerSETCC(SDOperand Op, SelectionDAG &DAG, + SDOperand Chain) { + assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); + SDOperand Cond; + SDOperand Op0 = Op.getOperand(0); + SDOperand Op1 = Op.getOperand(1); + SDOperand CC = Op.getOperand(2); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + const MVT::ValueType *VTs1 = DAG.getNodeValueTypes(MVT::Other, MVT::Flag); + const MVT::ValueType *VTs2 = DAG.getNodeValueTypes(MVT::i8, MVT::Flag); + bool isFP = MVT::isFloatingPoint(Op.getOperand(1).getValueType()); + unsigned X86CC; + + if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC, + Op0, Op1, DAG)) { + SDOperand Ops1[] = { Chain, Op0, Op1 }; + Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, Ops1, 3).getValue(1); + SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond }; + return DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2); + } + + assert(isFP && "Illegal integer SetCC!"); + + SDOperand COps[] = { Chain, Op0, Op1 }; + Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, COps, 3).getValue(1); + + switch (SetCCOpcode) { + default: assert(false && "Illegal floating point SetCC!"); + case ISD::SETOEQ: { // !PF & ZF + SDOperand Ops1[] = { DAG.getConstant(X86::COND_NP, MVT::i8), Cond }; + SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2); + SDOperand Ops2[] = { DAG.getConstant(X86::COND_E, MVT::i8), + Tmp1.getValue(1) }; + SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2); + return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2); + } + case ISD::SETUNE: { // PF | !ZF + SDOperand Ops1[] = { DAG.getConstant(X86::COND_P, MVT::i8), Cond }; + SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2); + SDOperand Ops2[] = { DAG.getConstant(X86::COND_NE, MVT::i8), + Tmp1.getValue(1) }; + SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2); + return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2); + } + } +} + +SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) { + bool addTest = true; + SDOperand Chain = DAG.getEntryNode(); + SDOperand Cond = Op.getOperand(0); + SDOperand CC; + const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag); + + if (Cond.getOpcode() == ISD::SETCC) + Cond = LowerSETCC(Cond, DAG, Chain); + + if (Cond.getOpcode() == X86ISD::SETCC) { + CC = Cond.getOperand(0); + + // If condition flag is set by a X86ISD::CMP, then make a copy of it + // (since flag operand cannot be shared). Use it as the condition setting + // operand in place of the X86ISD::SETCC. + // If the X86ISD::SETCC has more than one use, then perhaps it's better + // to use a test instead of duplicating the X86ISD::CMP (for register + // pressure reason)? + SDOperand Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + bool IllegalFPCMov = !X86ScalarSSE && + MVT::isFloatingPoint(Op.getValueType()) && + !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended()); + if ((Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) && + !IllegalFPCMov) { + SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) }; + Cond = DAG.getNode(Opc, VTs, 2, Ops, 3); + addTest = false; + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) }; + Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3); + } + + VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::Flag); + SmallVector<SDOperand, 4> Ops; + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if + // condition is true. + Ops.push_back(Op.getOperand(2)); + Ops.push_back(Op.getOperand(1)); + Ops.push_back(CC); + Ops.push_back(Cond.getValue(1)); + return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size()); +} + +SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) { + bool addTest = true; + SDOperand Chain = Op.getOperand(0); + SDOperand Cond = Op.getOperand(1); + SDOperand Dest = Op.getOperand(2); + SDOperand CC; + const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag); + + if (Cond.getOpcode() == ISD::SETCC) + Cond = LowerSETCC(Cond, DAG, Chain); + + if (Cond.getOpcode() == X86ISD::SETCC) { + CC = Cond.getOperand(0); + + // If condition flag is set by a X86ISD::CMP, then make a copy of it + // (since flag operand cannot be shared). Use it as the condition setting + // operand in place of the X86ISD::SETCC. + // If the X86ISD::SETCC has more than one use, then perhaps it's better + // to use a test instead of duplicating the X86ISD::CMP (for register + // pressure reason)? + SDOperand Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) { + SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) }; + Cond = DAG.getNode(Opc, VTs, 2, Ops, 3); + addTest = false; + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) }; + Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3); + } + return DAG.getNode(X86ISD::BRCOND, Op.getValueType(), + Cond, Op.getOperand(2), CC, Cond.getValue(1)); +} + +SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) { + unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + + if (Subtarget->is64Bit()) + return LowerX86_64CCCCallTo(Op, DAG, CallingConv); + else + switch (CallingConv) { + default: + assert(0 && "Unsupported calling convention"); + case CallingConv::Fast: + // TODO: Implement fastcc + // Falls through + case CallingConv::C: + case CallingConv::X86_StdCall: + return LowerCCCCallTo(Op, DAG, CallingConv); + case CallingConv::X86_FastCall: + return LowerFastCCCallTo(Op, DAG, CallingConv); + } +} + + +// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. +// Calls to _alloca is needed to probe the stack when allocating more than 4k +// bytes in one go. Touching the stack at 4K increments is necessary to ensure +// that the guard pages used by the OS virtual memory manager are allocated in +// correct sequence. +SDOperand +X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDOperand Op, + SelectionDAG &DAG) { + assert(Subtarget->isTargetCygMing() && + "This should be used only on Cygwin/Mingw targets"); + + // Get the inputs. + SDOperand Chain = Op.getOperand(0); + SDOperand Size = Op.getOperand(1); + // FIXME: Ensure alignment here + + SDOperand Flag; + + MVT::ValueType IntPtr = getPointerTy(); + MVT::ValueType SPTy = (Subtarget->is64Bit() ? MVT::i64 : MVT::i32); + + Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag); + Flag = Chain.getValue(1); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SDOperand Ops[] = { Chain, + DAG.getTargetExternalSymbol("_alloca", IntPtr), + DAG.getRegister(X86::EAX, IntPtr), + Flag }; + Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 4); + Flag = Chain.getValue(1); + + Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1); + + std::vector<MVT::ValueType> Tys; + Tys.push_back(SPTy); + Tys.push_back(MVT::Other); + SDOperand Ops1[2] = { Chain.getValue(0), Chain }; + return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2); +} + +SDOperand +X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + const Function* Fn = MF.getFunction(); + if (Fn->hasExternalLinkage() && + Subtarget->isTargetCygMing() && + Fn->getName() == "main") + MF.getInfo<X86MachineFunctionInfo>()->setForceFramePointer(true); + + unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); + if (Subtarget->is64Bit()) + return LowerX86_64CCCArguments(Op, DAG); + else + switch(CC) { + default: + assert(0 && "Unsupported calling convention"); + case CallingConv::Fast: + // TODO: implement fastcc. + + // Falls through + case CallingConv::C: + return LowerCCCArguments(Op, DAG); + case CallingConv::X86_StdCall: + MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(StdCall); + return LowerCCCArguments(Op, DAG, true); + case CallingConv::X86_FastCall: + MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(FastCall); + return LowerFastCCArguments(Op, DAG); + } +} + +SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) { + SDOperand InFlag(0, 0); + SDOperand Chain = Op.getOperand(0); + unsigned Align = + (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue(); + if (Align == 0) Align = 1; + + ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + // If not DWORD aligned, call memset if size is less than the threshold. + // It knows how to align to the right boundary first. + if ((Align & 3) != 0 || + (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) { + MVT::ValueType IntPtr = getPointerTy(); + const Type *IntPtrTy = getTargetData()->getIntPtrType(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Op.getOperand(1); + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + // Extend the unsigned i8 argument to be an int value for the call. + Entry.Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Op.getOperand(2)); + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Op.getOperand(3); + Args.push_back(Entry); + std::pair<SDOperand,SDOperand> CallResult = + LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false, + DAG.getExternalSymbol("memset", IntPtr), Args, DAG); + return CallResult.second; + } + + MVT::ValueType AVT; + SDOperand Count; + ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + unsigned BytesLeft = 0; + bool TwoRepStos = false; + if (ValC) { + unsigned ValReg; + uint64_t Val = ValC->getValue() & 255; + + // If the value is a constant, then we can potentially use larger sets. + switch (Align & 3) { + case 2: // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + ValReg = X86::EAX; + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) { // QWORD aligned + AVT = MVT::i64; + ValReg = X86::RAX; + Val = (Val << 32) | Val; + } + break; + default: // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = Op.getOperand(3); + break; + } + + if (AVT > MVT::i8) { + if (I) { + unsigned UBytes = MVT::getSizeInBits(AVT) / 8; + Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy()); + BytesLeft = I->getValue() % UBytes; + } else { + assert(AVT >= MVT::i32 && + "Do not use rep;stos if not at least DWORD aligned"); + Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(), + Op.getOperand(3), DAG.getConstant(2, MVT::i8)); + TwoRepStos = true; + } + } + + Chain = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT), + InFlag); + InFlag = Chain.getValue(1); + } else { + AVT = MVT::i8; + Count = Op.getOperand(3); + Chain = DAG.getCopyToReg(Chain, X86::AL, Op.getOperand(2), InFlag); + InFlag = Chain.getValue(1); + } + + Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI, + Op.getOperand(1), InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(DAG.getValueType(AVT)); + Ops.push_back(InFlag); + Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size()); + + if (TwoRepStos) { + InFlag = Chain.getValue(1); + Count = Op.getOperand(3); + MVT::ValueType CVT = Count.getValueType(); + SDOperand Left = DAG.getNode(ISD::AND, CVT, Count, + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); + Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); + InFlag = Chain.getValue(1); + Tys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getValueType(MVT::i8)); + Ops.push_back(InFlag); + Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size()); + } else if (BytesLeft) { + // Issue stores for the last 1 - 7 bytes. + SDOperand Value; + unsigned Val = ValC->getValue() & 255; + unsigned Offset = I->getValue() - BytesLeft; + SDOperand DstAddr = Op.getOperand(1); + MVT::ValueType AddrVT = DstAddr.getValueType(); + if (BytesLeft >= 4) { + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + Value = DAG.getConstant(Val, MVT::i32); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, AddrVT, DstAddr, + DAG.getConstant(Offset, AddrVT)), + NULL, 0); + BytesLeft -= 4; + Offset += 4; + } + if (BytesLeft >= 2) { + Value = DAG.getConstant((Val << 8) | Val, MVT::i16); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, AddrVT, DstAddr, + DAG.getConstant(Offset, AddrVT)), + NULL, 0); + BytesLeft -= 2; + Offset += 2; + } + if (BytesLeft == 1) { + Value = DAG.getConstant(Val, MVT::i8); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, AddrVT, DstAddr, + DAG.getConstant(Offset, AddrVT)), + NULL, 0); + } + } + + return Chain; +} + +SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { + SDOperand Chain = Op.getOperand(0); + unsigned Align = + (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue(); + if (Align == 0) Align = 1; + + ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + // If not DWORD aligned, call memcpy if size is less than the threshold. + // It knows how to align to the right boundary first. + if ((Align & 3) != 0 || + (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) { + MVT::ValueType IntPtr = getPointerTy(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = getTargetData()->getIntPtrType(); + Entry.Node = Op.getOperand(1); Args.push_back(Entry); + Entry.Node = Op.getOperand(2); Args.push_back(Entry); + Entry.Node = Op.getOperand(3); Args.push_back(Entry); + std::pair<SDOperand,SDOperand> CallResult = + LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false, + DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG); + return CallResult.second; + } + + MVT::ValueType AVT; + SDOperand Count; + unsigned BytesLeft = 0; + bool TwoRepMovs = false; + switch (Align & 3) { + case 2: // WORD aligned + AVT = MVT::i16; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) // QWORD aligned + AVT = MVT::i64; + break; + default: // Byte aligned + AVT = MVT::i8; + Count = Op.getOperand(3); + break; + } + + if (AVT > MVT::i8) { + if (I) { + unsigned UBytes = MVT::getSizeInBits(AVT) / 8; + Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy()); + BytesLeft = I->getValue() % UBytes; + } else { + assert(AVT >= MVT::i32 && + "Do not use rep;movs if not at least DWORD aligned"); + Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(), + Op.getOperand(3), DAG.getConstant(2, MVT::i8)); + TwoRepMovs = true; + } + } + + SDOperand InFlag(0, 0); + Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI, + Op.getOperand(1), InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI, + Op.getOperand(2), InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(DAG.getValueType(AVT)); + Ops.push_back(InFlag); + Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()); + + if (TwoRepMovs) { + InFlag = Chain.getValue(1); + Count = Op.getOperand(3); + MVT::ValueType CVT = Count.getValueType(); + SDOperand Left = DAG.getNode(ISD::AND, CVT, Count, + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); + Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); + InFlag = Chain.getValue(1); + Tys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(DAG.getValueType(MVT::i8)); + Ops.push_back(InFlag); + Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()); + } else if (BytesLeft) { + // Issue loads and stores for the last 1 - 7 bytes. + unsigned Offset = I->getValue() - BytesLeft; + SDOperand DstAddr = Op.getOperand(1); + MVT::ValueType DstVT = DstAddr.getValueType(); + SDOperand SrcAddr = Op.getOperand(2); + MVT::ValueType SrcVT = SrcAddr.getValueType(); + SDOperand Value; + if (BytesLeft >= 4) { + Value = DAG.getLoad(MVT::i32, Chain, + DAG.getNode(ISD::ADD, SrcVT, SrcAddr, + DAG.getConstant(Offset, SrcVT)), + NULL, 0); + Chain = Value.getValue(1); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, DstVT, DstAddr, + DAG.getConstant(Offset, DstVT)), + NULL, 0); + BytesLeft -= 4; + Offset += 4; + } + if (BytesLeft >= 2) { + Value = DAG.getLoad(MVT::i16, Chain, + DAG.getNode(ISD::ADD, SrcVT, SrcAddr, + DAG.getConstant(Offset, SrcVT)), + NULL, 0); + Chain = Value.getValue(1); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, DstVT, DstAddr, + DAG.getConstant(Offset, DstVT)), + NULL, 0); + BytesLeft -= 2; + Offset += 2; + } + + if (BytesLeft == 1) { + Value = DAG.getLoad(MVT::i8, Chain, + DAG.getNode(ISD::ADD, SrcVT, SrcAddr, + DAG.getConstant(Offset, SrcVT)), + NULL, 0); + Chain = Value.getValue(1); + Chain = DAG.getStore(Chain, Value, + DAG.getNode(ISD::ADD, DstVT, DstAddr, + DAG.getConstant(Offset, DstVT)), + NULL, 0); + } + } + + return Chain; +} + +SDOperand +X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); + SDOperand TheOp = Op.getOperand(0); + SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1); + if (Subtarget->is64Bit()) { + SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1)); + SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX, + MVT::i64, Copy1.getValue(2)); + SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2, + DAG.getConstant(32, MVT::i8)); + SDOperand Ops[] = { + DAG.getNode(ISD::OR, MVT::i64, Copy1, Tmp), Copy2.getValue(1) + }; + + Tys = DAG.getVTList(MVT::i64, MVT::Other); + return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2); + } + + SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1)); + SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::EDX, + MVT::i32, Copy1.getValue(2)); + SDOperand Ops[] = { Copy1, Copy2, Copy2.getValue(1) }; + Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 3); +} + +SDOperand X86TargetLowering::LowerVASTART(SDOperand Op, SelectionDAG &DAG) { + SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2)); + + if (!Subtarget->is64Bit()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); + return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV->getValue(), + SV->getOffset()); + } + + // __va_list_tag: + // gp_offset (0 - 6 * 8) + // fp_offset (48 - 48 + 8 * 16) + // overflow_arg_area (point to parameters coming in memory). + // reg_save_area + SmallVector<SDOperand, 8> MemOps; + SDOperand FIN = Op.getOperand(1); + // Store gp_offset + SDOperand Store = DAG.getStore(Op.getOperand(0), + DAG.getConstant(VarArgsGPOffset, MVT::i32), + FIN, SV->getValue(), SV->getOffset()); + MemOps.push_back(Store); + + // Store fp_offset + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + Store = DAG.getStore(Op.getOperand(0), + DAG.getConstant(VarArgsFPOffset, MVT::i32), + FIN, SV->getValue(), SV->getOffset()); + MemOps.push_back(Store); + + // Store ptr to overflow_arg_area + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(4, getPointerTy())); + SDOperand OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV->getValue(), + SV->getOffset()); + MemOps.push_back(Store); + + // Store ptr to reg_save_area. + FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN, + DAG.getConstant(8, getPointerTy())); + SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV->getValue(), + SV->getOffset()); + MemOps.push_back(Store); + return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size()); +} + +SDOperand X86TargetLowering::LowerVACOPY(SDOperand Op, SelectionDAG &DAG) { + // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + SDOperand Chain = Op.getOperand(0); + SDOperand DstPtr = Op.getOperand(1); + SDOperand SrcPtr = Op.getOperand(2); + SrcValueSDNode *DstSV = cast<SrcValueSDNode>(Op.getOperand(3)); + SrcValueSDNode *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4)); + + SrcPtr = DAG.getLoad(getPointerTy(), Chain, SrcPtr, + SrcSV->getValue(), SrcSV->getOffset()); + Chain = SrcPtr.getValue(1); + for (unsigned i = 0; i < 3; ++i) { + SDOperand Val = DAG.getLoad(MVT::i64, Chain, SrcPtr, + SrcSV->getValue(), SrcSV->getOffset()); + Chain = Val.getValue(1); + Chain = DAG.getStore(Chain, Val, DstPtr, + DstSV->getValue(), DstSV->getOffset()); + if (i == 2) + break; + SrcPtr = DAG.getNode(ISD::ADD, getPointerTy(), SrcPtr, + DAG.getConstant(8, getPointerTy())); + DstPtr = DAG.getNode(ISD::ADD, getPointerTy(), DstPtr, + DAG.getConstant(8, getPointerTy())); + } + return Chain; +} + +SDOperand +X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue(); + switch (IntNo) { + default: return SDOperand(); // Don't custom lower most intrinsics. + // Comparison intrinsics. + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomineq_sd: { + unsigned Opc = 0; + ISD::CondCode CC = ISD::SETCC_INVALID; + switch (IntNo) { + default: break; + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse2_comieq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse2_comilt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse2_comile_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse2_comigt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse2_comige_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse2_comineq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETNE; + break; + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse2_ucomieq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse2_ucomilt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse2_ucomile_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse2_ucomigt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse2_ucomige_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_ucomineq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETNE; + break; + } + + unsigned X86CC; + SDOperand LHS = Op.getOperand(1); + SDOperand RHS = Op.getOperand(2); + translateX86CC(CC, true, X86CC, LHS, RHS, DAG); + + const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag); + SDOperand Ops1[] = { DAG.getEntryNode(), LHS, RHS }; + SDOperand Cond = DAG.getNode(Opc, VTs, 2, Ops1, 3); + VTs = DAG.getNodeValueTypes(MVT::i8, MVT::Flag); + SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond }; + SDOperand SetCC = DAG.getNode(X86ISD::SETCC, VTs, 2, Ops2, 2); + return DAG.getNode(ISD::ANY_EXTEND, MVT::i32, SetCC); + } + } +} + +SDOperand X86TargetLowering::LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG) { + // Depths > 0 not supported yet! + if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0) + return SDOperand(); + + // Just load the return address + SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG); + return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0); +} + +SDOperand X86TargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG) { + // Depths > 0 not supported yet! + if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0) + return SDOperand(); + + SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG); + return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI, + DAG.getConstant(4, getPointerTy())); +} + +SDOperand X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDOperand Op, + SelectionDAG &DAG) { + // Is not yet supported on x86-64 + if (Subtarget->is64Bit()) + return SDOperand(); + + return DAG.getConstant(8, getPointerTy()); +} + +SDOperand X86TargetLowering::LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG) +{ + assert(!Subtarget->is64Bit() && + "Lowering of eh_return builtin is not supported yet on x86-64"); + + MachineFunction &MF = DAG.getMachineFunction(); + SDOperand Chain = Op.getOperand(0); + SDOperand Offset = Op.getOperand(1); + SDOperand Handler = Op.getOperand(2); + + SDOperand Frame = DAG.getRegister(RegInfo->getFrameRegister(MF), + getPointerTy()); + + SDOperand StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame, + DAG.getConstant(-4UL, getPointerTy())); + StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset); + Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0); + Chain = DAG.getCopyToReg(Chain, X86::ECX, StoreAddr); + MF.addLiveOut(X86::ECX); + + return DAG.getNode(X86ISD::EH_RETURN, MVT::Other, + Chain, DAG.getRegister(X86::ECX, getPointerTy())); +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { + switch (Op.getOpcode()) { + default: assert(0 && "Should not custom lower this!"); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); + case ISD::SHL_PARTS: + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerShift(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FABS: return LowerFABS(Op, DAG); + case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG, DAG.getEntryNode()); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::CALL: return LowerCALL(Op, DAG); + case ISD::RET: return LowerRET(Op, DAG); + case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); + case ISD::MEMSET: return LowerMEMSET(Op, DAG); + case ISD::MEMCPY: return LowerMEMCPY(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLCECOUNTER(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::FRAME_TO_ARGS_OFFSET: + return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + } + return SDOperand(); +} + +const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return NULL; + case X86ISD::SHLD: return "X86ISD::SHLD"; + case X86ISD::SHRD: return "X86ISD::SHRD"; + case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FOR: return "X86ISD::FOR"; + case X86ISD::FXOR: return "X86ISD::FXOR"; + case X86ISD::FSRL: return "X86ISD::FSRL"; + case X86ISD::FILD: return "X86ISD::FILD"; + case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; + case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; + case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; + case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FLD: return "X86ISD::FLD"; + case X86ISD::FST: return "X86ISD::FST"; + case X86ISD::FP_GET_RESULT: return "X86ISD::FP_GET_RESULT"; + case X86ISD::FP_SET_RESULT: return "X86ISD::FP_SET_RESULT"; + case X86ISD::CALL: return "X86ISD::CALL"; + case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; + case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::COMI: return "X86ISD::COMI"; + case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::SETCC: return "X86ISD::SETCC"; + case X86ISD::CMOV: return "X86ISD::CMOV"; + case X86ISD::BRCOND: return "X86ISD::BRCOND"; + case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; + case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; + case X86ISD::LOAD_PACK: return "X86ISD::LOAD_PACK"; + case X86ISD::LOAD_UA: return "X86ISD::LOAD_UA"; + case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; + case X86ISD::Wrapper: return "X86ISD::Wrapper"; + case X86ISD::S2VEC: return "X86ISD::S2VEC"; + case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER"; + case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; + } +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const { + // X86 supports extremely general addressing modes. + + // X86 allows a sign-extended 32-bit immediate field as a displacement. + if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) + return false; + + if (AM.BaseGV) { + // X86-64 only supports addr of globals in small code model. + if (Subtarget->is64Bit() && + getTargetMachine().getCodeModel() != CodeModel::Small) + return false; + + // We can only fold this if we don't need a load either. + if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) + return false; + } + + switch (AM.Scale) { + case 0: + case 1: + case 2: + case 4: + case 8: + // These scales always work. + break; + case 3: + case 5: + case 9: + // These scales are formed with basereg+scalereg. Only accept if there is + // no basereg yet. + if (AM.HasBaseReg) + return false; + break; + default: // Other stuff never works. + return false; + } + + return true; +} + + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const { + // Only do shuffles on 128-bit vector types for now. + if (MVT::getSizeInBits(VT) == 64) return false; + return (Mask.Val->getNumOperands() <= 4 || + isIdentityMask(Mask.Val) || + isIdentityMask(Mask.Val, true) || + isSplatMask(Mask.Val) || + isPSHUFHW_PSHUFLWMask(Mask.Val) || + X86::isUNPCKLMask(Mask.Val) || + X86::isUNPCKHMask(Mask.Val) || + X86::isUNPCKL_v_undef_Mask(Mask.Val) || + X86::isUNPCKH_v_undef_Mask(Mask.Val)); +} + +bool X86TargetLowering::isVectorClearMaskLegal(std::vector<SDOperand> &BVOps, + MVT::ValueType EVT, + SelectionDAG &DAG) const { + unsigned NumElts = BVOps.size(); + // Only do shuffles on 128-bit vector types for now. + if (MVT::getSizeInBits(EVT) * NumElts == 64) return false; + if (NumElts == 2) return true; + if (NumElts == 4) { + return (isMOVLMask(&BVOps[0], 4) || + isCommutedMOVL(&BVOps[0], 4, true) || + isSHUFPMask(&BVOps[0], 4) || + isCommutedSHUFP(&BVOps[0], 4)); + } + return false; +} + +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +X86TargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *BB) { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + switch (MI->getOpcode()) { + default: assert(false && "Unexpected instr type to insert"); + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_V4F32: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: { + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + ilist<MachineBasicBlock>::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB); + unsigned Opc = + X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB); + MachineFunction *F = BB->getParent(); + F->getBasicBlockList().insert(It, copy0MBB); + F->getBasicBlockList().insert(It, sinkMBB); + // Update machine-CFG edges by first adding all successors of the current + // block to the new block which will contain the Phi node for the select. + for(MachineBasicBlock::succ_iterator i = BB->succ_begin(), + e = BB->succ_end(); i != e; ++i) + sinkMBB->addSuccessor(*i); + // Next, remove all successors of the current block, and add the true + // and fallthrough blocks as its successors. + while(!BB->succ_empty()) + BB->removeSuccessor(BB->succ_begin()); + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + delete MI; // The pseudo instruction is gone now. + return BB; + } + + case X86::FP32_TO_INT16_IN_MEM: + case X86::FP32_TO_INT32_IN_MEM: + case X86::FP32_TO_INT64_IN_MEM: + case X86::FP64_TO_INT16_IN_MEM: + case X86::FP64_TO_INT32_IN_MEM: + case X86::FP64_TO_INT64_IN_MEM: { + // Change the floating point control register to use "round towards zero" + // mode when truncating to an integer value. + MachineFunction *F = BB->getParent(); + int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); + addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx); + + // Load the old value of the high byte of the control word... + unsigned OldCW = + F->getSSARegMap()->createVirtualRegister(X86::GR16RegisterClass); + addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx); + + // Set the high part to be round to zero... + addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx) + .addImm(0xC7F); + + // Reload the modified control word now... + addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx); + + // Restore the memory image of control word to original value + addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx) + .addReg(OldCW); + + // Get the X86 opcode to use. + unsigned Opc; + switch (MI->getOpcode()) { + default: assert(0 && "illegal opcode!"); + case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; + case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; + case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; + case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; + case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; + case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; + } + + X86AddressMode AM; + MachineOperand &Op = MI->getOperand(0); + if (Op.isRegister()) { + AM.BaseType = X86AddressMode::RegBase; + AM.Base.Reg = Op.getReg(); + } else { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = Op.getFrameIndex(); + } + Op = MI->getOperand(1); + if (Op.isImmediate()) + AM.Scale = Op.getImm(); + Op = MI->getOperand(2); + if (Op.isImmediate()) + AM.IndexReg = Op.getImm(); + Op = MI->getOperand(3); + if (Op.isGlobalAddress()) { + AM.GV = Op.getGlobal(); + } else { + AM.Disp = Op.getImm(); + } + addFullAddress(BuildMI(BB, TII->get(Opc)), AM) + .addReg(MI->getOperand(4).getReg()); + + // Reload the original control word now. + addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx); + + delete MI; // The pseudo instruction is gone now. + return BB; + } + } +} + +//===----------------------------------------------------------------------===// +// X86 Optimization Hooks +//===----------------------------------------------------------------------===// + +void X86TargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + assert((Opc >= ISD::BUILTIN_OP_END || + Opc == ISD::INTRINSIC_WO_CHAIN || + Opc == ISD::INTRINSIC_W_CHAIN || + Opc == ISD::INTRINSIC_VOID) && + "Should use MaskedValueIsZero if you don't know whether Op" + " is a target node!"); + + KnownZero = KnownOne = 0; // Don't know anything. + switch (Opc) { + default: break; + case X86ISD::SETCC: + KnownZero |= (MVT::getIntVTBitMask(Op.getValueType()) ^ 1ULL); + break; + } +} + +/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// element of the result of the vector shuffle. +static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) { + MVT::ValueType VT = N->getValueType(0); + SDOperand PermMask = N->getOperand(2); + unsigned NumElems = PermMask.getNumOperands(); + SDOperand V = (i < NumElems) ? N->getOperand(0) : N->getOperand(1); + i %= NumElems; + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return (i == 0) + ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT)); + } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) { + SDOperand Idx = PermMask.getOperand(i); + if (Idx.getOpcode() == ISD::UNDEF) + return DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT)); + return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Idx)->getValue(),DAG); + } + return SDOperand(); +} + +/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the +/// node is a GlobalAddress + an offset. +static bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) { + unsigned Opc = N->getOpcode(); + if (Opc == X86ISD::Wrapper) { + if (dyn_cast<GlobalAddressSDNode>(N->getOperand(0))) { + GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); + return true; + } + } else if (Opc == ISD::ADD) { + SDOperand N1 = N->getOperand(0); + SDOperand N2 = N->getOperand(1); + if (isGAPlusOffset(N1.Val, GA, Offset)) { + ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2); + if (V) { + Offset += V->getSignExtended(); + return true; + } + } else if (isGAPlusOffset(N2.Val, GA, Offset)) { + ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1); + if (V) { + Offset += V->getSignExtended(); + return true; + } + } + } + return false; +} + +/// isConsecutiveLoad - Returns true if N is loading from an address of Base +/// + Dist * Size. +static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size, + MachineFrameInfo *MFI) { + if (N->getOperand(0).Val != Base->getOperand(0).Val) + return false; + + SDOperand Loc = N->getOperand(1); + SDOperand BaseLoc = Base->getOperand(1); + if (Loc.getOpcode() == ISD::FrameIndex) { + if (BaseLoc.getOpcode() != ISD::FrameIndex) + return false; + int FI = dyn_cast<FrameIndexSDNode>(Loc)->getIndex(); + int BFI = dyn_cast<FrameIndexSDNode>(BaseLoc)->getIndex(); + int FS = MFI->getObjectSize(FI); + int BFS = MFI->getObjectSize(BFI); + if (FS != BFS || FS != Size) return false; + return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Size); + } else { + GlobalValue *GV1 = NULL; + GlobalValue *GV2 = NULL; + int64_t Offset1 = 0; + int64_t Offset2 = 0; + bool isGA1 = isGAPlusOffset(Loc.Val, GV1, Offset1); + bool isGA2 = isGAPlusOffset(BaseLoc.Val, GV2, Offset2); + if (isGA1 && isGA2 && GV1 == GV2) + return Offset1 == (Offset2 + Dist*Size); + } + + return false; +} + +static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI, + const X86Subtarget *Subtarget) { + GlobalValue *GV; + int64_t Offset; + if (isGAPlusOffset(Base, GV, Offset)) + return (GV->getAlignment() >= 16 && (Offset % 16) == 0); + else { + assert(Base->getOpcode() == ISD::FrameIndex && "Unexpected base node!"); + int BFI = dyn_cast<FrameIndexSDNode>(Base)->getIndex(); + if (BFI < 0) + // Fixed objects do not specify alignment, however the offsets are known. + return ((Subtarget->getStackAlignment() % 16) == 0 && + (MFI->getObjectOffset(BFI) % 16) == 0); + else + return MFI->getObjectAlignment(BFI) >= 16; + } + return false; +} + + +/// PerformShuffleCombine - Combine a vector_shuffle that is equal to +/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load +/// if the load addresses are consecutive, non-overlapping, and in the right +/// order. +static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MVT::ValueType VT = N->getValueType(0); + MVT::ValueType EVT = MVT::getVectorElementType(VT); + SDOperand PermMask = N->getOperand(2); + int NumElems = (int)PermMask.getNumOperands(); + SDNode *Base = NULL; + for (int i = 0; i < NumElems; ++i) { + SDOperand Idx = PermMask.getOperand(i); + if (Idx.getOpcode() == ISD::UNDEF) { + if (!Base) return SDOperand(); + } else { + SDOperand Arg = + getShuffleScalarElt(N, cast<ConstantSDNode>(Idx)->getValue(), DAG); + if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val)) + return SDOperand(); + if (!Base) + Base = Arg.Val; + else if (!isConsecutiveLoad(Arg.Val, Base, + i, MVT::getSizeInBits(EVT)/8,MFI)) + return SDOperand(); + } + } + + bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget); + if (isAlign16) { + LoadSDNode *LD = cast<LoadSDNode>(Base); + return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(), + LD->getSrcValueOffset()); + } else { + // Just use movups, it's shorter. + SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); + SmallVector<SDOperand, 3> Ops; + Ops.push_back(Base->getOperand(0)); + Ops.push_back(Base->getOperand(1)); + Ops.push_back(Base->getOperand(2)); + return DAG.getNode(ISD::BIT_CONVERT, VT, + DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size())); + } +} + +/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. +static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDOperand Cond = N->getOperand(0); + + // If we have SSE[12] support, try to form min/max nodes. + if (Subtarget->hasSSE2() && + (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) { + if (Cond.getOpcode() == ISD::SETCC) { + // Get the LHS/RHS of the select. + SDOperand LHS = N->getOperand(1); + SDOperand RHS = N->getOperand(2); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + unsigned Opcode = 0; + if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { + switch (CC) { + default: break; + case ISD::SETOLE: // (X <= Y) ? X : Y -> min + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min + case ISD::SETLT: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOGT: // (X > Y) ? X : Y -> max + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; + } + } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { + switch (CC) { + default: break; + case ISD::SETOGT: // (X > Y) ? Y : X -> min + case ISD::SETUGT: + case ISD::SETGT: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOLE: // (X <= Y) ? Y : X -> max + case ISD::SETULE: + case ISD::SETLE: + if (!UnsafeFPMath) break; + // FALL THROUGH. + case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max + case ISD::SETLT: + Opcode = X86ISD::FMAX; + break; + } + } + + if (Opcode) + return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS); + } + + } + + return SDOperand(); +} + + +SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: break; + case ISD::VECTOR_SHUFFLE: + return PerformShuffleCombine(N, DAG, Subtarget); + case ISD::SELECT: + return PerformSELECTCombine(N, DAG, Subtarget); + } + + return SDOperand(); +} + +//===----------------------------------------------------------------------===// +// X86 Inline Assembly Support +//===----------------------------------------------------------------------===// + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +X86TargetLowering::ConstraintType +X86TargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'A': + case 'r': + case 'R': + case 'l': + case 'q': + case 'Q': + case 'x': + case 'Y': + return C_RegisterClass; + default: + break; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// isOperandValidForConstraint - Return the specified operand (possibly +/// modified) if the specified SDOperand is valid for the specified target +/// constraint letter, otherwise return null. +SDOperand X86TargetLowering:: +isOperandValidForConstraint(SDOperand Op, char Constraint, SelectionDAG &DAG) { + switch (Constraint) { + default: break; + case 'I': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getValue() <= 31) + return DAG.getTargetConstant(C->getValue(), Op.getValueType()); + } + return SDOperand(0,0); + case 'N': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getValue() <= 255) + return DAG.getTargetConstant(C->getValue(), Op.getValueType()); + } + return SDOperand(0,0); + case 'i': { + // Literal immediates are always ok. + if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) + return DAG.getTargetConstant(CST->getValue(), Op.getValueType()); + + // If we are in non-pic codegen mode, we allow the address of a global (with + // an optional displacement) to be used with 'i'. + GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op); + int64_t Offset = 0; + + // Match either (GA) or (GA+C) + if (GA) { + Offset = GA->getOffset(); + } else if (Op.getOpcode() == ISD::ADD) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0)); + if (C && GA) { + Offset = GA->getOffset()+C->getValue(); + } else { + C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0)); + if (C && GA) + Offset = GA->getOffset()+C->getValue(); + else + C = 0, GA = 0; + } + } + + if (GA) { + // If addressing this global requires a load (e.g. in PIC mode), we can't + // match. + if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(), + false)) + return SDOperand(0, 0); + + Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), + Offset); + return Op; + } + + // Otherwise, not valid for this mode. + return SDOperand(0, 0); + } + } + return TargetLowering::isOperandValidForConstraint(Op, Constraint, DAG); +} + +std::vector<unsigned> X86TargetLowering:: +getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + if (Constraint.size() == 1) { + // FIXME: not handling fp-stack yet! + switch (Constraint[0]) { // GCC X86 Constraint Letters + default: break; // Unknown constraint letter + case 'A': // EAX/EDX + if (VT == MVT::i32 || VT == MVT::i64) + return make_vector<unsigned>(X86::EAX, X86::EDX, 0); + break; + case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode) + case 'Q': // Q_REGS + if (VT == MVT::i32) + return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); + else if (VT == MVT::i16) + return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); + else if (VT == MVT::i8) + return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::DL, 0); + break; + } + } + + return std::vector<unsigned>(); +} + +std::pair<unsigned, const TargetRegisterClass*> +X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const { + // First, see if this is a constraint that directly corresponds to an LLVM + // register class. + if (Constraint.size() == 1) { + // GCC Constraint Letters + switch (Constraint[0]) { + default: break; + case 'r': // GENERAL_REGS + case 'R': // LEGACY_REGS + case 'l': // INDEX_REGS + if (VT == MVT::i64 && Subtarget->is64Bit()) + return std::make_pair(0U, X86::GR64RegisterClass); + if (VT == MVT::i32) + return std::make_pair(0U, X86::GR32RegisterClass); + else if (VT == MVT::i16) + return std::make_pair(0U, X86::GR16RegisterClass); + else if (VT == MVT::i8) + return std::make_pair(0U, X86::GR8RegisterClass); + break; + case 'y': // MMX_REGS if MMX allowed. + if (!Subtarget->hasMMX()) break; + return std::make_pair(0U, X86::VR64RegisterClass); + break; + case 'Y': // SSE_REGS if SSE2 allowed + if (!Subtarget->hasSSE2()) break; + // FALL THROUGH. + case 'x': // SSE_REGS if SSE1 allowed + if (!Subtarget->hasSSE1()) break; + + switch (VT) { + default: break; + // Scalar SSE types. + case MVT::f32: + case MVT::i32: + return std::make_pair(0U, X86::FR32RegisterClass); + case MVT::f64: + case MVT::i64: + return std::make_pair(0U, X86::FR64RegisterClass); + // Vector types. + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v4f32: + case MVT::v2f64: + return std::make_pair(0U, X86::VR128RegisterClass); + } + break; + } + } + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair<unsigned, const TargetRegisterClass*> Res; + Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // Not found as a standard register? + if (Res.second == 0) { + // GCC calls "st(0)" just plain "st". + if (StringsEqualNoCase("{st}", Constraint)) { + Res.first = X86::ST0; + Res.second = X86::RSTRegisterClass; + } + + return Res; + } + + // Otherwise, check to see if this is a register class of the wrong value + // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to + // turn into {ax},{dx}. + if (Res.second->hasType(VT)) + return Res; // Correct type already, nothing to do. + + // All of the single-register GCC register classes map their values onto + // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we + // really want an 8-bit or 32-bit register, map to the appropriate register + // class and return the appropriate register. + if (Res.second != X86::GR16RegisterClass) + return Res; + + if (VT == MVT::i8) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::AL; break; + case X86::DX: DestReg = X86::DL; break; + case X86::CX: DestReg = X86::CL; break; + case X86::BX: DestReg = X86::BL; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = Res.second = X86::GR8RegisterClass; + } + } else if (VT == MVT::i32) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::EAX; break; + case X86::DX: DestReg = X86::EDX; break; + case X86::CX: DestReg = X86::ECX; break; + case X86::BX: DestReg = X86::EBX; break; + case X86::SI: DestReg = X86::ESI; break; + case X86::DI: DestReg = X86::EDI; break; + case X86::BP: DestReg = X86::EBP; break; + case X86::SP: DestReg = X86::ESP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = Res.second = X86::GR32RegisterClass; + } + } else if (VT == MVT::i64) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::RAX; break; + case X86::DX: DestReg = X86::RDX; break; + case X86::CX: DestReg = X86::RCX; break; + case X86::BX: DestReg = X86::RBX; break; + case X86::SI: DestReg = X86::RSI; break; + case X86::DI: DestReg = X86::RDI; break; + case X86::BP: DestReg = X86::RBP; break; + case X86::SP: DestReg = X86::RSP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = Res.second = X86::GR64RegisterClass; + } + } + + return Res; +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h new file mode 100644 index 0000000..07a96d3 --- /dev/null +++ b/lib/Target/X86/X86ISelLowering.h @@ -0,0 +1,437 @@ +//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Chris Lattner and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ISELLOWERING_H +#define X86ISELLOWERING_H + +#include "X86Subtarget.h" +#include "X86RegisterInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/SelectionDAG.h" + +namespace llvm { + namespace X86ISD { + // X86 Specific DAG Nodes + enum NodeType { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END+X86::INSTRUCTION_LIST_END, + + /// SHLD, SHRD - Double shift instructions. These correspond to + /// X86::SHLDxx and X86::SHRDxx instructions. + SHLD, + SHRD, + + /// FAND - Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// FOR - Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// FSRL - Bitwise logical right shift of floating point values. These + /// corresponds to X86::PSRLDQ. + FSRL, + + /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has three inputs (token chain, address, + /// and source type) and two outputs (FP value and token chain). FILD_FLAG + /// also produces a flag). + FILD, + FILD_FLAG, + + /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). + FP_TO_INT16_IN_MEM, + FP_TO_INT32_IN_MEM, + FP_TO_INT64_IN_MEM, + + /// FLD - This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, ptr to load from, and a ValueType node indicating the type + /// to load to. + FLD, + + /// FST - This instruction implements a truncating store to FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and a ValueType to store it + /// as. + FST, + + /// FP_GET_RESULT - This corresponds to FpGETRESULT pseudo instruction + /// which copies from ST(0) to the destination. It takes a chain and + /// writes a RFP result and a chain. + FP_GET_RESULT, + + /// FP_SET_RESULT - This corresponds to FpSETRESULT pseudo instruction + /// which copies the source operand to ST(0). It takes a chain+value and + /// returns a chain and a flag. + FP_SET_RESULT, + + /// CALL/TAILCALL - These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + /// The CALL vs TAILCALL distinction boils down to whether the callee is + /// known not to modify the caller's stack frame, as is standard with + /// LLVM. + CALL, + TAILCALL, + + /// RDTSC_DAG - This operation implements the lowering for + /// readcyclecounter + RDTSC_DAG, + + /// X86 compare and logical compare instructions. + CMP, TEST, COMI, UCOMI, + + /// X86 SetCC. Operand 1 is condition code, and operand 2 is the flag + /// operand produced by a CMP instruction. + SETCC, + + /// X86 conditional moves. Operand 1 and operand 2 are the two values + /// to select from (operand 1 is a R/W operand). Operand 3 is the + /// condition code, and operand 4 is the flag operand produced by a CMP + /// or TEST instruction. It also writes a flag result. + CMOV, + + /// X86 conditional branches. Operand 1 is the chain operand, operand 2 + /// is the block to branch if condition is true, operand 3 is the + /// condition code, and operand 4 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// Return with a flag operand. Operand 1 is the chain operand, operand + /// 2 is the number of bytes of stack to pop. + RET_FLAG, + + /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// LOAD_PACK Load a 128-bit packed float / double value. It has the same + /// operands as a normal load. + LOAD_PACK, + + /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as + /// a normal load. + LOAD_UA, + + /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// Wrapper - A wrapper node for TargetConstantPool, + /// TargetExternalSymbol, and TargetGlobalAddress. + Wrapper, + + /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not + /// have to match the operand type. + S2VEC, + + /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, + + /// FMAX, FMIN - Floating point max and min. + /// + FMAX, FMIN, + + /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal + /// approximation. Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, FRCP, + + // Thread Local Storage + TLSADDR, THREAD_POINTER, + + // Exception Handling helpers + EH_RETURN + }; + } + + /// Define some predicates that are used for node matching. + namespace X86 { + /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFDMask(SDNode *N); + + /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFHWMask(SDNode *N); + + /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFLWMask(SDNode *N); + + /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to SHUFP*. + bool isSHUFPMask(SDNode *N); + + /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVHLPS. + bool isMOVHLPSMask(SDNode *N); + + /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form + /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, + /// <2, 3, 2, 3> + bool isMOVHLPS_v_undef_Mask(SDNode *N); + + /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. + bool isMOVLPMask(SDNode *N); + + /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} + /// as well as MOVLHPS. + bool isMOVHPMask(SDNode *N); + + /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to UNPCKL. + bool isUNPCKLMask(SDNode *N, bool V2IsSplat = false); + + /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to UNPCKH. + bool isUNPCKHMask(SDNode *N, bool V2IsSplat = false); + + /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form + /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, + /// <0, 0, 1, 1> + bool isUNPCKL_v_undef_Mask(SDNode *N); + + /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form + /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, + /// <2, 2, 3, 3> + bool isUNPCKH_v_undef_Mask(SDNode *N); + + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSS, + /// MOVSD, and MOVD, i.e. setting the lowest element. + bool isMOVLMask(SDNode *N); + + /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. + bool isMOVSHDUPMask(SDNode *N); + + /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. + bool isMOVSLDUPMask(SDNode *N); + + /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a splat of a single element. + bool isSplatMask(SDNode *N); + + /// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a splat of zero element. + bool isSplatLoMask(SDNode *N); + + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle + /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* + /// instructions. + unsigned getShuffleSHUFImmediate(SDNode *N); + + /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle + /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW + /// instructions. + unsigned getShufflePSHUFHWImmediate(SDNode *N); + + /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle + /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW + /// instructions. + unsigned getShufflePSHUFLWImmediate(SDNode *N); + } + + //===--------------------------------------------------------------------===// + // X86TargetLowering - X86 Implementation of the TargetLowering interface + class X86TargetLowering : public TargetLowering { + int VarArgsFrameIndex; // FrameIndex for start of varargs area. + int RegSaveFrameIndex; // X86-64 vararg func register save area. + unsigned VarArgsGPOffset; // X86-64 vararg func int reg offset. + unsigned VarArgsFPOffset; // X86-64 vararg func fp reg offset. + int ReturnAddrIndex; // FrameIndex for return slot. + int BytesToPopOnReturn; // Number of arg bytes ret should pop. + int BytesCallerReserves; // Number of arg bytes caller makes. + public: + X86TargetLowering(TargetMachine &TM); + + // Return the number of bytes that a function should pop when it returns (in + // addition to the space used by the return address). + // + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } + + // Return the number of bytes that the caller reserves for arguments passed + // to this function. + unsigned getBytesCallerReserves() const { return BytesCallerReserves; } + + /// getStackPtrReg - Return the stack pointer register we are using: either + /// ESP or RSP. + unsigned getStackPtrReg() const { return X86StackPtr; } + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + + virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI, + MachineBasicBlock *MBB); + + /// getTargetNodeName - This method returns the name of a target specific + /// DAG node. + virtual const char *getTargetNodeName(unsigned Opcode) const; + + /// computeMaskedBitsForTargetNode - Determine which of the bits specified + /// in Mask are known to be either zero or one and return them in the + /// KnownZero/KnownOne bitsets. + virtual void computeMaskedBitsForTargetNode(const SDOperand Op, + uint64_t Mask, + uint64_t &KnownZero, + uint64_t &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG); + + ConstraintType getConstraintType(const std::string &Constraint) const; + + std::vector<unsigned> + getRegClassForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + /// isOperandValidForConstraint - Return the specified operand (possibly + /// modified) if the specified SDOperand is valid for the specified target + /// constraint letter, otherwise return null. + SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter, + SelectionDAG &DAG); + + /// getRegForInlineAsmConstraint - Given a physical register constraint + /// (e.g. {edx}), return the register number and the register class for the + /// register. This should only be used for C_Register constraints. On + /// error, this returns a register number of 0. + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT::ValueType VT) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const; + + /// isShuffleMaskLegal - Targets can use this to indicate that they only + /// support *some* VECTOR_SHUFFLE operations, those with specific masks. + /// By default, if a target supports the VECTOR_SHUFFLE node, all mask + /// values are assumed to be legal. + virtual bool isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const; + + /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is + /// used by Targets can use this to indicate if there is a suitable + /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant + /// pool entry. + virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps, + MVT::ValueType EVT, + SelectionDAG &DAG) const; + private: + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + const MRegisterInfo *RegInfo; + + /// X86StackPtr - X86 physical register used as stack ptr. + unsigned X86StackPtr; + + /// X86ScalarSSE - Select between SSE2 or x87 floating point ops. + bool X86ScalarSSE; + + SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall, + unsigned CallingConv, SelectionDAG &DAG); + + // C and StdCall Calling Convention implementation. + SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, + bool isStdCall = false); + SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC); + + // X86-64 C Calling Convention implementation. + SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC); + + // Fast and FastCall Calling Convention implementation. + SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC); + + SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerShift(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFABS(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFNEG(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG, SDOperand Chain); + SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerVACOPY(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerFRAME_TO_ARGS_OFFSET(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG); + }; +} + +#endif // X86ISELLOWERING_H diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h new file mode 100644 index 0000000..c0fa58d --- /dev/null +++ b/lib/Target/X86/X86InstrBuilder.h @@ -0,0 +1,125 @@ +//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to handle X86'isms in a clean way. +// +// The BuildMem function may be used with the BuildMI function to add entire +// memory references in a single, typed, function call. X86 memory references +// can be very complex expressions (described in the README), so wrapping them +// up behind an easier to use interface makes sense. Descriptions of the +// functions are included below. +// +// For reference, the order of operands for memory references is: +// (Operand), Base, Scale, Index, Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRBUILDER_H +#define X86INSTRBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +namespace llvm { + +/// X86AddressMode - This struct holds a generalized full x86 address mode. +/// The base register can be a frame index, which will eventually be replaced +/// with BP or SP and Disp being offsetted accordingly. The displacement may +/// also include the offset of a global value. +struct X86AddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FrameIndex; + } Base; + + unsigned Scale; + unsigned IndexReg; + unsigned Disp; + GlobalValue *GV; + + X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) { + Base.Reg = 0; + } +}; + +/// addDirectMem - This function is used to add a direct memory reference to the +/// current instruction -- that is, a dereference of an address in a register, +/// with no scale, index or displacement. An example is: DWORD PTR [EAX]. +/// +inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB, + unsigned Reg) { + // Because memory references are always represented with four + // values, this adds: Reg, [1, NoReg, 0] to the instruction. + return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0); +} + + +/// addRegOffset - This function is used to add a memory reference of the form +/// [Reg + Offset], i.e., one with no scale or index, but with a +/// displacement. An example is: DWORD PTR [EAX + 4]. +/// +inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB, + unsigned Reg, int Offset) { + return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset); +} + +/// addRegReg - This function is used to add a memory reference of the form: +/// [Reg + Reg]. +inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB, + unsigned Reg1, unsigned Reg2) { + return MIB.addReg(Reg1).addImm(1).addReg(Reg2).addImm(0); +} + +inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB, + const X86AddressMode &AM) { + assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8); + + if (AM.BaseType == X86AddressMode::RegBase) + MIB.addReg(AM.Base.Reg); + else if (AM.BaseType == X86AddressMode::FrameIndexBase) + MIB.addFrameIndex(AM.Base.FrameIndex); + else + assert (0); + MIB.addImm(AM.Scale).addReg(AM.IndexReg); + if (AM.GV) + return MIB.addGlobalAddress(AM.GV, AM.Disp); + else + return MIB.addImm(AM.Disp); +} + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +inline const MachineInstrBuilder & +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { + return MIB.addFrameIndex(FI).addImm(1).addReg(0).addImm(Offset); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference has base register ConstantPoolIndex offset which is retained until +/// either machine code emission or assembly output. This allows an optional +/// offset to be added as well. +/// +inline const MachineInstrBuilder & +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + int Offset = 0) { + return MIB.addConstantPoolIndex(CPI).addImm(1).addReg(0).addImm(Offset); +} + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td new file mode 100644 index 0000000..11aeb07 --- /dev/null +++ b/lib/Target/X86/X86InstrFPStack.td @@ -0,0 +1,456 @@ +//==- X86InstrFPStack.td - Describe the X86 Instruction Set -------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 x87 FPU instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FPStack specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FpGet : SDTypeProfile<1, 0, [SDTCisFP<0>]>; +def SDTX86FpSet : SDTypeProfile<0, 1, [SDTCisFP<0>]>; +def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; + +def X86fpget : SDNode<"X86ISD::FP_GET_RESULT", SDTX86FpGet, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; +def X86fpset : SDNode<"X86ISD::FP_SET_RESULT", SDTX86FpSet, + [SDNPHasChain, SDNPOutFlag]>; +def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, + [SDNPHasChain]>; +def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, + [SDNPHasChain, SDNPInFlag]>; +def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, + [SDNPHasChain]>; +def X86fildflag : SDNode<"X86ISD::FILD_FLAG",SDTX86Fild, + [SDNPHasChain, SDNPOutFlag]>; +def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain]>; +def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain]>; +def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// FPStack pattern fragments +//===----------------------------------------------------------------------===// + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def fpimmneg0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-0.0); +}]>; + +def fpimm1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+1.0); +}]>; + +def fpimmneg1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-1.0); +}]>; + +// Some 'special' instructions +let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. + def FP32_TO_INT16_IN_MEM : I<0, Pseudo, + (ops i16mem:$dst, RFP32:$src), + "#FP32_TO_INT16_IN_MEM PSEUDO!", + [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT32_IN_MEM : I<0, Pseudo, + (ops i32mem:$dst, RFP32:$src), + "#FP32_TO_INT32_IN_MEM PSEUDO!", + [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT64_IN_MEM : I<0, Pseudo, + (ops i64mem:$dst, RFP32:$src), + "#FP32_TO_INT64_IN_MEM PSEUDO!", + [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; + def FP64_TO_INT16_IN_MEM : I<0, Pseudo, + (ops i16mem:$dst, RFP64:$src), + "#FP64_TO_INT16_IN_MEM PSEUDO!", + [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT32_IN_MEM : I<0, Pseudo, + (ops i32mem:$dst, RFP64:$src), + "#FP64_TO_INT32_IN_MEM PSEUDO!", + [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT64_IN_MEM : I<0, Pseudo, + (ops i64mem:$dst, RFP64:$src), + "#FP64_TO_INT64_IN_MEM PSEUDO!", + [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; +} + +let isTerminator = 1 in + let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in + def FP_REG_KILL : I<0, Pseudo, (ops), "#FP_REG_KILL", []>; + +// All FP Stack operations are represented with three instructions here. The +// first two instructions, generated by the instruction selector, uses "RFP32" +// or "RFP64" registers: traditional register files to reference 32-bit or +// 64-bit floating point values. These sizes apply to the values, not the +// registers, which are always 64 bits; RFP32 and RFP64 can be copied to +// each other without losing information. These instructions are all psuedo +// instructions and use the "_Fp" suffix. +// In some cases there are additional variants with a mixture of 32-bit and +// 64-bit registers. +// The second instruction is defined with FPI, which is the actual instruction +// emitted by the assembler. These use "RST" registers, although frequently +// the actual register(s) used are implicit. These are always 64-bits. +// The FP stackifier pass converts one to the other after register allocation +// occurs. +// +// Note that the FpI instruction should have instruction selection info (e.g. +// a pattern) and the FPI instruction should have emission info (e.g. opcode +// encoding and asm printing info). + +// FPI - Floating Point Instruction template. +class FPI<bits<8> o, Format F, dag ops, string asm> : I<o, F, ops, asm, []> {} + +// FpI_ - Floating Point Psuedo Instruction template. Not Predicated. +class FpI_<dag ops, FPFormat fp, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, ops, ""> { + let FPForm = fp; let FPFormBits = FPForm.Value; + let Pattern = pattern; +} + +// Random Pseudo Instructions. +def FpGETRESULT32 : FpI_<(ops RFP32:$dst), SpecialFP, + [(set RFP32:$dst, X86fpget)]>; // FPR = ST(0) + +def FpGETRESULT64 : FpI_<(ops RFP64:$dst), SpecialFP, + [(set RFP64:$dst, X86fpget)]>; // FPR = ST(0) + +let noResults = 1 in { + def FpSETRESULT32 : FpI_<(ops RFP32:$src), SpecialFP, + [(X86fpset RFP32:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR + + def FpSETRESULT64 : FpI_<(ops RFP64:$src), SpecialFP, + [(X86fpset RFP64:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR +} +// FpI - Floating Point Psuedo Instruction template. Predicated on FPStack. +class FpI<dag ops, FPFormat fp, list<dag> pattern> : + FpI_<ops, fp, pattern>, Requires<[FPStack]>; + +// Register copies. Just copies, the 64->32 version does not truncate. +def MOV_Fp3232 : FpI<(ops RFP32:$dst, RFP32:$src), SpecialFP, []>; +def MOV_Fp3264 : FpI<(ops RFP64:$dst, RFP32:$src), SpecialFP, []>; +def MOV_Fp6432 : FpI<(ops RFP32:$dst, RFP64:$src), SpecialFP, []>; +def MOV_Fp6464 : FpI<(ops RFP64:$dst, RFP64:$src), SpecialFP, []>; + +// Factoring for arithmetic. +multiclass FPBinary_rr<SDNode OpNode> { +// Register op register -> register +// These are separated out because they have no reversed form. +def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), TwoArgFP, + [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; +def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), TwoArgFP, + [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; +} +// The FopST0 series are not included here because of the irregularities +// in where the 'r' goes in assembly output. +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +// ST(0) = ST(0) + [mem] +def _Fp32m : FpI<(ops RFP32:$dst, RFP32:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; +def _Fp64m : FpI<(ops RFP64:$dst, RFP64:$src1, f64mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; +def _Fp64m32: FpI<(ops RFP64:$dst, RFP64:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (extloadf32 addr:$src2)))]>; +def _F32m : FPI<0xD8, fp, (ops f32mem:$src), + !strconcat("f", !strconcat(asmstring, "{s} $src"))>; +def _F64m : FPI<0xDC, fp, (ops f64mem:$src), + !strconcat("f", !strconcat(asmstring, "{l} $src"))>; +// ST(0) = ST(0) + [memint] +def _FpI16m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i16mem:$src2), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i32mem:$src2), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i32)))]>; +def _FpI16m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i16mem:$src2), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i32mem:$src2), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i32)))]>; +def _FI16m : FPI<0xDE, fp, (ops i16mem:$src), + !strconcat("fi", !strconcat(asmstring, "{s} $src"))>; +def _FI32m : FPI<0xDA, fp, (ops i32mem:$src), + !strconcat("fi", !strconcat(asmstring, "{l} $src"))>; +} + +defm ADD : FPBinary_rr<fadd>; +defm SUB : FPBinary_rr<fsub>; +defm MUL : FPBinary_rr<fmul>; +defm DIV : FPBinary_rr<fdiv>; +defm ADD : FPBinary<fadd, MRM0m, "add">; +defm SUB : FPBinary<fsub, MRM4m, "sub">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm MUL : FPBinary<fmul, MRM1m, "mul">; +defm DIV : FPBinary<fdiv, MRM6m, "div">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr">; + +class FPST0rInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (ops RST:$op), asm>, D8; +class FPrST0Inst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (ops RST:$op), asm>, DC; +class FPrST0PInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (ops RST:$op), asm>, DE; + +// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion +// of some of the 'reverse' forms of the fsub and fdiv instructions. As such, +// we have to put some 'r's in and take them out of weird places. +def ADD_FST0r : FPST0rInst <0xC0, "fadd $op">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd {%st(0), $op|$op, %ST(0)}">; +def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp $op">; +def SUBR_FST0r : FPST0rInst <0xE8, "fsubr $op">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r} {%st(0), $op|$op, %ST(0)}">; +def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p $op">; +def SUB_FST0r : FPST0rInst <0xE0, "fsub $op">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r} {%st(0), $op|$op, %ST(0)}">; +def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p $op">; +def MUL_FST0r : FPST0rInst <0xC8, "fmul $op">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul {%st(0), $op|$op, %ST(0)}">; +def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp $op">; +def DIVR_FST0r : FPST0rInst <0xF8, "fdivr $op">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r} {%st(0), $op|$op, %ST(0)}">; +def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p $op">; +def DIV_FST0r : FPST0rInst <0xF0, "fdiv $op">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r} {%st(0), $op|$op, %ST(0)}">; +def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p $op">; + +// Unary operations. +multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> { +def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src))]>; +def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src))]>; +def _F : FPI<opcode, RawFrm, (ops), asmstring>, D9; +} + +defm CHS : FPUnary<fneg, 0xE0, "fchs">; +defm ABS : FPUnary<fabs, 0xE1, "fabs">; +defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">; +defm SIN : FPUnary<fsin, 0xFE, "fsin">; +defm COS : FPUnary<fcos, 0xFF, "fcos">; + +def TST_Fp32 : FpI<(ops RFP32:$src), OneArgFP, + []>; +def TST_Fp64 : FpI<(ops RFP64:$src), OneArgFP, + []>; +def TST_F : FPI<0xE4, RawFrm, (ops), "ftst">, D9; + +// Floating point cmovs. +multiclass FPCMov<PatLeaf cc> { + def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), CondMovFP, + [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, + cc))]>; + def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), CondMovFP, + [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, + cc))]>; +} +let isTwoAddress = 1 in { +defm CMOVB : FPCMov<X86_COND_B>; +defm CMOVBE : FPCMov<X86_COND_BE>; +defm CMOVE : FPCMov<X86_COND_E>; +defm CMOVP : FPCMov<X86_COND_P>; +defm CMOVNB : FPCMov<X86_COND_AE>; +defm CMOVNBE: FPCMov<X86_COND_A>; +defm CMOVNE : FPCMov<X86_COND_NE>; +defm CMOVNP : FPCMov<X86_COND_NP>; +} + +// These are not factored because there's no clean way to pass DA/DB. +def CMOVB_F : FPI<0xC0, AddRegFrm, (ops RST:$op), + "fcmovb {$op, %st(0)|%ST(0), $op}">, DA; +def CMOVBE_F : FPI<0xD0, AddRegFrm, (ops RST:$op), + "fcmovbe {$op, %st(0)|%ST(0), $op}">, DA; +def CMOVE_F : FPI<0xC8, AddRegFrm, (ops RST:$op), + "fcmove {$op, %st(0)|%ST(0), $op}">, DA; +def CMOVP_F : FPI<0xD8, AddRegFrm, (ops RST:$op), + "fcmovu {$op, %st(0)|%ST(0), $op}">, DA; +def CMOVNB_F : FPI<0xC0, AddRegFrm, (ops RST:$op), + "fcmovnb {$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNBE_F: FPI<0xD0, AddRegFrm, (ops RST:$op), + "fcmovnbe {$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNE_F : FPI<0xC8, AddRegFrm, (ops RST:$op), + "fcmovne {$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNP_F : FPI<0xD8, AddRegFrm, (ops RST:$op), + "fcmovnu {$op, %st(0)|%ST(0), $op}">, DB; + +// Floating point loads & stores. +def LD_Fp32m : FpI<(ops RFP32:$dst, f32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (loadf32 addr:$src))]>; +def LD_Fp64m : FpI<(ops RFP64:$dst, f64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (loadf64 addr:$src))]>; +def ILD_Fp16m32: FpI<(ops RFP32:$dst, i16mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m32: FpI<(ops RFP32:$dst, i32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m32: FpI<(ops RFP32:$dst, i64mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m64: FpI<(ops RFP64:$dst, i16mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m64: FpI<(ops RFP64:$dst, i32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m64: FpI<(ops RFP64:$dst, i64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i64))]>; + +def ST_Fp32m : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP, + [(store RFP32:$src, addr:$op)]>; +def ST_Fp64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP, + [(truncstoref32 RFP64:$src, addr:$op)]>; +def ST_Fp64m : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP, + [(store RFP64:$src, addr:$op)]>; + +def ST_FpP32m : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP, []>; +def ST_FpP64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP64m : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp16m32 : FpI<(ops i16mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpI<(ops i32mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp64m32 : FpI<(ops i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp16m64 : FpI<(ops i16mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpI<(ops i32mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp64m64 : FpI<(ops i64mem:$op, RFP64:$src), OneArgFP, []>; + +def LD_F32m : FPI<0xD9, MRM0m, (ops f32mem:$src), "fld{s} $src">; +def LD_F64m : FPI<0xDD, MRM0m, (ops f64mem:$src), "fld{l} $src">; +def ILD_F16m : FPI<0xDF, MRM0m, (ops i16mem:$src), "fild{s} $src">; +def ILD_F32m : FPI<0xDB, MRM0m, (ops i32mem:$src), "fild{l} $src">; +def ILD_F64m : FPI<0xDF, MRM5m, (ops i64mem:$src), "fild{ll} $src">; +def ST_F32m : FPI<0xD9, MRM2m, (ops f32mem:$dst), "fst{s} $dst">; +def ST_F64m : FPI<0xDD, MRM2m, (ops f64mem:$dst), "fst{l} $dst">; +def ST_FP32m : FPI<0xD9, MRM3m, (ops f32mem:$dst), "fstp{s} $dst">; +def ST_FP64m : FPI<0xDD, MRM3m, (ops f64mem:$dst), "fstp{l} $dst">; +def IST_F16m : FPI<0xDF, MRM2m, (ops i16mem:$dst), "fist{s} $dst">; +def IST_F32m : FPI<0xDB, MRM2m, (ops i32mem:$dst), "fist{l} $dst">; +def IST_FP16m : FPI<0xDF, MRM3m, (ops i16mem:$dst), "fistp{s} $dst">; +def IST_FP32m : FPI<0xDB, MRM3m, (ops i32mem:$dst), "fistp{l} $dst">; +def IST_FP64m : FPI<0xDF, MRM7m, (ops i64mem:$dst), "fistp{ll} $dst">; + +// FISTTP requires SSE3 even though it's a FPStack op. +def ISTT_Fp16m32 : FpI_<(ops i16mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp32m32 : FpI_<(ops i32mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp64m32 : FpI_<(ops i64mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp16m64 : FpI_<(ops i16mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp32m64 : FpI_<(ops i32mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp64m64 : FpI_<(ops i64mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; + +def ISTT_FP16m : FPI<0xDF, MRM1m, (ops i16mem:$dst), "fisttp{s} $dst">; +def ISTT_FP32m : FPI<0xDB, MRM1m, (ops i32mem:$dst), "fisttp{l} $dst">; +def ISTT_FP64m : FPI<0xDD, MRM1m, (ops i64mem:$dst), "fisttp{ll} $dst">; + +// FP Stack manipulation instructions. +def LD_Frr : FPI<0xC0, AddRegFrm, (ops RST:$op), "fld $op">, D9; +def ST_Frr : FPI<0xD0, AddRegFrm, (ops RST:$op), "fst $op">, DD; +def ST_FPrr : FPI<0xD8, AddRegFrm, (ops RST:$op), "fstp $op">, DD; +def XCH_F : FPI<0xC8, AddRegFrm, (ops RST:$op), "fxch $op">, D9; + +// Floating point constant loads. +let isReMaterializable = 1 in { +def LD_Fp032 : FpI<(ops RFP32:$dst), ZeroArgFP, + [(set RFP32:$dst, fpimm0)]>; +def LD_Fp132 : FpI<(ops RFP32:$dst), ZeroArgFP, + [(set RFP32:$dst, fpimm1)]>; +def LD_Fp064 : FpI<(ops RFP64:$dst), ZeroArgFP, + [(set RFP64:$dst, fpimm0)]>; +def LD_Fp164 : FpI<(ops RFP64:$dst), ZeroArgFP, + [(set RFP64:$dst, fpimm1)]>; +} + +def LD_F0 : FPI<0xEE, RawFrm, (ops), "fldz">, D9; +def LD_F1 : FPI<0xE8, RawFrm, (ops), "fld1">, D9; + + +// Floating point compares. +def UCOM_Fpr32 : FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP, + []>; // FPSW = cmp ST(0) with ST(i) +def UCOM_FpIr32: FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP, + [(X86cmp RFP32:$lhs, RFP32:$rhs)]>; // CC = ST(0) cmp ST(i) +def UCOM_Fpr64 : FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP, + []>; // FPSW = cmp ST(0) with ST(i) +def UCOM_FpIr64: FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP, + [(X86cmp RFP64:$lhs, RFP64:$rhs)]>; // CC = ST(0) cmp ST(i) + +def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i) + (ops RST:$reg), + "fucom $reg">, DD, Imp<[ST0],[]>; +def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop + (ops RST:$reg), + "fucomp $reg">, DD, Imp<[ST0],[]>; +def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop + (ops), + "fucompp">, DA, Imp<[ST0],[]>; + +def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i) + (ops RST:$reg), + "fucomi {$reg, %st(0)|%ST(0), $reg}">, DB, Imp<[ST0],[]>; +def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop + (ops RST:$reg), + "fucomip {$reg, %st(0)|%ST(0), $reg}">, DF, Imp<[ST0],[]>; + +// Floating point flag ops. +def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags + (ops), "fnstsw", []>, DF, Imp<[],[AX]>; + +def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world + (ops i16mem:$dst), "fnstcw $dst", []>; +def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] + (ops i16mem:$dst), "fldcw $dst", []>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Required for RET of f32 / f64 values. +def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; +def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; + +// Required for CALL which return f32 / f64 values. +def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; + +// Floating point constant -0.0 and -1.0 +def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStack]>; +def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStack]>; +def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStack]>; +def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStack]>; + +// Used to conv. i64 to f64 since there isn't a SSE version. +def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; + +def : Pat<(extloadf32 addr:$src), + (MOV_Fp3264 (LD_Fp32m addr:$src))>, Requires<[FPStack]>; +def : Pat<(fextend RFP32:$src), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStack]>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp new file mode 100644 index 0000000..06b14fe --- /dev/null +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -0,0 +1,567 @@ +//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "X86.h" +#include "X86GenInstrInfo.inc" +#include "X86InstrBuilder.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LiveVariables.h" +using namespace llvm; + +X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) + : TargetInstrInfo(X86Insts, sizeof(X86Insts)/sizeof(X86Insts[0])), + TM(tm), RI(tm, *this) { +} + +bool X86InstrInfo::isMoveInstr(const MachineInstr& MI, + unsigned& sourceReg, + unsigned& destReg) const { + MachineOpCode oc = MI.getOpcode(); + if (oc == X86::MOV8rr || oc == X86::MOV16rr || + oc == X86::MOV32rr || oc == X86::MOV64rr || + oc == X86::MOV16to16_ || oc == X86::MOV32to32_ || + oc == X86::MOV_Fp3232 || oc == X86::MOVSSrr || oc == X86::MOVSDrr || + oc == X86::MOV_Fp3264 || oc == X86::MOV_Fp6432 || oc == X86::MOV_Fp6464 || + oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr || + oc == X86::MOVAPSrr || oc == X86::MOVAPDrr || + oc == X86::MOVSS2PSrr || oc == X86::MOVSD2PDrr || + oc == X86::MOVPS2SSrr || oc == X86::MOVPD2SDrr || + oc == X86::MMX_MOVD64rr || oc == X86::MMX_MOVQ64rr) { + assert(MI.getNumOperands() >= 2 && + MI.getOperand(0).isRegister() && + MI.getOperand(1).isRegister() && + "invalid register-register move instruction"); + sourceReg = MI.getOperand(1).getReg(); + destReg = MI.getOperand(0).getReg(); + return true; + } + return false; +} + +unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV16_rm: + case X86::MOV32rm: + case X86::MOV32_rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVAPDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() && + MI->getOperand(3).isRegister() && MI->getOperand(4).isImmediate() && + MI->getOperand(2).getImmedValue() == 1 && + MI->getOperand(3).getReg() == 0 && + MI->getOperand(4).getImmedValue() == 0) { + FrameIndex = MI->getOperand(1).getFrameIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8mr: + case X86::MOV16mr: + case X86::MOV16_mr: + case X86::MOV32mr: + case X86::MOV32_mr: + case X86::MOV64mr: + case X86::ST_FpP64m: + case X86::MOVSSmr: + case X86::MOVSDmr: + case X86::MOVAPSmr: + case X86::MOVAPDmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() && + MI->getOperand(2).isRegister() && MI->getOperand(3).isImmediate() && + MI->getOperand(1).getImmedValue() == 1 && + MI->getOperand(2).getReg() == 0 && + MI->getOperand(3).getImmedValue() == 0) { + FrameIndex = MI->getOperand(0).getFrameIndex(); + return MI->getOperand(4).getReg(); + } + break; + } + return 0; +} + + +bool X86InstrInfo::isReallyTriviallyReMaterializable(MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV16_rm: + case X86::MOV32rm: + case X86::MOV32_rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVAPDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + // Loads from constant pools are trivially rematerializable. + return MI->getOperand(1).isRegister() && MI->getOperand(2).isImmediate() && + MI->getOperand(3).isRegister() && MI->getOperand(4).isConstantPoolIndex() && + MI->getOperand(1).getReg() == 0 && + MI->getOperand(2).getImmedValue() == 1 && + MI->getOperand(3).getReg() == 0; + } + // All other instructions marked M_REMATERIALIZABLE are always trivially + // rematerializable. + return true; +} + +/// convertToThreeAddress - This method must be implemented by targets that +/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target +/// may be able to convert a two-address instruction into a true +/// three-address instruction on demand. This allows the X86 target (for +/// example) to convert ADD and SHL instructions into LEA instructions if they +/// would require register copies due to two-addressness. +/// +/// This method returns a null pointer if the transformation cannot be +/// performed, otherwise it returns the new instruction. +/// +MachineInstr * +X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables &LV) const { + MachineInstr *MI = MBBI; + // All instructions input are two-addr instructions. Get the known operands. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + + MachineInstr *NewMI = NULL; + // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When + // we have better subtarget support, enable the 16-bit LEA generation here. + bool DisableLEA16 = true; + + switch (MI->getOpcode()) { + default: return 0; + case X86::SHUFPSrri: { + assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + + unsigned A = MI->getOperand(0).getReg(); + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + unsigned M = MI->getOperand(3).getImm(); + if (B != C) return 0; + NewMI = BuildMI(get(X86::PSHUFDri), A).addReg(B).addImm(M); + break; + } + case X86::SHL64ri: { + assert(MI->getNumOperands() == 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + NewMI = BuildMI(get(X86::LEA64r), Dest) + .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0); + break; + } + case X86::SHL32ri: { + assert(MI->getNumOperands() == 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ? + X86::LEA64_32r : X86::LEA32r; + NewMI = BuildMI(get(Opc), Dest) + .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0); + break; + } + case X86::SHL16ri: { + assert(MI->getNumOperands() == 3 && "Unknown shift instruction!"); + if (DisableLEA16) return 0; + + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + NewMI = BuildMI(get(X86::LEA16r), Dest) + .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0); + break; + } + } + + // FIXME: None of these instructions are promotable to LEAs without + // additional information. In particular, LEA doesn't set the flags that + // add and inc do. :( + if (0) + switch (MI->getOpcode()) { + case X86::INC32r: + case X86::INC64_32r: + assert(MI->getNumOperands() == 2 && "Unknown inc instruction!"); + NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, 1); + break; + case X86::INC16r: + case X86::INC64_16r: + if (DisableLEA16) return 0; + assert(MI->getNumOperands() == 2 && "Unknown inc instruction!"); + NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, 1); + break; + case X86::DEC32r: + case X86::DEC64_32r: + assert(MI->getNumOperands() == 2 && "Unknown dec instruction!"); + NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, -1); + break; + case X86::DEC16r: + case X86::DEC64_16r: + if (DisableLEA16) return 0; + assert(MI->getNumOperands() == 2 && "Unknown dec instruction!"); + NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, -1); + break; + case X86::ADD32rr: + assert(MI->getNumOperands() == 3 && "Unknown add instruction!"); + NewMI = addRegReg(BuildMI(get(X86::LEA32r), Dest), Src, + MI->getOperand(2).getReg()); + break; + case X86::ADD16rr: + if (DisableLEA16) return 0; + assert(MI->getNumOperands() == 3 && "Unknown add instruction!"); + NewMI = addRegReg(BuildMI(get(X86::LEA16r), Dest), Src, + MI->getOperand(2).getReg()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + assert(MI->getNumOperands() == 3 && "Unknown add instruction!"); + if (MI->getOperand(2).isImmediate()) + NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, + MI->getOperand(2).getImmedValue()); + break; + case X86::ADD16ri: + case X86::ADD16ri8: + if (DisableLEA16) return 0; + assert(MI->getNumOperands() == 3 && "Unknown add instruction!"); + if (MI->getOperand(2).isImmediate()) + NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, + MI->getOperand(2).getImmedValue()); + break; + case X86::SHL16ri: + if (DisableLEA16) return 0; + case X86::SHL32ri: + assert(MI->getNumOperands() == 3 && MI->getOperand(2).isImmediate() && + "Unknown shl instruction!"); + unsigned ShAmt = MI->getOperand(2).getImmedValue(); + if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) { + X86AddressMode AM; + AM.Scale = 1 << ShAmt; + AM.IndexReg = Src; + unsigned Opc = MI->getOpcode() == X86::SHL32ri ? X86::LEA32r :X86::LEA16r; + NewMI = addFullAddress(BuildMI(get(Opc), Dest), AM); + } + break; + } + + if (NewMI) { + NewMI->copyKillDeadInfo(MI); + LV.instructionChanged(MI, NewMI); // Update live variables + MFI->insert(MBBI, NewMI); // Insert the new inst + } + return NewMI; +} + +/// commuteInstruction - We have a few instructions that must be hacked on to +/// commute them. +/// +MachineInstr *X86InstrInfo::commuteInstruction(MachineInstr *MI) const { + // FIXME: Can commute cmoves by changing the condition! + switch (MI->getOpcode()) { + case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) + case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) + case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) + case X86::SHLD32rri8:{// A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) + unsigned Opc; + unsigned Size; + switch (MI->getOpcode()) { + default: assert(0 && "Unreachable!"); + case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; + case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; + case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; + case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; + } + unsigned Amt = MI->getOperand(3).getImmedValue(); + unsigned A = MI->getOperand(0).getReg(); + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + bool BisKill = MI->getOperand(1).isKill(); + bool CisKill = MI->getOperand(2).isKill(); + return BuildMI(get(Opc), A).addReg(C, false, false, CisKill) + .addReg(B, false, false, BisKill).addImm(Size-Amt); + } + default: + return TargetInstrInfo::commuteInstruction(MI); + } +} + +static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) { + switch (BrOpc) { + default: return X86::COND_INVALID; + case X86::JE: return X86::COND_E; + case X86::JNE: return X86::COND_NE; + case X86::JL: return X86::COND_L; + case X86::JLE: return X86::COND_LE; + case X86::JG: return X86::COND_G; + case X86::JGE: return X86::COND_GE; + case X86::JB: return X86::COND_B; + case X86::JBE: return X86::COND_BE; + case X86::JA: return X86::COND_A; + case X86::JAE: return X86::COND_AE; + case X86::JS: return X86::COND_S; + case X86::JNS: return X86::COND_NS; + case X86::JP: return X86::COND_P; + case X86::JNP: return X86::COND_NP; + case X86::JO: return X86::COND_O; + case X86::JNO: return X86::COND_NO; + } +} + +unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { + switch (CC) { + default: assert(0 && "Illegal condition code!"); + case X86::COND_E: return X86::JE; + case X86::COND_NE: return X86::JNE; + case X86::COND_L: return X86::JL; + case X86::COND_LE: return X86::JLE; + case X86::COND_G: return X86::JG; + case X86::COND_GE: return X86::JGE; + case X86::COND_B: return X86::JB; + case X86::COND_BE: return X86::JBE; + case X86::COND_A: return X86::JA; + case X86::COND_AE: return X86::JAE; + case X86::COND_S: return X86::JS; + case X86::COND_NS: return X86::JNS; + case X86::COND_P: return X86::JP; + case X86::COND_NP: return X86::JNP; + case X86::COND_O: return X86::JO; + case X86::COND_NO: return X86::JNO; + } +} + +/// GetOppositeBranchCondition - Return the inverse of the specified condition, +/// e.g. turning COND_E to COND_NE. +X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { + switch (CC) { + default: assert(0 && "Illegal condition code!"); + case X86::COND_E: return X86::COND_NE; + case X86::COND_NE: return X86::COND_E; + case X86::COND_L: return X86::COND_GE; + case X86::COND_LE: return X86::COND_G; + case X86::COND_G: return X86::COND_LE; + case X86::COND_GE: return X86::COND_L; + case X86::COND_B: return X86::COND_AE; + case X86::COND_BE: return X86::COND_A; + case X86::COND_A: return X86::COND_BE; + case X86::COND_AE: return X86::COND_B; + case X86::COND_S: return X86::COND_NS; + case X86::COND_NS: return X86::COND_S; + case X86::COND_P: return X86::COND_NP; + case X86::COND_NP: return X86::COND_P; + case X86::COND_O: return X86::COND_NO; + case X86::COND_NO: return X86::COND_O; + } +} + +// For purposes of branch analysis do not count FP_REG_KILL as a terminator. +bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + if (MI->getOpcode() == X86::FP_REG_KILL) + return false; + + const TargetInstrDescriptor *TID = MI->getInstrDescriptor(); + if (TID->Flags & M_TERMINATOR_FLAG) { + // Conditional branch is a special case. + if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0) + return true; + if ((TID->Flags & M_PREDICABLE) == 0) + return true; + return !isPredicated(MI); + } + return false; +} + +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (!isBranch(LastInst->getOpcode())) + return true; + + // If the block ends with a branch there are 3 possibilities: + // it's an unconditional, conditional, or indirect branch. + + if (LastInst->getOpcode() == X86::JMP) { + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + X86::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode()); + if (BranchCode == X86::COND_INVALID) + return true; // Can't handle indirect branch. + + // Otherwise, block ends with fall-through condbranch. + TBB = LastInst->getOperand(0).getMachineBasicBlock(); + Cond.push_back(MachineOperand::CreateImm(BranchCode)); + return false; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with X86::JMP and a conditional branch, handle it. + X86::CondCode BranchCode = GetCondFromBranchOpc(SecondLastInst->getOpcode()); + if (BranchCode != X86::COND_INVALID && LastInst->getOpcode() == X86::JMP) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + Cond.push_back(MachineOperand::CreateImm(BranchCode)); + FBB = LastInst->getOperand(0).getMachineBasicBlock(); + return false; + } + + // If the block ends with two X86::JMPs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == X86::JMP && + LastInst->getOpcode() == X86::JMP) { + TBB = SecondLastInst->getOperand(0).getMachineBasicBlock(); + I = LastInst; + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + if (I->getOpcode() != X86::JMP && + GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 1 || Cond.size() == 0) && + "X86 branch conditions have one component!"); + + if (FBB == 0) { // One way branch. + if (Cond.empty()) { + // Unconditional branch? + BuildMI(&MBB, get(X86::JMP)).addMBB(TBB); + } else { + // Conditional branch. + unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm()); + BuildMI(&MBB, get(Opc)).addMBB(TBB); + } + return 1; + } + + // Two-way Conditional branch. + unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm()); + BuildMI(&MBB, get(Opc)).addMBB(TBB); + BuildMI(&MBB, get(X86::JMP)).addMBB(FBB); + return 2; +} + +bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const { + if (MBB.empty()) return false; + + switch (MBB.back().getOpcode()) { + case X86::RET: // Return. + case X86::RETI: + case X86::TAILJMPd: + case X86::TAILJMPr: + case X86::TAILJMPm: + case X86::JMP: // Uncond branch. + case X86::JMP32r: // Indirect branch. + case X86::JMP32m: // Indirect branch through mem. + return true; + default: return false; + } +} + +bool X86InstrInfo:: +ReverseBranchCondition(std::vector<MachineOperand> &Cond) const { + assert(Cond.size() == 1 && "Invalid X86 branch condition!"); + Cond[0].setImm(GetOppositeBranchCondition((X86::CondCode)Cond[0].getImm())); + return false; +} + +const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + if (Subtarget->is64Bit()) + return &X86::GR64RegClass; + else + return &X86::GR32RegClass; +} diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h new file mode 100644 index 0000000..ec30cc7 --- /dev/null +++ b/lib/Target/X86/X86InstrInfo.h @@ -0,0 +1,287 @@ +//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRUCTIONINFO_H +#define X86INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "X86RegisterInfo.h" + +namespace llvm { + class X86RegisterInfo; + class X86TargetMachine; + +namespace X86 { + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + COND_INVALID + }; + + // Turn condition code into conditional branch opcode. + unsigned GetCondBranchFromCond(CondCode CC); + + /// GetOppositeBranchCondition - Return the inverse of the specified cond, + /// e.g. turning COND_E to COND_NE. + CondCode GetOppositeBranchCondition(X86::CondCode CC); + +} + +/// X86II - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace X86II { + enum { + //===------------------------------------------------------------------===// + // Instruction types. These are the standard/most common forms for X86 + // instructions. + // + + // PseudoFrm - This represents an instruction that is a pseudo instruction + // or one that has not been implemented yet. It is illegal to code generate + // it, but tolerated for intermediate implementation stages. + Pseudo = 0, + + /// Raw - This form is for instructions that don't have any operands, so + /// they are just a fixed opcode value, like 'leave'. + RawFrm = 1, + + /// AddRegFrm - This form is used for instructions like 'push r32' that have + /// their one register operand added to their opcode. + AddRegFrm = 2, + + /// MRMDestReg - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is a register. + /// + MRMDestReg = 3, + + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is memory. + /// + MRMDestMem = 4, + + /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is a register. + /// + MRMSrcReg = 5, + + /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is memory. + /// + MRMSrcMem = 6, + + /// MRM[0-7][rm] - These forms are used to represent instructions that use + /// a Mod/RM byte, and use the middle field to hold extended opcode + /// information. In the intel manual these are represented as /0, /1, ... + /// + + // First, instructions that operate on a register r/m operand... + MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3 + MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7 + + // Next, instructions that operate on a memory r/m operand... + MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3 + MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7 + + // MRMInitReg - This form is used for instructions whose source and + // destinations are the same register. + MRMInitReg = 32, + + FormMask = 63, + + //===------------------------------------------------------------------===// + // Actual flags... + + // OpSize - Set if this instruction requires an operand size prefix (0x66), + // which most often indicates that the instruction operates on 16 bit data + // instead of 32 bit data. + OpSize = 1 << 6, + + // AsSize - Set if this instruction requires an operand size prefix (0x67), + // which most often indicates that the instruction address 16 bit address + // instead of 32 bit address (or 32 bit address in 64 bit mode). + AdSize = 1 << 7, + + //===------------------------------------------------------------------===// + // Op0Mask - There are several prefix bytes that are used to form two byte + // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is + // used to obtain the setting of this field. If no bits in this field is + // set, there is no prefix byte for obtaining a multibyte opcode. + // + Op0Shift = 8, + Op0Mask = 0xF << Op0Shift, + + // TB - TwoByte - Set if this instruction has a two byte opcode, which + // starts with a 0x0F byte before the real opcode. + TB = 1 << Op0Shift, + + // REP - The 0xF3 prefix byte indicating repetition of the following + // instruction. + REP = 2 << Op0Shift, + + // D8-DF - These escape opcodes are used by the floating point unit. These + // values must remain sequential. + D8 = 3 << Op0Shift, D9 = 4 << Op0Shift, + DA = 5 << Op0Shift, DB = 6 << Op0Shift, + DC = 7 << Op0Shift, DD = 8 << Op0Shift, + DE = 9 << Op0Shift, DF = 10 << Op0Shift, + + // XS, XD - These prefix codes are for single and double precision scalar + // floating point operations performed in the SSE registers. + XD = 11 << Op0Shift, XS = 12 << Op0Shift, + + // T8, TA - Prefix after the 0x0F prefix. + T8 = 13 << Op0Shift, TA = 14 << Op0Shift, + + //===------------------------------------------------------------------===// + // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. + // They are used to specify GPRs and SSE registers, 64-bit operand size, + // etc. We only cares about REX.W and REX.R bits and only the former is + // statically determined. + // + REXShift = 12, + REX_W = 1 << REXShift, + + //===------------------------------------------------------------------===// + // This three-bit field describes the size of an immediate operand. Zero is + // unused so that we can tell if we forgot to set a value. + ImmShift = 13, + ImmMask = 7 << ImmShift, + Imm8 = 1 << ImmShift, + Imm16 = 2 << ImmShift, + Imm32 = 3 << ImmShift, + Imm64 = 4 << ImmShift, + + //===------------------------------------------------------------------===// + // FP Instruction Classification... Zero is non-fp instruction. + + // FPTypeMask - Mask for all of the FP types... + FPTypeShift = 16, + FPTypeMask = 7 << FPTypeShift, + + // NotFP - The default, set for instructions that do not use FP registers. + NotFP = 0 << FPTypeShift, + + // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0 + ZeroArgFP = 1 << FPTypeShift, + + // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst + OneArgFP = 2 << FPTypeShift, + + // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a + // result back to ST(0). For example, fcos, fsqrt, etc. + // + OneArgFPRW = 3 << FPTypeShift, + + // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an + // explicit argument, storing the result to either ST(0) or the implicit + // argument. For example: fadd, fsub, fmul, etc... + TwoArgFP = 4 << FPTypeShift, + + // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an + // explicit argument, but have no destination. Example: fucom, fucomi, ... + CompareFP = 5 << FPTypeShift, + + // CondMovFP - "2 operand" floating point conditional move instructions. + CondMovFP = 6 << FPTypeShift, + + // SpecialFP - Special instruction forms. Dispatch by opcode explicitly. + SpecialFP = 7 << FPTypeShift, + + // Bits 19 -> 23 are unused + OpcodeShift = 24, + OpcodeMask = 0xFF << OpcodeShift + }; +} + +class X86InstrInfo : public TargetInstrInfo { + X86TargetMachine &TM; + const X86RegisterInfo RI; +public: + X86InstrInfo(X86TargetMachine &tm); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const MRegisterInfo &getRegisterInfo() const { return RI; } + + // Return true if the instruction is a register to register move and + // leave the source and dest operands in the passed parameters. + // + bool isMoveInstr(const MachineInstr& MI, unsigned& sourceReg, + unsigned& destReg) const; + unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const; + bool isReallyTriviallyReMaterializable(MachineInstr *MI) const; + + /// convertToThreeAddress - This method must be implemented by targets that + /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target + /// may be able to convert a two-address instruction into a true + /// three-address instruction on demand. This allows the X86 target (for + /// example) to convert ADD and SHL instructions into LEA instructions if they + /// would require register copies due to two-addressness. + /// + /// This method returns a null pointer if the transformation cannot be + /// performed, otherwise it returns the new instruction. + /// + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables &LV) const; + + /// commuteInstruction - We have a few instructions that must be hacked on to + /// commute them. + /// + virtual MachineInstr *commuteInstruction(MachineInstr *MI) const; + + // Branch analysis. + virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const; + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + std::vector<MachineOperand> &Cond) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const std::vector<MachineOperand> &Cond) const; + virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const; + virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const; + + const TargetRegisterClass *getPointerRegClass() const; + + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the + // specified opcode number. + // + unsigned char getBaseOpcodeFor(const TargetInstrDescriptor *TID) const { + return TID->TSFlags >> X86II::OpcodeShift; + } +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td new file mode 100644 index 0000000..b24f644 --- /dev/null +++ b/lib/Target/X86/X86InstrInfo.td @@ -0,0 +1,2674 @@ +//===- X86InstrInfo.td - Describe the X86 Instruction Set -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>; + +def SDTX86Cmov : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>]>; + +def SDTX86BrCond : SDTypeProfile<0, 2, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>; + +def SDTX86SetCC : SDTypeProfile<1, 1, + [SDTCisVT<0, i8>, SDTCisVT<1, i8>]>; + +def SDTX86Ret : SDTypeProfile<0, 1, [SDTCisVT<0, i16>]>; + +def SDT_X86CallSeqStart : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>; +def SDT_X86CallSeqEnd : SDTypeProfile<0, 2, [ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; + +def SDT_X86Call : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86RdTsc : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisInt<1>]>; + +def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest, + [SDNPHasChain, SDNPOutFlag]>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov, + [SDNPInFlag, SDNPOutFlag]>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain, SDNPInFlag]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC, + [SDNPInFlag, SDNPOutFlag]>; + +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInFlag]>; + +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutFlag]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>; + +def X86tailcall: SDNode<"X86ISD::TAILCALL", SDT_X86Call, + [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc, + [SDNPHasChain, SDNPOutFlag]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>; +def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +class X86MemOperand<string printMethod> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm); +} + +def i8mem : X86MemOperand<"printi8mem">; +def i16mem : X86MemOperand<"printi16mem">; +def i32mem : X86MemOperand<"printi32mem">; +def i64mem : X86MemOperand<"printi64mem">; +def i128mem : X86MemOperand<"printi128mem">; +def f32mem : X86MemOperand<"printf32mem">; +def f64mem : X86MemOperand<"printf64mem">; +def f128mem : X86MemOperand<"printf128mem">; + +def lea32mem : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm); +} + +def SSECC : Operand<i8> { + let PrintMethod = "printSSECC"; +} + +def piclabel: Operand<i32> { + let PrintMethod = "printPICLabel"; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16>; +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32>; + +// Branch targets have OtherVT type. +def brtarget : Operand<OtherVT>; + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86 specific addressing mode. +def addr : ComplexPattern<iPTR, 4, "SelectAddr", [], []>; +def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr", + [add, mul, shl, or, frameindex], []>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; def RawFrm : Format<1>; +def AddRegFrm : Format<2>; def MRMDestReg : Format<3>; +def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>; +def MRMSrcMem : Format<6>; +def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>; +def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>; +def MRM6r : Format<22>; def MRM7r : Format<23>; +def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; +def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; +def MRM6m : Format<30>; def MRM7m : Format<31>; +def MRMInitReg : Format<32>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def FPStack : Predicate<"!Subtarget->hasSSE2()">; +def In32BitMode : Predicate<"!Subtarget->is64Bit()">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">; +def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; + +//===----------------------------------------------------------------------===// +// X86 specific pattern fragments. +// + +// ImmType - This specifies the immediate type used by an instruction. This is +// part of the ad-hoc solution used to emit machine instruction encodings by our +// machine code emitter. +class ImmType<bits<3> val> { + bits<3> Value = val; +} +def NoImm : ImmType<0>; +def Imm8 : ImmType<1>; +def Imm16 : ImmType<2>; +def Imm32 : ImmType<3>; +def Imm64 : ImmType<4>; + +// FPFormat - This specifies what form this FP instruction has. This is used by +// the Floating-Point stackifier pass. +class FPFormat<bits<3> val> { + bits<3> Value = val; +} +def NotFP : FPFormat<0>; +def ZeroArgFP : FPFormat<1>; +def OneArgFP : FPFormat<2>; +def OneArgFPRW : FPFormat<3>; +def TwoArgFP : FPFormat<4>; +def CompareFP : FPFormat<5>; +def CondMovFP : FPFormat<6>; +def SpecialFP : FPFormat<7>; + + +class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr> + : Instruction { + let Namespace = "X86"; + + bits<8> Opcode = opcod; + Format Form = f; + bits<6> FormBits = Form.Value; + ImmType ImmT = i; + bits<3> ImmTypeBits = ImmT.Value; + + dag OperandList = ops; + string AsmString = AsmStr; + + // + // Attributes specific to X86 instructions... + // + bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix? + bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? + + bits<4> Prefix = 0; // Which prefix byte does this inst have? + bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix? + FPFormat FPForm; // What flavor of FP instruction is this? + bits<3> FPFormBits = 0; +} + + +// Prefix byte classes which are used to indicate to the ad-hoc machine code +// emitter that various prefix bytes are required. +class OpSize { bit hasOpSizePrefix = 1; } +class AdSize { bit hasAdSizePrefix = 1; } +class REX_W { bit hasREX_WPrefix = 1; } +class TB { bits<4> Prefix = 1; } +class REP { bits<4> Prefix = 2; } +class D8 { bits<4> Prefix = 3; } +class D9 { bits<4> Prefix = 4; } +class DA { bits<4> Prefix = 5; } +class DB { bits<4> Prefix = 6; } +class DC { bits<4> Prefix = 7; } +class DD { bits<4> Prefix = 8; } +class DE { bits<4> Prefix = 9; } +class DF { bits<4> Prefix = 10; } +class XD { bits<4> Prefix = 11; } +class XS { bits<4> Prefix = 12; } +class T8 { bits<4> Prefix = 13; } +class TA { bits<4> Prefix = 14; } + + +//===----------------------------------------------------------------------===// +// Pattern fragments... +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; +def X86_COND_AE : PatLeaf<(i8 1)>; +def X86_COND_B : PatLeaf<(i8 2)>; +def X86_COND_BE : PatLeaf<(i8 3)>; +def X86_COND_E : PatLeaf<(i8 4)>; +def X86_COND_G : PatLeaf<(i8 5)>; +def X86_COND_GE : PatLeaf<(i8 6)>; +def X86_COND_L : PatLeaf<(i8 7)>; +def X86_COND_LE : PatLeaf<(i8 8)>; +def X86_COND_NE : PatLeaf<(i8 9)>; +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; +def X86_COND_S : PatLeaf<(i8 15)>; + +def i16immSExt8 : PatLeaf<(i16 imm), [{ + // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit + // sign extended field. + return (int16_t)N->getValue() == (int8_t)N->getValue(); +}]>; + +def i32immSExt8 : PatLeaf<(i32 imm), [{ + // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit + // sign extended field. + return (int32_t)N->getValue() == (int8_t)N->getValue(); +}]>; + +// Helper fragments for loads. +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>; +def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; + +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; + +def sextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (sextloadi1 node:$ptr))>; +def sextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (sextloadi1 node:$ptr))>; +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; + +//===----------------------------------------------------------------------===// +// Instruction templates... +// + +class I<bits<8> o, Format f, dag ops, string asm, list<dag> pattern> + : X86Inst<o, f, NoImm, ops, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8 <bits<8> o, Format f, dag ops, string asm, list<dag> pattern> + : X86Inst<o, f, Imm8 , ops, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii16<bits<8> o, Format f, dag ops, string asm, list<dag> pattern> + : X86Inst<o, f, Imm16, ops, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32<bits<8> o, Format f, dag ops, string asm, list<dag> pattern> + : X86Inst<o, f, Imm32, ops, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +//===----------------------------------------------------------------------===// +// Instruction list... +// + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +def ADJCALLSTACKDOWN : I<0, Pseudo, (ops i32imm:$amt), "#ADJCALLSTACKDOWN", + [(X86callseq_start imm:$amt)]>, Imp<[ESP],[ESP]>; +def ADJCALLSTACKUP : I<0, Pseudo, (ops i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end imm:$amt1, imm:$amt2)]>, + Imp<[ESP],[ESP]>; +def IMPLICIT_USE : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_USE", []>; +def IMPLICIT_DEF : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_DEF", []>; +def IMPLICIT_DEF_GR8 : I<0, Pseudo, (ops GR8:$dst), + "#IMPLICIT_DEF $dst", + [(set GR8:$dst, (undef))]>; +def IMPLICIT_DEF_GR16 : I<0, Pseudo, (ops GR16:$dst), + "#IMPLICIT_DEF $dst", + [(set GR16:$dst, (undef))]>; +def IMPLICIT_DEF_GR32 : I<0, Pseudo, (ops GR32:$dst), + "#IMPLICIT_DEF $dst", + [(set GR32:$dst, (undef))]>; + +// Nop +def NOOP : I<0x90, RawFrm, (ops), "nop", []>; + +// Truncate +def TRUNC_32_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src), + "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>; +def TRUNC_16_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src), + "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>; +def TRUNC_32to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src), + "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}", + [(set GR16:$dst, (trunc GR32:$src))]>; + +//===----------------------------------------------------------------------===// +// Control Flow Instructions... +// + +// Return instructions. +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, noResults = 1 in { + def RET : I<0xC3, RawFrm, (ops), "ret", [(X86retflag 0)]>; + def RETI : Ii16<0xC2, RawFrm, (ops i16imm:$amt), "ret $amt", + [(X86retflag imm:$amt)]>; +} + +// All branches are RawFrm, Void, Branch, and Terminators +let isBranch = 1, isTerminator = 1, noResults = 1 in + class IBr<bits<8> opcode, dag ops, string asm, list<dag> pattern> : + I<opcode, RawFrm, ops, asm, pattern>; + +// Indirect branches +let isBranch = 1, isBarrier = 1 in + def JMP : IBr<0xE9, (ops brtarget:$dst), "jmp $dst", [(br bb:$dst)]>; + +let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in { + def JMP32r : I<0xFF, MRM4r, (ops GR32:$dst), "jmp{l} {*}$dst", + [(brind GR32:$dst)]>; + def JMP32m : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp{l} {*}$dst", + [(brind (loadi32 addr:$dst))]>; +} + +// Conditional branches +def JE : IBr<0x84, (ops brtarget:$dst), "je $dst", + [(X86brcond bb:$dst, X86_COND_E)]>, TB; +def JNE : IBr<0x85, (ops brtarget:$dst), "jne $dst", + [(X86brcond bb:$dst, X86_COND_NE)]>, TB; +def JL : IBr<0x8C, (ops brtarget:$dst), "jl $dst", + [(X86brcond bb:$dst, X86_COND_L)]>, TB; +def JLE : IBr<0x8E, (ops brtarget:$dst), "jle $dst", + [(X86brcond bb:$dst, X86_COND_LE)]>, TB; +def JG : IBr<0x8F, (ops brtarget:$dst), "jg $dst", + [(X86brcond bb:$dst, X86_COND_G)]>, TB; +def JGE : IBr<0x8D, (ops brtarget:$dst), "jge $dst", + [(X86brcond bb:$dst, X86_COND_GE)]>, TB; + +def JB : IBr<0x82, (ops brtarget:$dst), "jb $dst", + [(X86brcond bb:$dst, X86_COND_B)]>, TB; +def JBE : IBr<0x86, (ops brtarget:$dst), "jbe $dst", + [(X86brcond bb:$dst, X86_COND_BE)]>, TB; +def JA : IBr<0x87, (ops brtarget:$dst), "ja $dst", + [(X86brcond bb:$dst, X86_COND_A)]>, TB; +def JAE : IBr<0x83, (ops brtarget:$dst), "jae $dst", + [(X86brcond bb:$dst, X86_COND_AE)]>, TB; + +def JS : IBr<0x88, (ops brtarget:$dst), "js $dst", + [(X86brcond bb:$dst, X86_COND_S)]>, TB; +def JNS : IBr<0x89, (ops brtarget:$dst), "jns $dst", + [(X86brcond bb:$dst, X86_COND_NS)]>, TB; +def JP : IBr<0x8A, (ops brtarget:$dst), "jp $dst", + [(X86brcond bb:$dst, X86_COND_P)]>, TB; +def JNP : IBr<0x8B, (ops brtarget:$dst), "jnp $dst", + [(X86brcond bb:$dst, X86_COND_NP)]>, TB; +def JO : IBr<0x80, (ops brtarget:$dst), "jo $dst", + [(X86brcond bb:$dst, X86_COND_O)]>, TB; +def JNO : IBr<0x81, (ops brtarget:$dst), "jno $dst", + [(X86brcond bb:$dst, X86_COND_NO)]>, TB; + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1, noResults = 1 in + // All calls clobber the non-callee saved registers... + let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in { + def CALLpcrel32 : I<0xE8, RawFrm, (ops i32imm:$dst, variable_ops), + "call ${dst:call}", []>; + def CALL32r : I<0xFF, MRM2r, (ops GR32:$dst, variable_ops), + "call {*}$dst", [(X86call GR32:$dst)]>; + def CALL32m : I<0xFF, MRM2m, (ops i32mem:$dst, variable_ops), + "call {*}$dst", []>; + } + +// Tail call stuff. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in + def TAILJMPd : IBr<0xE9, (ops i32imm:$dst), "jmp ${dst:call} # TAIL CALL", + []>; +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in + def TAILJMPr : I<0xFF, MRM4r, (ops GR32:$dst), "jmp {*}$dst # TAIL CALL", + []>; +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in + def TAILJMPm : I<0xFF, MRM4m, (ops i32mem:$dst), + "jmp {*}$dst # TAIL CALL", []>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions... +// +def LEAVE : I<0xC9, RawFrm, + (ops), "leave", []>, Imp<[EBP,ESP],[EBP,ESP]>; +def POP32r : I<0x58, AddRegFrm, + (ops GR32:$reg), "pop{l} $reg", []>, Imp<[ESP],[ESP]>; + +def PUSH32r : I<0x50, AddRegFrm, + (ops GR32:$reg), "push{l} $reg", []>, Imp<[ESP],[ESP]>; + +def MovePCtoStack : I<0, Pseudo, (ops piclabel:$label), + "call $label", []>; + +let isTwoAddress = 1 in // GR32 = bswap GR32 + def BSWAP32r : I<0xC8, AddRegFrm, + (ops GR32:$dst, GR32:$src), + "bswap{l} $dst", + [(set GR32:$dst, (bswap GR32:$src))]>, TB; + +def XCHG8rr : I<0x86, MRMDestReg, // xchg GR8, GR8 + (ops GR8:$src1, GR8:$src2), + "xchg{b} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG16rr : I<0x87, MRMDestReg, // xchg GR16, GR16 + (ops GR16:$src1, GR16:$src2), + "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize; +def XCHG32rr : I<0x87, MRMDestReg, // xchg GR32, GR32 + (ops GR32:$src1, GR32:$src2), + "xchg{l} {$src2|$src1}, {$src1|$src2}", []>; + +def XCHG8mr : I<0x86, MRMDestMem, + (ops i8mem:$src1, GR8:$src2), + "xchg{b} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG16mr : I<0x87, MRMDestMem, + (ops i16mem:$src1, GR16:$src2), + "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize; +def XCHG32mr : I<0x87, MRMDestMem, + (ops i32mem:$src1, GR32:$src2), + "xchg{l} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG8rm : I<0x86, MRMSrcMem, + (ops GR8:$src1, i8mem:$src2), + "xchg{b} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG16rm : I<0x87, MRMSrcMem, + (ops GR16:$src1, i16mem:$src2), + "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize; +def XCHG32rm : I<0x87, MRMSrcMem, + (ops GR32:$src1, i32mem:$src2), + "xchg{l} {$src2|$src1}, {$src1|$src2}", []>; + +def LEA16r : I<0x8D, MRMSrcMem, + (ops GR16:$dst, i32mem:$src), + "lea{w} {$src|$dst}, {$dst|$src}", []>, OpSize; +def LEA32r : I<0x8D, MRMSrcMem, + (ops GR32:$dst, lea32mem:$src), + "lea{l} {$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>; + +def REP_MOVSB : I<0xA4, RawFrm, (ops), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)]>, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP; +def REP_MOVSW : I<0xA5, RawFrm, (ops), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)]>, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP, OpSize; +def REP_MOVSD : I<0xA5, RawFrm, (ops), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)]>, + Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP; + +def REP_STOSB : I<0xAA, RawFrm, (ops), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)]>, + Imp<[AL,ECX,EDI], [ECX,EDI]>, REP; +def REP_STOSW : I<0xAB, RawFrm, (ops), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)]>, + Imp<[AX,ECX,EDI], [ECX,EDI]>, REP, OpSize; +def REP_STOSD : I<0xAB, RawFrm, (ops), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)]>, + Imp<[EAX,ECX,EDI], [ECX,EDI]>, REP; + +def RDTSC : I<0x31, RawFrm, (ops), "rdtsc", [(X86rdtsc)]>, + TB, Imp<[],[RAX,RDX]>; + +//===----------------------------------------------------------------------===// +// Input/Output Instructions... +// +def IN8rr : I<0xEC, RawFrm, (ops), + "in{b} {%dx, %al|%AL, %DX}", + []>, Imp<[DX], [AL]>; +def IN16rr : I<0xED, RawFrm, (ops), + "in{w} {%dx, %ax|%AX, %DX}", + []>, Imp<[DX], [AX]>, OpSize; +def IN32rr : I<0xED, RawFrm, (ops), + "in{l} {%dx, %eax|%EAX, %DX}", + []>, Imp<[DX],[EAX]>; + +def IN8ri : Ii8<0xE4, RawFrm, (ops i16i8imm:$port), + "in{b} {$port, %al|%AL, $port}", + []>, + Imp<[], [AL]>; +def IN16ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port), + "in{w} {$port, %ax|%AX, $port}", + []>, + Imp<[], [AX]>, OpSize; +def IN32ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port), + "in{l} {$port, %eax|%EAX, $port}", + []>, + Imp<[],[EAX]>; + +def OUT8rr : I<0xEE, RawFrm, (ops), + "out{b} {%al, %dx|%DX, %AL}", + []>, Imp<[DX, AL], []>; +def OUT16rr : I<0xEF, RawFrm, (ops), + "out{w} {%ax, %dx|%DX, %AX}", + []>, Imp<[DX, AX], []>, OpSize; +def OUT32rr : I<0xEF, RawFrm, (ops), + "out{l} {%eax, %dx|%DX, %EAX}", + []>, Imp<[DX, EAX], []>; + +def OUT8ir : Ii8<0xE6, RawFrm, (ops i16i8imm:$port), + "out{b} {%al, $port|$port, %AL}", + []>, + Imp<[AL], []>; +def OUT16ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port), + "out{w} {%ax, $port|$port, %AX}", + []>, + Imp<[AX], []>, OpSize; +def OUT32ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port), + "out{l} {%eax, $port|$port, %EAX}", + []>, + Imp<[EAX], []>; + +//===----------------------------------------------------------------------===// +// Move Instructions... +// +def MOV8rr : I<0x88, MRMDestReg, (ops GR8 :$dst, GR8 :$src), + "mov{b} {$src, $dst|$dst, $src}", []>; +def MOV16rr : I<0x89, MRMDestReg, (ops GR16:$dst, GR16:$src), + "mov{w} {$src, $dst|$dst, $src}", []>, OpSize; +def MOV32rr : I<0x89, MRMDestReg, (ops GR32:$dst, GR32:$src), + "mov{l} {$src, $dst|$dst, $src}", []>; +let isReMaterializable = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (ops GR8 :$dst, i8imm :$src), + "mov{b} {$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)]>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (ops GR16:$dst, i16imm:$src), + "mov{w} {$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)]>, OpSize; +def MOV32ri : Ii32<0xB8, AddRegFrm, (ops GR32:$dst, i32imm:$src), + "mov{l} {$src, $dst|$dst, $src}", + [(set GR32:$dst, imm:$src)]>; +} +def MOV8mi : Ii8 <0xC6, MRM0m, (ops i8mem :$dst, i8imm :$src), + "mov{b} {$src, $dst|$dst, $src}", + [(store (i8 imm:$src), addr:$dst)]>; +def MOV16mi : Ii16<0xC7, MRM0m, (ops i16mem:$dst, i16imm:$src), + "mov{w} {$src, $dst|$dst, $src}", + [(store (i16 imm:$src), addr:$dst)]>, OpSize; +def MOV32mi : Ii32<0xC7, MRM0m, (ops i32mem:$dst, i32imm:$src), + "mov{l} {$src, $dst|$dst, $src}", + [(store (i32 imm:$src), addr:$dst)]>; + +def MOV8rm : I<0x8A, MRMSrcMem, (ops GR8 :$dst, i8mem :$src), + "mov{b} {$src, $dst|$dst, $src}", + [(set GR8:$dst, (load addr:$src))]>; +def MOV16rm : I<0x8B, MRMSrcMem, (ops GR16:$dst, i16mem:$src), + "mov{w} {$src, $dst|$dst, $src}", + [(set GR16:$dst, (load addr:$src))]>, OpSize; +def MOV32rm : I<0x8B, MRMSrcMem, (ops GR32:$dst, i32mem:$src), + "mov{l} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (load addr:$src))]>; + +def MOV8mr : I<0x88, MRMDestMem, (ops i8mem :$dst, GR8 :$src), + "mov{b} {$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)]>; +def MOV16mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16:$src), + "mov{w} {$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)]>, OpSize; +def MOV32mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32:$src), + "mov{l} {$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)]>; + +//===----------------------------------------------------------------------===// +// Fixed-Register Multiplication and Division Instructions... +// + +// Extra precision multiplication +def MUL8r : I<0xF6, MRM4r, (ops GR8:$src), "mul{b} $src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, GR8:$src))]>, + Imp<[AL],[AX]>; // AL,AH = AL*GR8 +def MUL16r : I<0xF7, MRM4r, (ops GR16:$src), "mul{w} $src", []>, + Imp<[AX],[AX,DX]>, OpSize; // AX,DX = AX*GR16 +def MUL32r : I<0xF7, MRM4r, (ops GR32:$src), "mul{l} $src", []>, + Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*GR32 +def MUL8m : I<0xF6, MRM4m, (ops i8mem :$src), + "mul{b} $src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, (loadi8 addr:$src)))]>, + Imp<[AL],[AX]>; // AL,AH = AL*[mem8] +def MUL16m : I<0xF7, MRM4m, (ops i16mem:$src), + "mul{w} $src", []>, Imp<[AX],[AX,DX]>, + OpSize; // AX,DX = AX*[mem16] +def MUL32m : I<0xF7, MRM4m, (ops i32mem:$src), + "mul{l} $src", []>, Imp<[EAX],[EAX,EDX]>;// EAX,EDX = EAX*[mem32] + +def IMUL8r : I<0xF6, MRM5r, (ops GR8:$src), "imul{b} $src", []>, + Imp<[AL],[AX]>; // AL,AH = AL*GR8 +def IMUL16r : I<0xF7, MRM5r, (ops GR16:$src), "imul{w} $src", []>, + Imp<[AX],[AX,DX]>, OpSize; // AX,DX = AX*GR16 +def IMUL32r : I<0xF7, MRM5r, (ops GR32:$src), "imul{l} $src", []>, + Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*GR32 +def IMUL8m : I<0xF6, MRM5m, (ops i8mem :$src), + "imul{b} $src", []>, Imp<[AL],[AX]>; // AL,AH = AL*[mem8] +def IMUL16m : I<0xF7, MRM5m, (ops i16mem:$src), + "imul{w} $src", []>, Imp<[AX],[AX,DX]>, + OpSize; // AX,DX = AX*[mem16] +def IMUL32m : I<0xF7, MRM5m, (ops i32mem:$src), + "imul{l} $src", []>, + Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*[mem32] + +// unsigned division/remainder +def DIV8r : I<0xF6, MRM6r, (ops GR8:$src), // AX/r8 = AL,AH + "div{b} $src", []>, Imp<[AX],[AX]>; +def DIV16r : I<0xF7, MRM6r, (ops GR16:$src), // DX:AX/r16 = AX,DX + "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize; +def DIV32r : I<0xF7, MRM6r, (ops GR32:$src), // EDX:EAX/r32 = EAX,EDX + "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>; +def DIV8m : I<0xF6, MRM6m, (ops i8mem:$src), // AX/[mem8] = AL,AH + "div{b} $src", []>, Imp<[AX],[AX]>; +def DIV16m : I<0xF7, MRM6m, (ops i16mem:$src), // DX:AX/[mem16] = AX,DX + "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize; +def DIV32m : I<0xF7, MRM6m, (ops i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX + "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>; + +// Signed division/remainder. +def IDIV8r : I<0xF6, MRM7r, (ops GR8:$src), // AX/r8 = AL,AH + "idiv{b} $src", []>, Imp<[AX],[AX]>; +def IDIV16r: I<0xF7, MRM7r, (ops GR16:$src), // DX:AX/r16 = AX,DX + "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize; +def IDIV32r: I<0xF7, MRM7r, (ops GR32:$src), // EDX:EAX/r32 = EAX,EDX + "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>; +def IDIV8m : I<0xF6, MRM7m, (ops i8mem:$src), // AX/[mem8] = AL,AH + "idiv{b} $src", []>, Imp<[AX],[AX]>; +def IDIV16m: I<0xF7, MRM7m, (ops i16mem:$src), // DX:AX/[mem16] = AX,DX + "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize; +def IDIV32m: I<0xF7, MRM7m, (ops i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX + "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>; + + +//===----------------------------------------------------------------------===// +// Two address Instructions... +// +let isTwoAddress = 1 in { + +// Conditional moves +def CMOVB16rr : I<0x42, MRMSrcReg, // if <u, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_B))]>, + TB, OpSize; +def CMOVB16rm : I<0x42, MRMSrcMem, // if <u, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_B))]>, + TB, OpSize; +def CMOVB32rr : I<0x42, MRMSrcReg, // if <u, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_B))]>, + TB; +def CMOVB32rm : I<0x42, MRMSrcMem, // if <u, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_B))]>, + TB; + +def CMOVAE16rr: I<0x43, MRMSrcReg, // if >=u, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_AE))]>, + TB, OpSize; +def CMOVAE16rm: I<0x43, MRMSrcMem, // if >=u, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_AE))]>, + TB, OpSize; +def CMOVAE32rr: I<0x43, MRMSrcReg, // if >=u, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_AE))]>, + TB; +def CMOVAE32rm: I<0x43, MRMSrcMem, // if >=u, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_AE))]>, + TB; + +def CMOVE16rr : I<0x44, MRMSrcReg, // if ==, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_E))]>, + TB, OpSize; +def CMOVE16rm : I<0x44, MRMSrcMem, // if ==, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_E))]>, + TB, OpSize; +def CMOVE32rr : I<0x44, MRMSrcReg, // if ==, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_E))]>, + TB; +def CMOVE32rm : I<0x44, MRMSrcMem, // if ==, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_E))]>, + TB; + +def CMOVNE16rr: I<0x45, MRMSrcReg, // if !=, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_NE))]>, + TB, OpSize; +def CMOVNE16rm: I<0x45, MRMSrcMem, // if !=, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_NE))]>, + TB, OpSize; +def CMOVNE32rr: I<0x45, MRMSrcReg, // if !=, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_NE))]>, + TB; +def CMOVNE32rm: I<0x45, MRMSrcMem, // if !=, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_NE))]>, + TB; + +def CMOVBE16rr: I<0x46, MRMSrcReg, // if <=u, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_BE))]>, + TB, OpSize; +def CMOVBE16rm: I<0x46, MRMSrcMem, // if <=u, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_BE))]>, + TB, OpSize; +def CMOVBE32rr: I<0x46, MRMSrcReg, // if <=u, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_BE))]>, + TB; +def CMOVBE32rm: I<0x46, MRMSrcMem, // if <=u, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_BE))]>, + TB; + +def CMOVA16rr : I<0x47, MRMSrcReg, // if >u, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_A))]>, + TB, OpSize; +def CMOVA16rm : I<0x47, MRMSrcMem, // if >u, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_A))]>, + TB, OpSize; +def CMOVA32rr : I<0x47, MRMSrcReg, // if >u, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_A))]>, + TB; +def CMOVA32rm : I<0x47, MRMSrcMem, // if >u, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_A))]>, + TB; + +def CMOVL16rr : I<0x4C, MRMSrcReg, // if <s, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_L))]>, + TB, OpSize; +def CMOVL16rm : I<0x4C, MRMSrcMem, // if <s, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_L))]>, + TB, OpSize; +def CMOVL32rr : I<0x4C, MRMSrcReg, // if <s, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_L))]>, + TB; +def CMOVL32rm : I<0x4C, MRMSrcMem, // if <s, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_L))]>, + TB; + +def CMOVGE16rr: I<0x4D, MRMSrcReg, // if >=s, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_GE))]>, + TB, OpSize; +def CMOVGE16rm: I<0x4D, MRMSrcMem, // if >=s, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_GE))]>, + TB, OpSize; +def CMOVGE32rr: I<0x4D, MRMSrcReg, // if >=s, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_GE))]>, + TB; +def CMOVGE32rm: I<0x4D, MRMSrcMem, // if >=s, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_GE))]>, + TB; + +def CMOVLE16rr: I<0x4E, MRMSrcReg, // if <=s, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_LE))]>, + TB, OpSize; +def CMOVLE16rm: I<0x4E, MRMSrcMem, // if <=s, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_LE))]>, + TB, OpSize; +def CMOVLE32rr: I<0x4E, MRMSrcReg, // if <=s, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_LE))]>, + TB; +def CMOVLE32rm: I<0x4E, MRMSrcMem, // if <=s, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_LE))]>, + TB; + +def CMOVG16rr : I<0x4F, MRMSrcReg, // if >s, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_G))]>, + TB, OpSize; +def CMOVG16rm : I<0x4F, MRMSrcMem, // if >s, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_G))]>, + TB, OpSize; +def CMOVG32rr : I<0x4F, MRMSrcReg, // if >s, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_G))]>, + TB; +def CMOVG32rm : I<0x4F, MRMSrcMem, // if >s, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_G))]>, + TB; + +def CMOVS16rr : I<0x48, MRMSrcReg, // if signed, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_S))]>, + TB, OpSize; +def CMOVS16rm : I<0x48, MRMSrcMem, // if signed, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_S))]>, + TB, OpSize; +def CMOVS32rr : I<0x48, MRMSrcReg, // if signed, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_S))]>, + TB; +def CMOVS32rm : I<0x48, MRMSrcMem, // if signed, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_S))]>, + TB; + +def CMOVNS16rr: I<0x49, MRMSrcReg, // if !signed, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_NS))]>, + TB, OpSize; +def CMOVNS16rm: I<0x49, MRMSrcMem, // if !signed, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_NS))]>, + TB, OpSize; +def CMOVNS32rr: I<0x49, MRMSrcReg, // if !signed, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_NS))]>, + TB; +def CMOVNS32rm: I<0x49, MRMSrcMem, // if !signed, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_NS))]>, + TB; + +def CMOVP16rr : I<0x4A, MRMSrcReg, // if parity, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_P))]>, + TB, OpSize; +def CMOVP16rm : I<0x4A, MRMSrcMem, // if parity, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_P))]>, + TB, OpSize; +def CMOVP32rr : I<0x4A, MRMSrcReg, // if parity, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_P))]>, + TB; +def CMOVP32rm : I<0x4A, MRMSrcMem, // if parity, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_P))]>, + TB; + +def CMOVNP16rr : I<0x4B, MRMSrcReg, // if !parity, GR16 = GR16 + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, + X86_COND_NP))]>, + TB, OpSize; +def CMOVNP16rm : I<0x4B, MRMSrcMem, // if !parity, GR16 = [mem16] + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + X86_COND_NP))]>, + TB, OpSize; +def CMOVNP32rr : I<0x4B, MRMSrcReg, // if !parity, GR32 = GR32 + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, + X86_COND_NP))]>, + TB; +def CMOVNP32rm : I<0x4B, MRMSrcMem, // if !parity, GR32 = [mem32] + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + X86_COND_NP))]>, + TB; + + +// unary instructions +let CodeSize = 2 in { +def NEG8r : I<0xF6, MRM3r, (ops GR8 :$dst, GR8 :$src), "neg{b} $dst", + [(set GR8:$dst, (ineg GR8:$src))]>; +def NEG16r : I<0xF7, MRM3r, (ops GR16:$dst, GR16:$src), "neg{w} $dst", + [(set GR16:$dst, (ineg GR16:$src))]>, OpSize; +def NEG32r : I<0xF7, MRM3r, (ops GR32:$dst, GR32:$src), "neg{l} $dst", + [(set GR32:$dst, (ineg GR32:$src))]>; +let isTwoAddress = 0 in { + def NEG8m : I<0xF6, MRM3m, (ops i8mem :$dst), "neg{b} $dst", + [(store (ineg (loadi8 addr:$dst)), addr:$dst)]>; + def NEG16m : I<0xF7, MRM3m, (ops i16mem:$dst), "neg{w} $dst", + [(store (ineg (loadi16 addr:$dst)), addr:$dst)]>, OpSize; + def NEG32m : I<0xF7, MRM3m, (ops i32mem:$dst), "neg{l} $dst", + [(store (ineg (loadi32 addr:$dst)), addr:$dst)]>; + +} + +def NOT8r : I<0xF6, MRM2r, (ops GR8 :$dst, GR8 :$src), "not{b} $dst", + [(set GR8:$dst, (not GR8:$src))]>; +def NOT16r : I<0xF7, MRM2r, (ops GR16:$dst, GR16:$src), "not{w} $dst", + [(set GR16:$dst, (not GR16:$src))]>, OpSize; +def NOT32r : I<0xF7, MRM2r, (ops GR32:$dst, GR32:$src), "not{l} $dst", + [(set GR32:$dst, (not GR32:$src))]>; +let isTwoAddress = 0 in { + def NOT8m : I<0xF6, MRM2m, (ops i8mem :$dst), "not{b} $dst", + [(store (not (loadi8 addr:$dst)), addr:$dst)]>; + def NOT16m : I<0xF7, MRM2m, (ops i16mem:$dst), "not{w} $dst", + [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize; + def NOT32m : I<0xF7, MRM2m, (ops i32mem:$dst), "not{l} $dst", + [(store (not (loadi32 addr:$dst)), addr:$dst)]>; +} +} // CodeSize + +// TODO: inc/dec is slow for P4, but fast for Pentium-M. +let CodeSize = 2 in +def INC8r : I<0xFE, MRM0r, (ops GR8 :$dst, GR8 :$src), "inc{b} $dst", + [(set GR8:$dst, (add GR8:$src, 1))]>; +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def INC16r : I<0x40, AddRegFrm, (ops GR16:$dst, GR16:$src), "inc{w} $dst", + [(set GR16:$dst, (add GR16:$src, 1))]>, + OpSize, Requires<[In32BitMode]>; +def INC32r : I<0x40, AddRegFrm, (ops GR32:$dst, GR32:$src), "inc{l} $dst", + [(set GR32:$dst, (add GR32:$src, 1))]>, Requires<[In32BitMode]>; +} +let isTwoAddress = 0, CodeSize = 2 in { + def INC8m : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst", + [(store (add (loadi8 addr:$dst), 1), addr:$dst)]>; + def INC16m : I<0xFF, MRM0m, (ops i16mem:$dst), "inc{w} $dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst)]>, OpSize; + def INC32m : I<0xFF, MRM0m, (ops i32mem:$dst), "inc{l} $dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst)]>; +} + +let CodeSize = 2 in +def DEC8r : I<0xFE, MRM1r, (ops GR8 :$dst, GR8 :$src), "dec{b} $dst", + [(set GR8:$dst, (add GR8:$src, -1))]>; +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def DEC16r : I<0x48, AddRegFrm, (ops GR16:$dst, GR16:$src), "dec{w} $dst", + [(set GR16:$dst, (add GR16:$src, -1))]>, + OpSize, Requires<[In32BitMode]>; +def DEC32r : I<0x48, AddRegFrm, (ops GR32:$dst, GR32:$src), "dec{l} $dst", + [(set GR32:$dst, (add GR32:$src, -1))]>, Requires<[In32BitMode]>; +} + +let isTwoAddress = 0, CodeSize = 2 in { + def DEC8m : I<0xFE, MRM1m, (ops i8mem :$dst), "dec{b} $dst", + [(store (add (loadi8 addr:$dst), -1), addr:$dst)]>; + def DEC16m : I<0xFF, MRM1m, (ops i16mem:$dst), "dec{w} $dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst)]>, OpSize; + def DEC32m : I<0xFF, MRM1m, (ops i32mem:$dst), "dec{l} $dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst)]>; +} + +// Logical operators... +let isCommutable = 1 in { // X = AND Y, Z --> X = AND Z, Y +def AND8rr : I<0x20, MRMDestReg, + (ops GR8 :$dst, GR8 :$src1, GR8 :$src2), + "and{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (and GR8:$src1, GR8:$src2))]>; +def AND16rr : I<0x21, MRMDestReg, + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "and{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (and GR16:$src1, GR16:$src2))]>, OpSize; +def AND32rr : I<0x21, MRMDestReg, + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "and{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>; +} + +def AND8rm : I<0x22, MRMSrcMem, + (ops GR8 :$dst, GR8 :$src1, i8mem :$src2), + "and{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (and GR8:$src1, (load addr:$src2)))]>; +def AND16rm : I<0x23, MRMSrcMem, + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "and{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (and GR16:$src1, (load addr:$src2)))]>, OpSize; +def AND32rm : I<0x23, MRMSrcMem, + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "and{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (and GR32:$src1, (load addr:$src2)))]>; + +def AND8ri : Ii8<0x80, MRM4r, + (ops GR8 :$dst, GR8 :$src1, i8imm :$src2), + "and{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (and GR8:$src1, imm:$src2))]>; +def AND16ri : Ii16<0x81, MRM4r, + (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "and{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (and GR16:$src1, imm:$src2))]>, OpSize; +def AND32ri : Ii32<0x81, MRM4r, + (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "and{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>; +def AND16ri8 : Ii8<0x83, MRM4r, + (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "and{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def AND32ri8 : Ii8<0x83, MRM4r, + (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "and{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2))]>; + +let isTwoAddress = 0 in { + def AND8mr : I<0x20, MRMDestMem, + (ops i8mem :$dst, GR8 :$src), + "and{b} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), GR8:$src), addr:$dst)]>; + def AND16mr : I<0x21, MRMDestMem, + (ops i16mem:$dst, GR16:$src), + "and{w} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), GR16:$src), addr:$dst)]>, + OpSize; + def AND32mr : I<0x21, MRMDestMem, + (ops i32mem:$dst, GR32:$src), + "and{l} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), GR32:$src), addr:$dst)]>; + def AND8mi : Ii8<0x80, MRM4m, + (ops i8mem :$dst, i8imm :$src), + "and{b} {$src, $dst|$dst, $src}", + [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst)]>; + def AND16mi : Ii16<0x81, MRM4m, + (ops i16mem:$dst, i16imm:$src), + "and{w} {$src, $dst|$dst, $src}", + [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst)]>, + OpSize; + def AND32mi : Ii32<0x81, MRM4m, + (ops i32mem:$dst, i32imm:$src), + "and{l} {$src, $dst|$dst, $src}", + [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst)]>; + def AND16mi8 : Ii8<0x83, MRM4m, + (ops i16mem:$dst, i16i8imm :$src), + "and{w} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst)]>, + OpSize; + def AND32mi8 : Ii8<0x83, MRM4m, + (ops i32mem:$dst, i32i8imm :$src), + "and{l} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst)]>; +} + + +let isCommutable = 1 in { // X = OR Y, Z --> X = OR Z, Y +def OR8rr : I<0x08, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2), + "or{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>; +def OR16rr : I<0x09, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "or{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>, OpSize; +def OR32rr : I<0x09, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "or{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>; +} +def OR8rm : I<0x0A, MRMSrcMem , (ops GR8 :$dst, GR8 :$src1, i8mem :$src2), + "or{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>; +def OR16rm : I<0x0B, MRMSrcMem , (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "or{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>, OpSize; +def OR32rm : I<0x0B, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "or{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (or GR32:$src1, (load addr:$src2)))]>; + +def OR8ri : Ii8 <0x80, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2), + "or{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>; +def OR16ri : Ii16<0x81, MRM1r, (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "or{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>, OpSize; +def OR32ri : Ii32<0x81, MRM1r, (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "or{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>; + +def OR16ri8 : Ii8<0x83, MRM1r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "or{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2))]>, OpSize; +def OR32ri8 : Ii8<0x83, MRM1r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "or{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2))]>; +let isTwoAddress = 0 in { + def OR8mr : I<0x08, MRMDestMem, (ops i8mem:$dst, GR8:$src), + "or{b} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>; + def OR16mr : I<0x09, MRMDestMem, (ops i16mem:$dst, GR16:$src), + "or{w} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>, OpSize; + def OR32mr : I<0x09, MRMDestMem, (ops i32mem:$dst, GR32:$src), + "or{l} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), GR32:$src), addr:$dst)]>; + def OR8mi : Ii8<0x80, MRM1m, (ops i8mem :$dst, i8imm:$src), + "or{b} {$src, $dst|$dst, $src}", + [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst)]>; + def OR16mi : Ii16<0x81, MRM1m, (ops i16mem:$dst, i16imm:$src), + "or{w} {$src, $dst|$dst, $src}", + [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst)]>, + OpSize; + def OR32mi : Ii32<0x81, MRM1m, (ops i32mem:$dst, i32imm:$src), + "or{l} {$src, $dst|$dst, $src}", + [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst)]>; + def OR16mi8 : Ii8<0x83, MRM1m, (ops i16mem:$dst, i16i8imm:$src), + "or{w} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst)]>, + OpSize; + def OR32mi8 : Ii8<0x83, MRM1m, (ops i32mem:$dst, i32i8imm:$src), + "or{l} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst)]>; +} + + +let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y +def XOR8rr : I<0x30, MRMDestReg, + (ops GR8 :$dst, GR8 :$src1, GR8 :$src2), + "xor{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>; +def XOR16rr : I<0x31, MRMDestReg, + (ops GR16:$dst, GR16:$src1, GR16:$src2), + "xor{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize; +def XOR32rr : I<0x31, MRMDestReg, + (ops GR32:$dst, GR32:$src1, GR32:$src2), + "xor{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>; +} + +def XOR8rm : I<0x32, MRMSrcMem , + (ops GR8 :$dst, GR8:$src1, i8mem :$src2), + "xor{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2)))]>; +def XOR16rm : I<0x33, MRMSrcMem , + (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "xor{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>, OpSize; +def XOR32rm : I<0x33, MRMSrcMem , + (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "xor{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2)))]>; + +def XOR8ri : Ii8<0x80, MRM6r, + (ops GR8:$dst, GR8:$src1, i8imm:$src2), + "xor{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (xor GR8:$src1, imm:$src2))]>; +def XOR16ri : Ii16<0x81, MRM6r, + (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "xor{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (xor GR16:$src1, imm:$src2))]>, OpSize; +def XOR32ri : Ii32<0x81, MRM6r, + (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "xor{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>; +def XOR16ri8 : Ii8<0x83, MRM6r, + (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "xor{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def XOR32ri8 : Ii8<0x83, MRM6r, + (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "xor{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2))]>; +let isTwoAddress = 0 in { + def XOR8mr : I<0x30, MRMDestMem, + (ops i8mem :$dst, GR8 :$src), + "xor{b} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), GR8:$src), addr:$dst)]>; + def XOR16mr : I<0x31, MRMDestMem, + (ops i16mem:$dst, GR16:$src), + "xor{w} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), GR16:$src), addr:$dst)]>, + OpSize; + def XOR32mr : I<0x31, MRMDestMem, + (ops i32mem:$dst, GR32:$src), + "xor{l} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), GR32:$src), addr:$dst)]>; + def XOR8mi : Ii8<0x80, MRM6m, + (ops i8mem :$dst, i8imm :$src), + "xor{b} {$src, $dst|$dst, $src}", + [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst)]>; + def XOR16mi : Ii16<0x81, MRM6m, + (ops i16mem:$dst, i16imm:$src), + "xor{w} {$src, $dst|$dst, $src}", + [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst)]>, + OpSize; + def XOR32mi : Ii32<0x81, MRM6m, + (ops i32mem:$dst, i32imm:$src), + "xor{l} {$src, $dst|$dst, $src}", + [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst)]>; + def XOR16mi8 : Ii8<0x83, MRM6m, + (ops i16mem:$dst, i16i8imm :$src), + "xor{w} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst)]>, + OpSize; + def XOR32mi8 : Ii8<0x83, MRM6m, + (ops i32mem:$dst, i32i8imm :$src), + "xor{l} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst)]>; +} + +// Shift instructions +def SHL8rCL : I<0xD2, MRM4r, (ops GR8 :$dst, GR8 :$src), + "shl{b} {%cl, $dst|$dst, %CL}", + [(set GR8:$dst, (shl GR8:$src, CL))]>, Imp<[CL],[]>; +def SHL16rCL : I<0xD3, MRM4r, (ops GR16:$dst, GR16:$src), + "shl{w} {%cl, $dst|$dst, %CL}", + [(set GR16:$dst, (shl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize; +def SHL32rCL : I<0xD3, MRM4r, (ops GR32:$dst, GR32:$src), + "shl{l} {%cl, $dst|$dst, %CL}", + [(set GR32:$dst, (shl GR32:$src, CL))]>, Imp<[CL],[]>; + +def SHL8ri : Ii8<0xC0, MRM4r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2), + "shl{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def SHL16ri : Ii8<0xC1, MRM4r, (ops GR16:$dst, GR16:$src1, i8imm:$src2), + "shl{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def SHL32ri : Ii8<0xC1, MRM4r, (ops GR32:$dst, GR32:$src1, i8imm:$src2), + "shl{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>; +} + +// Shift left by one. Not used because (add x, x) is slightly cheaper. +def SHL8r1 : I<0xD0, MRM4r, (ops GR8 :$dst, GR8 :$src1), + "shl{b} $dst", []>; +def SHL16r1 : I<0xD1, MRM4r, (ops GR16:$dst, GR16:$src1), + "shl{w} $dst", []>, OpSize; +def SHL32r1 : I<0xD1, MRM4r, (ops GR32:$dst, GR32:$src1), + "shl{l} $dst", []>; + +let isTwoAddress = 0 in { + def SHL8mCL : I<0xD2, MRM4m, (ops i8mem :$dst), + "shl{b} {%cl, $dst|$dst, %CL}", + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SHL16mCL : I<0xD3, MRM4m, (ops i16mem:$dst), + "shl{w} {%cl, $dst|$dst, %CL}", + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>, OpSize; + def SHL32mCL : I<0xD3, MRM4m, (ops i32mem:$dst), + "shl{l} {%cl, $dst|$dst, %CL}", + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SHL8mi : Ii8<0xC0, MRM4m, (ops i8mem :$dst, i8imm:$src), + "shl{b} {$src, $dst|$dst, $src}", + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + def SHL16mi : Ii8<0xC1, MRM4m, (ops i16mem:$dst, i8imm:$src), + "shl{w} {$src, $dst|$dst, $src}", + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; + def SHL32mi : Ii8<0xC1, MRM4m, (ops i32mem:$dst, i8imm:$src), + "shl{l} {$src, $dst|$dst, $src}", + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + + // Shift by 1 + def SHL8m1 : I<0xD0, MRM4m, (ops i8mem :$dst), + "shl{b} $dst", + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + def SHL16m1 : I<0xD1, MRM4m, (ops i16mem:$dst), + "shl{w} $dst", + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; + def SHL32m1 : I<0xD1, MRM4m, (ops i32mem:$dst), + "shl{l} $dst", + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +} + +def SHR8rCL : I<0xD2, MRM5r, (ops GR8 :$dst, GR8 :$src), + "shr{b} {%cl, $dst|$dst, %CL}", + [(set GR8:$dst, (srl GR8:$src, CL))]>, Imp<[CL],[]>; +def SHR16rCL : I<0xD3, MRM5r, (ops GR16:$dst, GR16:$src), + "shr{w} {%cl, $dst|$dst, %CL}", + [(set GR16:$dst, (srl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize; +def SHR32rCL : I<0xD3, MRM5r, (ops GR32:$dst, GR32:$src), + "shr{l} {%cl, $dst|$dst, %CL}", + [(set GR32:$dst, (srl GR32:$src, CL))]>, Imp<[CL],[]>; + +def SHR8ri : Ii8<0xC0, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2), + "shr{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>; +def SHR16ri : Ii8<0xC1, MRM5r, (ops GR16:$dst, GR16:$src1, i8imm:$src2), + "shr{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def SHR32ri : Ii8<0xC1, MRM5r, (ops GR32:$dst, GR32:$src1, i8imm:$src2), + "shr{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>; + +// Shift by 1 +def SHR8r1 : I<0xD0, MRM5r, (ops GR8:$dst, GR8:$src1), + "shr{b} $dst", + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>; +def SHR16r1 : I<0xD1, MRM5r, (ops GR16:$dst, GR16:$src1), + "shr{w} $dst", + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize; +def SHR32r1 : I<0xD1, MRM5r, (ops GR32:$dst, GR32:$src1), + "shr{l} $dst", + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>; + +let isTwoAddress = 0 in { + def SHR8mCL : I<0xD2, MRM5m, (ops i8mem :$dst), + "shr{b} {%cl, $dst|$dst, %CL}", + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SHR16mCL : I<0xD3, MRM5m, (ops i16mem:$dst), + "shr{w} {%cl, $dst|$dst, %CL}", + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>, OpSize; + def SHR32mCL : I<0xD3, MRM5m, (ops i32mem:$dst), + "shr{l} {%cl, $dst|$dst, %CL}", + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SHR8mi : Ii8<0xC0, MRM5m, (ops i8mem :$dst, i8imm:$src), + "shr{b} {$src, $dst|$dst, $src}", + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + def SHR16mi : Ii8<0xC1, MRM5m, (ops i16mem:$dst, i8imm:$src), + "shr{w} {$src, $dst|$dst, $src}", + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; + def SHR32mi : Ii8<0xC1, MRM5m, (ops i32mem:$dst, i8imm:$src), + "shr{l} {$src, $dst|$dst, $src}", + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + + // Shift by 1 + def SHR8m1 : I<0xD0, MRM5m, (ops i8mem :$dst), + "shr{b} $dst", + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + def SHR16m1 : I<0xD1, MRM5m, (ops i16mem:$dst), + "shr{w} $dst", + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize; + def SHR32m1 : I<0xD1, MRM5m, (ops i32mem:$dst), + "shr{l} $dst", + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +} + +def SAR8rCL : I<0xD2, MRM7r, (ops GR8 :$dst, GR8 :$src), + "sar{b} {%cl, $dst|$dst, %CL}", + [(set GR8:$dst, (sra GR8:$src, CL))]>, Imp<[CL],[]>; +def SAR16rCL : I<0xD3, MRM7r, (ops GR16:$dst, GR16:$src), + "sar{w} {%cl, $dst|$dst, %CL}", + [(set GR16:$dst, (sra GR16:$src, CL))]>, Imp<[CL],[]>, OpSize; +def SAR32rCL : I<0xD3, MRM7r, (ops GR32:$dst, GR32:$src), + "sar{l} {%cl, $dst|$dst, %CL}", + [(set GR32:$dst, (sra GR32:$src, CL))]>, Imp<[CL],[]>; + +def SAR8ri : Ii8<0xC0, MRM7r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2), + "sar{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>; +def SAR16ri : Ii8<0xC1, MRM7r, (ops GR16:$dst, GR16:$src1, i8imm:$src2), + "sar{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>, + OpSize; +def SAR32ri : Ii8<0xC1, MRM7r, (ops GR32:$dst, GR32:$src1, i8imm:$src2), + "sar{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>; + +// Shift by 1 +def SAR8r1 : I<0xD0, MRM7r, (ops GR8 :$dst, GR8 :$src1), + "sar{b} $dst", + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>; +def SAR16r1 : I<0xD1, MRM7r, (ops GR16:$dst, GR16:$src1), + "sar{w} $dst", + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize; +def SAR32r1 : I<0xD1, MRM7r, (ops GR32:$dst, GR32:$src1), + "sar{l} $dst", + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>; + +let isTwoAddress = 0 in { + def SAR8mCL : I<0xD2, MRM7m, (ops i8mem :$dst), + "sar{b} {%cl, $dst|$dst, %CL}", + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SAR16mCL : I<0xD3, MRM7m, (ops i16mem:$dst), + "sar{w} {%cl, $dst|$dst, %CL}", + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>, OpSize; + def SAR32mCL : I<0xD3, MRM7m, (ops i32mem:$dst), + "sar{l} {%cl, $dst|$dst, %CL}", + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def SAR8mi : Ii8<0xC0, MRM7m, (ops i8mem :$dst, i8imm:$src), + "sar{b} {$src, $dst|$dst, $src}", + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + def SAR16mi : Ii8<0xC1, MRM7m, (ops i16mem:$dst, i8imm:$src), + "sar{w} {$src, $dst|$dst, $src}", + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; + def SAR32mi : Ii8<0xC1, MRM7m, (ops i32mem:$dst, i8imm:$src), + "sar{l} {$src, $dst|$dst, $src}", + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + + // Shift by 1 + def SAR8m1 : I<0xD0, MRM7m, (ops i8mem :$dst), + "sar{b} $dst", + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + def SAR16m1 : I<0xD1, MRM7m, (ops i16mem:$dst), + "sar{w} $dst", + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; + def SAR32m1 : I<0xD1, MRM7m, (ops i32mem:$dst), + "sar{l} $dst", + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +} + +// Rotate instructions +// FIXME: provide shorter instructions when imm8 == 1 +def ROL8rCL : I<0xD2, MRM0r, (ops GR8 :$dst, GR8 :$src), + "rol{b} {%cl, $dst|$dst, %CL}", + [(set GR8:$dst, (rotl GR8:$src, CL))]>, Imp<[CL],[]>; +def ROL16rCL : I<0xD3, MRM0r, (ops GR16:$dst, GR16:$src), + "rol{w} {%cl, $dst|$dst, %CL}", + [(set GR16:$dst, (rotl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize; +def ROL32rCL : I<0xD3, MRM0r, (ops GR32:$dst, GR32:$src), + "rol{l} {%cl, $dst|$dst, %CL}", + [(set GR32:$dst, (rotl GR32:$src, CL))]>, Imp<[CL],[]>; + +def ROL8ri : Ii8<0xC0, MRM0r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2), + "rol{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; +def ROL16ri : Ii8<0xC1, MRM0r, (ops GR16:$dst, GR16:$src1, i8imm:$src2), + "rol{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def ROL32ri : Ii8<0xC1, MRM0r, (ops GR32:$dst, GR32:$src1, i8imm:$src2), + "rol{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>; + +// Rotate by 1 +def ROL8r1 : I<0xD0, MRM0r, (ops GR8 :$dst, GR8 :$src1), + "rol{b} $dst", + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>; +def ROL16r1 : I<0xD1, MRM0r, (ops GR16:$dst, GR16:$src1), + "rol{w} $dst", + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize; +def ROL32r1 : I<0xD1, MRM0r, (ops GR32:$dst, GR32:$src1), + "rol{l} $dst", + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>; + +let isTwoAddress = 0 in { + def ROL8mCL : I<0xD2, MRM0m, (ops i8mem :$dst), + "rol{b} {%cl, $dst|$dst, %CL}", + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def ROL16mCL : I<0xD3, MRM0m, (ops i16mem:$dst), + "rol{w} {%cl, $dst|$dst, %CL}", + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>, OpSize; + def ROL32mCL : I<0xD3, MRM0m, (ops i32mem:$dst), + "rol{l} {%cl, $dst|$dst, %CL}", + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def ROL8mi : Ii8<0xC0, MRM0m, (ops i8mem :$dst, i8imm:$src), + "rol{b} {$src, $dst|$dst, $src}", + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + def ROL16mi : Ii8<0xC1, MRM0m, (ops i16mem:$dst, i8imm:$src), + "rol{w} {$src, $dst|$dst, $src}", + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; + def ROL32mi : Ii8<0xC1, MRM0m, (ops i32mem:$dst, i8imm:$src), + "rol{l} {$src, $dst|$dst, $src}", + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + + // Rotate by 1 + def ROL8m1 : I<0xD0, MRM0m, (ops i8mem :$dst), + "rol{b} $dst", + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + def ROL16m1 : I<0xD1, MRM0m, (ops i16mem:$dst), + "rol{w} $dst", + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; + def ROL32m1 : I<0xD1, MRM0m, (ops i32mem:$dst), + "rol{l} $dst", + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +} + +def ROR8rCL : I<0xD2, MRM1r, (ops GR8 :$dst, GR8 :$src), + "ror{b} {%cl, $dst|$dst, %CL}", + [(set GR8:$dst, (rotr GR8:$src, CL))]>, Imp<[CL],[]>; +def ROR16rCL : I<0xD3, MRM1r, (ops GR16:$dst, GR16:$src), + "ror{w} {%cl, $dst|$dst, %CL}", + [(set GR16:$dst, (rotr GR16:$src, CL))]>, Imp<[CL],[]>, OpSize; +def ROR32rCL : I<0xD3, MRM1r, (ops GR32:$dst, GR32:$src), + "ror{l} {%cl, $dst|$dst, %CL}", + [(set GR32:$dst, (rotr GR32:$src, CL))]>, Imp<[CL],[]>; + +def ROR8ri : Ii8<0xC0, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2), + "ror{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>; +def ROR16ri : Ii8<0xC1, MRM1r, (ops GR16:$dst, GR16:$src1, i8imm:$src2), + "ror{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def ROR32ri : Ii8<0xC1, MRM1r, (ops GR32:$dst, GR32:$src1, i8imm:$src2), + "ror{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>; + +// Rotate by 1 +def ROR8r1 : I<0xD0, MRM1r, (ops GR8 :$dst, GR8 :$src1), + "ror{b} $dst", + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>; +def ROR16r1 : I<0xD1, MRM1r, (ops GR16:$dst, GR16:$src1), + "ror{w} $dst", + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize; +def ROR32r1 : I<0xD1, MRM1r, (ops GR32:$dst, GR32:$src1), + "ror{l} $dst", + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>; + +let isTwoAddress = 0 in { + def ROR8mCL : I<0xD2, MRM1m, (ops i8mem :$dst), + "ror{b} {%cl, $dst|$dst, %CL}", + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def ROR16mCL : I<0xD3, MRM1m, (ops i16mem:$dst), + "ror{w} {%cl, $dst|$dst, %CL}", + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>, OpSize; + def ROR32mCL : I<0xD3, MRM1m, (ops i32mem:$dst), + "ror{l} {%cl, $dst|$dst, %CL}", + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; + def ROR8mi : Ii8<0xC0, MRM1m, (ops i8mem :$dst, i8imm:$src), + "ror{b} {$src, $dst|$dst, $src}", + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + def ROR16mi : Ii8<0xC1, MRM1m, (ops i16mem:$dst, i8imm:$src), + "ror{w} {$src, $dst|$dst, $src}", + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; + def ROR32mi : Ii8<0xC1, MRM1m, (ops i32mem:$dst, i8imm:$src), + "ror{l} {$src, $dst|$dst, $src}", + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + + // Rotate by 1 + def ROR8m1 : I<0xD0, MRM1m, (ops i8mem :$dst), + "ror{b} $dst", + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; + def ROR16m1 : I<0xD1, MRM1m, (ops i16mem:$dst), + "ror{w} $dst", + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; + def ROR32m1 : I<0xD1, MRM1m, (ops i32mem:$dst), + "ror{l} $dst", + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +} + + + +// Double shift instructions (generalizations of rotate) +def SHLD32rrCL : I<0xA5, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, + Imp<[CL],[]>, TB; +def SHRD32rrCL : I<0xAD, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, + Imp<[CL],[]>, TB; +def SHLD16rrCL : I<0xA5, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>, + Imp<[CL],[]>, TB, OpSize; +def SHRD16rrCL : I<0xAD, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>, + Imp<[CL],[]>, TB, OpSize; + +let isCommutable = 1 in { // These instructions commute to each other. +def SHLD32rri8 : Ii8<0xA4, MRMDestReg, + (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3), + "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB; +def SHRD32rri8 : Ii8<0xAC, MRMDestReg, + (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3), + "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB; +def SHLD16rri8 : Ii8<0xA4, MRMDestReg, + (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3), + "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize; +def SHRD16rri8 : Ii8<0xAC, MRMDestReg, + (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3), + "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize; +} + +let isTwoAddress = 0 in { + def SHLD32mrCL : I<0xA5, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, + Imp<[CL],[]>, TB; + def SHRD32mrCL : I<0xAD, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, + Imp<[CL],[]>, TB; + def SHLD32mri8 : Ii8<0xA4, MRMDestMem, + (ops i32mem:$dst, GR32:$src2, i8imm:$src3), + "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; + def SHRD32mri8 : Ii8<0xAC, MRMDestMem, + (ops i32mem:$dst, GR32:$src2, i8imm:$src3), + "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; + + def SHLD16mrCL : I<0xA5, MRMDestMem, (ops i16mem:$dst, GR16:$src2), + "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, + Imp<[CL],[]>, TB, OpSize; + def SHRD16mrCL : I<0xAD, MRMDestMem, (ops i16mem:$dst, GR16:$src2), + "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, + Imp<[CL],[]>, TB, OpSize; + def SHLD16mri8 : Ii8<0xA4, MRMDestMem, + (ops i16mem:$dst, GR16:$src2, i8imm:$src3), + "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize; + def SHRD16mri8 : Ii8<0xAC, MRMDestMem, + (ops i16mem:$dst, GR16:$src2, i8imm:$src3), + "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize; +} + + +// Arithmetic. +let isCommutable = 1 in { // X = ADD Y, Z --> X = ADD Z, Y +def ADD8rr : I<0x00, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2), + "add{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (add GR8:$src1, GR8:$src2))]>; +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def ADD16rr : I<0x01, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (add GR16:$src1, GR16:$src2))]>, OpSize; +def ADD32rr : I<0x01, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>; +} // end isConvertibleToThreeAddress +} // end isCommutable +def ADD8rm : I<0x02, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2), + "add{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (add GR8:$src1, (load addr:$src2)))]>; +def ADD16rm : I<0x03, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (add GR16:$src1, (load addr:$src2)))]>, OpSize; +def ADD32rm : I<0x03, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (add GR32:$src1, (load addr:$src2)))]>; + +def ADD8ri : Ii8<0x80, MRM0r, (ops GR8:$dst, GR8:$src1, i8imm:$src2), + "add{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (add GR8:$src1, imm:$src2))]>; + +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def ADD16ri : Ii16<0x81, MRM0r, (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (add GR16:$src1, imm:$src2))]>, OpSize; +def ADD32ri : Ii32<0x81, MRM0r, (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (add GR32:$src1, imm:$src2))]>; +def ADD16ri8 : Ii8<0x83, MRM0r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def ADD32ri8 : Ii8<0x83, MRM0r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2))]>; +} + +let isTwoAddress = 0 in { + def ADD8mr : I<0x00, MRMDestMem, (ops i8mem :$dst, GR8 :$src2), + "add{b} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), GR8:$src2), addr:$dst)]>; + def ADD16mr : I<0x01, MRMDestMem, (ops i16mem:$dst, GR16:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), GR16:$src2), addr:$dst)]>, + OpSize; + def ADD32mr : I<0x01, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), GR32:$src2), addr:$dst)]>; + def ADD8mi : Ii8<0x80, MRM0m, (ops i8mem :$dst, i8imm :$src2), + "add{b} {$src2, $dst|$dst, $src2}", + [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst)]>; + def ADD16mi : Ii16<0x81, MRM0m, (ops i16mem:$dst, i16imm:$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst)]>, + OpSize; + def ADD32mi : Ii32<0x81, MRM0m, (ops i32mem:$dst, i32imm:$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst)]>; + def ADD16mi8 : Ii8<0x83, MRM0m, (ops i16mem:$dst, i16i8imm :$src2), + "add{w} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>, + OpSize; + def ADD32mi8 : Ii8<0x83, MRM0m, (ops i32mem:$dst, i32i8imm :$src2), + "add{l} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>; +} + +let isCommutable = 1 in { // X = ADC Y, Z --> X = ADC Z, Y +def ADC32rr : I<0x11, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>; +} +def ADC32rm : I<0x13, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>; +def ADC32ri : Ii32<0x81, MRM2r, (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>; +def ADC32ri8 : Ii8<0x83, MRM2r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>; + +let isTwoAddress = 0 in { + def ADC32mr : I<0x11, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>; + def ADC32mi : Ii32<0x81, MRM2m, (ops i32mem:$dst, i32imm:$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>; + def ADC32mi8 : Ii8<0x83, MRM2m, (ops i32mem:$dst, i32i8imm :$src2), + "adc{l} {$src2, $dst|$dst, $src2}", + [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>; +} + +def SUB8rr : I<0x28, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2), + "sub{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sub GR8:$src1, GR8:$src2))]>; +def SUB16rr : I<0x29, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sub GR16:$src1, GR16:$src2))]>, OpSize; +def SUB32rr : I<0x29, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>; +def SUB8rm : I<0x2A, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2), + "sub{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2)))]>; +def SUB16rm : I<0x2B, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2)))]>, OpSize; +def SUB32rm : I<0x2B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2)))]>; + +def SUB8ri : Ii8 <0x80, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2), + "sub{b} {$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sub GR8:$src1, imm:$src2))]>; +def SUB16ri : Ii16<0x81, MRM5r, (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sub GR16:$src1, imm:$src2))]>, OpSize; +def SUB32ri : Ii32<0x81, MRM5r, (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sub GR32:$src1, imm:$src2))]>; +def SUB16ri8 : Ii8<0x83, MRM5r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def SUB32ri8 : Ii8<0x83, MRM5r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2))]>; +let isTwoAddress = 0 in { + def SUB8mr : I<0x28, MRMDestMem, (ops i8mem :$dst, GR8 :$src2), + "sub{b} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), GR8:$src2), addr:$dst)]>; + def SUB16mr : I<0x29, MRMDestMem, (ops i16mem:$dst, GR16:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), GR16:$src2), addr:$dst)]>, + OpSize; + def SUB32mr : I<0x29, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), GR32:$src2), addr:$dst)]>; + def SUB8mi : Ii8<0x80, MRM5m, (ops i8mem :$dst, i8imm:$src2), + "sub{b} {$src2, $dst|$dst, $src2}", + [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst)]>; + def SUB16mi : Ii16<0x81, MRM5m, (ops i16mem:$dst, i16imm:$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(store (sub (loadi16 addr:$dst), imm:$src2), addr:$dst)]>, + OpSize; + def SUB32mi : Ii32<0x81, MRM5m, (ops i32mem:$dst, i32imm:$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(store (sub (loadi32 addr:$dst), imm:$src2), addr:$dst)]>; + def SUB16mi8 : Ii8<0x83, MRM5m, (ops i16mem:$dst, i16i8imm :$src2), + "sub{w} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>, + OpSize; + def SUB32mi8 : Ii8<0x83, MRM5m, (ops i32mem:$dst, i32i8imm :$src2), + "sub{l} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>; +} + +def SBB32rr : I<0x19, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>; + +let isTwoAddress = 0 in { + def SBB32mr : I<0x19, MRMDestMem, (ops i32mem:$dst, GR32:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>; + def SBB8mi : Ii32<0x80, MRM3m, (ops i8mem:$dst, i8imm:$src2), + "sbb{b} {$src2, $dst|$dst, $src2}", + [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>; + def SBB32mi : Ii32<0x81, MRM3m, (ops i32mem:$dst, i32imm:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>; + def SBB32mi8 : Ii8<0x83, MRM3m, (ops i32mem:$dst, i32i8imm :$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>; +} +def SBB32rm : I<0x1B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>; +def SBB32ri : Ii32<0x81, MRM3r, (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>; +def SBB32ri8 : Ii8<0x83, MRM3r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "sbb{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>; + +let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y +def IMUL16rr : I<0xAF, MRMSrcReg, (ops GR16:$dst, GR16:$src1, GR16:$src2), + "imul{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (mul GR16:$src1, GR16:$src2))]>, TB, OpSize; +def IMUL32rr : I<0xAF, MRMSrcReg, (ops GR32:$dst, GR32:$src1, GR32:$src2), + "imul{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>, TB; +} +def IMUL16rm : I<0xAF, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2), + "imul{w} {$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2)))]>, + TB, OpSize; +def IMUL32rm : I<0xAF, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2), + "imul{l} {$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2)))]>, TB; + +} // end Two Address instructions + +// Suprisingly enough, these are not two address instructions! +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (ops GR16:$dst, GR16:$src1, i16imm:$src2), + "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, (mul GR16:$src1, imm:$src2))]>, OpSize; +def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 + (ops GR32:$dst, GR32:$src1, i32imm:$src2), + "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>; +def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 + (ops GR16:$dst, GR16:$src1, i16i8imm:$src2), + "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 + (ops GR32:$dst, GR32:$src1, i32i8imm:$src2), + "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2))]>; + +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (ops GR16:$dst, i16mem:$src1, i16imm:$src2), + "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, (mul (load addr:$src1), imm:$src2))]>, + OpSize; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (ops GR32:$dst, i32mem:$src1, i32imm:$src2), + "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (mul (load addr:$src1), imm:$src2))]>; +def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 + (ops GR16:$dst, i16mem:$src1, i16i8imm :$src2), + "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, (mul (load addr:$src1), i16immSExt8:$src2))]>, + OpSize; +def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 + (ops GR32:$dst, i32mem:$src1, i32i8imm: $src2), + "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (mul (load addr:$src1), i32immSExt8:$src2))]>; + +//===----------------------------------------------------------------------===// +// Test instructions are just like AND, except they don't generate a result. +// +let isCommutable = 1 in { // TEST X, Y --> TEST Y, X +def TEST8rr : I<0x84, MRMDestReg, (ops GR8:$src1, GR8:$src2), + "test{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR8:$src1, GR8:$src2), 0)]>; +def TEST16rr : I<0x85, MRMDestReg, (ops GR16:$src1, GR16:$src2), + "test{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR16:$src1, GR16:$src2), 0)]>, OpSize; +def TEST32rr : I<0x85, MRMDestReg, (ops GR32:$src1, GR32:$src2), + "test{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR32:$src1, GR32:$src2), 0)]>; +} + +def TEST8rm : I<0x84, MRMSrcMem, (ops GR8 :$src1, i8mem :$src2), + "test{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0)]>; +def TEST16rm : I<0x85, MRMSrcMem, (ops GR16:$src1, i16mem:$src2), + "test{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0)]>, + OpSize; +def TEST32rm : I<0x85, MRMSrcMem, (ops GR32:$src1, i32mem:$src2), + "test{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0)]>; + +def TEST8ri : Ii8 <0xF6, MRM0r, // flags = GR8 & imm8 + (ops GR8:$src1, i8imm:$src2), + "test{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR8:$src1, imm:$src2), 0)]>; +def TEST16ri : Ii16<0xF7, MRM0r, // flags = GR16 & imm16 + (ops GR16:$src1, i16imm:$src2), + "test{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR16:$src1, imm:$src2), 0)]>, OpSize; +def TEST32ri : Ii32<0xF7, MRM0r, // flags = GR32 & imm32 + (ops GR32:$src1, i32imm:$src2), + "test{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR32:$src1, imm:$src2), 0)]>; + +def TEST8mi : Ii8 <0xF6, MRM0m, // flags = [mem8] & imm8 + (ops i8mem:$src1, i8imm:$src2), + "test{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0)]>; +def TEST16mi : Ii16<0xF7, MRM0m, // flags = [mem16] & imm16 + (ops i16mem:$src1, i16imm:$src2), + "test{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0)]>, + OpSize; +def TEST32mi : Ii32<0xF7, MRM0m, // flags = [mem32] & imm32 + (ops i32mem:$src1, i32imm:$src2), + "test{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0)]>; + + +// Condition code ops, incl. set if equal/not equal/... +def SAHF : I<0x9E, RawFrm, (ops), "sahf", []>, Imp<[AH],[]>; // flags = AH +def LAHF : I<0x9F, RawFrm, (ops), "lahf", []>, Imp<[],[AH]>; // AH = flags + +def SETEr : I<0x94, MRM0r, + (ops GR8 :$dst), + "sete $dst", + [(set GR8:$dst, (X86setcc X86_COND_E))]>, + TB; // GR8 = == +def SETEm : I<0x94, MRM0m, + (ops i8mem:$dst), + "sete $dst", + [(store (X86setcc X86_COND_E), addr:$dst)]>, + TB; // [mem8] = == +def SETNEr : I<0x95, MRM0r, + (ops GR8 :$dst), + "setne $dst", + [(set GR8:$dst, (X86setcc X86_COND_NE))]>, + TB; // GR8 = != +def SETNEm : I<0x95, MRM0m, + (ops i8mem:$dst), + "setne $dst", + [(store (X86setcc X86_COND_NE), addr:$dst)]>, + TB; // [mem8] = != +def SETLr : I<0x9C, MRM0r, + (ops GR8 :$dst), + "setl $dst", + [(set GR8:$dst, (X86setcc X86_COND_L))]>, + TB; // GR8 = < signed +def SETLm : I<0x9C, MRM0m, + (ops i8mem:$dst), + "setl $dst", + [(store (X86setcc X86_COND_L), addr:$dst)]>, + TB; // [mem8] = < signed +def SETGEr : I<0x9D, MRM0r, + (ops GR8 :$dst), + "setge $dst", + [(set GR8:$dst, (X86setcc X86_COND_GE))]>, + TB; // GR8 = >= signed +def SETGEm : I<0x9D, MRM0m, + (ops i8mem:$dst), + "setge $dst", + [(store (X86setcc X86_COND_GE), addr:$dst)]>, + TB; // [mem8] = >= signed +def SETLEr : I<0x9E, MRM0r, + (ops GR8 :$dst), + "setle $dst", + [(set GR8:$dst, (X86setcc X86_COND_LE))]>, + TB; // GR8 = <= signed +def SETLEm : I<0x9E, MRM0m, + (ops i8mem:$dst), + "setle $dst", + [(store (X86setcc X86_COND_LE), addr:$dst)]>, + TB; // [mem8] = <= signed +def SETGr : I<0x9F, MRM0r, + (ops GR8 :$dst), + "setg $dst", + [(set GR8:$dst, (X86setcc X86_COND_G))]>, + TB; // GR8 = > signed +def SETGm : I<0x9F, MRM0m, + (ops i8mem:$dst), + "setg $dst", + [(store (X86setcc X86_COND_G), addr:$dst)]>, + TB; // [mem8] = > signed + +def SETBr : I<0x92, MRM0r, + (ops GR8 :$dst), + "setb $dst", + [(set GR8:$dst, (X86setcc X86_COND_B))]>, + TB; // GR8 = < unsign +def SETBm : I<0x92, MRM0m, + (ops i8mem:$dst), + "setb $dst", + [(store (X86setcc X86_COND_B), addr:$dst)]>, + TB; // [mem8] = < unsign +def SETAEr : I<0x93, MRM0r, + (ops GR8 :$dst), + "setae $dst", + [(set GR8:$dst, (X86setcc X86_COND_AE))]>, + TB; // GR8 = >= unsign +def SETAEm : I<0x93, MRM0m, + (ops i8mem:$dst), + "setae $dst", + [(store (X86setcc X86_COND_AE), addr:$dst)]>, + TB; // [mem8] = >= unsign +def SETBEr : I<0x96, MRM0r, + (ops GR8 :$dst), + "setbe $dst", + [(set GR8:$dst, (X86setcc X86_COND_BE))]>, + TB; // GR8 = <= unsign +def SETBEm : I<0x96, MRM0m, + (ops i8mem:$dst), + "setbe $dst", + [(store (X86setcc X86_COND_BE), addr:$dst)]>, + TB; // [mem8] = <= unsign +def SETAr : I<0x97, MRM0r, + (ops GR8 :$dst), + "seta $dst", + [(set GR8:$dst, (X86setcc X86_COND_A))]>, + TB; // GR8 = > signed +def SETAm : I<0x97, MRM0m, + (ops i8mem:$dst), + "seta $dst", + [(store (X86setcc X86_COND_A), addr:$dst)]>, + TB; // [mem8] = > signed + +def SETSr : I<0x98, MRM0r, + (ops GR8 :$dst), + "sets $dst", + [(set GR8:$dst, (X86setcc X86_COND_S))]>, + TB; // GR8 = <sign bit> +def SETSm : I<0x98, MRM0m, + (ops i8mem:$dst), + "sets $dst", + [(store (X86setcc X86_COND_S), addr:$dst)]>, + TB; // [mem8] = <sign bit> +def SETNSr : I<0x99, MRM0r, + (ops GR8 :$dst), + "setns $dst", + [(set GR8:$dst, (X86setcc X86_COND_NS))]>, + TB; // GR8 = !<sign bit> +def SETNSm : I<0x99, MRM0m, + (ops i8mem:$dst), + "setns $dst", + [(store (X86setcc X86_COND_NS), addr:$dst)]>, + TB; // [mem8] = !<sign bit> +def SETPr : I<0x9A, MRM0r, + (ops GR8 :$dst), + "setp $dst", + [(set GR8:$dst, (X86setcc X86_COND_P))]>, + TB; // GR8 = parity +def SETPm : I<0x9A, MRM0m, + (ops i8mem:$dst), + "setp $dst", + [(store (X86setcc X86_COND_P), addr:$dst)]>, + TB; // [mem8] = parity +def SETNPr : I<0x9B, MRM0r, + (ops GR8 :$dst), + "setnp $dst", + [(set GR8:$dst, (X86setcc X86_COND_NP))]>, + TB; // GR8 = not parity +def SETNPm : I<0x9B, MRM0m, + (ops i8mem:$dst), + "setnp $dst", + [(store (X86setcc X86_COND_NP), addr:$dst)]>, + TB; // [mem8] = not parity + +// Integer comparisons +def CMP8rr : I<0x38, MRMDestReg, + (ops GR8 :$src1, GR8 :$src2), + "cmp{b} {$src2, $src1|$src1, $src2}", + [(X86cmp GR8:$src1, GR8:$src2)]>; +def CMP16rr : I<0x39, MRMDestReg, + (ops GR16:$src1, GR16:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp GR16:$src1, GR16:$src2)]>, OpSize; +def CMP32rr : I<0x39, MRMDestReg, + (ops GR32:$src1, GR32:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp GR32:$src1, GR32:$src2)]>; +def CMP8mr : I<0x38, MRMDestMem, + (ops i8mem :$src1, GR8 :$src2), + "cmp{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi8 addr:$src1), GR8:$src2)]>; +def CMP16mr : I<0x39, MRMDestMem, + (ops i16mem:$src1, GR16:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi16 addr:$src1), GR16:$src2)]>, OpSize; +def CMP32mr : I<0x39, MRMDestMem, + (ops i32mem:$src1, GR32:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi32 addr:$src1), GR32:$src2)]>; +def CMP8rm : I<0x3A, MRMSrcMem, + (ops GR8 :$src1, i8mem :$src2), + "cmp{b} {$src2, $src1|$src1, $src2}", + [(X86cmp GR8:$src1, (loadi8 addr:$src2))]>; +def CMP16rm : I<0x3B, MRMSrcMem, + (ops GR16:$src1, i16mem:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp GR16:$src1, (loadi16 addr:$src2))]>, OpSize; +def CMP32rm : I<0x3B, MRMSrcMem, + (ops GR32:$src1, i32mem:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp GR32:$src1, (loadi32 addr:$src2))]>; +def CMP8ri : Ii8<0x80, MRM7r, + (ops GR8:$src1, i8imm:$src2), + "cmp{b} {$src2, $src1|$src1, $src2}", + [(X86cmp GR8:$src1, imm:$src2)]>; +def CMP16ri : Ii16<0x81, MRM7r, + (ops GR16:$src1, i16imm:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp GR16:$src1, imm:$src2)]>, OpSize; +def CMP32ri : Ii32<0x81, MRM7r, + (ops GR32:$src1, i32imm:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp GR32:$src1, imm:$src2)]>; +def CMP8mi : Ii8 <0x80, MRM7m, + (ops i8mem :$src1, i8imm :$src2), + "cmp{b} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi8 addr:$src1), imm:$src2)]>; +def CMP16mi : Ii16<0x81, MRM7m, + (ops i16mem:$src1, i16imm:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi16 addr:$src1), imm:$src2)]>, OpSize; +def CMP32mi : Ii32<0x81, MRM7m, + (ops i32mem:$src1, i32imm:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi32 addr:$src1), imm:$src2)]>; +def CMP16ri8 : Ii8<0x83, MRM7r, + (ops GR16:$src1, i16i8imm:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp GR16:$src1, i16immSExt8:$src2)]>, OpSize; +def CMP16mi8 : Ii8<0x83, MRM7m, + (ops i16mem:$src1, i16i8imm:$src2), + "cmp{w} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2)]>, OpSize; +def CMP32mi8 : Ii8<0x83, MRM7m, + (ops i32mem:$src1, i32i8imm:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2)]>; +def CMP32ri8 : Ii8<0x83, MRM7r, + (ops GR32:$src1, i32i8imm:$src2), + "cmp{l} {$src2, $src1|$src1, $src2}", + [(X86cmp GR32:$src1, i32immSExt8:$src2)]>; + +// Sign/Zero extenders +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (ops GR16:$dst, GR8 :$src), + "movs{bw|x} {$src, $dst|$dst, $src}", + [(set GR16:$dst, (sext GR8:$src))]>, TB, OpSize; +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (ops GR16:$dst, i8mem :$src), + "movs{bw|x} {$src, $dst|$dst, $src}", + [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB, OpSize; +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (ops GR32:$dst, GR8 :$src), + "movs{bl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR8:$src))]>, TB; +def MOVSX32rm8 : I<0xBE, MRMSrcMem, (ops GR32:$dst, i8mem :$src), + "movs{bl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB; +def MOVSX32rr16: I<0xBF, MRMSrcReg, (ops GR32:$dst, GR16:$src), + "movs{wl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR16:$src))]>, TB; +def MOVSX32rm16: I<0xBF, MRMSrcMem, (ops GR32:$dst, i16mem:$src), + "movs{wl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; + +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (ops GR16:$dst, GR8 :$src), + "movz{bw|x} {$src, $dst|$dst, $src}", + [(set GR16:$dst, (zext GR8:$src))]>, TB, OpSize; +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (ops GR16:$dst, i8mem :$src), + "movz{bw|x} {$src, $dst|$dst, $src}", + [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB, OpSize; +def MOVZX32rr8 : I<0xB6, MRMSrcReg, (ops GR32:$dst, GR8 :$src), + "movz{bl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR8:$src))]>, TB; +def MOVZX32rm8 : I<0xB6, MRMSrcMem, (ops GR32:$dst, i8mem :$src), + "movz{bl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB; +def MOVZX32rr16: I<0xB7, MRMSrcReg, (ops GR32:$dst, GR16:$src), + "movz{wl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR16:$src))]>, TB; +def MOVZX32rm16: I<0xB7, MRMSrcMem, (ops GR32:$dst, i16mem:$src), + "movz{wl|x} {$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB; + +def CBW : I<0x98, RawFrm, (ops), + "{cbtw|cbw}", []>, Imp<[AL],[AX]>, OpSize; // AX = signext(AL) +def CWDE : I<0x98, RawFrm, (ops), + "{cwtl|cwde}", []>, Imp<[AX],[EAX]>; // EAX = signext(AX) + +def CWD : I<0x99, RawFrm, (ops), + "{cwtd|cwd}", []>, Imp<[AX],[AX,DX]>, OpSize; // DX:AX = signext(AX) +def CDQ : I<0x99, RawFrm, (ops), + "{cltd|cdq}", []>, Imp<[EAX],[EAX,EDX]>; // EDX:EAX = signext(EAX) + + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instructions that map movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +def MOV8r0 : I<0x30, MRMInitReg, (ops GR8 :$dst), + "xor{b} $dst, $dst", + [(set GR8:$dst, 0)]>; +def MOV16r0 : I<0x31, MRMInitReg, (ops GR16:$dst), + "xor{w} $dst, $dst", + [(set GR16:$dst, 0)]>, OpSize; +def MOV32r0 : I<0x31, MRMInitReg, (ops GR32:$dst), + "xor{l} $dst, $dst", + [(set GR32:$dst, 0)]>; + +// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only +// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX). +def MOV16to16_ : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16:$src), + "mov{w} {$src, $dst|$dst, $src}", []>, OpSize; +def MOV32to32_ : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32:$src), + "mov{l} {$src, $dst|$dst, $src}", []>; + +def MOV16_rr : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16_:$src), + "mov{w} {$src, $dst|$dst, $src}", []>, OpSize; +def MOV32_rr : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32_:$src), + "mov{l} {$src, $dst|$dst, $src}", []>; +def MOV16_rm : I<0x8B, MRMSrcMem, (ops GR16_:$dst, i16mem:$src), + "mov{w} {$src, $dst|$dst, $src}", []>, OpSize; +def MOV32_rm : I<0x8B, MRMSrcMem, (ops GR32_:$dst, i32mem:$src), + "mov{l} {$src, $dst|$dst, $src}", []>; +def MOV16_mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16_:$src), + "mov{w} {$src, $dst|$dst, $src}", []>, OpSize; +def MOV32_mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32_:$src), + "mov{l} {$src, $dst|$dst, $src}", []>; + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// + +def TLS_addr : I<0, Pseudo, (ops GR32:$dst, i32imm:$sym), + "leal ${sym:mem}(,%ebx,1), $dst", + [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>, + Imp<[EBX],[]>; + +let AddedComplexity = 10 in +def TLS_gs_rr : I<0, Pseudo, (ops GR32:$dst, GR32:$src), + "movl %gs:($src), $dst", + [(set GR32:$dst, (load (add X86TLStp, GR32:$src)))]>; + +let AddedComplexity = 15 in +def TLS_gs_ri : I<0, Pseudo, (ops GR32:$dst, i32imm:$src), + "movl %gs:${src:mem}, $dst", + [(set GR32:$dst, + (load (add X86TLStp, (X86Wrapper tglobaltlsaddr:$src))))]>; + +def TLS_tp : I<0, Pseudo, (ops GR32:$dst), + "movl %gs:0, $dst", + [(set GR32:$dst, X86TLStp)]>; + +//===----------------------------------------------------------------------===// +// DWARF Pseudo Instructions +// + +def DWARF_LOC : I<0, Pseudo, (ops i32imm:$line, i32imm:$col, i32imm:$file), + "; .loc $file, $line, $col", + [(dwarf_loc (i32 imm:$line), (i32 imm:$col), + (i32 imm:$file))]>; + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, noResults = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (ops GR32:$addr), + "ret #eh_return, addr: $addr", + [(X86ehret GR32:$addr)]>; + +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; +def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)), (MOV32ri tglobaltlsaddr:$dst)>; +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; + +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), + (ADD32ri GR32:$src1, tconstpool:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), + (ADD32ri GR32:$src1, tjumptable:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), + (ADD32ri GR32:$src1, tglobaladdr:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), + (ADD32ri GR32:$src1, texternalsym:$src2)>; + +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV32mi addr:$dst, tglobaladdr:$src)>; +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV32mi addr:$dst, texternalsym:$src)>; + +// Calls +def : Pat<(X86tailcall GR32:$dst), + (CALL32r GR32:$dst)>; + +def : Pat<(X86tailcall (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86tailcall (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; + +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; + +// X86 specific add which produces a flag. +def : Pat<(addc GR32:$src1, GR32:$src2), + (ADD32rr GR32:$src1, GR32:$src2)>; +def : Pat<(addc GR32:$src1, (load addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; +def : Pat<(addc GR32:$src1, imm:$src2), + (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(addc GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; + +def : Pat<(subc GR32:$src1, GR32:$src2), + (SUB32rr GR32:$src1, GR32:$src2)>; +def : Pat<(subc GR32:$src1, (load addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; +def : Pat<(subc GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(subc GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; + +def : Pat<(truncstorei1 (i8 imm:$src), addr:$dst), + (MOV8mi addr:$dst, imm:$src)>; +def : Pat<(truncstorei1 GR8:$src, addr:$dst), + (MOV8mr addr:$dst, GR8:$src)>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; + +// {s|z}extload bool -> {s|z}extload byte +def : Pat<(sextloadi16i1 addr:$src), (MOVSX16rm8 addr:$src)>; +def : Pat<(sextloadi32i1 addr:$src), (MOVSX32rm8 addr:$src)>; +def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; + +// extload bool -> extload byte +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +// anyext -> zext +def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; +def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>; +def : Pat<(i16 (anyext (loadi8 addr:$src))), (MOVZX16rm8 addr:$src)>; +def : Pat<(i32 (anyext (loadi8 addr:$src))), (MOVZX32rm8 addr:$src)>; +def : Pat<(i32 (anyext (loadi16 addr:$src))), (MOVZX32rm16 addr:$src)>; + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// (shl x, 1) ==> (add x, x) +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; + +// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c) +def : Pat<(or (srl GR32:$src1, CL:$amt), + (shl GR32:$src2, (sub 32, CL:$amt))), + (SHRD32rrCL GR32:$src1, GR32:$src2)>; + +def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt), + (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst), + (SHRD32mrCL addr:$dst, GR32:$src2)>; + +// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c) +def : Pat<(or (shl GR32:$src1, CL:$amt), + (srl GR32:$src2, (sub 32, CL:$amt))), + (SHLD32rrCL GR32:$src1, GR32:$src2)>; + +def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt), + (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst), + (SHLD32mrCL addr:$dst, GR32:$src2)>; + +// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c) +def : Pat<(or (srl GR16:$src1, CL:$amt), + (shl GR16:$src2, (sub 16, CL:$amt))), + (SHRD16rrCL GR16:$src1, GR16:$src2)>; + +def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt), + (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst), + (SHRD16mrCL addr:$dst, GR16:$src2)>; + +// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c) +def : Pat<(or (shl GR16:$src1, CL:$amt), + (srl GR16:$src2, (sub 16, CL:$amt))), + (SHLD16rrCL GR16:$src1, GR16:$src2)>; + +def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt), + (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst), + (SHLD16mrCL addr:$dst, GR16:$src2)>; + + +//===----------------------------------------------------------------------===// +// Floating Point Stack Support +//===----------------------------------------------------------------------===// + +include "X86InstrFPStack.td" + +//===----------------------------------------------------------------------===// +// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2) +//===----------------------------------------------------------------------===// + +include "X86InstrMMX.td" + +//===----------------------------------------------------------------------===// +// XMM Floating point support (requires SSE / SSE2) +//===----------------------------------------------------------------------===// + +include "X86InstrSSE.td" + +//===----------------------------------------------------------------------===// +// X86-64 Support +//===----------------------------------------------------------------------===// + +include "X86InstrX86-64.td" diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td new file mode 100644 index 0000000..c774460 --- /dev/null +++ b/lib/Target/X86/X86InstrMMX.td @@ -0,0 +1,645 @@ +//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MMX instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction templates +//===----------------------------------------------------------------------===// + +// MMXI - MMX instructions with TB prefix. +// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXID - MMX instructions with XD prefix. +// MMXIS - MMX instructions with XS prefix. +class MMXI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>; +class MMXRI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, REX_W, Requires<[HasMMX]>; +class MMX2I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasMMX]>; +class MMXIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>; +class MMXID<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, XD, Requires<[HasMMX]>; +class MMXIS<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, XS, Requires<[HasMMX]>; + +// Some 'special' instructions +def IMPLICIT_DEF_VR64 : I<0, Pseudo, (ops VR64:$dst), + "#IMPLICIT_DEF $dst", + [(set VR64:$dst, (v8i8 (undef)))]>, + Requires<[HasMMX]>; + +// 64-bit vector undef's. +def : Pat<(v8i8 (undef)), (IMPLICIT_DEF_VR64)>; +def : Pat<(v4i16 (undef)), (IMPLICIT_DEF_VR64)>; +def : Pat<(v2i32 (undef)), (IMPLICIT_DEF_VR64)>; +def : Pat<(v1i64 (undef)), (IMPLICIT_DEF_VR64)>; + +//===----------------------------------------------------------------------===// +// MMX Pattern Fragments +//===----------------------------------------------------------------------===// + +def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>; + +def bc_v8i8 : PatFrag<(ops node:$in), (v8i8 (bitconvert node:$in))>; +def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>; +def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>; +def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>; + +//===----------------------------------------------------------------------===// +// MMX Masks +//===----------------------------------------------------------------------===// + +// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to +// PSHUFW imm. +def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{ + return getI8Imm(X86::getShuffleSHUFImmediate(N)); +}]>; + +// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...> +def MMX_UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKHMask(N); +}]>; + +// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...> +def MMX_UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKLMask(N); +}]>; + +// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...> +def MMX_UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKH_v_undef_Mask(N); +}]>; + +// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...> +def MMX_UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKL_v_undef_Mask(N); +}]>; + +// Patterns for shuffling. +def MMX_PSHUFW_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isPSHUFDMask(N); +}], MMX_SHUFFLE_get_shuf_imm>; + +// Patterns for: vector_shuffle v1, v2, <4, 5, 2, 3>; etc. +def MMX_MOVL_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVLMask(N); +}]>; + +//===----------------------------------------------------------------------===// +// MMX Multiclasses +//===----------------------------------------------------------------------===// + +let isTwoAddress = 1 in { + // MMXI_binop_rm - Simple MMX binary operator. + multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, bit Commutable = 0> { + def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> { + let isCommutable = Commutable; + } + def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (OpVT (OpNode VR64:$src1, + (bitconvert + (load_mmx addr:$src2)))))]>; + } + + multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + bit Commutable = 0> { + def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> { + let isCommutable = Commutable; + } + def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; + } + + // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64. + // + // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew + // to collapse (bitconvert VT to VT) into its operand. + // + multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit Commutable = 0> { + def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> { + let isCommutable = Commutable; + } + def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, + (OpNode VR64:$src1,(load_mmx addr:$src2)))]>; + } + + multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId> { + def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>; + def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; + def ri : MMXIi8<opc2, ImmForm, (ops VR64:$dst, VR64:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (scalar_to_vector (i32 imm:$src2))))]>; + } +} + +//===----------------------------------------------------------------------===// +// MMX EMMS & FEMMS Instructions +//===----------------------------------------------------------------------===// + +def MMX_EMMS : MMXI<0x77, RawFrm, (ops), "emms", [(int_x86_mmx_emms)]>; +def MMX_FEMMS : MMXI<0x0E, RawFrm, (ops), "femms", [(int_x86_mmx_femms)]>; + +//===----------------------------------------------------------------------===// +// MMX Scalar Instructions +//===----------------------------------------------------------------------===// + +// Data Transfer Instructions +def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src), + "movd {$src, $dst|$dst, $src}", []>; +def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", []>; +def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (ops i32mem:$dst, VR64:$src), + "movd {$src, $dst|$dst, $src}", []>; + +def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (ops VR64:$dst, GR64:$src), + "movd {$src, $dst|$dst, $src}", []>; + +def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (ops VR64:$dst, VR64:$src), + "movq {$src, $dst|$dst, $src}", []>; +def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (ops VR64:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR64:$dst, (load_mmx addr:$src))]>; +def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (ops i64mem:$dst, VR64:$src), + "movq {$src, $dst|$dst, $src}", + [(store (v1i64 VR64:$src), addr:$dst)]>; + +def MMX_MOVDQ2Qrr : MMXID<0xD6, MRMDestMem, (ops VR64:$dst, VR128:$src), + "movdq2q {$src, $dst|$dst, $src}", + [(set VR64:$dst, + (v1i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))]>; + +def MMX_MOVQ2DQrr : MMXIS<0xD6, MRMDestMem, (ops VR128:$dst, VR64:$src), + "movq2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (bitconvert (v1i64 VR64:$src)))]>; + +def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (ops i64mem:$dst, VR64:$src), + "movntq {$src, $dst|$dst, $src}", + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>; + +let AddedComplexity = 15 in +// movd to MMX register zero-extends +def MMX_MOVZDI2PDIrr : MMX2I<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR64:$dst, + (v2i32 (vector_shuffle immAllZerosV, + (v2i32 (scalar_to_vector GR32:$src)), + MMX_MOVL_shuffle_mask)))]>; +let AddedComplexity = 20 in +def MMX_MOVZDI2PDIrm : MMX2I<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR64:$dst, + (v2i32 (vector_shuffle immAllZerosV, + (v2i32 (scalar_to_vector + (loadi32 addr:$src))), + MMX_MOVL_shuffle_mask)))]>; + +// Arithmetic Instructions + +// -- Addition +defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8, 1>; +defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>; +defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>; +defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>; + +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>; + +// -- Subtraction +defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>; +defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>; +defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>; +defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>; + +// -- Multiplication +defm MMX_PMULLW : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>; +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>; +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>; + +// -- Miscellanea +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>; + +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xE0, "psadbw", int_x86_mmx_psad_bw, 1>; + +// Logical Instructions +defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>; +defm MMX_POR : MMXI_binop_rm_v1i64<0xEB, "por" , or, 1>; +defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>; + +let isTwoAddress = 1 in { + def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "pandn {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1), + VR64:$src2)))]>; + def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "pandn {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1), + (load addr:$src2))))]>; +} + +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d>; + +// Comparison Instructions +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>; + +// Conversion Instructions + +// -- Unpack Instructions +let isTwoAddress = 1 in { + // Unpack High Packed Data Instructions + def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; + def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, + (bc_v8i8 (load_mmx addr:$src2)), + MMX_UNPCKH_shuffle_mask)))]>; + + def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; + def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, + (bc_v4i16 (load_mmx addr:$src2)), + MMX_UNPCKH_shuffle_mask)))]>; + + def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKH_shuffle_mask)))]>; + def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, + (bc_v2i32 (load_mmx addr:$src2)), + MMX_UNPCKH_shuffle_mask)))]>; + + // Unpack Low Packed Data Instructions + def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpcklbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKL_shuffle_mask)))]>; + def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpcklbw {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v8i8 (vector_shuffle VR64:$src1, + (bc_v8i8 (load_mmx addr:$src2)), + MMX_UNPCKL_shuffle_mask)))]>; + + def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpcklwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKL_shuffle_mask)))]>; + def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpcklwd {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle VR64:$src1, + (bc_v4i16 (load_mmx addr:$src2)), + MMX_UNPCKL_shuffle_mask)))]>; + + def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, VR64:$src2), + "punpckldq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, VR64:$src2, + MMX_UNPCKL_shuffle_mask)))]>; + def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i64mem:$src2), + "punpckldq {$src2, $dst|$dst, $src2}", + [(set VR64:$dst, + (v2i32 (vector_shuffle VR64:$src1, + (bc_v2i32 (load_mmx addr:$src2)), + MMX_UNPCKL_shuffle_mask)))]>; +} + +// -- Pack Instructions +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>; + +// -- Shuffle Instructions +def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, i8imm:$src2), + "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle + VR64:$src1, (undef), + MMX_PSHUFW_shuffle_mask:$src2)))]>; +def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, + (ops VR64:$dst, i64mem:$src1, i8imm:$src2), + "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (v4i16 (vector_shuffle + (bc_v4i16 (load_mmx addr:$src1)), + (undef), + MMX_PSHUFW_shuffle_mask:$src2)))]>; + +// -- Conversion Instructions +def MMX_CVTPD2PIrr : MMX2I<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src), + "cvtpd2pi {$src, $dst|$dst, $src}", []>; +def MMX_CVTPD2PIrm : MMX2I<0x2D, MRMSrcMem, (ops VR64:$dst, f128mem:$src), + "cvtpd2pi {$src, $dst|$dst, $src}", []>; + +def MMX_CVTPI2PDrr : MMX2I<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src), + "cvtpi2pd {$src, $dst|$dst, $src}", []>; +def MMX_CVTPI2PDrm : MMX2I<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "cvtpi2pd {$src, $dst|$dst, $src}", []>; + +def MMX_CVTPI2PSrr : MMXI<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src), + "cvtpi2ps {$src, $dst|$dst, $src}", []>; +def MMX_CVTPI2PSrm : MMXI<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "cvtpi2ps {$src, $dst|$dst, $src}", []>; + +def MMX_CVTPS2PIrr : MMXI<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src), + "cvtps2pi {$src, $dst|$dst, $src}", []>; +def MMX_CVTPS2PIrm : MMXI<0x2D, MRMSrcMem, (ops VR64:$dst, f64mem:$src), + "cvtps2pi {$src, $dst|$dst, $src}", []>; + +def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src), + "cvttpd2pi {$src, $dst|$dst, $src}", []>; +def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (ops VR64:$dst, f128mem:$src), + "cvttpd2pi {$src, $dst|$dst, $src}", []>; + +def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src), + "cvttps2pi {$src, $dst|$dst, $src}", []>; +def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (ops VR64:$dst, f64mem:$src), + "cvttps2pi {$src, $dst|$dst, $src}", []>; + +// Extract / Insert +def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>; +def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>; + +def MMX_PEXTRWri : MMXIi8<0xC5, MRMSrcReg, + (ops GR32:$dst, VR64:$src1, i16i8imm:$src2), + "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1), + (iPTR imm:$src2)))]>; +let isTwoAddress = 1 in { + def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg, + (ops VR64:$dst, VR64:$src1, GR32:$src2, i16i8imm:$src3), + "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1), + GR32:$src2, (iPTR imm:$src3))))]>; + def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem, + (ops VR64:$dst, VR64:$src1, i16mem:$src2, i16i8imm:$src3), + "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, + (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1), + (i32 (anyext (loadi16 addr:$src2))), + (iPTR imm:$src3))))]>; +} + +// Mask creation +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (ops GR32:$dst, VR64:$src), + "pmovmskb {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>; + +// Misc. +def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (ops VR64:$src, VR64:$mask), + "maskmovq {$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>, + Imp<[EDI],[]>; + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instructions that map zero vector to pxor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let isReMaterializable = 1 in { + def MMX_V_SET0 : MMXI<0xEF, MRMInitReg, (ops VR64:$dst), + "pxor $dst, $dst", + [(set VR64:$dst, (v1i64 immAllZerosV))]>; + def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (ops VR64:$dst), + "pcmpeqd $dst, $dst", + [(set VR64:$dst, (v1i64 immAllOnesV))]>; +} + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Store 64-bit integer vector values. +def : Pat<(store (v8i8 VR64:$src), addr:$dst), + (MMX_MOVQ64mr addr:$dst, VR64:$src)>; +def : Pat<(store (v4i16 VR64:$src), addr:$dst), + (MMX_MOVQ64mr addr:$dst, VR64:$src)>; +def : Pat<(store (v2i32 VR64:$src), addr:$dst), + (MMX_MOVQ64mr addr:$dst, VR64:$src)>; +def : Pat<(store (v1i64 VR64:$src), addr:$dst), + (MMX_MOVQ64mr addr:$dst, VR64:$src)>; + +// 64-bit vector all zero's. +def : Pat<(v8i8 immAllZerosV), (MMX_V_SET0)>; +def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>; +def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>; +def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>; + +// 64-bit vector all one's. +def : Pat<(v8i8 immAllOnesV), (MMX_V_SETALLONES)>; +def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>; +def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>; +def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>; + +// Bit convert. +def : Pat<(v8i8 (bitconvert (v1i64 VR64:$src))), (v8i8 VR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 VR64:$src))), (v8i8 VR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 VR64:$src))), (v8i8 VR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 VR64:$src))), (v4i16 VR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 VR64:$src))), (v2i32 VR64:$src)>; +def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 VR64:$src))), (v1i64 VR64:$src)>; + +// 64-bit bit convert. +def : Pat<(v1i64 (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; +def : Pat<(v2i32 (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; +def : Pat<(v4i16 (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; +def : Pat<(v8i8 (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; + +def MMX_X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; + +// Move scalar to XMM zero-extended +// movd to XMM register zero-extends +let AddedComplexity = 15 in { + def : Pat<(v8i8 (vector_shuffle immAllZerosV, + (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), + (MMX_MOVZDI2PDIrr GR32:$src)>; + def : Pat<(v4i16 (vector_shuffle immAllZerosV, + (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), + (MMX_MOVZDI2PDIrr GR32:$src)>; + def : Pat<(v2i32 (vector_shuffle immAllZerosV, + (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)), + (MMX_MOVZDI2PDIrr GR32:$src)>; +} + +// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower +// 8 or 16-bits matter. +def : Pat<(v8i8 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; +def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; +def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>; + +// Patterns to perform canonical versions of vector shuffling. +let AddedComplexity = 10 in { + def : Pat<(v8i8 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKL_v_undef_shuffle_mask)), + (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>; + def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKL_v_undef_shuffle_mask)), + (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>; + def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKL_v_undef_shuffle_mask)), + (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>; +} + +let AddedComplexity = 10 in { + def : Pat<(v8i8 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKH_v_undef_shuffle_mask)), + (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>; + def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKH_v_undef_shuffle_mask)), + (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>; + def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef), + MMX_UNPCKH_v_undef_shuffle_mask)), + (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>; +} + +// Patterns to perform vector shuffling with a zeroed out vector. +let AddedComplexity = 20 in { + def : Pat<(bc_v2i32 (vector_shuffle immAllZerosV, + (v2i32 (scalar_to_vector (load_mmx addr:$src))), + MMX_UNPCKL_shuffle_mask)), + (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>; +} + +// Some special case PANDN patterns. +// FIXME: Get rid of these. +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))), + VR64:$src2)), + (MMX_PANDNrr VR64:$src1, VR64:$src2)>; +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))), + VR64:$src2)), + (MMX_PANDNrr VR64:$src1, VR64:$src2)>; +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))), + VR64:$src2)), + (MMX_PANDNrr VR64:$src1, VR64:$src2)>; + +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))), + (load addr:$src2))), + (MMX_PANDNrm VR64:$src1, addr:$src2)>; +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))), + (load addr:$src2))), + (MMX_PANDNrm VR64:$src1, addr:$src2)>; +def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))), + (load addr:$src2))), + (MMX_PANDNrm VR64:$src1, addr:$src2)>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td new file mode 100644 index 0000000..5fc7a65 --- /dev/null +++ b/lib/Target/X86/X86InstrSSE.td @@ -0,0 +1,2572 @@ +//====- X86InstrSSE.td - Describe the X86 Instruction Set -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Evan Cheng and is distributed under the University +// of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 SSE instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// SSE specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, + SDTCisFP<0>, SDTCisInt<2> ]>; + +def X86loadp : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>; +def X86loadu : SDNode<"X86ISD::LOAD_UA", SDTLoad, [SDNPHasChain]>; +def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; +def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; +def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest, + [SDNPHasChain, SDNPOutFlag]>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest, + [SDNPHasChain, SDNPOutFlag]>; +def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>; + +//===----------------------------------------------------------------------===// +// SSE 'Special' Instructions +//===----------------------------------------------------------------------===// + +def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst), + "#IMPLICIT_DEF $dst", + [(set VR128:$dst, (v4f32 (undef)))]>, + Requires<[HasSSE1]>; +def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst), + "#IMPLICIT_DEF $dst", + [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>; +def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst), + "#IMPLICIT_DEF $dst", + [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>; + +//===----------------------------------------------------------------------===// +// SSE Complex Patterns +//===----------------------------------------------------------------------===// + +// These are 'extloads' from a scalar to the low element of a vector, zeroing +// the top elements. These are used for the SSE 'ss' and 'sd' instruction +// forms. +def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [], + [SDNPHasChain]>; +def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [], + [SDNPHasChain]>; + +def ssmem : Operand<v4f32> { + let PrintMethod = "printf32mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm); +} +def sdmem : Operand<v2f64> { + let PrintMethod = "printf64mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm); +} + +//===----------------------------------------------------------------------===// +// SSE pattern fragments +//===----------------------------------------------------------------------===// + +def X86loadpf32 : PatFrag<(ops node:$ptr), (f32 (X86loadp node:$ptr))>; +def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>; + +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; + +def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; +def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; +def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; +def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; +def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; +def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; + +def fp32imm0 : PatLeaf<(f32 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def PSxLDQ_imm : SDNodeXForm<imm, [{ + // Transformation function: imm >> 3 + return getI32Imm(N->getValue() >> 3); +}]>; + +// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*, +// SHUFP* etc. imm. +def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{ + return getI8Imm(X86::getShuffleSHUFImmediate(N)); +}]>; + +// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to +// PSHUFHW imm. +def SHUFFLE_get_pshufhw_imm : SDNodeXForm<build_vector, [{ + return getI8Imm(X86::getShufflePSHUFHWImmediate(N)); +}]>; + +// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to +// PSHUFLW imm. +def SHUFFLE_get_pshuflw_imm : SDNodeXForm<build_vector, [{ + return getI8Imm(X86::getShufflePSHUFLWImmediate(N)); +}]>; + +def SSE_splat_mask : PatLeaf<(build_vector), [{ + return X86::isSplatMask(N); +}], SHUFFLE_get_shuf_imm>; + +def SSE_splat_lo_mask : PatLeaf<(build_vector), [{ + return X86::isSplatLoMask(N); +}]>; + +def MOVHLPS_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVHLPSMask(N); +}]>; + +def MOVHLPS_v_undef_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVHLPS_v_undef_Mask(N); +}]>; + +def MOVHP_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVHPMask(N); +}]>; + +def MOVLP_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVLPMask(N); +}]>; + +def MOVL_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVLMask(N); +}]>; + +def MOVSHDUP_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVSHDUPMask(N); +}]>; + +def MOVSLDUP_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isMOVSLDUPMask(N); +}]>; + +def UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKLMask(N); +}]>; + +def UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKHMask(N); +}]>; + +def UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKL_v_undef_Mask(N); +}]>; + +def UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isUNPCKH_v_undef_Mask(N); +}]>; + +def PSHUFD_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isPSHUFDMask(N); +}], SHUFFLE_get_shuf_imm>; + +def PSHUFHW_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isPSHUFHWMask(N); +}], SHUFFLE_get_pshufhw_imm>; + +def PSHUFLW_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isPSHUFLWMask(N); +}], SHUFFLE_get_pshuflw_imm>; + +def SHUFP_unary_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isPSHUFDMask(N); +}], SHUFFLE_get_shuf_imm>; + +def SHUFP_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isSHUFPMask(N); +}], SHUFFLE_get_shuf_imm>; + +def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{ + return X86::isSHUFPMask(N); +}], SHUFFLE_get_shuf_imm>; + +//===----------------------------------------------------------------------===// +// SSE scalar FP Instructions +//===----------------------------------------------------------------------===// + +// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the +// scheduler into a branch sequence. +let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler. + def CMOV_FR32 : I<0, Pseudo, + (ops FR32:$dst, FR32:$t, FR32:$f, i8imm:$cond), + "#CMOV_FR32 PSEUDO!", + [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond))]>; + def CMOV_FR64 : I<0, Pseudo, + (ops FR64:$dst, FR64:$t, FR64:$f, i8imm:$cond), + "#CMOV_FR64 PSEUDO!", + [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond))]>; + def CMOV_V4F32 : I<0, Pseudo, + (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V4F32 PSEUDO!", + [(set VR128:$dst, + (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>; + def CMOV_V2F64 : I<0, Pseudo, + (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2F64 PSEUDO!", + [(set VR128:$dst, + (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>; + def CMOV_V2I64 : I<0, Pseudo, + (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2I64 PSEUDO!", + [(set VR128:$dst, + (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>; +} + +//===----------------------------------------------------------------------===// +// SSE1 Instructions +//===----------------------------------------------------------------------===// + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with TB prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. + +class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>; +class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; +class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>; + +// Move Instructions +def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src), + "movss {$src, $dst|$dst, $src}", []>; +def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src), + "movss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (loadf32 addr:$src))]>; +def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src), + "movss {$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>; + +// Conversion instructions +def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint FR32:$src))]>; +def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>; +def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src), + "cvtsi2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp GR32:$src))]>; +def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src), + "cvtsi2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>; + +// Match intrinsics which expect XMM operand(s). +def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvtss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>; +def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvtss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_cvtss2si + (load addr:$src)))]>; + +// Aliases for intrinsics +def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_sse_cvttss2si VR128:$src))]>; +def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src), + "cvttss2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_sse_cvttss2si(load addr:$src)))]>; + +let isTwoAddress = 1 in { + def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, GR32:$src2), + "cvtsi2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1, + GR32:$src2))]>; + def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i32mem:$src2), + "cvtsi2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1, + (loadi32 addr:$src2)))]>; +} + +// Comparison instructions +let isTwoAddress = 1 in { + def CMPSSrr : SSI<0xC2, MRMSrcReg, + (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + []>; + def CMPSSrm : SSI<0xC2, MRMSrcMem, + (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", []>; +} + +def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86cmp FR32:$src1, FR32:$src2)]>; +def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>; + +// Aliases to match intrinsics which expect XMM operand(s). +let isTwoAddress = 1 in { + def Int_CMPSSrr : SSI<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1, + VR128:$src, imm:$cc))]>; + def Int_CMPSSrm : SSI<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc), + "cmp${cc}ss {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>; +def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "ucomiss {$src2, $src1|$src1, $src2}", + [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>; + +def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "comiss {$src2, $src1|$src1, $src2}", + [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>; +def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "comiss {$src2, $src1|$src1, $src2}", + [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>; + +// Aliases of packed SSE1 instructions for scalar use. These all have names that +// start with 'Fs'. + +// Alias instructions that map fld0 to pxor for sse. +def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst), + "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>, + Requires<[HasSSE1]>, TB, OpSize; + +// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are +// disregarded. +def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src), + "movaps {$src, $dst|$dst, $src}", []>; + +// Alias instruction to load FR32 from f128mem using movaps. Upper bits are +// disregarded. +def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src), + "movaps {$src, $dst|$dst, $src}", + [(set FR32:$dst, (X86loadpf32 addr:$src))]>; + +// Alias bitwise logical operations using SSE logical ops on packed FP values. +let isTwoAddress = 1 in { +let isCommutable = 1 in { + def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>; + def FsORPSrr : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>; + def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>; +} + +def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fand FR32:$src1, + (X86loadpf32 addr:$src2)))]>; +def FsORPSrm : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86for FR32:$src1, + (X86loadpf32 addr:$src2)))]>; +def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set FR32:$dst, (X86fxor FR32:$src1, + (X86loadpf32 addr:$src2)))]>; + +def FsANDNPSrr : PSI<0x55, MRMSrcReg, + (ops FR32:$dst, FR32:$src1, FR32:$src2), + "andnps {$src2, $dst|$dst, $src2}", []>; +def FsANDNPSrm : PSI<0x55, MRMSrcMem, + (ops FR32:$dst, FR32:$src1, f128mem:$src2), + "andnps {$src2, $dst|$dst, $src2}", []>; +} + +/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements undefined. +/// +/// These three forms can each be reg+reg or reg+mem, so there are a total of +/// six "instructions". +/// +let isTwoAddress = 1 in { +multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F32Int, + bit Commutable = 0> { + // Scalar operation, reg+reg. + def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>; + + // Vector operation, reg+reg. + def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>; + + // Intrinsic operation, reg+reg. + def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F32Int VR128:$src1, + sse_load_f32:$src2))]>; +} +} + +// Arithmetic instructions +defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>; +defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>; +defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>; +defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>; + +/// sse1_fp_binop_rm - Other SSE1 binops +/// +/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of +/// instructions for a full-vector intrinsic form. Operations that map +/// onto C operators don't use this form since they just use the plain +/// vector form instead of having a separate vector intrinsic form. +/// +/// This provides a total of eight "instructions". +/// +let isTwoAddress = 1 in { +multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F32Int, + Intrinsic V4F32Int, + bit Commutable = 0> { + + // Scalar operation, reg+reg. + def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>; + + // Vector operation, reg+reg. + def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>; + + // Intrinsic operation, reg+reg. + def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2), + !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F32Int VR128:$src1, + sse_load_f32:$src2))]>; + + // Vector intrinsic operation, reg+reg. + def PSrr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, reg+mem. + def PSrm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2), + !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>; +} +} + +defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax, + int_x86_sse_max_ss, int_x86_sse_max_ps>; +defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin, + int_x86_sse_min_ss, int_x86_sse_min_ps>; + +//===----------------------------------------------------------------------===// +// SSE packed FP Instructions + +// Move Instructions +def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movaps {$src, $dst|$dst, $src}", []>; +def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movaps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv4f32 addr:$src))]>; + +def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movaps {$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>; + +def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movups {$src, $dst|$dst, $src}", []>; +def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movups {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movups {$src, $dst|$dst, $src}", + [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>; + +let isTwoAddress = 1 in { + let AddedComplexity = 20 in { + def MOVLPSrm : PSI<0x12, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movlps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))), + MOVLP_shuffle_mask)))]>; + def MOVHPSrm : PSI<0x16, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))), + MOVHP_shuffle_mask)))]>; + } // AddedComplexity +} // isTwoAddress + +def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movlps {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>; + +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movhps {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (vector_shuffle + (bc_v2f64 (v4f32 VR128:$src)), (undef), + UNPCKH_shuffle_mask)), (iPTR 0))), + addr:$dst)]>; + +let isTwoAddress = 1 in { +let AddedComplexity = 15 in { +def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movlhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHP_shuffle_mask)))]>; + +def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movhlps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHLPS_shuffle_mask)))]>; +} // AddedComplexity +} // isTwoAddress + + + +// Arithmetic + +/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. +/// +/// These four forms can each have a reg or a mem operand, so there are a +/// total of eight "instructions". +/// +multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F32Int, + Intrinsic V4F32Int, + bit Commutable = 0> { + // Scalar operation, reg. + def SSr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src), + !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode FR32:$src))]> { + let isCommutable = Commutable; + } + + // Scalar operation, mem. + def SSm : SSI<opc, MRMSrcMem, (ops FR32:$dst, f32mem:$src), + !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode (load addr:$src)))]>; + + // Vector operation, reg. + def PSr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> { + let isCommutable = Commutable; + } + + // Vector operation, mem. + def PSm : PSI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>; + + // Intrinsic operation, reg. + def SSr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int VR128:$src))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, mem. + def SSm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, ssmem:$src), + !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int sse_load_f32:$src))]>; + + // Vector intrinsic operation, reg + def PSr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int VR128:$src))]> { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, mem + def PSm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int (load addr:$src)))]>; +} + +// Square root. +defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt, + int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt, + int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>; +defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp, + int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>; + +// Logical +let isTwoAddress = 1 in { + let isCommutable = 1 in { + def ANDPSrr : PSI<0x54, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (and VR128:$src1, VR128:$src2)))]>; + def ORPSrr : PSI<0x56, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (or VR128:$src1, VR128:$src2)))]>; + def XORPSrr : PSI<0x57, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 + (xor VR128:$src1, VR128:$src2)))]>; + } + + def ANDPSrm : PSI<0x54, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "andps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (and VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def ORPSrm : PSI<0x56, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "orps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (or VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def XORPSrm : PSI<0x57, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "xorps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (xor VR128:$src1, + (bc_v2i64 (loadv4f32 addr:$src2))))]>; + def ANDNPSrr : PSI<0x55, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andnps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (and (xor VR128:$src1, + (bc_v2i64 (v4i32 immAllOnesV))), + VR128:$src2)))]>; + def ANDNPSrm : PSI<0x55, MRMSrcMem, + (ops VR128:$dst, VR128:$src1,f128mem:$src2), + "andnps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (and (xor VR128:$src1, + (bc_v2i64 (v4i32 immAllOnesV))), + (bc_v2i64 (loadv4f32 addr:$src2)))))]>; +} + +let isTwoAddress = 1 in { + def CMPPSrri : PSIi8<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1, + VR128:$src, imm:$cc))]>; + def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc), + "cmp${cc}ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +// Shuffle and unpack instructions +let isTwoAddress = 1 in { + let isConvertibleToThreeAddress = 1 in // Convert to pshufd + def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, + VR128:$src2, i32i8imm:$src3), + "shufps {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + SHUFP_shuffle_mask:$src3)))]>; + def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, + f128mem:$src2, i32i8imm:$src3), + "shufps {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + SHUFP_shuffle_mask:$src3)))]>; + + let AddedComplexity = 10 in { + def UNPCKHPSrr : PSI<0x15, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpckhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def UNPCKHPSrm : PSI<0x15, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpckhps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKH_shuffle_mask)))]>; + + def UNPCKLPSrr : PSI<0x14, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpcklps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def UNPCKLPSrm : PSI<0x14, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpcklps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKL_shuffle_mask)))]>; + } // AddedComplexity +} // isTwoAddress + +// Mask creation +def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "movmskps {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>; +def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "movmskpd {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>; + +// Prefetching loads. +// TODO: no intrinsics for these? +def PREFETCHT0 : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>; +def PREFETCHT1 : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>; +def PREFETCHT2 : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>; +def PREFETCHNTA : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>; + +// Non-temporal stores +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src), + "movntps {$src, $dst|$dst, $src}", + [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>; + +// Load, store, and memory fence +def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>; + +// MXCSR register +def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src), + "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>; +def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst), + "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>; + +// Alias instructions that map zero vector to pxor / xorp* for sse. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let isReMaterializable = 1 in +def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst), + "xorps $dst, $dst", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; + +// FR32 to 128-bit vector conversion. +def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector FR32:$src)))]>; +def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>; + +// FIXME: may not be able to eliminate this movss with coalescing the src and +// dest register classes are different. We really want to write this pattern +// like this: +// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +// (f32 FR32:$src)>; +def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (vector_extract (v4f32 VR128:$src), + (iPTR 0)))]>; +def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src), + "movss {$src, $dst|$dst, $src}", + [(store (f32 (vector_extract (v4f32 VR128:$src), + (iPTR 0))), addr:$dst)]>; + + +// Move to lower bits of a VR128, leaving upper bits alone. +// Three operand (but two address) aliases. +let isTwoAddress = 1 in { + def MOVLSS2PSrr : SSI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, FR32:$src2), + "movss {$src2, $dst|$dst, $src2}", []>; + + let AddedComplexity = 15 in + def MOVLPSrr : SSI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)))]>; +} + +// Move to lower bits of a VR128 and zeroing upper bits. +// Loading from memory automatically zeroing upper bits. +let AddedComplexity = 20 in +def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src), + "movss {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), + MOVL_shuffle_mask)))]>; + + +//===----------------------------------------------------------------------===// +// SSE2 Instructions +//===----------------------------------------------------------------------===// + +// SSE2 Instruction Templates: +// +// SDI - SSE2 instructions with XD prefix. +// PDI - SSE2 instructions with TB and OpSize prefixes. +// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. + +class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>; +class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; +class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>; + +// Move Instructions +def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src), + "movsd {$src, $dst|$dst, $src}", []>; +def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src), + "movsd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (loadf64 addr:$src))]>; +def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src), + "movsd {$src, $dst|$dst, $src}", + [(store FR64:$src, addr:$dst)]>; + +// Conversion instructions +def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src), + "cvttsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint FR64:$src))]>; +def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f64mem:$src), + "cvttsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>; +def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (ops FR32:$dst, FR64:$src), + "cvtsd2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround FR64:$src))]>; +def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (ops FR32:$dst, f64mem:$src), + "cvtsd2ss {$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround (loadf64 addr:$src)))]>; +def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR32:$src), + "cvtsi2sd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (sint_to_fp GR32:$src))]>; +def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (ops FR64:$dst, i32mem:$src), + "cvtsi2sd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>; + +// SSE2 instructions with XS prefix +def CVTSS2SDrr : I<0x5A, MRMSrcReg, (ops FR64:$dst, FR32:$src), + "cvtss2sd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (fextend FR32:$src))]>, XS, + Requires<[HasSSE2]>; +def CVTSS2SDrm : I<0x5A, MRMSrcMem, (ops FR64:$dst, f32mem:$src), + "cvtss2sd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, + Requires<[HasSSE2]>; + +// Match intrinsics which expect XMM operand(s). +def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvtsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>; +def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (ops GR32:$dst, f128mem:$src), + "cvtsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_cvtsd2si + (load addr:$src)))]>; + +// Aliases for intrinsics +def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "cvttsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_sse2_cvttsd2si VR128:$src))]>; +def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f128mem:$src), + "cvttsd2si {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_cvttsd2si + (load addr:$src)))]>; + +// Comparison instructions +let isTwoAddress = 1 in { + def CMPSDrr : SDI<0xC2, MRMSrcReg, + (ops FR64:$dst, FR64:$src1, FR64:$src, SSECC:$cc), + "cmp${cc}sd {$src, $dst|$dst, $src}", []>; + def CMPSDrm : SDI<0xC2, MRMSrcMem, + (ops FR64:$dst, FR64:$src1, f64mem:$src, SSECC:$cc), + "cmp${cc}sd {$src, $dst|$dst, $src}", []>; +} + +def UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops FR64:$src1, FR64:$src2), + "ucomisd {$src2, $src1|$src1, $src2}", + [(X86cmp FR64:$src1, FR64:$src2)]>; +def UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops FR64:$src1, f64mem:$src2), + "ucomisd {$src2, $src1|$src1, $src2}", + [(X86cmp FR64:$src1, (loadf64 addr:$src2))]>; + +// Aliases to match intrinsics which expect XMM operand(s). +let isTwoAddress = 1 in { + def Int_CMPSDrr : SDI<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}sd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1, + VR128:$src, imm:$cc))]>; + def Int_CMPSDrm : SDI<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src, SSECC:$cc), + "cmp${cc}sd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "ucomisd {$src2, $src1|$src1, $src2}", + [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>; +def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "ucomisd {$src2, $src1|$src1, $src2}", + [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2))]>; + +def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2), + "comisd {$src2, $src1|$src1, $src2}", + [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>; +def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2), + "comisd {$src2, $src1|$src1, $src2}", + [(X86comi (v2f64 VR128:$src1), (load addr:$src2))]>; + +// Aliases of packed SSE2 instructions for scalar use. These all have names that +// start with 'Fs'. + +// Alias instructions that map fld0 to pxor for sse. +def FsFLD0SD : I<0xEF, MRMInitReg, (ops FR64:$dst), + "pxor $dst, $dst", [(set FR64:$dst, fpimm0)]>, + Requires<[HasSSE2]>, TB, OpSize; + +// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are +// disregarded. +def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (ops FR64:$dst, FR64:$src), + "movapd {$src, $dst|$dst, $src}", []>; + +// Alias instruction to load FR64 from f128mem using movapd. Upper bits are +// disregarded. +def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (ops FR64:$dst, f128mem:$src), + "movapd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (X86loadpf64 addr:$src))]>; + +// Alias bitwise logical operations using SSE logical ops on packed FP values. +let isTwoAddress = 1 in { +let isCommutable = 1 in { + def FsANDPDrr : PDI<0x54, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), + "andpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>; + def FsORPDrr : PDI<0x56, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), + "orpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>; + def FsXORPDrr : PDI<0x57, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), + "xorpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>; +} + +def FsANDPDrm : PDI<0x54, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2), + "andpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86fand FR64:$src1, + (X86loadpf64 addr:$src2)))]>; +def FsORPDrm : PDI<0x56, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2), + "orpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86for FR64:$src1, + (X86loadpf64 addr:$src2)))]>; +def FsXORPDrm : PDI<0x57, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2), + "xorpd {$src2, $dst|$dst, $src2}", + [(set FR64:$dst, (X86fxor FR64:$src1, + (X86loadpf64 addr:$src2)))]>; + +def FsANDNPDrr : PDI<0x55, MRMSrcReg, + (ops FR64:$dst, FR64:$src1, FR64:$src2), + "andnpd {$src2, $dst|$dst, $src2}", []>; +def FsANDNPDrm : PDI<0x55, MRMSrcMem, + (ops FR64:$dst, FR64:$src1, f128mem:$src2), + "andnpd {$src2, $dst|$dst, $src2}", []>; +} + +/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements undefined. +/// +/// These three forms can each be reg+reg or reg+mem, so there are a total of +/// six "instructions". +/// +let isTwoAddress = 1 in { +multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F64Int, + bit Commutable = 0> { + // Scalar operation, reg+reg. + def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>; + + // Vector operation, reg+reg. + def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>; + + // Intrinsic operation, reg+reg. + def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F64Int VR128:$src1, + sse_load_f64:$src2))]>; +} +} + +// Arithmetic instructions +defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>; +defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>; +defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>; +defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>; + +/// sse2_fp_binop_rm - Other SSE2 binops +/// +/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of +/// instructions for a full-vector intrinsic form. Operations that map +/// onto C operators don't use this form since they just use the plain +/// vector form instead of having a separate vector intrinsic form. +/// +/// This provides a total of eight "instructions". +/// +let isTwoAddress = 1 in { +multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F64Int, + Intrinsic V2F64Int, + bit Commutable = 0> { + + // Scalar operation, reg+reg. + def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>; + + // Vector operation, reg+reg. + def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>; + + // Intrinsic operation, reg+reg. + def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2), + !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (F64Int VR128:$src1, + sse_load_f64:$src2))]>; + + // Vector intrinsic operation, reg+reg. + def PDrr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, reg+mem. + def PDrm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2), + !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>; +} +} + +defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax, + int_x86_sse2_max_sd, int_x86_sse2_max_pd>; +defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin, + int_x86_sse2_min_sd, int_x86_sse2_min_pd>; + +//===----------------------------------------------------------------------===// +// SSE packed FP Instructions + +// Move Instructions +def MOVAPDrr : PDI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movapd {$src, $dst|$dst, $src}", []>; +def MOVAPDrm : PDI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movapd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv2f64 addr:$src))]>; + +def MOVAPDmr : PDI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movapd {$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)]>; + +def MOVUPDrr : PDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movupd {$src, $dst|$dst, $src}", []>; +def MOVUPDrm : PDI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movupd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>; +def MOVUPDmr : PDI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movupd {$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>; + +let isTwoAddress = 1 in { + let AddedComplexity = 20 in { + def MOVLPDrm : PDI<0x12, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movlpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)), + MOVLP_shuffle_mask)))]>; + def MOVHPDrm : PDI<0x16, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "movhpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)), + MOVHP_shuffle_mask)))]>; + } // AddedComplexity +} // isTwoAddress + +def MOVLPDmr : PDI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movlpd {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +def MOVHPDmr : PDI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movhpd {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (vector_shuffle VR128:$src, (undef), + UNPCKH_shuffle_mask)), (iPTR 0))), + addr:$dst)]>; + +// SSE2 instructions without OpSize prefix +def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtdq2ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, + TB, Requires<[HasSSE2]>; +def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, i128mem:$src), + "cvtdq2ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps + (bitconvert (loadv2i64 addr:$src))))]>, + TB, Requires<[HasSSE2]>; + +// SSE2 instructions with XS prefix +def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtdq2pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, + XS, Requires<[HasSSE2]>; +def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "cvtdq2pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd + (bitconvert (loadv2i64 addr:$src))))]>, + XS, Requires<[HasSSE2]>; + +def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtps2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; +def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "cvtps2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq + (load addr:$src)))]>; +// SSE2 packed instructions with XS prefix +def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvttps2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>, + XS, Requires<[HasSSE2]>; +def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "cvttps2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (load addr:$src)))]>, + XS, Requires<[HasSSE2]>; + +// SSE2 packed instructions with XD prefix +def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtpd2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + XD, Requires<[HasSSE2]>; +def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "cvtpd2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq + (load addr:$src)))]>, + XD, Requires<[HasSSE2]>; + +def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvttpd2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; +def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "cvttpd2dq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (load addr:$src)))]>; + +// SSE2 instructions without OpSize prefix +def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtps2pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, + TB, Requires<[HasSSE2]>; +def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (ops VR128:$dst, f64mem:$src), + "cvtps2pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd + (load addr:$src)))]>, + TB, Requires<[HasSSE2]>; + +def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "cvtpd2ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; +def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, f128mem:$src), + "cvtpd2ps {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps + (load addr:$src)))]>; + +// Match intrinsics which expect XMM operand(s). +// Aliases for intrinsics +let isTwoAddress = 1 in { +def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, GR32:$src2), + "cvtsi2sd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1, + GR32:$src2))]>; +def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i32mem:$src2), + "cvtsi2sd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1, + (loadi32 addr:$src2)))]>; +def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "cvtsd2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, + VR128:$src2))]>; +def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f64mem:$src2), + "cvtsd2ss {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, + (load addr:$src2)))]>; +def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "cvtss2sd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + VR128:$src2))]>, XS, + Requires<[HasSSE2]>; +def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f32mem:$src2), + "cvtss2sd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + (load addr:$src2)))]>, XS, + Requires<[HasSSE2]>; +} + +// Arithmetic + +/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. +/// +/// These four forms can each have a reg or a mem operand, so there are a +/// total of eight "instructions". +/// +multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F64Int, + Intrinsic V2F64Int, + bit Commutable = 0> { + // Scalar operation, reg. + def SDr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src), + !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode FR64:$src))]> { + let isCommutable = Commutable; + } + + // Scalar operation, mem. + def SDm : SDI<opc, MRMSrcMem, (ops FR64:$dst, f64mem:$src), + !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode (load addr:$src)))]>; + + // Vector operation, reg. + def PDr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> { + let isCommutable = Commutable; + } + + // Vector operation, mem. + def PDm : PDI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>; + + // Intrinsic operation, reg. + def SDr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int VR128:$src))]> { + let isCommutable = Commutable; + } + + // Intrinsic operation, mem. + def SDm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, sdmem:$src), + !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int sse_load_f64:$src))]>; + + // Vector intrinsic operation, reg + def PDr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src), + !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V2F64Int VR128:$src))]> { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, mem + def PDm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, f64mem:$src), + !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V2F64Int (load addr:$src)))]>; +} + +// Square root. +defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt, + int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>; + +// There is no f64 version of the reciprocal approximation instructions. + +// Logical +let isTwoAddress = 1 in { + let isCommutable = 1 in { + def ANDPDrr : PDI<0x54, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (and (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))]>; + def ORPDrr : PDI<0x56, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "orpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (or (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))]>; + def XORPDrr : PDI<0x57, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "xorpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (xor (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))]>; + } + + def ANDPDrm : PDI<0x54, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "andpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (and (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (loadv2f64 addr:$src2))))]>; + def ORPDrm : PDI<0x56, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "orpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (or (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (loadv2f64 addr:$src2))))]>; + def XORPDrm : PDI<0x57, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "xorpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (xor (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (loadv2f64 addr:$src2))))]>; + def ANDNPDrr : PDI<0x55, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "andnpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), + (bc_v2i64 (v2f64 VR128:$src2))))]>; + def ANDNPDrm : PDI<0x55, MRMSrcMem, + (ops VR128:$dst, VR128:$src1,f128mem:$src2), + "andnpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (and (vnot (bc_v2i64 (v2f64 VR128:$src1))), + (bc_v2i64 (loadv2f64 addr:$src2))))]>; +} + +let isTwoAddress = 1 in { + def CMPPDrri : PDIi8<0xC2, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc), + "cmp${cc}pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1, + VR128:$src, imm:$cc))]>; + def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc), + "cmp${cc}pd {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +// Shuffle and unpack instructions +let isTwoAddress = 1 in { + def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2, i8imm:$src3), + "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, (v2f64 (vector_shuffle + VR128:$src1, VR128:$src2, + SHUFP_shuffle_mask:$src3)))]>; + def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, + f128mem:$src2, i8imm:$src3), + "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + VR128:$src1, (load addr:$src2), + SHUFP_shuffle_mask:$src3)))]>; + + let AddedComplexity = 10 in { + def UNPCKHPDrr : PDI<0x15, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpckhpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def UNPCKHPDrm : PDI<0x15, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpckhpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKH_shuffle_mask)))]>; + + def UNPCKLPDrr : PDI<0x14, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "unpcklpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def UNPCKLPDrm : PDI<0x14, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "unpcklpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + VR128:$src1, (load addr:$src2), + UNPCKL_shuffle_mask)))]>; + } // AddedComplexity +} // isTwoAddress + + +//===----------------------------------------------------------------------===// +// SSE integer instructions + +// Move Instructions +def MOVDQArr : PDI<0x6F, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movdqa {$src, $dst|$dst, $src}", []>; +def MOVDQArm : PDI<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src), + "movdqa {$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv2i64 addr:$src))]>; +def MOVDQAmr : PDI<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src), + "movdqa {$src, $dst|$dst, $src}", + [(store (v2i64 VR128:$src), addr:$dst)]>; +def MOVDQUrm : I<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src), + "movdqu {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, + XS, Requires<[HasSSE2]>; +def MOVDQUmr : I<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src), + "movdqu {$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, + XS, Requires<[HasSSE2]>; + + +let isTwoAddress = 1 in { + +multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + bit Commutable = 0> { + def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; +} + +multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId> { + def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; + def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; + def ri : PDIi8<opc2, ImmForm, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, + (scalar_to_vector (i32 imm:$src2))))]>; +} + + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, bit Commutable = 0> { + def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, + (bitconvert (loadv2i64 addr:$src2)))))]>; +} + +/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64. +/// +/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew +/// to collapse (bitconvert VT to VT) into its operand. +/// +multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit Commutable = 0> { + def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> { + let isCommutable = Commutable; + } + def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (OpNode VR128:$src1,(loadv2i64 addr:$src2)))]>; +} + +} // isTwoAddress + +// 128-bit Integer Arithmetic + +defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>; +defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>; +defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>; +defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>; + +defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>; +defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>; +defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>; +defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>; + +defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>; +defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>; +defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>; +defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>; + +defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>; +defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>; +defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>; +defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>; + +defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>; + +defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>; +defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>; +defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>; + +defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>; + +defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>; +defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>; + + +defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>; +defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>; +defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>; +defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>; +defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>; + + +defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>; +defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>; +defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>; + +defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>; +defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>; +defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>; + +defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>; +defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>; +// PSRAQ doesn't exist in SSE[1-3]. + +// 128-bit logical shifts. +let isTwoAddress = 1 in { + def PSLLDQri : PDIi8<0x73, MRM7r, + (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), + "pslldq {$src2, $dst|$dst, $src2}", []>; + def PSRLDQri : PDIi8<0x73, MRM3r, + (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), + "psrldq {$src2, $dst|$dst, $src2}", []>; + // PSRADQri doesn't exist in SSE[1-3]. +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), + (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>; + def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), + (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>; + def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), + (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>; +} + +// Logical +defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>; +defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>; +defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>; + +let isTwoAddress = 1 in { + def PANDNrr : PDI<0xDF, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "pandn {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), + VR128:$src2)))]>; + + def PANDNrm : PDI<0xDF, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "pandn {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1), + (load addr:$src2))))]>; +} + +// SSE2 Integer comparison +defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>; +defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>; +defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>; +defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>; +defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>; +defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>; + +// Pack instructions +defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>; +defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>; +defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>; + +// Shuffle and unpack instructions +def PSHUFDri : PDIi8<0x70, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, i8imm:$src2), + "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v4i32 (vector_shuffle + VR128:$src1, (undef), + PSHUFD_shuffle_mask:$src2)))]>; +def PSHUFDmi : PDIi8<0x70, MRMSrcMem, + (ops VR128:$dst, i128mem:$src1, i8imm:$src2), + "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v4i32 (vector_shuffle + (bc_v4i32(loadv2i64 addr:$src1)), + (undef), + PSHUFD_shuffle_mask:$src2)))]>; + +// SSE2 with ImmT == Imm8 and XS prefix. +def PSHUFHWri : Ii8<0x70, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, i8imm:$src2), + "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v8i16 (vector_shuffle + VR128:$src1, (undef), + PSHUFHW_shuffle_mask:$src2)))]>, + XS, Requires<[HasSSE2]>; +def PSHUFHWmi : Ii8<0x70, MRMSrcMem, + (ops VR128:$dst, i128mem:$src1, i8imm:$src2), + "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v8i16 (vector_shuffle + (bc_v8i16 (loadv2i64 addr:$src1)), + (undef), + PSHUFHW_shuffle_mask:$src2)))]>, + XS, Requires<[HasSSE2]>; + +// SSE2 with ImmT == Imm8 and XD prefix. +def PSHUFLWri : Ii8<0x70, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, i32i8imm:$src2), + "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v8i16 (vector_shuffle + VR128:$src1, (undef), + PSHUFLW_shuffle_mask:$src2)))]>, + XD, Requires<[HasSSE2]>; +def PSHUFLWmi : Ii8<0x70, MRMSrcMem, + (ops VR128:$dst, i128mem:$src1, i32i8imm:$src2), + "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v8i16 (vector_shuffle + (bc_v8i16 (loadv2i64 addr:$src1)), + (undef), + PSHUFLW_shuffle_mask:$src2)))]>, + XD, Requires<[HasSSE2]>; + + +let isTwoAddress = 1 in { + def PUNPCKLBWrr : PDI<0x60, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpcklbw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v16i8 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def PUNPCKLBWrm : PDI<0x60, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpcklbw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v16i8 (vector_shuffle VR128:$src1, + (bc_v16i8 (loadv2i64 addr:$src2)), + UNPCKL_shuffle_mask)))]>; + def PUNPCKLWDrr : PDI<0x61, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpcklwd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v8i16 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def PUNPCKLWDrm : PDI<0x61, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpcklwd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v8i16 (vector_shuffle VR128:$src1, + (bc_v8i16 (loadv2i64 addr:$src2)), + UNPCKL_shuffle_mask)))]>; + def PUNPCKLDQrr : PDI<0x62, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpckldq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def PUNPCKLDQrm : PDI<0x62, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpckldq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4i32 (vector_shuffle VR128:$src1, + (bc_v4i32 (loadv2i64 addr:$src2)), + UNPCKL_shuffle_mask)))]>; + def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpcklqdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKL_shuffle_mask)))]>; + def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpcklqdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (vector_shuffle VR128:$src1, + (loadv2i64 addr:$src2), + UNPCKL_shuffle_mask)))]>; + + def PUNPCKHBWrr : PDI<0x68, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v16i8 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def PUNPCKHBWrm : PDI<0x68, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpckhbw {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v16i8 (vector_shuffle VR128:$src1, + (bc_v16i8 (loadv2i64 addr:$src2)), + UNPCKH_shuffle_mask)))]>; + def PUNPCKHWDrr : PDI<0x69, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v8i16 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def PUNPCKHWDrm : PDI<0x69, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpckhwd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v8i16 (vector_shuffle VR128:$src1, + (bc_v8i16 (loadv2i64 addr:$src2)), + UNPCKH_shuffle_mask)))]>; + def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpckhdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4i32 (vector_shuffle VR128:$src1, + (bc_v4i32 (loadv2i64 addr:$src2)), + UNPCKH_shuffle_mask)))]>; + def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "punpckhqdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (vector_shuffle VR128:$src1, VR128:$src2, + UNPCKH_shuffle_mask)))]>; + def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i128mem:$src2), + "punpckhqdq {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (vector_shuffle VR128:$src1, + (loadv2i64 addr:$src2), + UNPCKH_shuffle_mask)))]>; +} + +// Extract / Insert +def PEXTRWri : PDIi8<0xC5, MRMSrcReg, + (ops GR32:$dst, VR128:$src1, i32i8imm:$src2), + "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), + (iPTR imm:$src2)))]>; +let isTwoAddress = 1 in { + def PINSRWrri : PDIi8<0xC4, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, + GR32:$src2, i32i8imm:$src3), + "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v8i16 (X86pinsrw (v8i16 VR128:$src1), + GR32:$src2, (iPTR imm:$src3))))]>; + def PINSRWrmi : PDIi8<0xC4, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, + i16mem:$src2, i32i8imm:$src3), + "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (v8i16 (X86pinsrw (v8i16 VR128:$src1), + (i32 (anyext (loadi16 addr:$src2))), + (iPTR imm:$src3))))]>; +} + +// Mask creation +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (ops GR32:$dst, VR128:$src), + "pmovmskb {$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; + +// Conditional store +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (ops VR128:$src, VR128:$mask), + "maskmovdqu {$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, + Imp<[EDI],[]>; + +// Non-temporal stores +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src), + "movntpd {$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>; +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (ops f128mem:$dst, VR128:$src), + "movntdq {$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>; +def MOVNTImr : I<0xC3, MRMDestMem, (ops i32mem:$dst, GR32:$src), + "movnti {$src, $dst|$dst, $src}", + [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, + TB, Requires<[HasSSE2]>; + +// Flush cache +def CLFLUSH : I<0xAE, MRM7m, (ops i8mem:$src), + "clflush $src", [(int_x86_sse2_clflush addr:$src)]>, + TB, Requires<[HasSSE2]>; + +// Load, store, and memory fence +def LFENCE : I<0xAE, MRM5m, (ops), + "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>; +def MFENCE : I<0xAE, MRM6m, (ops), + "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>; + + +// Alias instructions that map zero vector to pxor / xorp* for sse. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let isReMaterializable = 1 in + def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst), + "pcmpeqd $dst, $dst", + [(set VR128:$dst, (v2f64 immAllOnesV))]>; + +// FR64 to 128-bit vector conversion. +def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src), + "movsd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (scalar_to_vector FR64:$src)))]>; +def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), + "movsd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>; + +def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>; +def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>; + +def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (ops FR32:$dst, GR32:$src), + "movd {$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))]>; + +def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (ops FR32:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; + +// SSE2 instructions with XS prefix +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + Requires<[HasSSE2]>; +def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src), + "movq {$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +// FIXME: may not be able to eliminate this movss with coalescing the src and +// dest register classes are different. We really want to write this pattern +// like this: +// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +// (f32 FR32:$src)>; +def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, VR128:$src), + "movsd {$src, $dst|$dst, $src}", + [(set FR64:$dst, (vector_extract (v2f64 VR128:$src), + (iPTR 0)))]>; +def MOVPD2SDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, VR128:$src), + "movsd {$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>; +def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (ops GR32:$dst, VR128:$src), + "movd {$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + (iPTR 0)))]>; +def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src), + "movd {$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (ops GR32:$dst, FR32:$src), + "movd {$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))]>; +def MOVSS2DImr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, FR32:$src), + "movd {$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; + + +// Move to lower bits of a VR128, leaving upper bits alone. +// Three operand (but two address) aliases. +let isTwoAddress = 1 in { + def MOVLSD2PDrr : SDI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, FR64:$src2), + "movsd {$src2, $dst|$dst, $src2}", []>; + + let AddedComplexity = 15 in + def MOVLPDrr : SDI<0x10, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "movsd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2f64 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)))]>; +} + +// Store / copy lower 64-bits of a XMM register. +def MOVLQ128mr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src), + "movq {$src, $dst|$dst, $src}", + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; + +// Move to lower bits of a VR128 and zeroing upper bits. +// Loading from memory automatically zeroing upper bits. +let AddedComplexity = 20 in + def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src), + "movsd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (vector_shuffle immAllZerosV, + (v2f64 (scalar_to_vector + (loadf64 addr:$src))), + MOVL_shuffle_mask)))]>; + +let AddedComplexity = 15 in +// movd / movq to XMM register zero-extends +def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (vector_shuffle immAllZerosV, + (v4i32 (scalar_to_vector GR32:$src)), + MOVL_shuffle_mask)))]>; +let AddedComplexity = 20 in +def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src), + "movd {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (vector_shuffle immAllZerosV, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), + MOVL_shuffle_mask)))]>; + +// Moving from XMM to XMM but still clear upper 64 bits. +let AddedComplexity = 15 in +def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>, + XS, Requires<[HasSSE2]>; +let AddedComplexity = 20 in +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "movq {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_movl_dq + (bitconvert (loadv2i64 addr:$src))))]>, + XS, Requires<[HasSSE2]>; + + +//===----------------------------------------------------------------------===// +// SSE3 Instructions +//===----------------------------------------------------------------------===// + +// SSE3 Instruction Templates: +// +// S3I - SSE3 instructions with TB and OpSize prefixes. +// S3SI - SSE3 instructions with XS prefix. +// S3DI - SSE3 instructions with XD prefix. + +class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>; +class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>; +class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>; + +// Move Instructions +def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movshdup {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4f32 (vector_shuffle + VR128:$src, (undef), + MOVSHDUP_shuffle_mask)))]>; +def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movshdup {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4f32 (vector_shuffle + (loadv4f32 addr:$src), (undef), + MOVSHDUP_shuffle_mask)))]>; + +def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movsldup {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4f32 (vector_shuffle + VR128:$src, (undef), + MOVSLDUP_shuffle_mask)))]>; +def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (ops VR128:$dst, f128mem:$src), + "movsldup {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4f32 (vector_shuffle + (loadv4f32 addr:$src), (undef), + MOVSLDUP_shuffle_mask)))]>; + +def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src), + "movddup {$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (vector_shuffle + VR128:$src, (undef), + SSE_splat_lo_mask)))]>; +def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (ops VR128:$dst, f64mem:$src), + "movddup {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (vector_shuffle + (scalar_to_vector (loadf64 addr:$src)), + (undef), + SSE_splat_lo_mask)))]>; + +// Arithmetic +let isTwoAddress = 1 in { + def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "addsubps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1, + VR128:$src2))]>; + def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "addsubps {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1, + (load addr:$src2)))]>; + def ADDSUBPDrr : S3I<0xD0, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, VR128:$src2), + "addsubpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1, + VR128:$src2))]>; + def ADDSUBPDrm : S3I<0xD0, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, f128mem:$src2), + "addsubpd {$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1, + (load addr:$src2)))]>; +} + +def LDDQUrm : S3DI<0xF0, MRMSrcMem, (ops VR128:$dst, i128mem:$src), + "lddqu {$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>; + +// Horizontal ops +class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId> + : S3DI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>; +class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId> + : S3DI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>; +class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId> + : S3I<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>; +class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId> + : S3I<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>; + +let isTwoAddress = 1 in { + def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>; + def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>; + def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>; + def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>; + def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>; + def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>; + def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; + def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; +} + +// Thread synchronization +def MONITOR : I<0xC8, RawFrm, (ops), "monitor", + [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>; +def MWAIT : I<0xC9, RawFrm, (ops), "mwait", + [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; + +// vector_shuffle v1, <undef> <1, 1, 3, 3> +let AddedComplexity = 15 in +def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + MOVSHDUP_shuffle_mask)), + (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>; +let AddedComplexity = 20 in +def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), + MOVSHDUP_shuffle_mask)), + (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>; + +// vector_shuffle v1, <undef> <0, 0, 2, 2> +let AddedComplexity = 15 in + def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + MOVSLDUP_shuffle_mask)), + (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>; +let AddedComplexity = 20 in + def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef), + MOVSLDUP_shuffle_mask)), + (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>; + +//===----------------------------------------------------------------------===// +// SSSE3 Instructions +//===----------------------------------------------------------------------===// + +// SSE3 Instruction Templates: +// +// SS38I - SSSE3 instructions with T8 and OpSize prefixes. +// SS3AI - SSSE3 instructions with TA and OpSize prefixes. + +class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>; +class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>; + +/// SS3I_binop_rm_int - Simple SSSE3 binary operatr whose type is v2i64. +let isTwoAddress = 1 in { + multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + bit Commutable = 0> { + def rr : SS38I<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> { + let isCommutable = Commutable; + } + def rm : SS38I<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, + (IntId VR128:$src1, + (bitconvert (loadv2i64 addr:$src2))))]>; + } +} + +defm PMULHRSW128 : SS3I_binop_rm_int<0x0B, "pmulhrsw", + int_x86_ssse3_pmulhrsw_128, 1>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// 128-bit vector undef's. +def : Pat<(v2f64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>; + +// 128-bit vector all zero's. +def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>; + +// 128-bit vector all one's. +def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>; +def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>; + +// Store 128-bit integer vector values. +def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; + +// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or +// 16-bits matter. +def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, + Requires<[HasSSE2]>; +def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>, + Requires<[HasSSE2]>; + +// bit_convert +let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +} + +// Move scalar to XMM zero-extended +// movd to XMM register zero-extends +let AddedComplexity = 15 in { +def : Pat<(v8i16 (vector_shuffle immAllZerosV, + (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)), + (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; +def : Pat<(v16i8 (vector_shuffle immAllZerosV, + (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)), + (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>; +// Zeroing a VR128 then do a MOVS{S|D} to the lower bits. +def : Pat<(v2f64 (vector_shuffle immAllZerosV, + (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)), + (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>; +def : Pat<(v4f32 (vector_shuffle immAllZerosV, + (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)), + (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>; +} + +// Splat v2f64 / v2i64 +let AddedComplexity = 10 in { +def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_lo_mask:$sm), + (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm), + (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_lo_mask:$sm), + (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm), + (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +} + +// Splat v4f32 +def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm), + (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>, + Requires<[HasSSE1]>; + +// Special unary SHUFPSrri case. +// FIXME: when we want non two-address code, then we should use PSHUFD? +def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef), + SHUFP_unary_shuffle_mask:$sm), + (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>, + Requires<[HasSSE1]>; +// Unary v4f32 shuffle with PSHUF* in order to fold a load. +def : Pat<(vector_shuffle (loadv4f32 addr:$src1), (undef), + SHUFP_unary_shuffle_mask:$sm), + (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; +// Special binary v4i32 shuffle cases with SHUFPS. +def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2), + PSHUFD_binary_shuffle_mask:$sm), + (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; +def : Pat<(vector_shuffle (v4i32 VR128:$src1), + (bc_v4i32 (loadv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm), + (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>, + Requires<[HasSSE2]>; + +// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...> +let AddedComplexity = 10 in { +def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask)), + (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask)), + (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask)), + (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + UNPCKL_v_undef_shuffle_mask)), + (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>; +} + +// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...> +let AddedComplexity = 10 in { +def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask)), + (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask)), + (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask)), + (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef), + UNPCKH_v_undef_shuffle_mask)), + (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>; +} + +let AddedComplexity = 15 in { +// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS +def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHP_shuffle_mask)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + +// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS +def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVHLPS_shuffle_mask)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; + +// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS +def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef), + MOVHLPS_v_undef_shuffle_mask)), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; +def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef), + MOVHLPS_v_undef_shuffle_mask)), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; +} + +let AddedComplexity = 20 in { +// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS +// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS +def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; +def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2), + MOVHP_shuffle_mask)), + (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; +def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2), + MOVHP_shuffle_mask)), + (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + +def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), + MOVLP_shuffle_mask)), + (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), + MOVHP_shuffle_mask)), + (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>; +def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2), + MOVLP_shuffle_mask)), + (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +} + +let AddedComplexity = 15 in { +// Setting the lowest element in the vector. +def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)), + (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2, + MOVL_shuffle_mask)), + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; + +// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd) +def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVLP_shuffle_mask)), + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2, + MOVLP_shuffle_mask)), + (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +} + +// Set lowest element and zero upper elements. +let AddedComplexity = 20 in +def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), + MOVL_shuffle_mask)), + (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>; + +// FIXME: Temporary workaround since 2-wide shuffle is broken. +def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2), + (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2), + (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2), + (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3), + (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>, + Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3), + (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>, + Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2), + (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)), + (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2), + (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)), + (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2), + (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)), + (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2), + (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>; +def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)), + (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + +// Some special case pandn patterns. +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), + VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), + VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), + VR128:$src2)), + (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>; + +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))), + (load addr:$src2))), + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))), + (load addr:$src2))), + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; +def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))), + (load addr:$src2))), + (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>; + +// Unaligned load +def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>, + Requires<[HasSSE1]>; diff --git a/lib/Target/X86/X86InstrX86-64.td b/lib/Target/X86/X86InstrX86-64.td new file mode 100644 index 0000000..ac43846 --- /dev/null +++ b/lib/Target/X86/X86InstrX86-64.td @@ -0,0 +1,1165 @@ +//====- X86InstrX86-64.td - Describe the X86 Instruction Set ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86-64 instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Operand Definitions... +// + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64>; +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64>; + +def lea64mem : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm); +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printlea64_32mem"; + let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm); +} + +//===----------------------------------------------------------------------===// +// Complex Pattern Definitions... +// +def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr", + [add, mul, shl, or, frameindex, X86Wrapper], + []>; + +//===----------------------------------------------------------------------===// +// Instruction templates... +// + +class RI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : I<o, F, ops, asm, pattern>, REX_W; +class RIi8 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii8<o, F, ops, asm, pattern>, REX_W; +class RIi32 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : Ii32<o, F, ops, asm, pattern>, REX_W; + +class RIi64<bits<8> o, Format f, dag ops, string asm, list<dag> pattern> + : X86Inst<o, f, Imm64, ops, asm>, REX_W { + let Pattern = pattern; + let CodeSize = 3; +} + +class RSSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : SSI<o, F, ops, asm, pattern>, REX_W; +class RSDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : SDI<o, F, ops, asm, pattern>, REX_W; +class RPDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern> + : PDI<o, F, ops, asm, pattern>, REX_W; + +//===----------------------------------------------------------------------===// +// Pattern fragments... +// + +def i64immSExt32 : PatLeaf<(i64 imm), [{ + // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit + // sign extended field. + return (int64_t)N->getValue() == (int32_t)N->getValue(); +}]>; + +def i64immZExt32 : PatLeaf<(i64 imm), [{ + // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit + // unsignedsign extended field. + return (uint64_t)N->getValue() == (uint32_t)N->getValue(); +}]>; + +def i64immSExt8 : PatLeaf<(i64 imm), [{ + // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit + // sign extended field. + return (int64_t)N->getValue() == (int8_t)N->getValue(); +}]>; + +def sextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (sextloadi1 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + +//===----------------------------------------------------------------------===// +// Instruction list... +// + +def IMPLICIT_DEF_GR64 : I<0, Pseudo, (ops GR64:$dst), + "#IMPLICIT_DEF $dst", + [(set GR64:$dst, (undef))]>; + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1, noResults = 1 in + // All calls clobber the non-callee saved registers... + let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15] in { + def CALL64pcrel32 : I<0xE8, RawFrm, (ops i64imm:$dst, variable_ops), + "call ${dst:call}", []>; + def CALL64r : I<0xFF, MRM2r, (ops GR64:$dst, variable_ops), + "call {*}$dst", [(X86call GR64:$dst)]>; + def CALL64m : I<0xFF, MRM2m, (ops i64mem:$dst, variable_ops), + "call {*}$dst", []>; + } + +// Branches +let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in { + def JMP64r : I<0xFF, MRM4r, (ops GR64:$dst), "jmp{q} {*}$dst", + [(brind GR64:$dst)]>; + def JMP64m : I<0xFF, MRM4m, (ops i64mem:$dst), "jmp{q} {*}$dst", + [(brind (loadi64 addr:$dst))]>; +} + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions... +// +def LEAVE64 : I<0xC9, RawFrm, + (ops), "leave", []>, Imp<[RBP,RSP],[RBP,RSP]>; +def POP64r : I<0x58, AddRegFrm, + (ops GR64:$reg), "pop{q} $reg", []>, Imp<[RSP],[RSP]>; +def PUSH64r : I<0x50, AddRegFrm, + (ops GR64:$reg), "push{q} $reg", []>, Imp<[RSP],[RSP]>; + +def LEA64_32r : I<0x8D, MRMSrcMem, + (ops GR32:$dst, lea64_32mem:$src), + "lea{l} {$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>; + +def LEA64r : RI<0x8D, MRMSrcMem, (ops GR64:$dst, lea64mem:$src), + "lea{q} {$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)]>; + +let isTwoAddress = 1 in +def BSWAP64r : RI<0xC8, AddRegFrm, (ops GR64:$dst, GR64:$src), + "bswap{q} $dst", + [(set GR64:$dst, (bswap GR64:$src))]>, TB; +// Exchange +def XCHG64rr : RI<0x87, MRMDestReg, (ops GR64:$src1, GR64:$src2), + "xchg{q} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG64mr : RI<0x87, MRMDestMem, (ops i64mem:$src1, GR64:$src2), + "xchg{q} {$src2|$src1}, {$src1|$src2}", []>; +def XCHG64rm : RI<0x87, MRMSrcMem, (ops GR64:$src1, i64mem:$src2), + "xchg{q} {$src2|$src1}, {$src1|$src2}", []>; + +// Repeat string ops +def REP_MOVSQ : RI<0xA5, RawFrm, (ops), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)]>, + Imp<[RCX,RDI,RSI], [RCX,RDI,RSI]>, REP; +def REP_STOSQ : RI<0xAB, RawFrm, (ops), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)]>, + Imp<[RAX,RCX,RDI], [RCX,RDI]>, REP; + +//===----------------------------------------------------------------------===// +// Move Instructions... +// + +def MOV64rr : RI<0x89, MRMDestReg, (ops GR64:$dst, GR64:$src), + "mov{q} {$src, $dst|$dst, $src}", []>; + +def MOV64ri : RIi64<0xB8, AddRegFrm, (ops GR64:$dst, i64imm:$src), + "movabs{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, imm:$src)]>; +def MOV64ri32 : RIi32<0xC7, MRM0r, (ops GR64:$dst, i64i32imm:$src), + "mov{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)]>; + +def MOV64rm : RI<0x8B, MRMSrcMem, (ops GR64:$dst, i64mem:$src), + "mov{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))]>; + +def MOV64mr : RI<0x89, MRMDestMem, (ops i64mem:$dst, GR64:$src), + "mov{q} {$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)]>; +def MOV64mi32 : RIi32<0xC7, MRM0m, (ops i64mem:$dst, i64i32imm:$src), + "mov{q} {$src, $dst|$dst, $src}", + [(store i64immSExt32:$src, addr:$dst)]>; + +// Sign/Zero extenders + +def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (ops GR64:$dst, GR8 :$src), + "movs{bq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR8:$src))]>, TB; +def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (ops GR64:$dst, i8mem :$src), + "movs{bq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB; +def MOVSX64rr16: RI<0xBF, MRMSrcReg, (ops GR64:$dst, GR16:$src), + "movs{wq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR16:$src))]>, TB; +def MOVSX64rm16: RI<0xBF, MRMSrcMem, (ops GR64:$dst, i16mem:$src), + "movs{wq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB; +def MOVSX64rr32: RI<0x63, MRMSrcReg, (ops GR64:$dst, GR32:$src), + "movs{lq|xd} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR32:$src))]>; +def MOVSX64rm32: RI<0x63, MRMSrcMem, (ops GR64:$dst, i32mem:$src), + "movs{lq|xd} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i32 addr:$src))]>; + +def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (ops GR64:$dst, GR8 :$src), + "movz{bq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (zext GR8:$src))]>, TB; +def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (ops GR64:$dst, i8mem :$src), + "movz{bq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB; +def MOVZX64rr16: RI<0xB7, MRMSrcReg, (ops GR64:$dst, GR16:$src), + "movz{wq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (zext GR16:$src))]>, TB; +def MOVZX64rm16: RI<0xB7, MRMSrcMem, (ops GR64:$dst, i16mem:$src), + "movz{wq|x} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB; + +def CDQE : RI<0x98, RawFrm, (ops), + "{cltq|cdqe}", []>, Imp<[EAX],[RAX]>; // RAX = signext(EAX) + +def CQO : RI<0x99, RawFrm, (ops), + "{cqto|cqo}", []>, Imp<[RAX],[RAX,RDX]>; // RDX:RAX = signext(RAX) + +//===----------------------------------------------------------------------===// +// Arithmetic Instructions... +// + +let isTwoAddress = 1 in { +let isConvertibleToThreeAddress = 1 in { +let isCommutable = 1 in +def ADD64rr : RI<0x01, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (add GR64:$src1, GR64:$src2))]>; + +def ADD64ri32 : RIi32<0x81, MRM0r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2))]>; +def ADD64ri8 : RIi8<0x83, MRM0r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2))]>; +} // isConvertibleToThreeAddress + +def ADD64rm : RI<0x03, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (add GR64:$src1, (load addr:$src2)))]>; +} // isTwoAddress + +def ADD64mr : RI<0x01, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), GR64:$src2), addr:$dst)]>; +def ADD64mi32 : RIi32<0x81, MRM0m, (ops i64mem:$dst, i64i32imm :$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>; +def ADD64mi8 : RIi8<0x83, MRM0m, (ops i64mem:$dst, i64i8imm :$src2), + "add{q} {$src2, $dst|$dst, $src2}", + [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>; + +let isTwoAddress = 1 in { +let isCommutable = 1 in +def ADC64rr : RI<0x11, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>; + +def ADC64rm : RI<0x13, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>; + +def ADC64ri32 : RIi32<0x81, MRM2r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>; +def ADC64ri8 : RIi8<0x83, MRM2r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def ADC64mr : RI<0x11, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>; +def ADC64mi32 : RIi32<0x81, MRM2m, (ops i64mem:$dst, i64i32imm:$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>; +def ADC64mi8 : RIi8<0x83, MRM2m, (ops i64mem:$dst, i64i8imm :$src2), + "adc{q} {$src2, $dst|$dst, $src2}", + [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>; + +let isTwoAddress = 1 in { +def SUB64rr : RI<0x29, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>; + +def SUB64rm : RI<0x2B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2)))]>; + +def SUB64ri32 : RIi32<0x81, MRM5r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2))]>; +def SUB64ri8 : RIi8<0x83, MRM5r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def SUB64mr : RI<0x29, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), GR64:$src2), addr:$dst)]>; +def SUB64mi32 : RIi32<0x81, MRM5m, (ops i64mem:$dst, i64i32imm:$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>; +def SUB64mi8 : RIi8<0x83, MRM5m, (ops i64mem:$dst, i64i8imm :$src2), + "sub{q} {$src2, $dst|$dst, $src2}", + [(store (sub (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>; + +let isTwoAddress = 1 in { +def SBB64rr : RI<0x19, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>; + +def SBB64rm : RI<0x1B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>; + +def SBB64ri32 : RIi32<0x81, MRM3r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>; +def SBB64ri8 : RIi8<0x83, MRM3r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def SBB64mr : RI<0x19, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>; +def SBB64mi32 : RIi32<0x81, MRM3m, (ops i64mem:$dst, i64i32imm:$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>; +def SBB64mi8 : RIi8<0x83, MRM3m, (ops i64mem:$dst, i64i8imm :$src2), + "sbb{q} {$src2, $dst|$dst, $src2}", + [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>; + +// Unsigned multiplication +def MUL64r : RI<0xF7, MRM4r, (ops GR64:$src), + "mul{q} $src", []>, + Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*GR64 +def MUL64m : RI<0xF7, MRM4m, (ops i64mem:$src), + "mul{q} $src", []>, + Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*[mem64] + +// Signed multiplication +def IMUL64r : RI<0xF7, MRM5r, (ops GR64:$src), + "imul{q} $src", []>, + Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*GR64 +def IMUL64m : RI<0xF7, MRM5m, (ops i64mem:$src), + "imul{q} $src", []>, + Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*[mem64] + +let isTwoAddress = 1 in { +let isCommutable = 1 in +def IMUL64rr : RI<0xAF, MRMSrcReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "imul{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>, TB; + +def IMUL64rm : RI<0xAF, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "imul{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2)))]>, TB; +} // isTwoAddress + +// Suprisingly enough, these are not two address instructions! +def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32 + (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>; +def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 + (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2))]>; +def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (ops GR64:$dst, i64mem:$src1, i64i32imm:$src2), + "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, (mul (load addr:$src1), i64immSExt32:$src2))]>; +def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 + (ops GR64:$dst, i64mem:$src1, i64i8imm: $src2), + "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, (mul (load addr:$src1), i64immSExt8:$src2))]>; + +// Unsigned division / remainder +def DIV64r : RI<0xF7, MRM6r, (ops GR64:$src), // RDX:RAX/r64 = RAX,RDX + "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>; +def DIV64m : RI<0xF7, MRM6m, (ops i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX + "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>; + +// Signed division / remainder +def IDIV64r: RI<0xF7, MRM7r, (ops GR64:$src), // RDX:RAX/r64 = RAX,RDX + "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>; +def IDIV64m: RI<0xF7, MRM7m, (ops i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX + "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>; + +// Unary instructions +let CodeSize = 2 in { +let isTwoAddress = 1 in +def NEG64r : RI<0xF7, MRM3r, (ops GR64:$dst, GR64:$src), "neg{q} $dst", + [(set GR64:$dst, (ineg GR64:$src))]>; +def NEG64m : RI<0xF7, MRM3m, (ops i64mem:$dst), "neg{q} $dst", + [(store (ineg (loadi64 addr:$dst)), addr:$dst)]>; + +let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in +def INC64r : RI<0xFF, MRM0r, (ops GR64:$dst, GR64:$src), "inc{q} $dst", + [(set GR64:$dst, (add GR64:$src, 1))]>; +def INC64m : RI<0xFF, MRM0m, (ops i64mem:$dst), "inc{q} $dst", + [(store (add (loadi64 addr:$dst), 1), addr:$dst)]>; + +let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in +def DEC64r : RI<0xFF, MRM1r, (ops GR64:$dst, GR64:$src), "dec{q} $dst", + [(set GR64:$dst, (add GR64:$src, -1))]>; +def DEC64m : RI<0xFF, MRM1m, (ops i64mem:$dst), "dec{q} $dst", + [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>; + +// In 64-bit mode, single byte INC and DEC cannot be encoded. +let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in { +// Can transform into LEA. +def INC64_16r : I<0xFF, MRM0r, (ops GR16:$dst, GR16:$src), "inc{w} $dst", + [(set GR16:$dst, (add GR16:$src, 1))]>, + OpSize, Requires<[In64BitMode]>; +def INC64_32r : I<0xFF, MRM0r, (ops GR32:$dst, GR32:$src), "inc{l} $dst", + [(set GR32:$dst, (add GR32:$src, 1))]>, + Requires<[In64BitMode]>; +def DEC64_16r : I<0xFF, MRM1r, (ops GR16:$dst, GR16:$src), "dec{w} $dst", + [(set GR16:$dst, (add GR16:$src, -1))]>, + OpSize, Requires<[In64BitMode]>; +def DEC64_32r : I<0xFF, MRM1r, (ops GR32:$dst, GR32:$src), "dec{l} $dst", + [(set GR32:$dst, (add GR32:$src, -1))]>, + Requires<[In64BitMode]>; +} // isConvertibleToThreeAddress +} // CodeSize + + +// Shift instructions +let isTwoAddress = 1 in { +def SHL64rCL : RI<0xD3, MRM4r, (ops GR64:$dst, GR64:$src), + "shl{q} {%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (shl GR64:$src, CL))]>, + Imp<[CL],[]>; +def SHL64ri : RIi8<0xC1, MRM4r, (ops GR64:$dst, GR64:$src1, i8imm:$src2), + "shl{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>; +def SHL64r1 : RI<0xD1, MRM4r, (ops GR64:$dst, GR64:$src1), + "shl{q} $dst", []>; +} // isTwoAddress + +def SHL64mCL : RI<0xD3, MRM4m, (ops i64mem:$dst), + "shl{q} {%cl, $dst|$dst, %CL}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; +def SHL64mi : RIi8<0xC1, MRM4m, (ops i64mem:$dst, i8imm:$src), + "shl{q} {$src, $dst|$dst, $src}", + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHL64m1 : RI<0xD1, MRM4m, (ops i64mem:$dst), + "shl{q} $dst", + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let isTwoAddress = 1 in { +def SHR64rCL : RI<0xD3, MRM5r, (ops GR64:$dst, GR64:$src), + "shr{q} {%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (srl GR64:$src, CL))]>, + Imp<[CL],[]>; +def SHR64ri : RIi8<0xC1, MRM5r, (ops GR64:$dst, GR64:$src1, i8imm:$src2), + "shr{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>; +def SHR64r1 : RI<0xD1, MRM5r, (ops GR64:$dst, GR64:$src1), + "shr{q} $dst", + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>; +} // isTwoAddress + +def SHR64mCL : RI<0xD3, MRM5m, (ops i64mem:$dst), + "shr{q} {%cl, $dst|$dst, %CL}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; +def SHR64mi : RIi8<0xC1, MRM5m, (ops i64mem:$dst, i8imm:$src), + "shr{q} {$src, $dst|$dst, $src}", + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHR64m1 : RI<0xD1, MRM5m, (ops i64mem:$dst), + "shr{q} $dst", + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let isTwoAddress = 1 in { +def SAR64rCL : RI<0xD3, MRM7r, (ops GR64:$dst, GR64:$src), + "sar{q} {%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (sra GR64:$src, CL))]>, Imp<[CL],[]>; +def SAR64ri : RIi8<0xC1, MRM7r, (ops GR64:$dst, GR64:$src1, i8imm:$src2), + "sar{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>; +def SAR64r1 : RI<0xD1, MRM7r, (ops GR64:$dst, GR64:$src1), + "sar{q} $dst", + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>; +} // isTwoAddress + +def SAR64mCL : RI<0xD3, MRM7m, (ops i64mem:$dst), + "sar{q} {%cl, $dst|$dst, %CL}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; +def SAR64mi : RIi8<0xC1, MRM7m, (ops i64mem:$dst, i8imm:$src), + "sar{q} {$src, $dst|$dst, $src}", + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SAR64m1 : RI<0xD1, MRM7m, (ops i64mem:$dst), + "sar{q} $dst", + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +// Rotate instructions +let isTwoAddress = 1 in { +def ROL64rCL : RI<0xD3, MRM0r, (ops GR64:$dst, GR64:$src), + "rol{q} {%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (rotl GR64:$src, CL))]>, Imp<[CL],[]>; +def ROL64ri : RIi8<0xC1, MRM0r, (ops GR64:$dst, GR64:$src1, i8imm:$src2), + "rol{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; +def ROL64r1 : RI<0xD1, MRM0r, (ops GR64:$dst, GR64:$src1), + "rol{q} $dst", + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>; +} // isTwoAddress + +def ROL64mCL : I<0xD3, MRM0m, (ops i64mem:$dst), + "rol{q} {%cl, $dst|$dst, %CL}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; +def ROL64mi : RIi8<0xC1, MRM0m, (ops i64mem:$dst, i8imm:$src), + "rol{q} {$src, $dst|$dst, $src}", + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def ROL64m1 : RI<0xD1, MRM0m, (ops i64mem:$dst), + "rol{q} $dst", + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let isTwoAddress = 1 in { +def ROR64rCL : RI<0xD3, MRM1r, (ops GR64:$dst, GR64:$src), + "ror{q} {%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (rotr GR64:$src, CL))]>, Imp<[CL],[]>; +def ROR64ri : RIi8<0xC1, MRM1r, (ops GR64:$dst, GR64:$src1, i8imm:$src2), + "ror{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>; +def ROR64r1 : RI<0xD1, MRM1r, (ops GR64:$dst, GR64:$src1), + "ror{q} $dst", + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>; +} // isTwoAddress + +def ROR64mCL : RI<0xD3, MRM1m, (ops i64mem:$dst), + "ror{q} {%cl, $dst|$dst, %CL}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>, + Imp<[CL],[]>; +def ROR64mi : RIi8<0xC1, MRM1m, (ops i64mem:$dst, i8imm:$src), + "ror{q} {$src, $dst|$dst, $src}", + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def ROR64m1 : RI<0xD1, MRM1m, (ops i64mem:$dst), + "ror{q} $dst", + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +// Double shift instructions (generalizations of rotate) +let isTwoAddress = 1 in { +def SHLD64rrCL : RI<0xA5, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>, + Imp<[CL],[]>, TB; +def SHRD64rrCL : RI<0xAD, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>, + Imp<[CL],[]>, TB; + +let isCommutable = 1 in { // FIXME: Update X86InstrInfo::commuteInstruction +def SHLD64rri8 : RIi8<0xA4, MRMDestReg, + (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3), + "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>, + TB; +def SHRD64rri8 : RIi8<0xAC, MRMDestReg, + (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3), + "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>, + TB; +} // isCommutable +} // isTwoAddress + +// Temporary hack: there is no patterns associated with these instructions +// so we have to tell tblgen that these do not produce results. +let noResults = 1 in { +def SHLD64mrCL : RI<0xA5, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>, + Imp<[CL],[]>, TB; +def SHRD64mrCL : RI<0xAD, MRMDestMem, (ops i64mem:$dst, GR64:$src2), + "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>, + Imp<[CL],[]>, TB; +def SHLD64mri8 : RIi8<0xA4, MRMDestMem, + (ops i64mem:$dst, GR64:$src2, i8imm:$src3), + "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>, + TB; +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, + (ops i64mem:$dst, GR64:$src2, i8imm:$src3), + "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>, + TB; +} // noResults + +//===----------------------------------------------------------------------===// +// Logical Instructions... +// + +let isTwoAddress = 1 in +def NOT64r : RI<0xF7, MRM2r, (ops GR64:$dst, GR64:$src), "not{q} $dst", + [(set GR64:$dst, (not GR64:$src))]>; +def NOT64m : RI<0xF7, MRM2m, (ops i64mem:$dst), "not{q} $dst", + [(store (not (loadi64 addr:$dst)), addr:$dst)]>; + +let isTwoAddress = 1 in { +let isCommutable = 1 in +def AND64rr : RI<0x21, MRMDestReg, + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "and{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>; +def AND64rm : RI<0x23, MRMSrcMem, + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "and{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (and GR64:$src1, (load addr:$src2)))]>; +def AND64ri32 : RIi32<0x81, MRM4r, + (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "and{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2))]>; +def AND64ri8 : RIi8<0x83, MRM4r, + (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "and{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def AND64mr : RI<0x21, MRMDestMem, + (ops i64mem:$dst, GR64:$src), + "and{q} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), GR64:$src), addr:$dst)]>; +def AND64mi32 : RIi32<0x81, MRM4m, + (ops i64mem:$dst, i64i32imm:$src), + "and{q} {$src, $dst|$dst, $src}", + [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>; +def AND64mi8 : RIi8<0x83, MRM4m, + (ops i64mem:$dst, i64i8imm :$src), + "and{q} {$src, $dst|$dst, $src}", + [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst)]>; + +let isTwoAddress = 1 in { +let isCommutable = 1 in +def OR64rr : RI<0x09, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "or{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>; +def OR64rm : RI<0x0B, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "or{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (or GR64:$src1, (load addr:$src2)))]>; +def OR64ri32 : RIi32<0x81, MRM1r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "or{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2))]>; +def OR64ri8 : RIi8<0x83, MRM1r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "or{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def OR64mr : RI<0x09, MRMDestMem, (ops i64mem:$dst, GR64:$src), + "or{q} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), GR64:$src), addr:$dst)]>; +def OR64mi32 : RIi32<0x81, MRM1m, (ops i64mem:$dst, i64i32imm:$src), + "or{q} {$src, $dst|$dst, $src}", + [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>; +def OR64mi8 : RIi8<0x83, MRM1m, (ops i64mem:$dst, i64i8imm:$src), + "or{q} {$src, $dst|$dst, $src}", + [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst)]>; + +let isTwoAddress = 1 in { +let isCommutable = 1 in +def XOR64rr : RI<0x31, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2), + "xor{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>; +def XOR64rm : RI<0x33, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "xor{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2)))]>; +def XOR64ri32 : RIi32<0x81, MRM6r, + (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), + "xor{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2))]>; +def XOR64ri8 : RIi8<0x83, MRM6r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2), + "xor{q} {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2))]>; +} // isTwoAddress + +def XOR64mr : RI<0x31, MRMDestMem, (ops i64mem:$dst, GR64:$src), + "xor{q} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), GR64:$src), addr:$dst)]>; +def XOR64mi32 : RIi32<0x81, MRM6m, (ops i64mem:$dst, i64i32imm:$src), + "xor{q} {$src, $dst|$dst, $src}", + [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>; +def XOR64mi8 : RIi8<0x83, MRM6m, (ops i64mem:$dst, i64i8imm :$src), + "xor{q} {$src, $dst|$dst, $src}", + [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst)]>; + +//===----------------------------------------------------------------------===// +// Comparison Instructions... +// + +// Integer comparison +let isCommutable = 1 in +def TEST64rr : RI<0x85, MRMDestReg, (ops GR64:$src1, GR64:$src2), + "test{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR64:$src1, GR64:$src2), 0)]>; +def TEST64rm : RI<0x85, MRMSrcMem, (ops GR64:$src1, i64mem:$src2), + "test{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0)]>; +def TEST64ri32 : RIi32<0xF7, MRM0r, (ops GR64:$src1, i64i32imm:$src2), + "test{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0)]>; +def TEST64mi32 : RIi32<0xF7, MRM0m, (ops i64mem:$src1, i64i32imm:$src2), + "test{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0)]>; + +def CMP64rr : RI<0x39, MRMDestReg, (ops GR64:$src1, GR64:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp GR64:$src1, GR64:$src2)]>; +def CMP64mr : RI<0x39, MRMDestMem, (ops i64mem:$src1, GR64:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi64 addr:$src1), GR64:$src2)]>; +def CMP64rm : RI<0x3B, MRMSrcMem, (ops GR64:$src1, i64mem:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp GR64:$src1, (loadi64 addr:$src2))]>; +def CMP64ri32 : RIi32<0x81, MRM7r, (ops GR64:$src1, i64i32imm:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp GR64:$src1, i64immSExt32:$src2)]>; +def CMP64mi32 : RIi32<0x81, MRM7m, (ops i64mem:$src1, i64i32imm:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2)]>; +def CMP64mi8 : RIi8<0x83, MRM7m, (ops i64mem:$src1, i64i8imm:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2)]>; +def CMP64ri8 : RIi8<0x83, MRM7r, (ops GR64:$src1, i64i8imm:$src2), + "cmp{q} {$src2, $src1|$src1, $src2}", + [(X86cmp GR64:$src1, i64immSExt8:$src2)]>; + +// Conditional moves +let isTwoAddress = 1 in { +def CMOVB64rr : RI<0x42, MRMSrcReg, // if <u, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_B))]>, TB; +def CMOVB64rm : RI<0x42, MRMSrcMem, // if <u, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovb {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_B))]>, TB; +def CMOVAE64rr: RI<0x43, MRMSrcReg, // if >=u, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_AE))]>, TB; +def CMOVAE64rm: RI<0x43, MRMSrcMem, // if >=u, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovae {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_AE))]>, TB; +def CMOVE64rr : RI<0x44, MRMSrcReg, // if ==, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_E))]>, TB; +def CMOVE64rm : RI<0x44, MRMSrcMem, // if ==, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmove {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_E))]>, TB; +def CMOVNE64rr: RI<0x45, MRMSrcReg, // if !=, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_NE))]>, TB; +def CMOVNE64rm: RI<0x45, MRMSrcMem, // if !=, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovne {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_NE))]>, TB; +def CMOVBE64rr: RI<0x46, MRMSrcReg, // if <=u, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_BE))]>, TB; +def CMOVBE64rm: RI<0x46, MRMSrcMem, // if <=u, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovbe {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_BE))]>, TB; +def CMOVA64rr : RI<0x47, MRMSrcReg, // if >u, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_A))]>, TB; +def CMOVA64rm : RI<0x47, MRMSrcMem, // if >u, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmova {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_A))]>, TB; +def CMOVL64rr : RI<0x4C, MRMSrcReg, // if <s, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_L))]>, TB; +def CMOVL64rm : RI<0x4C, MRMSrcMem, // if <s, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovl {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_L))]>, TB; +def CMOVGE64rr: RI<0x4D, MRMSrcReg, // if >=s, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_GE))]>, TB; +def CMOVGE64rm: RI<0x4D, MRMSrcMem, // if >=s, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovge {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_GE))]>, TB; +def CMOVLE64rr: RI<0x4E, MRMSrcReg, // if <=s, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_LE))]>, TB; +def CMOVLE64rm: RI<0x4E, MRMSrcMem, // if <=s, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovle {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_LE))]>, TB; +def CMOVG64rr : RI<0x4F, MRMSrcReg, // if >s, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_G))]>, TB; +def CMOVG64rm : RI<0x4F, MRMSrcMem, // if >s, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovg {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_G))]>, TB; +def CMOVS64rr : RI<0x48, MRMSrcReg, // if signed, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_S))]>, TB; +def CMOVS64rm : RI<0x48, MRMSrcMem, // if signed, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovs {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_S))]>, TB; +def CMOVNS64rr: RI<0x49, MRMSrcReg, // if !signed, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_NS))]>, TB; +def CMOVNS64rm: RI<0x49, MRMSrcMem, // if !signed, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovns {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_NS))]>, TB; +def CMOVP64rr : RI<0x4A, MRMSrcReg, // if parity, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_P))]>, TB; +def CMOVP64rm : RI<0x4A, MRMSrcMem, // if parity, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovp {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_P))]>, TB; +def CMOVNP64rr : RI<0x4B, MRMSrcReg, // if !parity, GR64 = GR64 + (ops GR64:$dst, GR64:$src1, GR64:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2, + X86_COND_NP))]>, TB; +def CMOVNP64rm : RI<0x4B, MRMSrcMem, // if !parity, GR64 = [mem64] + (ops GR64:$dst, GR64:$src1, i64mem:$src2), + "cmovnp {$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + X86_COND_NP))]>, TB; +} // isTwoAddress + +//===----------------------------------------------------------------------===// +// Conversion Instructions... +// + +// f64 -> signed i64 +def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src), + "cvtsd2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (ops GR64:$dst, f128mem:$src), + "cvtsd2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, FR64:$src), + "cvttsd2si{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (fp_to_sint FR64:$src))]>; +def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f64mem:$src), + "cvttsd2si{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>; +def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src), + "cvttsd2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f128mem:$src), + "cvttsd2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic + +// Signed i64 -> f64 +def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR64:$src), + "cvtsi2sd{q} {$src, $dst|$dst, $src}", + [(set FR64:$dst, (sint_to_fp GR64:$src))]>; +def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (ops FR64:$dst, i64mem:$src), + "cvtsi2sd{q} {$src, $dst|$dst, $src}", + [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>; +let isTwoAddress = 1 in { +def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, GR64:$src2), + "cvtsi2sd{q} {$src2, $dst|$dst, $src2}", + []>; // TODO: add intrinsic +def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i64mem:$src2), + "cvtsi2sd{q} {$src2, $dst|$dst, $src2}", + []>; // TODO: add intrinsic +} // isTwoAddress + +// Signed i64 -> f32 +def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR64:$src), + "cvtsi2ss{q} {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp GR64:$src))]>; +def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (ops FR32:$dst, i64mem:$src), + "cvtsi2ss{q} {$src, $dst|$dst, $src}", + [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>; +let isTwoAddress = 1 in { +def Int_CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, + (ops VR128:$dst, VR128:$src1, GR64:$src2), + "cvtsi2ss{q} {$src2, $dst|$dst, $src2}", + []>; // TODO: add intrinsic +def Int_CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, + (ops VR128:$dst, VR128:$src1, i64mem:$src2), + "cvtsi2ss{q} {$src2, $dst|$dst, $src2}", + []>; // TODO: add intrinsic +} // isTwoAddress + +// f32 -> signed i64 +def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src), + "cvtss2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (ops GR64:$dst, f32mem:$src), + "cvtss2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, FR32:$src), + "cvttss2si{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (fp_to_sint FR32:$src))]>; +def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src), + "cvttss2si{q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>; +def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src), + "cvttss2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic +def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src), + "cvttss2si{q} {$src, $dst|$dst, $src}", + []>; // TODO: add intrinsic + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Truncate +// In 64-mode, each 64-bit and 32-bit registers has a low 8-bit sub-register. +def TRUNC_64to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR64:$src), + "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", + [(set GR8:$dst, (trunc GR64:$src))]>; +def TRUNC_32to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32:$src), + "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", + [(set GR8:$dst, (trunc GR32:$src))]>, + Requires<[In64BitMode]>; +def TRUNC_16to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16:$src), + "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", + [(set GR8:$dst, (trunc GR16:$src))]>, + Requires<[In64BitMode]>; + +def TRUNC_64to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR64:$src), + "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}", + [(set GR16:$dst, (trunc GR64:$src))]>; + +def TRUNC_64to32 : I<0x89, MRMDestReg, (ops GR32:$dst, GR64:$src), + "mov{l} {${src:subreg32}, $dst|$dst, ${src:subreg32}}", + [(set GR32:$dst, (trunc GR64:$src))]>; + +// Zero-extension +// TODO: Remove this after proper i32 -> i64 zext support. +def PsMOVZX64rr32: I<0x89, MRMDestReg, (ops GR64:$dst, GR32:$src), + "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}", + [(set GR64:$dst, (zext GR32:$src))]>; +def PsMOVZX64rm32: I<0x8B, MRMSrcMem, (ops GR64:$dst, i32mem:$src), + "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}", + [(set GR64:$dst, (zextloadi64i32 addr:$src))]>; + + +// Alias instructions that map movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove +// when we have a better way to specify isel priority. +let AddedComplexity = 1 in +def MOV64r0 : RI<0x31, MRMInitReg, (ops GR64:$dst), + "xor{q} $dst, $dst", + [(set GR64:$dst, 0)]>; + +// Materialize i64 constant where top 32-bits are zero. +let AddedComplexity = 1 in +def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (ops GR64:$dst, i64i32imm:$src), + "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}", + [(set GR64:$dst, i64immZExt32:$src)]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri tconstpool :$dst)>, Requires<[NotSmallCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri tjumptable :$dst)>, Requires<[NotSmallCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>; + +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[SmallCode, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[SmallCode, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[SmallCode, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[SmallCode, IsStatic]>; + +// Calls +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +def : Pat<(X86tailcall (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86tailcall (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +def : Pat<(X86tailcall GR64:$dst), + (CALL64r GR64:$dst)>; + +// {s|z}extload bool -> {s|z}extload byte +def : Pat<(sextloadi64i1 addr:$src), (MOVSX64rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; + +// extload +def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>; +def : Pat<(extloadi64i32 addr:$src), (PsMOVZX64rm32 addr:$src)>; + +// anyext -> zext +def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>; +def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16:$src)>; +def : Pat<(i64 (anyext GR32:$src)), (PsMOVZX64rr32 GR32:$src)>; +def : Pat<(i64 (anyext (loadi8 addr:$src))), (MOVZX64rm8 addr:$src)>; +def : Pat<(i64 (anyext (loadi16 addr:$src))), (MOVZX64rm16 addr:$src)>; +def : Pat<(i64 (anyext (loadi32 addr:$src))), (PsMOVZX64rm32 addr:$src)>; + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// (shl x, 1) ==> (add x, x) +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c) +def : Pat<(or (srl GR64:$src1, CL:$amt), + (shl GR64:$src2, (sub 64, CL:$amt))), + (SHRD64rrCL GR64:$src1, GR64:$src2)>; + +def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt), + (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst), + (SHRD64mrCL addr:$dst, GR64:$src2)>; + +// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) +def : Pat<(or (shl GR64:$src1, CL:$amt), + (srl GR64:$src2, (sub 64, CL:$amt))), + (SHLD64rrCL GR64:$src1, GR64:$src2)>; + +def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt), + (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst), + (SHLD64mrCL addr:$dst, GR64:$src2)>; + +// X86 specific add which produces a flag. +def : Pat<(addc GR64:$src1, GR64:$src2), + (ADD64rr GR64:$src1, GR64:$src2)>; +def : Pat<(addc GR64:$src1, (load addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; +def : Pat<(addc GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, imm:$src2)>; +def : Pat<(addc GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; + +def : Pat<(subc GR64:$src1, GR64:$src2), + (SUB64rr GR64:$src1, GR64:$src2)>; +def : Pat<(subc GR64:$src1, (load addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; +def : Pat<(subc GR64:$src1, imm:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(subc GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; + + +//===----------------------------------------------------------------------===// +// X86-64 SSE Instructions +//===----------------------------------------------------------------------===// + +// Move instructions... + +def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR64:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>; +def MOV64toPQIrm : RPDI<0x6E, MRMSrcMem, (ops VR128:$dst, i64mem:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>; + +def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, VR128:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + (iPTR 0)))]>; +def MOVPQIto64mr : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, VR128:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (ops FR64:$dst, GR64:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>; +def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (ops FR64:$dst, i64mem:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>; + +def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, FR64:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))]>; +def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, FR64:$src), + "mov{d|q} {$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; diff --git a/lib/Target/X86/X86IntelAsmPrinter.cpp b/lib/Target/X86/X86IntelAsmPrinter.cpp new file mode 100755 index 0000000..39b65ee --- /dev/null +++ b/lib/Target/X86/X86IntelAsmPrinter.cpp @@ -0,0 +1,533 @@ +//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to Intel format assembly language. +// This printer is the output mechanism used by `llc'. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86IntelAsmPrinter.h" +#include "X86TargetAsmInfo.h" +#include "X86.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/Mangler.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(EmittedInsts, "Number of machine instrs printed"); + +std::string X86IntelAsmPrinter::getSectionForFunction(const Function &F) const { + // Intel asm always emits functions to _text. + return "_text"; +} + +/// runOnMachineFunction - This uses the printMachineInstruction() +/// method to print assembly for each instruction. +/// +bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + O << "\n\n"; + + // Print out constants referenced by the function + EmitConstantPool(MF.getConstantPool()); + + // Print out labels for the function. + const Function *F = MF.getFunction(); + unsigned CC = F->getCallingConv(); + + // Populate function information map. Actually, We don't want to populate + // non-stdcall or non-fastcall functions' information right now. + if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall) + FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>(); + + X86SharedAsmPrinter::decorateName(CurrentFnName, F); + + SwitchToTextSection(getSectionForFunction(*F).c_str(), F); + + switch (F->getLinkage()) { + default: assert(0 && "Unsupported linkage type!"); + case Function::InternalLinkage: + EmitAlignment(4); + break; + case Function::DLLExportLinkage: + DLLExportedFns.insert(CurrentFnName); + //FALLS THROUGH + case Function::ExternalLinkage: + O << "\tpublic " << CurrentFnName << "\n"; + EmitAlignment(4); + break; + } + + O << CurrentFnName << "\tproc near\n"; + + // Print out code for the function. + for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); + I != E; ++I) { + // Print a label for the basic block if there are any predecessors. + if (I->pred_begin() != I->pred_end()) { + printBasicBlockLabel(I, true); + O << '\n'; + } + for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); + II != E; ++II) { + // Print the assembly for the instruction. + O << "\t"; + printMachineInstruction(II); + } + } + + // Print out jump tables referenced by the function. + EmitJumpTableInfo(MF.getJumpTableInfo(), MF); + + O << CurrentFnName << "\tendp\n"; + + // We didn't modify anything. + return false; +} + +void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) { + unsigned char value = MI->getOperand(Op).getImmedValue(); + assert(value <= 7 && "Invalid ssecc argument!"); + switch (value) { + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + } +} + +void X86IntelAsmPrinter::printOp(const MachineOperand &MO, + const char *Modifier) { + const MRegisterInfo &RI = *TM.getRegisterInfo(); + switch (MO.getType()) { + case MachineOperand::MO_Register: { + if (MRegisterInfo::isPhysicalRegister(MO.getReg())) { + unsigned Reg = MO.getReg(); + if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ? + MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 : + ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8)); + Reg = getX86SubSuperRegister(Reg, VT); + } + O << RI.get(Reg).Name; + } else + O << "reg" << MO.getReg(); + return; + } + case MachineOperand::MO_Immediate: + O << MO.getImmedValue(); + return; + case MachineOperand::MO_MachineBasicBlock: + printBasicBlockLabel(MO.getMachineBasicBlock()); + return; + case MachineOperand::MO_JumpTableIndex: { + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + if (!isMemOp) O << "OFFSET "; + O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() + << "_" << MO.getJumpTableIndex(); + return; + } + case MachineOperand::MO_ConstantPoolIndex: { + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + if (!isMemOp) O << "OFFSET "; + O << "[" << TAI->getPrivateGlobalPrefix() << "CPI" + << getFunctionNumber() << "_" << MO.getConstantPoolIndex(); + int Offset = MO.getOffset(); + if (Offset > 0) + O << " + " << Offset; + else if (Offset < 0) + O << Offset; + O << "]"; + return; + } + case MachineOperand::MO_GlobalAddress: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + bool isMemOp = Modifier && !strcmp(Modifier, "mem"); + GlobalValue *GV = MO.getGlobal(); + std::string Name = Mang->getValueName(GV); + + X86SharedAsmPrinter::decorateName(Name, GV); + + if (!isMemOp && !isCallOp) O << "OFFSET "; + if (GV->hasDLLImportLinkage()) { + // FIXME: This should be fixed with full support of stdcall & fastcall + // CC's + O << "__imp_"; + } + O << Name; + int Offset = MO.getOffset(); + if (Offset > 0) + O << " + " << Offset; + else if (Offset < 0) + O << Offset; + return; + } + case MachineOperand::MO_ExternalSymbol: { + bool isCallOp = Modifier && !strcmp(Modifier, "call"); + if (!isCallOp) O << "OFFSET "; + O << TAI->getGlobalPrefix() << MO.getSymbolName(); + return; + } + default: + O << "<unknown operand type>"; return; + } +} + +void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, + const char *Modifier) { + assert(isMem(MI, Op) && "Invalid memory reference!"); + + const MachineOperand &BaseReg = MI->getOperand(Op); + int ScaleVal = MI->getOperand(Op+1).getImmedValue(); + const MachineOperand &IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + + O << "["; + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOp(BaseReg, Modifier); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << "*"; + printOp(IndexReg, Modifier); + NeedPlus = true; + } + + if (DispSpec.isGlobalAddress() || DispSpec.isConstantPoolIndex() || + DispSpec.isJumpTableIndex()) { + if (NeedPlus) + O << " + "; + printOp(DispSpec, "mem"); + } else { + int DispVal = DispSpec.getImmedValue(); + if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) { + if (NeedPlus) + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + O << DispVal; + } + } + O << "]"; +} + +void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) { + O << "\"L" << getFunctionNumber() << "$pb\"\n"; + O << "\"L" << getFunctionNumber() << "$pb\":"; +} + +bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO, + const char Mode) { + const MRegisterInfo &RI = *TM.getRegisterInfo(); + unsigned Reg = MO.getReg(); + switch (Mode) { + default: return true; // Unknown mode. + case 'b': // Print QImode register + Reg = getX86SubSuperRegister(Reg, MVT::i8); + break; + case 'h': // Print QImode high register + Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + break; + case 'w': // Print HImode register + Reg = getX86SubSuperRegister(Reg, MVT::i16); + break; + case 'k': // Print SImode register + Reg = getX86SubSuperRegister(Reg, MVT::i32); + break; + } + + O << '%' << RI.get(Reg).Name; + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]); + } + } + + printOperand(MI, OpNo); + return false; +} + +bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + printMemReference(MI, OpNo); + return false; +} + +/// printMachineInstruction -- Print out a single X86 LLVM instruction +/// MI in Intel syntax to the current output stream. +/// +void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) { + ++EmittedInsts; + + // See if a truncate instruction can be turned into a nop. + switch (MI->getOpcode()) { + default: break; + case X86::TRUNC_64to32: + case X86::TRUNC_64to16: + case X86::TRUNC_32to16: + case X86::TRUNC_32to8: + case X86::TRUNC_16to8: + case X86::TRUNC_32_to8: + case X86::TRUNC_16_to8: { + const MachineOperand &MO0 = MI->getOperand(0); + const MachineOperand &MO1 = MI->getOperand(1); + unsigned Reg0 = MO0.getReg(); + unsigned Reg1 = MO1.getReg(); + unsigned Opc = MI->getOpcode(); + if (Opc == X86::TRUNC_64to32) + Reg1 = getX86SubSuperRegister(Reg1, MVT::i32); + else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16) + Reg1 = getX86SubSuperRegister(Reg1, MVT::i16); + else + Reg1 = getX86SubSuperRegister(Reg1, MVT::i8); + O << TAI->getCommentString() << " TRUNCATE "; + if (Reg0 != Reg1) + O << "\n\t"; + break; + } + case X86::PsMOVZX64rr32: + O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t"; + break; + } + + // Call the autogenerated instruction printer routines. + printInstruction(MI); +} + +bool X86IntelAsmPrinter::doInitialization(Module &M) { + X86SharedAsmPrinter::doInitialization(M); + + Mang->markCharUnacceptable('.'); + + O << "\t.686\n\t.model flat\n\n"; + + // Emit declarations for external functions. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) + if (I->isDeclaration()) { + std::string Name = Mang->getValueName(I); + X86SharedAsmPrinter::decorateName(Name, I); + + O << "\textern " ; + if (I->hasDLLImportLinkage()) { + O << "__imp_"; + } + O << Name << ":near\n"; + } + + // Emit declarations for external globals. Note that VC++ always declares + // external globals to have type byte, and if that's good enough for VC++... + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (I->isDeclaration()) { + std::string Name = Mang->getValueName(I); + + O << "\textern " ; + if (I->hasDLLImportLinkage()) { + O << "__imp_"; + } + O << Name << ":byte\n"; + } + } + + return false; +} + +bool X86IntelAsmPrinter::doFinalization(Module &M) { + const TargetData *TD = TM.getTargetData(); + + // Print out module-level global variables here. + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (I->isDeclaration()) continue; // External global require no code + + // Check to see if this is a special global used by LLVM, if so, emit it. + if (EmitSpecialLLVMGlobal(I)) + continue; + + std::string name = Mang->getValueName(I); + Constant *C = I->getInitializer(); + unsigned Align = TD->getPreferredAlignmentLog(I); + bool bCustomSegment = false; + + switch (I->getLinkage()) { + case GlobalValue::LinkOnceLinkage: + case GlobalValue::WeakLinkage: + SwitchToDataSection(""); + O << name << "?\tsegment common 'COMMON'\n"; + bCustomSegment = true; + // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256 + // are also available. + break; + case GlobalValue::AppendingLinkage: + SwitchToDataSection(""); + O << name << "?\tsegment public 'DATA'\n"; + bCustomSegment = true; + // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256 + // are also available. + break; + case GlobalValue::DLLExportLinkage: + DLLExportedGVs.insert(name); + // FALL THROUGH + case GlobalValue::ExternalLinkage: + O << "\tpublic " << name << "\n"; + // FALL THROUGH + case GlobalValue::InternalLinkage: + SwitchToDataSection(TAI->getDataSection(), I); + break; + default: + assert(0 && "Unknown linkage type!"); + } + + if (!bCustomSegment) + EmitAlignment(Align, I); + + O << name << ":\t\t\t\t" << TAI->getCommentString() + << " " << I->getName() << '\n'; + + EmitGlobalConstant(C); + + if (bCustomSegment) + O << name << "?\tends\n"; + } + + // Output linker support code for dllexported globals + if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) || + (DLLExportedFns.begin() != DLLExportedFns.end())) { + SwitchToDataSection(""); + O << "; WARNING: The following code is valid only with MASM v8.x and (possible) higher\n" + << "; This version of MASM is usually shipped with Microsoft Visual Studio 2005\n" + << "; or (possible) further versions. Unfortunately, there is no way to support\n" + << "; dllexported symbols in the earlier versions of MASM in fully automatic way\n\n"; + O << "_drectve\t segment info alias('.drectve')\n"; + } + + for (std::set<std::string>::iterator i = DLLExportedGVs.begin(), + e = DLLExportedGVs.end(); + i != e; ++i) { + O << "\t db ' /EXPORT:" << *i << ",data'\n"; + } + + for (std::set<std::string>::iterator i = DLLExportedFns.begin(), + e = DLLExportedFns.end(); + i != e; ++i) { + O << "\t db ' /EXPORT:" << *i << "'\n"; + } + + if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) || + (DLLExportedFns.begin() != DLLExportedFns.end())) { + O << "_drectve\t ends\n"; + } + + // Bypass X86SharedAsmPrinter::doFinalization(). + AsmPrinter::doFinalization(M); + SwitchToDataSection(""); + O << "\tend\n"; + return false; // success +} + +void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const { + unsigned NumElts = CVA->getNumOperands(); + if (NumElts) { + // ML does not have escape sequences except '' for '. It also has a maximum + // string length of 255. + unsigned len = 0; + bool inString = false; + for (unsigned i = 0; i < NumElts; i++) { + int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255; + if (len == 0) + O << "\tdb "; + + if (n >= 32 && n <= 127) { + if (!inString) { + if (len > 0) { + O << ",'"; + len += 2; + } else { + O << "'"; + len++; + } + inString = true; + } + if (n == '\'') { + O << "'"; + len++; + } + O << char(n); + } else { + if (inString) { + O << "'"; + len++; + inString = false; + } + if (len > 0) { + O << ","; + len++; + } + O << n; + len += 1 + (n > 9) + (n > 99); + } + + if (len > 60) { + if (inString) { + O << "'"; + inString = false; + } + O << "\n"; + len = 0; + } + } + + if (len > 0) { + if (inString) + O << "'"; + O << "\n"; + } + } +} + +// Include the auto-generated portion of the assembly writer. +#include "X86GenAsmWriter1.inc" diff --git a/lib/Target/X86/X86IntelAsmPrinter.h b/lib/Target/X86/X86IntelAsmPrinter.h new file mode 100755 index 0000000..9ad11ff --- /dev/null +++ b/lib/Target/X86/X86IntelAsmPrinter.h @@ -0,0 +1,112 @@ +//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Intel assembly code printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INTELASMPRINTER_H +#define X86INTELASMPRINTER_H + +#include "X86AsmPrinter.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Target/MRegisterInfo.h" + +namespace llvm { + +struct X86IntelAsmPrinter : public X86SharedAsmPrinter { + X86IntelAsmPrinter(std::ostream &O, X86TargetMachine &TM, + const TargetAsmInfo *T) + : X86SharedAsmPrinter(O, TM, T) { + } + + virtual const char *getPassName() const { + return "X86 Intel-Style Assembly Printer"; + } + + /// printInstruction - This method is automatically generated by tablegen + /// from the instruction set description. This method returns true if the + /// machine instruction was sufficiently described to print it, otherwise it + /// returns false. + bool printInstruction(const MachineInstr *MI); + + // This method is used by the tablegen'erated instruction printer. + void printOperand(const MachineInstr *MI, unsigned OpNo, + const char *Modifier = 0) { + const MachineOperand &MO = MI->getOperand(OpNo); + if (MO.isRegister()) { + assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) && "Not physreg??"); + O << TM.getRegisterInfo()->get(MO.getReg()).Name; + } else { + printOp(MO, Modifier); + } + } + + void printi8mem(const MachineInstr *MI, unsigned OpNo) { + O << "BYTE PTR "; + printMemReference(MI, OpNo); + } + void printi16mem(const MachineInstr *MI, unsigned OpNo) { + O << "WORD PTR "; + printMemReference(MI, OpNo); + } + void printi32mem(const MachineInstr *MI, unsigned OpNo) { + O << "DWORD PTR "; + printMemReference(MI, OpNo); + } + void printi64mem(const MachineInstr *MI, unsigned OpNo) { + O << "QWORD PTR "; + printMemReference(MI, OpNo); + } + void printi128mem(const MachineInstr *MI, unsigned OpNo) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo); + } + void printf32mem(const MachineInstr *MI, unsigned OpNo) { + O << "DWORD PTR "; + printMemReference(MI, OpNo); + } + void printf64mem(const MachineInstr *MI, unsigned OpNo) { + O << "QWORD PTR "; + printMemReference(MI, OpNo); + } + void printf128mem(const MachineInstr *MI, unsigned OpNo) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo); + } + void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) { + O << "QWORD PTR "; + printMemReference(MI, OpNo, "subreg64"); + } + + bool printAsmMRegister(const MachineOperand &MO, const char Mode); + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode); + void printMachineInstruction(const MachineInstr *MI); + void printOp(const MachineOperand &MO, const char *Modifier = 0); + void printSSECC(const MachineInstr *MI, unsigned Op); + void printMemReference(const MachineInstr *MI, unsigned Op, + const char *Modifier=NULL); + void printPICLabel(const MachineInstr *MI, unsigned Op); + bool runOnMachineFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + + /// getSectionForFunction - Return the section that we should emit the + /// specified function body into. + virtual std::string getSectionForFunction(const Function &F) const; + + virtual void EmitString(const ConstantArray *CVA) const; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp new file mode 100644 index 0000000..b9e5d5b --- /dev/null +++ b/lib/Target/X86/X86JITInfo.cpp @@ -0,0 +1,372 @@ +//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the X86 target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "X86JITInfo.h" +#include "X86Relocations.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineCodeEmitter.h" +#include "llvm/Config/alloca.h" +#include <cstdlib> +using namespace llvm; + +#ifdef _MSC_VER + extern "C" void *_AddressOfReturnAddress(void); + #pragma intrinsic(_AddressOfReturnAddress) +#endif + +void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + unsigned char *OldByte = (unsigned char *)Old; + *OldByte++ = 0xE9; // Emit JMP opcode. + unsigned *OldWord = (unsigned *)OldByte; + unsigned NewAddr = (intptr_t)New; + unsigned OldAddr = (intptr_t)OldWord; + *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. +} + + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// Get the ASMPREFIX for the current host. This is often '_'. +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ +#endif +#define GETASMPREFIX2(X) #X +#define GETASMPREFIX(X) GETASMPREFIX2(X) +#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) + +// Provide a wrapper for X86CompilationCallback2 that saves non-traditional +// callee saved registers, for the fastcc calling convention. +extern "C" { +#if defined(__x86_64__) + // No need to save EAX/EDX for X86-64. + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + ASMPREFIX "X86CompilationCallback:\n" + // Save RBP + "pushq %rbp\n" + // Save RSP + "movq %rsp, %rbp\n" + // Save all int arg registers + "pushq %rdi\n" + "pushq %rsi\n" + "pushq %rdx\n" + "pushq %rcx\n" + "pushq %r8\n" + "pushq %r9\n" + // Align stack on 16-byte boundary. ESP might not be properly aligned + // (8 byte) if this is called from an indirect stub. + "andq $-16, %rsp\n" + // Save all XMM arg registers + "subq $128, %rsp\n" + "movaps %xmm0, (%rsp)\n" + "movaps %xmm1, 16(%rsp)\n" + "movaps %xmm2, 32(%rsp)\n" + "movaps %xmm3, 48(%rsp)\n" + "movaps %xmm4, 64(%rsp)\n" + "movaps %xmm5, 80(%rsp)\n" + "movaps %xmm6, 96(%rsp)\n" + "movaps %xmm7, 112(%rsp)\n" + // JIT callee + "movq %rbp, %rdi\n" // Pass prev frame and return address + "movq 8(%rbp), %rsi\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + // Restore all XMM arg registers + "movaps 112(%rsp), %xmm7\n" + "movaps 96(%rsp), %xmm6\n" + "movaps 80(%rsp), %xmm5\n" + "movaps 64(%rsp), %xmm4\n" + "movaps 48(%rsp), %xmm3\n" + "movaps 32(%rsp), %xmm2\n" + "movaps 16(%rsp), %xmm1\n" + "movaps (%rsp), %xmm0\n" + // Restore RSP + "movq %rbp, %rsp\n" + // Restore all int arg registers + "subq $48, %rsp\n" + "popq %r9\n" + "popq %r8\n" + "popq %rcx\n" + "popq %rdx\n" + "popq %rsi\n" + "popq %rdi\n" + // Restore RBP + "popq %rbp\n" + "ret\n"); +#elif defined(__i386__) || defined(i386) || defined(_M_IX86) +#ifndef _MSC_VER + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + ASMPREFIX "X86CompilationCallback:\n" + "pushl %ebp\n" + "movl %esp, %ebp\n" // Standard prologue + "pushl %eax\n" + "pushl %edx\n" // Save EAX/EDX/ECX + "pushl %ecx\n" +#if defined(__APPLE__) + "andl $-16, %esp\n" // Align ESP on 16-byte boundary +#endif + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "movl %ebp, %esp\n" // Restore ESP + "subl $12, %esp\n" + "popl %ecx\n" + "popl %edx\n" + "popl %eax\n" + "popl %ebp\n" + "ret\n"); + + // Same as X86CompilationCallback but also saves XMM argument registers. + void X86CompilationCallback_SSE(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback_SSE\n" + ASMPREFIX "X86CompilationCallback_SSE:\n" + "pushl %ebp\n" + "movl %esp, %ebp\n" // Standard prologue + "pushl %eax\n" + "pushl %edx\n" // Save EAX/EDX/ECX + "pushl %ecx\n" + "andl $-16, %esp\n" // Align ESP on 16-byte boundary + // Save all XMM arg registers + "subl $64, %esp\n" + "movaps %xmm0, (%esp)\n" + "movaps %xmm1, 16(%esp)\n" + "movaps %xmm2, 32(%esp)\n" + "movaps %xmm3, 48(%esp)\n" + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "addl $16, %esp\n" + "movaps 48(%esp), %xmm3\n" + "movaps 32(%esp), %xmm2\n" + "movaps 16(%esp), %xmm1\n" + "movaps (%esp), %xmm0\n" + "movl %ebp, %esp\n" // Restore ESP + "subl $12, %esp\n" + "popl %ecx\n" + "popl %edx\n" + "popl %eax\n" + "popl %ebp\n" + "ret\n"); +#else + void X86CompilationCallback2(void); + + _declspec(naked) void X86CompilationCallback(void) { + __asm { + push eax + push edx + push ecx + call X86CompilationCallback2 + pop ecx + pop edx + pop eax + ret + } + } +#endif // _MSC_VER + +#else // Not an i386 host + void X86CompilationCallback() { + assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n"); + abort(); + } +#endif +} + +/// X86CompilationCallback - This is the target-specific function invoked by the +/// function stub when we did not know the real target of a call. This function +/// must locate the start of the stub or call site and pass it into the JIT +/// compiler function. +#ifdef _MSC_VER +extern "C" void X86CompilationCallback2() { + assert(sizeof(size_t) == 4); // FIXME: handle Win64 + intptr_t *RetAddrLoc = (intptr_t *)_AddressOfReturnAddress(); + RetAddrLoc += 4; // skip over ret addr, edx, eax, ecx + intptr_t RetAddr = *RetAddrLoc; +#else +extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { + intptr_t *RetAddrLoc = &StackPtr[1]; +#endif + assert(*RetAddrLoc == RetAddr && + "Could not find return address on the stack!"); + + // It's a stub if there is an interrupt marker after the call. + bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD; + + // The call instruction should have pushed the return value onto the stack... +#ifdef __x86_64__ + RetAddr--; // Backtrack to the reference itself... +#else + RetAddr -= 4; // Backtrack to the reference itself... +#endif + +#if 0 + DOUT << "In callback! Addr=" << (void*)RetAddr + << " ESP=" << (void*)StackPtr + << ": Resolving call to function: " + << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"; +#endif + + // Sanity check to make sure this really is a call instruction. +#ifdef __x86_64__ + assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!"); + assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!"); +#else + assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!"); +#endif + + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. +#ifdef __x86_64__ + *(intptr_t *)(RetAddr - 0xa) = NewVal; +#else + *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); +#endif + + if (isStub) { + // If this is a stub, rewrite the call into an unconditional branch + // instruction so that two return addresses are not pushed onto the stack + // when the requested function finally gets called. This also makes the + // 0xCD byte (interrupt) dead, so the marker doesn't effect anything. +#ifdef __x86_64__ + ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6)); +#else + ((unsigned char*)RetAddr)[-1] = 0xE9; +#endif + } + + // Change the return address to reexecute the call instruction... +#ifdef __x86_64__ + *RetAddrLoc -= 0xd; +#else + *RetAddrLoc -= 5; +#endif +} + +TargetJITInfo::LazyResolverFn +X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + +#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \ + !defined(_MSC_VER) && !defined(__x86_64__) + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + union { + unsigned u[3]; + char c[12]; + } text; + + if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) { + // FIXME: support for AMD family of processors. + if (memcmp(text.c, "GenuineIntel", 12) == 0) { + X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); + if ((EDX >> 25) & 0x1) + return X86CompilationCallback_SSE; + } + } +#endif + + return X86CompilationCallback; +} + +void *X86JITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) { + // Note, we cast to intptr_t here to silence a -pedantic warning that + // complains about casting a function pointer to a normal pointer. +#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \ + !defined(_MSC_VER) && !defined(__x86_64__) + bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback && + Fn != (void*)(intptr_t)X86CompilationCallback_SSE); +#else + bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback; +#endif + if (NotCC) { +#ifdef __x86_64__ + MCE.startFunctionStub(13, 4); + MCE.emitByte(0x49); // REX prefix + MCE.emitByte(0xB8+2); // movabsq r10 + MCE.emitWordLE(((unsigned *)&Fn)[0]); + MCE.emitWordLE(((unsigned *)&Fn)[1]); + MCE.emitByte(0x41); // REX prefix + MCE.emitByte(0xFF); // jmpq *r10 + MCE.emitByte(2 | (4 << 3) | (3 << 6)); +#else + MCE.startFunctionStub(5, 4); + MCE.emitByte(0xE9); + MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4); +#endif + return MCE.finishFunctionStub(0); + } + +#ifdef __x86_64__ + MCE.startFunctionStub(14, 4); + MCE.emitByte(0x49); // REX prefix + MCE.emitByte(0xB8+2); // movabsq r10 + MCE.emitWordLE(((unsigned *)&Fn)[0]); + MCE.emitWordLE(((unsigned *)&Fn)[1]); + MCE.emitByte(0x41); // REX prefix + MCE.emitByte(0xFF); // callq *r10 + MCE.emitByte(2 | (2 << 3) | (3 << 6)); +#else + MCE.startFunctionStub(6, 4); + MCE.emitByte(0xE8); // Call with 32 bit pc-rel destination... + + MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4); +#endif + + MCE.emitByte(0xCD); // Interrupt - Just a marker identifying the stub! + return MCE.finishFunctionStub(0); +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void X86JITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); + intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); + switch ((X86::RelocationType)MR->getRelocationType()) { + case X86::reloc_pcrel_word: { + // PC relative relocation, add the relocated value to the value already in + // memory, after we adjust it for where the PC is. + ResultPtr = ResultPtr-(intptr_t)RelocPos-4-MR->getConstantVal(); + *((unsigned*)RelocPos) += (unsigned)ResultPtr; + break; + } + case X86::reloc_absolute_word: + // Absolute relocation, just add the relocated value to the value already + // in memory. + *((unsigned*)RelocPos) += (unsigned)ResultPtr; + break; + case X86::reloc_absolute_dword: + *((intptr_t*)RelocPos) += ResultPtr; + break; + } + } +} diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h new file mode 100644 index 0000000..a4c731a --- /dev/null +++ b/lib/Target/X86/X86JITInfo.h @@ -0,0 +1,50 @@ +//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86JITINFO_H +#define X86JITINFO_H + +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { + class X86TargetMachine; + + class X86JITInfo : public TargetJITInfo { + X86TargetMachine &TM; + public: + X86JITInfo(X86TargetMachine &tm) : TM(tm) {useGOT = 0;} + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + }; +} + +#endif diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h new file mode 100644 index 0000000..7a21fb2 --- /dev/null +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -0,0 +1,74 @@ +//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the Evan Cheng and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares X86-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef X86MACHINEFUNCTIONINFO_H +#define X86MACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +enum NameDecorationStyle { + None, + StdCall, + FastCall +}; + +/// X86MachineFunctionInfo - This class is derived from MachineFunction private +/// X86 target-specific information for each MachineFunction. +class X86MachineFunctionInfo : public MachineFunctionInfo { + /// ForceFramePointer - True if the function is required to use of frame + /// pointer for reasons other than it containing dynamic allocation or + /// that FP eliminatation is turned off. For example, Cygwin main function + /// contains stack pointer re-alignment code which requires FP. + bool ForceFramePointer; + + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize; + + /// BytesToPopOnReturn - amount of bytes function pops on return. + /// Used on windows platform for stdcall & fastcall name decoration + unsigned BytesToPopOnReturn; + + /// If the function requires additional name decoration, DecorationStyle holds + /// the right way to do so. + NameDecorationStyle DecorationStyle; + +public: + X86MachineFunctionInfo() : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + DecorationStyle(None) {} + + X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + DecorationStyle(None) {} + + bool getForceFramePointer() const { return ForceFramePointer;} + void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } + void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} + + NameDecorationStyle getDecorationStyle() const { return DecorationStyle; } + void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;} + +}; +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp new file mode 100644 index 0000000..da65db0 --- /dev/null +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -0,0 +1,1613 @@ +//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the MRegisterInfo class. This +// file is responsible for the frame pointer elimination optimization on X86. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86RegisterInfo.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineLocation.h" +#include "llvm/Target/TargetAsmInfo.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +namespace { + cl::opt<bool> + NoFusing("disable-spill-fusing", + cl::desc("Disable fusing of spill code into instructions")); + cl::opt<bool> + PrintFailedFusing("print-failed-fuse-candidates", + cl::desc("Print instructions that the allocator wants to" + " fuse, but the X86 backend currently can't"), + cl::Hidden); +} + +X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, + const TargetInstrInfo &tii) + : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP), + TM(tm), TII(tii) { + // Cache some information. + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + Is64Bit = Subtarget->is64Bit(); + if (Is64Bit) { + SlotSize = 8; + StackPtr = X86::RSP; + FramePtr = X86::RBP; + } else { + SlotSize = 4; + StackPtr = X86::ESP; + FramePtr = X86::EBP; + } +} + +bool X86RegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const { + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize); + unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + BuildMI(MBB, MI, TII.get(Opc)).addReg(Reg); + } + return true; +} + +bool X86RegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const { + if (CSI.empty()) + return false; + + unsigned Opc = Is64Bit ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + BuildMI(MBB, MI, TII.get(Opc), Reg); + } + return true; +} + +void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIdx, + const TargetRegisterClass *RC) const { + unsigned Opc; + if (RC == &X86::GR64RegClass) { + Opc = X86::MOV64mr; + } else if (RC == &X86::GR32RegClass) { + Opc = X86::MOV32mr; + } else if (RC == &X86::GR16RegClass) { + Opc = X86::MOV16mr; + } else if (RC == &X86::GR8RegClass) { + Opc = X86::MOV8mr; + } else if (RC == &X86::GR32_RegClass) { + Opc = X86::MOV32_mr; + } else if (RC == &X86::GR16_RegClass) { + Opc = X86::MOV16_mr; + } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) { + Opc = X86::ST_Fp64m; + } else if (RC == &X86::RFP32RegClass) { + Opc = X86::ST_Fp32m; + } else if (RC == &X86::FR32RegClass) { + Opc = X86::MOVSSmr; + } else if (RC == &X86::FR64RegClass) { + Opc = X86::MOVSDmr; + } else if (RC == &X86::VR128RegClass) { + Opc = X86::MOVAPSmr; + } else if (RC == &X86::VR64RegClass) { + Opc = X86::MMX_MOVQ64mr; + } else { + assert(0 && "Unknown regclass"); + abort(); + } + addFrameReference(BuildMI(MBB, MI, TII.get(Opc)), FrameIdx) + .addReg(SrcReg, false, false, true); +} + +void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC) const{ + unsigned Opc; + if (RC == &X86::GR64RegClass) { + Opc = X86::MOV64rm; + } else if (RC == &X86::GR32RegClass) { + Opc = X86::MOV32rm; + } else if (RC == &X86::GR16RegClass) { + Opc = X86::MOV16rm; + } else if (RC == &X86::GR8RegClass) { + Opc = X86::MOV8rm; + } else if (RC == &X86::GR32_RegClass) { + Opc = X86::MOV32_rm; + } else if (RC == &X86::GR16_RegClass) { + Opc = X86::MOV16_rm; + } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) { + Opc = X86::LD_Fp64m; + } else if (RC == &X86::RFP32RegClass) { + Opc = X86::LD_Fp32m; + } else if (RC == &X86::FR32RegClass) { + Opc = X86::MOVSSrm; + } else if (RC == &X86::FR64RegClass) { + Opc = X86::MOVSDrm; + } else if (RC == &X86::VR128RegClass) { + Opc = X86::MOVAPSrm; + } else if (RC == &X86::VR64RegClass) { + Opc = X86::MMX_MOVQ64rm; + } else { + assert(0 && "Unknown regclass"); + abort(); + } + addFrameReference(BuildMI(MBB, MI, TII.get(Opc), DestReg), FrameIdx); +} + +void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const { + unsigned Opc; + if (RC == &X86::GR64RegClass) { + Opc = X86::MOV64rr; + } else if (RC == &X86::GR32RegClass) { + Opc = X86::MOV32rr; + } else if (RC == &X86::GR16RegClass) { + Opc = X86::MOV16rr; + } else if (RC == &X86::GR8RegClass) { + Opc = X86::MOV8rr; + } else if (RC == &X86::GR32_RegClass) { + Opc = X86::MOV32_rr; + } else if (RC == &X86::GR16_RegClass) { + Opc = X86::MOV16_rr; + } else if (RC == &X86::RFP32RegClass) { + Opc = X86::MOV_Fp3232; + } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) { + Opc = X86::MOV_Fp6464; + } else if (RC == &X86::FR32RegClass) { + Opc = X86::FsMOVAPSrr; + } else if (RC == &X86::FR64RegClass) { + Opc = X86::FsMOVAPDrr; + } else if (RC == &X86::VR128RegClass) { + Opc = X86::MOVAPSrr; + } else if (RC == &X86::VR64RegClass) { + Opc = X86::MMX_MOVQ64rr; + } else { + assert(0 && "Unknown regclass"); + abort(); + } + BuildMI(MBB, MI, TII.get(Opc), DestReg).addReg(SrcReg); +} + + +void X86RegisterInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, + const MachineInstr *Orig) const { + MachineInstr *MI = Orig->clone(); + MI->getOperand(0).setReg(DestReg); + MBB.insert(I, MI); +} + +static MachineInstr *FuseTwoAddrInst(unsigned Opcode, unsigned FrameIndex, + MachineInstr *MI, + const TargetInstrInfo &TII) { + unsigned NumOps = TII.getNumOperands(MI->getOpcode())-2; + // Create the base instruction with the memory operand as the first part. + MachineInstrBuilder MIB = addFrameReference(BuildMI(TII.get(Opcode)), + FrameIndex); + + // Loop over the rest of the ri operands, converting them over. + for (unsigned i = 0; i != NumOps; ++i) { + MachineOperand &MO = MI->getOperand(i+2); + if (MO.isReg()) + MIB = MIB.addReg(MO.getReg(), false, MO.isImplicit()); + else if (MO.isImm()) + MIB = MIB.addImm(MO.getImm()); + else if (MO.isGlobalAddress()) + MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset()); + else if (MO.isJumpTableIndex()) + MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex()); + else if (MO.isExternalSymbol()) + MIB = MIB.addExternalSymbol(MO.getSymbolName()); + else + assert(0 && "Unknown operand type!"); + } + return MIB; +} + +static MachineInstr *FuseInst(unsigned Opcode, unsigned OpNo, + unsigned FrameIndex, MachineInstr *MI, + const TargetInstrInfo &TII) { + MachineInstrBuilder MIB = BuildMI(TII.get(Opcode)); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (i == OpNo) { + assert(MO.isReg() && "Expected to fold into reg operand!"); + MIB = addFrameReference(MIB, FrameIndex); + } else if (MO.isReg()) + MIB = MIB.addReg(MO.getReg(), MO.isDef(), MO.isImplicit()); + else if (MO.isImm()) + MIB = MIB.addImm(MO.getImm()); + else if (MO.isGlobalAddress()) + MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset()); + else if (MO.isJumpTableIndex()) + MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex()); + else if (MO.isExternalSymbol()) + MIB = MIB.addExternalSymbol(MO.getSymbolName()); + else + assert(0 && "Unknown operand for FuseInst!"); + } + return MIB; +} + +static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, + unsigned Opcode, unsigned FrameIndex, + MachineInstr *MI) { + return addFrameReference(BuildMI(TII.get(Opcode)), FrameIndex).addImm(0); +} + + +//===----------------------------------------------------------------------===// +// Efficient Lookup Table Support +//===----------------------------------------------------------------------===// + +namespace { + /// TableEntry - Maps the 'from' opcode to a fused form of the 'to' opcode. + /// + struct TableEntry { + unsigned from; // Original opcode. + unsigned to; // New opcode. + + // less operators used by STL search. + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + friend bool operator<(unsigned V, const TableEntry &TE) { + return V < TE.from; + } + }; +} + +/// TableIsSorted - Return true if the table is in 'from' opcode order. +/// +static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { + for (unsigned i = 1; i != NumEntries; ++i) + if (!(Table[i-1] < Table[i])) { + cerr << "Entries out of order " << Table[i-1].from + << " " << Table[i].from << "\n"; + return false; + } + return true; +} + +/// TableLookup - Return the table entry matching the specified opcode. +/// Otherwise return NULL. +static const TableEntry *TableLookup(const TableEntry *Table, unsigned N, + unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); + if (I != Table+N && I->from == Opcode) + return I; + return NULL; +} + +#define ARRAY_SIZE(TABLE) \ + (sizeof(TABLE)/sizeof(TABLE[0])) + +#ifdef NDEBUG +#define ASSERT_SORTED(TABLE) +#else +#define ASSERT_SORTED(TABLE) \ + { static bool TABLE##Checked = false; \ + if (!TABLE##Checked) { \ + assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) && \ + "All lookup tables must be sorted for efficient access!"); \ + TABLE##Checked = true; \ + } \ + } +#endif + + +MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, + unsigned i, + int FrameIndex) const { + // Check switch flag + if (NoFusing) return NULL; + + // Table (and size) to search + const TableEntry *OpcodeTablePtr = NULL; + unsigned OpcodeTableSize = 0; + bool isTwoAddrFold = false; + unsigned NumOps = TII.getNumOperands(MI->getOpcode()); + bool isTwoAddr = NumOps > 1 && + MI->getInstrDescriptor()->getOperandConstraint(1, TOI::TIED_TO) != -1; + + MachineInstr *NewMI = NULL; + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + if (isTwoAddr && NumOps >= 2 && i < 2 && + MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && + MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + static const TableEntry OpcodeTable[] = { + { X86::ADC32ri, X86::ADC32mi }, + { X86::ADC32ri8, X86::ADC32mi8 }, + { X86::ADC32rr, X86::ADC32mr }, + { X86::ADC64ri32, X86::ADC64mi32 }, + { X86::ADC64ri8, X86::ADC64mi8 }, + { X86::ADC64rr, X86::ADC64mr }, + { X86::ADD16ri, X86::ADD16mi }, + { X86::ADD16ri8, X86::ADD16mi8 }, + { X86::ADD16rr, X86::ADD16mr }, + { X86::ADD32ri, X86::ADD32mi }, + { X86::ADD32ri8, X86::ADD32mi8 }, + { X86::ADD32rr, X86::ADD32mr }, + { X86::ADD64ri32, X86::ADD64mi32 }, + { X86::ADD64ri8, X86::ADD64mi8 }, + { X86::ADD64rr, X86::ADD64mr }, + { X86::ADD8ri, X86::ADD8mi }, + { X86::ADD8rr, X86::ADD8mr }, + { X86::AND16ri, X86::AND16mi }, + { X86::AND16ri8, X86::AND16mi8 }, + { X86::AND16rr, X86::AND16mr }, + { X86::AND32ri, X86::AND32mi }, + { X86::AND32ri8, X86::AND32mi8 }, + { X86::AND32rr, X86::AND32mr }, + { X86::AND64ri32, X86::AND64mi32 }, + { X86::AND64ri8, X86::AND64mi8 }, + { X86::AND64rr, X86::AND64mr }, + { X86::AND8ri, X86::AND8mi }, + { X86::AND8rr, X86::AND8mr }, + { X86::DEC16r, X86::DEC16m }, + { X86::DEC32r, X86::DEC32m }, + { X86::DEC64_16r, X86::DEC16m }, + { X86::DEC64_32r, X86::DEC32m }, + { X86::DEC64r, X86::DEC64m }, + { X86::DEC8r, X86::DEC8m }, + { X86::INC16r, X86::INC16m }, + { X86::INC32r, X86::INC32m }, + { X86::INC64_16r, X86::INC16m }, + { X86::INC64_32r, X86::INC32m }, + { X86::INC64r, X86::INC64m }, + { X86::INC8r, X86::INC8m }, + { X86::NEG16r, X86::NEG16m }, + { X86::NEG32r, X86::NEG32m }, + { X86::NEG64r, X86::NEG64m }, + { X86::NEG8r, X86::NEG8m }, + { X86::NOT16r, X86::NOT16m }, + { X86::NOT32r, X86::NOT32m }, + { X86::NOT64r, X86::NOT64m }, + { X86::NOT8r, X86::NOT8m }, + { X86::OR16ri, X86::OR16mi }, + { X86::OR16ri8, X86::OR16mi8 }, + { X86::OR16rr, X86::OR16mr }, + { X86::OR32ri, X86::OR32mi }, + { X86::OR32ri8, X86::OR32mi8 }, + { X86::OR32rr, X86::OR32mr }, + { X86::OR64ri32, X86::OR64mi32 }, + { X86::OR64ri8, X86::OR64mi8 }, + { X86::OR64rr, X86::OR64mr }, + { X86::OR8ri, X86::OR8mi }, + { X86::OR8rr, X86::OR8mr }, + { X86::ROL16r1, X86::ROL16m1 }, + { X86::ROL16rCL, X86::ROL16mCL }, + { X86::ROL16ri, X86::ROL16mi }, + { X86::ROL32r1, X86::ROL32m1 }, + { X86::ROL32rCL, X86::ROL32mCL }, + { X86::ROL32ri, X86::ROL32mi }, + { X86::ROL64r1, X86::ROL64m1 }, + { X86::ROL64rCL, X86::ROL64mCL }, + { X86::ROL64ri, X86::ROL64mi }, + { X86::ROL8r1, X86::ROL8m1 }, + { X86::ROL8rCL, X86::ROL8mCL }, + { X86::ROL8ri, X86::ROL8mi }, + { X86::ROR16r1, X86::ROR16m1 }, + { X86::ROR16rCL, X86::ROR16mCL }, + { X86::ROR16ri, X86::ROR16mi }, + { X86::ROR32r1, X86::ROR32m1 }, + { X86::ROR32rCL, X86::ROR32mCL }, + { X86::ROR32ri, X86::ROR32mi }, + { X86::ROR64r1, X86::ROR64m1 }, + { X86::ROR64rCL, X86::ROR64mCL }, + { X86::ROR64ri, X86::ROR64mi }, + { X86::ROR8r1, X86::ROR8m1 }, + { X86::ROR8rCL, X86::ROR8mCL }, + { X86::ROR8ri, X86::ROR8mi }, + { X86::SAR16r1, X86::SAR16m1 }, + { X86::SAR16rCL, X86::SAR16mCL }, + { X86::SAR16ri, X86::SAR16mi }, + { X86::SAR32r1, X86::SAR32m1 }, + { X86::SAR32rCL, X86::SAR32mCL }, + { X86::SAR32ri, X86::SAR32mi }, + { X86::SAR64r1, X86::SAR64m1 }, + { X86::SAR64rCL, X86::SAR64mCL }, + { X86::SAR64ri, X86::SAR64mi }, + { X86::SAR8r1, X86::SAR8m1 }, + { X86::SAR8rCL, X86::SAR8mCL }, + { X86::SAR8ri, X86::SAR8mi }, + { X86::SBB32ri, X86::SBB32mi }, + { X86::SBB32ri8, X86::SBB32mi8 }, + { X86::SBB32rr, X86::SBB32mr }, + { X86::SBB64ri32, X86::SBB64mi32 }, + { X86::SBB64ri8, X86::SBB64mi8 }, + { X86::SBB64rr, X86::SBB64mr }, + { X86::SHL16r1, X86::SHL16m1 }, + { X86::SHL16rCL, X86::SHL16mCL }, + { X86::SHL16ri, X86::SHL16mi }, + { X86::SHL32r1, X86::SHL32m1 }, + { X86::SHL32rCL, X86::SHL32mCL }, + { X86::SHL32ri, X86::SHL32mi }, + { X86::SHL64r1, X86::SHL64m1 }, + { X86::SHL64rCL, X86::SHL64mCL }, + { X86::SHL64ri, X86::SHL64mi }, + { X86::SHL8r1, X86::SHL8m1 }, + { X86::SHL8rCL, X86::SHL8mCL }, + { X86::SHL8ri, X86::SHL8mi }, + { X86::SHLD16rrCL, X86::SHLD16mrCL }, + { X86::SHLD16rri8, X86::SHLD16mri8 }, + { X86::SHLD32rrCL, X86::SHLD32mrCL }, + { X86::SHLD32rri8, X86::SHLD32mri8 }, + { X86::SHLD64rrCL, X86::SHLD64mrCL }, + { X86::SHLD64rri8, X86::SHLD64mri8 }, + { X86::SHR16r1, X86::SHR16m1 }, + { X86::SHR16rCL, X86::SHR16mCL }, + { X86::SHR16ri, X86::SHR16mi }, + { X86::SHR32r1, X86::SHR32m1 }, + { X86::SHR32rCL, X86::SHR32mCL }, + { X86::SHR32ri, X86::SHR32mi }, + { X86::SHR64r1, X86::SHR64m1 }, + { X86::SHR64rCL, X86::SHR64mCL }, + { X86::SHR64ri, X86::SHR64mi }, + { X86::SHR8r1, X86::SHR8m1 }, + { X86::SHR8rCL, X86::SHR8mCL }, + { X86::SHR8ri, X86::SHR8mi }, + { X86::SHRD16rrCL, X86::SHRD16mrCL }, + { X86::SHRD16rri8, X86::SHRD16mri8 }, + { X86::SHRD32rrCL, X86::SHRD32mrCL }, + { X86::SHRD32rri8, X86::SHRD32mri8 }, + { X86::SHRD64rrCL, X86::SHRD64mrCL }, + { X86::SHRD64rri8, X86::SHRD64mri8 }, + { X86::SUB16ri, X86::SUB16mi }, + { X86::SUB16ri8, X86::SUB16mi8 }, + { X86::SUB16rr, X86::SUB16mr }, + { X86::SUB32ri, X86::SUB32mi }, + { X86::SUB32ri8, X86::SUB32mi8 }, + { X86::SUB32rr, X86::SUB32mr }, + { X86::SUB64ri32, X86::SUB64mi32 }, + { X86::SUB64ri8, X86::SUB64mi8 }, + { X86::SUB64rr, X86::SUB64mr }, + { X86::SUB8ri, X86::SUB8mi }, + { X86::SUB8rr, X86::SUB8mr }, + { X86::XOR16ri, X86::XOR16mi }, + { X86::XOR16ri8, X86::XOR16mi8 }, + { X86::XOR16rr, X86::XOR16mr }, + { X86::XOR32ri, X86::XOR32mi }, + { X86::XOR32ri8, X86::XOR32mi8 }, + { X86::XOR32rr, X86::XOR32mr }, + { X86::XOR64ri32, X86::XOR64mi32 }, + { X86::XOR64ri8, X86::XOR64mi8 }, + { X86::XOR64rr, X86::XOR64mr }, + { X86::XOR8ri, X86::XOR8mi }, + { X86::XOR8rr, X86::XOR8mr } + }; + ASSERT_SORTED(OpcodeTable); + OpcodeTablePtr = OpcodeTable; + OpcodeTableSize = ARRAY_SIZE(OpcodeTable); + isTwoAddrFold = true; + } else if (i == 0) { // If operand 0 + if (MI->getOpcode() == X86::MOV16r0) + NewMI = MakeM0Inst(TII, X86::MOV16mi, FrameIndex, MI); + else if (MI->getOpcode() == X86::MOV32r0) + NewMI = MakeM0Inst(TII, X86::MOV32mi, FrameIndex, MI); + else if (MI->getOpcode() == X86::MOV64r0) + NewMI = MakeM0Inst(TII, X86::MOV64mi32, FrameIndex, MI); + else if (MI->getOpcode() == X86::MOV8r0) + NewMI = MakeM0Inst(TII, X86::MOV8mi, FrameIndex, MI); + if (NewMI) { + NewMI->copyKillDeadInfo(MI); + return NewMI; + } + + static const TableEntry OpcodeTable[] = { + { X86::CMP16ri, X86::CMP16mi }, + { X86::CMP16ri8, X86::CMP16mi8 }, + { X86::CMP32ri, X86::CMP32mi }, + { X86::CMP32ri8, X86::CMP32mi8 }, + { X86::CMP8ri, X86::CMP8mi }, + { X86::DIV16r, X86::DIV16m }, + { X86::DIV32r, X86::DIV32m }, + { X86::DIV64r, X86::DIV64m }, + { X86::DIV8r, X86::DIV8m }, + { X86::FsMOVAPDrr, X86::MOVSDmr }, + { X86::FsMOVAPSrr, X86::MOVSSmr }, + { X86::IDIV16r, X86::IDIV16m }, + { X86::IDIV32r, X86::IDIV32m }, + { X86::IDIV64r, X86::IDIV64m }, + { X86::IDIV8r, X86::IDIV8m }, + { X86::IMUL16r, X86::IMUL16m }, + { X86::IMUL32r, X86::IMUL32m }, + { X86::IMUL64r, X86::IMUL64m }, + { X86::IMUL8r, X86::IMUL8m }, + { X86::MOV16ri, X86::MOV16mi }, + { X86::MOV16rr, X86::MOV16mr }, + { X86::MOV32ri, X86::MOV32mi }, + { X86::MOV32rr, X86::MOV32mr }, + { X86::MOV64ri32, X86::MOV64mi32 }, + { X86::MOV64rr, X86::MOV64mr }, + { X86::MOV8ri, X86::MOV8mi }, + { X86::MOV8rr, X86::MOV8mr }, + { X86::MOVAPDrr, X86::MOVAPDmr }, + { X86::MOVAPSrr, X86::MOVAPSmr }, + { X86::MOVPDI2DIrr, X86::MOVPDI2DImr }, + { X86::MOVPQIto64rr,X86::MOVPQIto64mr }, + { X86::MOVPS2SSrr, X86::MOVPS2SSmr }, + { X86::MOVSDrr, X86::MOVSDmr }, + { X86::MOVSDto64rr, X86::MOVSDto64mr }, + { X86::MOVSS2DIrr, X86::MOVSS2DImr }, + { X86::MOVSSrr, X86::MOVSSmr }, + { X86::MOVUPDrr, X86::MOVUPDmr }, + { X86::MOVUPSrr, X86::MOVUPSmr }, + { X86::MUL16r, X86::MUL16m }, + { X86::MUL32r, X86::MUL32m }, + { X86::MUL64r, X86::MUL64m }, + { X86::MUL8r, X86::MUL8m }, + { X86::SETAEr, X86::SETAEm }, + { X86::SETAr, X86::SETAm }, + { X86::SETBEr, X86::SETBEm }, + { X86::SETBr, X86::SETBm }, + { X86::SETEr, X86::SETEm }, + { X86::SETGEr, X86::SETGEm }, + { X86::SETGr, X86::SETGm }, + { X86::SETLEr, X86::SETLEm }, + { X86::SETLr, X86::SETLm }, + { X86::SETNEr, X86::SETNEm }, + { X86::SETNPr, X86::SETNPm }, + { X86::SETNSr, X86::SETNSm }, + { X86::SETPr, X86::SETPm }, + { X86::SETSr, X86::SETSm }, + { X86::TEST16ri, X86::TEST16mi }, + { X86::TEST32ri, X86::TEST32mi }, + { X86::TEST64ri32, X86::TEST64mi32 }, + { X86::TEST8ri, X86::TEST8mi }, + { X86::XCHG16rr, X86::XCHG16mr }, + { X86::XCHG32rr, X86::XCHG32mr }, + { X86::XCHG64rr, X86::XCHG64mr }, + { X86::XCHG8rr, X86::XCHG8mr } + }; + ASSERT_SORTED(OpcodeTable); + OpcodeTablePtr = OpcodeTable; + OpcodeTableSize = ARRAY_SIZE(OpcodeTable); + } else if (i == 1) { + static const TableEntry OpcodeTable[] = { + { X86::CMP16rr, X86::CMP16rm }, + { X86::CMP32rr, X86::CMP32rm }, + { X86::CMP64ri32, X86::CMP64mi32 }, + { X86::CMP64ri8, X86::CMP64mi8 }, + { X86::CMP64rr, X86::CMP64rm }, + { X86::CMP8rr, X86::CMP8rm }, + { X86::CMPPDrri, X86::CMPPDrmi }, + { X86::CMPPSrri, X86::CMPPSrmi }, + { X86::CMPSDrr, X86::CMPSDrm }, + { X86::CMPSSrr, X86::CMPSSrm }, + { X86::CVTSD2SSrr, X86::CVTSD2SSrm }, + { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm }, + { X86::CVTSI2SDrr, X86::CVTSI2SDrm }, + { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm }, + { X86::CVTSI2SSrr, X86::CVTSI2SSrm }, + { X86::CVTSS2SDrr, X86::CVTSS2SDrm }, + { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm }, + { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm }, + { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm }, + { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm }, + { X86::FsMOVAPDrr, X86::MOVSDrm }, + { X86::FsMOVAPSrr, X86::MOVSSrm }, + { X86::IMUL16rri, X86::IMUL16rmi }, + { X86::IMUL16rri8, X86::IMUL16rmi8 }, + { X86::IMUL32rri, X86::IMUL32rmi }, + { X86::IMUL32rri8, X86::IMUL32rmi8 }, + { X86::IMUL64rr, X86::IMUL64rm }, + { X86::IMUL64rri32, X86::IMUL64rmi32 }, + { X86::IMUL64rri8, X86::IMUL64rmi8 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm }, + { X86::Int_COMISDrr, X86::Int_COMISDrm }, + { X86::Int_COMISSrr, X86::Int_COMISSrm }, + { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm }, + { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm }, + { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm }, + { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm }, + { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm }, + { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm }, + { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm }, + { X86::Int_CVTSD2SIrr, X86::Int_CVTSD2SIrm }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm }, + { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm }, + { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm }, + { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm }, + { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm }, + { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm }, + { X86::Int_CVTSS2SIrr, X86::Int_CVTSS2SIrm }, + { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm }, + { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm }, + { X86::MOV16rr, X86::MOV16rm }, + { X86::MOV32rr, X86::MOV32rm }, + { X86::MOV64rr, X86::MOV64rm }, + { X86::MOV64toPQIrr, X86::MOV64toPQIrm }, + { X86::MOV64toSDrr, X86::MOV64toSDrm }, + { X86::MOV8rr, X86::MOV8rm }, + { X86::MOVAPDrr, X86::MOVAPDrm }, + { X86::MOVAPSrr, X86::MOVAPSrm }, + { X86::MOVDDUPrr, X86::MOVDDUPrm }, + { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm }, + { X86::MOVDI2SSrr, X86::MOVDI2SSrm }, + { X86::MOVSD2PDrr, X86::MOVSD2PDrm }, + { X86::MOVSDrr, X86::MOVSDrm }, + { X86::MOVSHDUPrr, X86::MOVSHDUPrm }, + { X86::MOVSLDUPrr, X86::MOVSLDUPrm }, + { X86::MOVSS2PSrr, X86::MOVSS2PSrm }, + { X86::MOVSSrr, X86::MOVSSrm }, + { X86::MOVSX16rr8, X86::MOVSX16rm8 }, + { X86::MOVSX32rr16, X86::MOVSX32rm16 }, + { X86::MOVSX32rr8, X86::MOVSX32rm8 }, + { X86::MOVSX64rr16, X86::MOVSX64rm16 }, + { X86::MOVSX64rr32, X86::MOVSX64rm32 }, + { X86::MOVSX64rr8, X86::MOVSX64rm8 }, + { X86::MOVUPDrr, X86::MOVUPDrm }, + { X86::MOVUPSrr, X86::MOVUPSrm }, + { X86::MOVZX16rr8, X86::MOVZX16rm8 }, + { X86::MOVZX32rr16, X86::MOVZX32rm16 }, + { X86::MOVZX32rr8, X86::MOVZX32rm8 }, + { X86::MOVZX64rr16, X86::MOVZX64rm16 }, + { X86::MOVZX64rr8, X86::MOVZX64rm8 }, + { X86::PSHUFDri, X86::PSHUFDmi }, + { X86::PSHUFHWri, X86::PSHUFHWmi }, + { X86::PSHUFLWri, X86::PSHUFLWmi }, + { X86::PsMOVZX64rr32, X86::PsMOVZX64rm32 }, + { X86::TEST16rr, X86::TEST16rm }, + { X86::TEST32rr, X86::TEST32rm }, + { X86::TEST64rr, X86::TEST64rm }, + { X86::TEST8rr, X86::TEST8rm }, + // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 + { X86::UCOMISDrr, X86::UCOMISDrm }, + { X86::UCOMISSrr, X86::UCOMISSrm }, + { X86::XCHG16rr, X86::XCHG16rm }, + { X86::XCHG32rr, X86::XCHG32rm }, + { X86::XCHG64rr, X86::XCHG64rm }, + { X86::XCHG8rr, X86::XCHG8rm } + }; + ASSERT_SORTED(OpcodeTable); + OpcodeTablePtr = OpcodeTable; + OpcodeTableSize = ARRAY_SIZE(OpcodeTable); + } else if (i == 2) { + static const TableEntry OpcodeTable[] = { + { X86::ADC32rr, X86::ADC32rm }, + { X86::ADC64rr, X86::ADC64rm }, + { X86::ADD16rr, X86::ADD16rm }, + { X86::ADD32rr, X86::ADD32rm }, + { X86::ADD64rr, X86::ADD64rm }, + { X86::ADD8rr, X86::ADD8rm }, + { X86::ADDPDrr, X86::ADDPDrm }, + { X86::ADDPSrr, X86::ADDPSrm }, + { X86::ADDSDrr, X86::ADDSDrm }, + { X86::ADDSSrr, X86::ADDSSrm }, + { X86::ADDSUBPDrr, X86::ADDSUBPDrm }, + { X86::ADDSUBPSrr, X86::ADDSUBPSrm }, + { X86::AND16rr, X86::AND16rm }, + { X86::AND32rr, X86::AND32rm }, + { X86::AND64rr, X86::AND64rm }, + { X86::AND8rr, X86::AND8rm }, + { X86::ANDNPDrr, X86::ANDNPDrm }, + { X86::ANDNPSrr, X86::ANDNPSrm }, + { X86::ANDPDrr, X86::ANDPDrm }, + { X86::ANDPSrr, X86::ANDPSrm }, + { X86::CMOVA16rr, X86::CMOVA16rm }, + { X86::CMOVA32rr, X86::CMOVA32rm }, + { X86::CMOVA64rr, X86::CMOVA64rm }, + { X86::CMOVAE16rr, X86::CMOVAE16rm }, + { X86::CMOVAE32rr, X86::CMOVAE32rm }, + { X86::CMOVAE64rr, X86::CMOVAE64rm }, + { X86::CMOVB16rr, X86::CMOVB16rm }, + { X86::CMOVB32rr, X86::CMOVB32rm }, + { X86::CMOVB64rr, X86::CMOVB64rm }, + { X86::CMOVBE16rr, X86::CMOVBE16rm }, + { X86::CMOVBE32rr, X86::CMOVBE32rm }, + { X86::CMOVBE64rr, X86::CMOVBE64rm }, + { X86::CMOVE16rr, X86::CMOVE16rm }, + { X86::CMOVE32rr, X86::CMOVE32rm }, + { X86::CMOVE64rr, X86::CMOVE64rm }, + { X86::CMOVG16rr, X86::CMOVG16rm }, + { X86::CMOVG32rr, X86::CMOVG32rm }, + { X86::CMOVG64rr, X86::CMOVG64rm }, + { X86::CMOVGE16rr, X86::CMOVGE16rm }, + { X86::CMOVGE32rr, X86::CMOVGE32rm }, + { X86::CMOVGE64rr, X86::CMOVGE64rm }, + { X86::CMOVL16rr, X86::CMOVL16rm }, + { X86::CMOVL32rr, X86::CMOVL32rm }, + { X86::CMOVL64rr, X86::CMOVL64rm }, + { X86::CMOVLE16rr, X86::CMOVLE16rm }, + { X86::CMOVLE32rr, X86::CMOVLE32rm }, + { X86::CMOVLE64rr, X86::CMOVLE64rm }, + { X86::CMOVNE16rr, X86::CMOVNE16rm }, + { X86::CMOVNE32rr, X86::CMOVNE32rm }, + { X86::CMOVNE64rr, X86::CMOVNE64rm }, + { X86::CMOVNP16rr, X86::CMOVNP16rm }, + { X86::CMOVNP32rr, X86::CMOVNP32rm }, + { X86::CMOVNP64rr, X86::CMOVNP64rm }, + { X86::CMOVNS16rr, X86::CMOVNS16rm }, + { X86::CMOVNS32rr, X86::CMOVNS32rm }, + { X86::CMOVNS64rr, X86::CMOVNS64rm }, + { X86::CMOVP16rr, X86::CMOVP16rm }, + { X86::CMOVP32rr, X86::CMOVP32rm }, + { X86::CMOVP64rr, X86::CMOVP64rm }, + { X86::CMOVS16rr, X86::CMOVS16rm }, + { X86::CMOVS32rr, X86::CMOVS32rm }, + { X86::CMOVS64rr, X86::CMOVS64rm }, + { X86::DIVPDrr, X86::DIVPDrm }, + { X86::DIVPSrr, X86::DIVPSrm }, + { X86::DIVSDrr, X86::DIVSDrm }, + { X86::DIVSSrr, X86::DIVSSrm }, + { X86::HADDPDrr, X86::HADDPDrm }, + { X86::HADDPSrr, X86::HADDPSrm }, + { X86::HSUBPDrr, X86::HSUBPDrm }, + { X86::HSUBPSrr, X86::HSUBPSrm }, + { X86::IMUL16rr, X86::IMUL16rm }, + { X86::IMUL32rr, X86::IMUL32rm }, + { X86::MAXPDrr, X86::MAXPDrm }, + { X86::MAXPDrr_Int, X86::MAXPDrm_Int }, + { X86::MAXPSrr, X86::MAXPSrm }, + { X86::MAXPSrr_Int, X86::MAXPSrm_Int }, + { X86::MAXSDrr, X86::MAXSDrm }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int }, + { X86::MAXSSrr, X86::MAXSSrm }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int }, + { X86::MINPDrr, X86::MINPDrm }, + { X86::MINPDrr_Int, X86::MINPDrm_Int }, + { X86::MINPSrr, X86::MINPSrm }, + { X86::MINPSrr_Int, X86::MINPSrm_Int }, + { X86::MINSDrr, X86::MINSDrm }, + { X86::MINSDrr_Int, X86::MINSDrm_Int }, + { X86::MINSSrr, X86::MINSSrm }, + { X86::MINSSrr_Int, X86::MINSSrm_Int }, + { X86::MULPDrr, X86::MULPDrm }, + { X86::MULPSrr, X86::MULPSrm }, + { X86::MULSDrr, X86::MULSDrm }, + { X86::MULSSrr, X86::MULSSrm }, + { X86::OR16rr, X86::OR16rm }, + { X86::OR32rr, X86::OR32rm }, + { X86::OR64rr, X86::OR64rm }, + { X86::OR8rr, X86::OR8rm }, + { X86::ORPDrr, X86::ORPDrm }, + { X86::ORPSrr, X86::ORPSrm }, + { X86::PACKSSDWrr, X86::PACKSSDWrm }, + { X86::PACKSSWBrr, X86::PACKSSWBrm }, + { X86::PACKUSWBrr, X86::PACKUSWBrm }, + { X86::PADDBrr, X86::PADDBrm }, + { X86::PADDDrr, X86::PADDDrm }, + { X86::PADDQrr, X86::PADDQrm }, + { X86::PADDSBrr, X86::PADDSBrm }, + { X86::PADDSWrr, X86::PADDSWrm }, + { X86::PADDWrr, X86::PADDWrm }, + { X86::PANDNrr, X86::PANDNrm }, + { X86::PANDrr, X86::PANDrm }, + { X86::PAVGBrr, X86::PAVGBrm }, + { X86::PAVGWrr, X86::PAVGWrm }, + { X86::PCMPEQBrr, X86::PCMPEQBrm }, + { X86::PCMPEQDrr, X86::PCMPEQDrm }, + { X86::PCMPEQWrr, X86::PCMPEQWrm }, + { X86::PCMPGTBrr, X86::PCMPGTBrm }, + { X86::PCMPGTDrr, X86::PCMPGTDrm }, + { X86::PCMPGTWrr, X86::PCMPGTWrm }, + { X86::PINSRWrri, X86::PINSRWrmi }, + { X86::PMADDWDrr, X86::PMADDWDrm }, + { X86::PMAXSWrr, X86::PMAXSWrm }, + { X86::PMAXUBrr, X86::PMAXUBrm }, + { X86::PMINSWrr, X86::PMINSWrm }, + { X86::PMINUBrr, X86::PMINUBrm }, + { X86::PMULHUWrr, X86::PMULHUWrm }, + { X86::PMULHWrr, X86::PMULHWrm }, + { X86::PMULLWrr, X86::PMULLWrm }, + { X86::PMULUDQrr, X86::PMULUDQrm }, + { X86::PORrr, X86::PORrm }, + { X86::PSADBWrr, X86::PSADBWrm }, + { X86::PSLLDrr, X86::PSLLDrm }, + { X86::PSLLQrr, X86::PSLLQrm }, + { X86::PSLLWrr, X86::PSLLWrm }, + { X86::PSRADrr, X86::PSRADrm }, + { X86::PSRAWrr, X86::PSRAWrm }, + { X86::PSRLDrr, X86::PSRLDrm }, + { X86::PSRLQrr, X86::PSRLQrm }, + { X86::PSRLWrr, X86::PSRLWrm }, + { X86::PSUBBrr, X86::PSUBBrm }, + { X86::PSUBDrr, X86::PSUBDrm }, + { X86::PSUBSBrr, X86::PSUBSBrm }, + { X86::PSUBSWrr, X86::PSUBSWrm }, + { X86::PSUBWrr, X86::PSUBWrm }, + { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm }, + { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm }, + { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm }, + { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm }, + { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm }, + { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm }, + { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm }, + { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm }, + { X86::PXORrr, X86::PXORrm }, + { X86::RCPPSr, X86::RCPPSm }, + { X86::RCPPSr_Int, X86::RCPPSm_Int }, + { X86::RSQRTPSr, X86::RSQRTPSm }, + { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int }, + { X86::RSQRTSSr, X86::RSQRTSSm }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int }, + { X86::SBB32rr, X86::SBB32rm }, + { X86::SBB64rr, X86::SBB64rm }, + { X86::SHUFPDrri, X86::SHUFPDrmi }, + { X86::SHUFPSrri, X86::SHUFPSrmi }, + { X86::SQRTPDr, X86::SQRTPDm }, + { X86::SQRTPDr_Int, X86::SQRTPDm_Int }, + { X86::SQRTPSr, X86::SQRTPSm }, + { X86::SQRTPSr_Int, X86::SQRTPSm_Int }, + { X86::SQRTSDr, X86::SQRTSDm }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int }, + { X86::SQRTSSr, X86::SQRTSSm }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int }, + { X86::SUB16rr, X86::SUB16rm }, + { X86::SUB32rr, X86::SUB32rm }, + { X86::SUB64rr, X86::SUB64rm }, + { X86::SUB8rr, X86::SUB8rm }, + { X86::SUBPDrr, X86::SUBPDrm }, + { X86::SUBPSrr, X86::SUBPSrm }, + { X86::SUBSDrr, X86::SUBSDrm }, + { X86::SUBSSrr, X86::SUBSSrm }, + // FIXME: TEST*rr -> swapped operand of TEST*mr. + { X86::UNPCKHPDrr, X86::UNPCKHPDrm }, + { X86::UNPCKHPSrr, X86::UNPCKHPSrm }, + { X86::UNPCKLPDrr, X86::UNPCKLPDrm }, + { X86::UNPCKLPSrr, X86::UNPCKLPSrm }, + { X86::XOR16rr, X86::XOR16rm }, + { X86::XOR32rr, X86::XOR32rm }, + { X86::XOR64rr, X86::XOR64rm }, + { X86::XOR8rr, X86::XOR8rm }, + { X86::XORPDrr, X86::XORPDrm }, + { X86::XORPSrr, X86::XORPSrm } + }; + ASSERT_SORTED(OpcodeTable); + OpcodeTablePtr = OpcodeTable; + OpcodeTableSize = ARRAY_SIZE(OpcodeTable); + } + + // If table selected... + if (OpcodeTablePtr) { + // Find the Opcode to fuse + unsigned fromOpcode = MI->getOpcode(); + // Lookup fromOpcode in table + if (const TableEntry *Entry = TableLookup(OpcodeTablePtr, OpcodeTableSize, + fromOpcode)) { + if (isTwoAddrFold) + NewMI = FuseTwoAddrInst(Entry->to, FrameIndex, MI, TII); + else + NewMI = FuseInst(Entry->to, i, FrameIndex, MI, TII); + NewMI->copyKillDeadInfo(MI); + return NewMI; + } + } + + // No fusion + if (PrintFailedFusing) + cerr << "We failed to fuse (" + << ((i == 1) ? "r" : "s") << "): " << *MI; + return NULL; +} + + +const unsigned *X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) + const { + static const unsigned CalleeSavedRegs32Bit[] = { + X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 + }; + + static const unsigned CalleeSavedRegs32EHRet[] = { + X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 + }; + + static const unsigned CalleeSavedRegs64Bit[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + + if (Is64Bit) + return CalleeSavedRegs64Bit; + else { + if (MF) { + MachineFrameInfo *MFI = MF->getFrameInfo(); + MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); + if (MMI && MMI->callsEHReturn()) + return CalleeSavedRegs32EHRet; + } + return CalleeSavedRegs32Bit; + } +} + +const TargetRegisterClass* const* +X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const { + static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = { + &X86::GR32RegClass, &X86::GR32RegClass, + &X86::GR32RegClass, &X86::GR32RegClass, 0 + }; + static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = { + &X86::GR32RegClass, &X86::GR32RegClass, + &X86::GR32RegClass, &X86::GR32RegClass, + &X86::GR32RegClass, &X86::GR32RegClass, 0 + }; + static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = { + &X86::GR64RegClass, &X86::GR64RegClass, + &X86::GR64RegClass, &X86::GR64RegClass, + &X86::GR64RegClass, &X86::GR64RegClass, 0 + }; + + if (Is64Bit) + return CalleeSavedRegClasses64Bit; + else { + if (MF) { + MachineFrameInfo *MFI = MF->getFrameInfo(); + MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); + if (MMI && MMI->callsEHReturn()) + return CalleeSavedRegClasses32EHRet; + } + return CalleeSavedRegClasses32Bit; + } + +} + +BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(X86::RSP); + Reserved.set(X86::ESP); + Reserved.set(X86::SP); + Reserved.set(X86::SPL); + if (hasFP(MF)) { + Reserved.set(X86::RBP); + Reserved.set(X86::EBP); + Reserved.set(X86::BP); + Reserved.set(X86::BPL); + } + return Reserved; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +// hasFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +// +bool X86RegisterInfo::hasFP(const MachineFunction &MF) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); + + return (NoFramePointerElim || + MF.getFrameInfo()->hasVarSizedObjects() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + (MMI && MMI->callsUnwindInit())); +} + +void X86RegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + if (hasFP(MF)) { + // If we have a frame pointer, turn the adjcallstackup instruction into a + // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP, + // <amt>' + MachineInstr *Old = I; + uint64_t Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + MachineInstr *New = 0; + if (Old->getOpcode() == X86::ADJCALLSTACKDOWN) { + New=BuildMI(TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), StackPtr) + .addReg(StackPtr).addImm(Amount); + } else { + assert(Old->getOpcode() == X86::ADJCALLSTACKUP); + // factor out the amount the callee already popped. + uint64_t CalleeAmt = Old->getOperand(1).getImm(); + Amount -= CalleeAmt; + if (Amount) { + unsigned Opc = (Amount < 128) ? + (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) : + (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri); + New = BuildMI(TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); + } + } + + // Replace the pseudo instruction with a new instruction... + if (New) MBB.insert(I, New); + } + } else if (I->getOpcode() == X86::ADJCALLSTACKUP) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + if (uint64_t CalleeAmt = I->getOperand(1).getImm()) { + unsigned Opc = (CalleeAmt < 128) ? + (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) : + (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri); + MachineInstr *New = + BuildMI(TII.get(Opc), StackPtr).addReg(StackPtr).addImm(CalleeAmt); + MBB.insert(I, New); + } + } + + MBB.erase(I); +} + +void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const{ + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + while (!MI.getOperand(i).isFrameIndex()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getFrameIndex(); + // This must be part of a four operand memory reference. Replace the + // FrameIndex with base register with EBP. Add an offset to the offset. + MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false); + + // Now add the frame object offset to the offset from EBP. + int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + + MI.getOperand(i+3).getImm()+SlotSize; + + if (!hasFP(MF)) + Offset += MF.getFrameInfo()->getStackSize(); + else + Offset += SlotSize; // Skip the saved EBP + + MI.getOperand(i+3).ChangeToImmediate(Offset); +} + +void +X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{ + if (hasFP(MF)) { + // Create a frame entry for the EBP register that must be saved. + int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, + (int)SlotSize * -2); + assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() && + "Slot for EBP register must be last in order to be found!"); + } +} + +/// emitSPUpdate - Emit a series of instructions to increment / decrement the +/// stack pointer by a constant value. +static +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, int64_t NumBytes, bool Is64Bit, + const TargetInstrInfo &TII) { + bool isSub = NumBytes < 0; + uint64_t Offset = isSub ? -NumBytes : NumBytes; + unsigned Opc = isSub + ? ((Offset < 128) ? + (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) : + (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri)) + : ((Offset < 128) ? + (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) : + (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri)); + uint64_t Chunk = (1LL << 31) - 1; + + while (Offset) { + uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + BuildMI(MBB, MBBI, TII.get(Opc), StackPtr).addReg(StackPtr).addImm(ThisVal); + Offset -= ThisVal; + } +} + +void X86RegisterInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment(); + const Function* Fn = MF.getFunction(); + const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>(); + MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + + // Prepare for frame info. + unsigned FrameLabelId = 0, StartLabelId = 0; + + // Get the number of bytes to allocate from the FrameInfo + uint64_t StackSize = MFI->getStackSize(); + uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + + if (MMI && MMI->needsFrameInfo()) { + // Mark function start + StartLabelId = MMI->NextLabelID(); + BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(StartLabelId); + } + + if (hasFP(MF)) { + // Get the offset of the stack slot for the EBP register... which is + // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. + // Update the frame offset adjustment. + MFI->setOffsetAdjustment(SlotSize-NumBytes); + + // Save EBP into the appropriate stack slot... + BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(FramePtr); + NumBytes -= SlotSize; + + if (MMI && MMI->needsFrameInfo()) { + // Mark effective beginning of when frame pointer becomes valid. + FrameLabelId = MMI->NextLabelID(); + BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(FrameLabelId); + } + + // Update EBP with the new base value... + BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) + .addReg(StackPtr); + } + + unsigned ReadyLabelId = 0; + if (MMI && MMI->needsFrameInfo()) { + // Mark effective beginning of when frame pointer is ready. + ReadyLabelId = MMI->NextLabelID(); + BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(ReadyLabelId); + } + + // Skip the callee-saved push instructions. + while (MBBI != MBB.end() && + (MBBI->getOpcode() == X86::PUSH32r || + MBBI->getOpcode() == X86::PUSH64r)) + ++MBBI; + + if (NumBytes) { // adjust stack pointer: ESP -= numbytes + if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) { + // Check, whether EAX is livein for this function + bool isEAXAlive = false; + for (MachineFunction::livein_iterator II = MF.livein_begin(), + EE = MF.livein_end(); (II != EE) && !isEAXAlive; ++II) { + unsigned Reg = II->first; + isEAXAlive = (Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL); + } + + // Function prologue calls _alloca to probe the stack when allocating + // more than 4k bytes in one go. Touching the stack at 4K increments is + // necessary to ensure that the guard pages used by the OS virtual memory + // manager are allocated in correct sequence. + if (!isEAXAlive) { + BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes); + BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("_alloca"); + } else { + // Save EAX + BuildMI(MBB, MBBI, TII.get(X86::PUSH32r), X86::EAX); + // Allocate NumBytes-4 bytes on stack. We'll also use 4 already + // allocated bytes for EAX. + BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4); + BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("_alloca"); + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(TII.get(X86::MOV32rm),X86::EAX), + StackPtr, NumBytes-4); + MBB.insert(MBBI, MI); + } + } else { + // If there is an ADD32ri or SUB32ri of ESP immediately after this + // instruction, merge the two instructions. + if (MBBI != MBB.end()) { + MachineBasicBlock::iterator NI = next(MBBI); + unsigned Opc = MBBI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + MBBI->getOperand(0).getReg() == StackPtr) { + NumBytes -= MBBI->getOperand(2).getImm(); + MBB.erase(MBBI); + MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + MBBI->getOperand(0).getReg() == StackPtr) { + NumBytes += MBBI->getOperand(2).getImm(); + MBB.erase(MBBI); + MBBI = NI; + } + } + + if (NumBytes) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII); + } + } + + if (MMI && MMI->needsFrameInfo()) { + std::vector<MachineMove> &Moves = MMI->getFrameMoves(); + const TargetAsmInfo *TAI = MF.getTarget().getTargetAsmInfo(); + + // Calculate amount of bytes used for return address storing + int stackGrowth = + (MF.getTarget().getFrameInfo()->getStackGrowthDirection() == + TargetFrameInfo::StackGrowsUp ? + TAI->getAddressSize() : -TAI->getAddressSize()); + + if (StackSize) { + // Show update of SP. + if (hasFP(MF)) { + // Adjust SP + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth); + Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc)); + } else { + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, -StackSize+stackGrowth); + Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc)); + } + } else { + //FIXME: Verify & implement for FP + MachineLocation SPDst(StackPtr); + MachineLocation SPSrc(StackPtr, stackGrowth); + Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc)); + } + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned Reg = CSI[I].getReg(); + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc)); + } + + if (hasFP(MF)) { + // Save FP + MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth); + MachineLocation FPSrc(FramePtr); + Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc)); + } + + MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc)); + } + + // If it's main() on Cygwin\Mingw32 we should align stack as well + if (Fn->hasExternalLinkage() && Fn->getName() == "main" && + Subtarget->isTargetCygMing()) { + BuildMI(MBB, MBBI, TII.get(X86::AND32ri), X86::ESP) + .addReg(X86::ESP).addImm(-Align); + + // Probe the stack + BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(Align); + BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)).addExternalSymbol("_alloca"); + } +} + +void X86RegisterInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + unsigned RetOpcode = MBBI->getOpcode(); + + switch (RetOpcode) { + case X86::RET: + case X86::RETI: + case X86::EH_RETURN: + case X86::TAILJMPd: + case X86::TAILJMPr: + case X86::TAILJMPm: break; // These are ok + default: + assert(0 && "Can only insert epilog into returning blocks"); + } + + // Get the number of bytes to allocate from the FrameInfo + uint64_t StackSize = MFI->getStackSize(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t NumBytes = StackSize - CSSize; + + if (hasFP(MF)) { + // pop EBP. + BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + NumBytes -= SlotSize; + } + + // Skip the callee-saved pop instructions. + while (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = prior(MBBI); + if (PI->getOpcode() != X86::POP32r && PI->getOpcode() != X86::POP64r) + break; + --MBBI; + } + + // If dynamic alloca is used, then reset esp to point to the last + // callee-saved slot before popping them off! + if (MFI->hasVarSizedObjects()) { + unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r; + MachineInstr *MI = addRegOffset(BuildMI(TII.get(Opc), StackPtr), + FramePtr, -CSSize); + MBB.insert(MBBI, MI); + NumBytes = 0; + } + + if (NumBytes) { // adjust stack pointer back: ESP += numbytes + // If there is an ADD32ri or SUB32ri of ESP immediately before this + // instruction, merge the two instructions. + if (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = prior(MBBI); + unsigned Opc = PI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + NumBytes += PI->getOperand(2).getImm(); + MBB.erase(PI); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + NumBytes -= PI->getOperand(2).getImm(); + MBB.erase(PI); + } + } + + if (NumBytes) + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII); + } + + // We're returning from function via eh_return. + if (RetOpcode == X86::EH_RETURN) { + MBBI = prior(MBB.end()); + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr). + addReg(DestAddr.getReg()); + } +} + +unsigned X86RegisterInfo::getRARegister() const { + if (Is64Bit) + return X86::RIP; // Should have dwarf #16 + else + return X86::EIP; // Should have dwarf #8 +} + +unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const { + return hasFP(MF) ? FramePtr : StackPtr; +} + +void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) + const { + // Calculate amount of bytes used for return address storing + int stackGrowth = (Is64Bit ? -8 : -4); + + // Initial state of the frame pointer is esp+4. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(StackPtr, stackGrowth); + Moves.push_back(MachineMove(0, Dst, Src)); + + // Add return address to move list + MachineLocation CSDst(StackPtr, stackGrowth); + MachineLocation CSSrc(getRARegister()); + Moves.push_back(MachineMove(0, CSDst, CSSrc)); +} + +unsigned X86RegisterInfo::getEHExceptionRegister() const { + assert(0 && "What is the exception register"); + return 0; +} + +unsigned X86RegisterInfo::getEHHandlerRegister() const { + assert(0 && "What is the exception handler register"); + return 0; +} + +namespace llvm { +unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) { + switch (VT) { + default: return Reg; + case MVT::i8: + if (High) { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case MVT::i16: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case MVT::i32: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case MVT::i64: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } + + return Reg; +} +} + +#include "X86GenRegisterInfo.inc" + diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h new file mode 100644 index 0000000..ab9e33f --- /dev/null +++ b/lib/Target/X86/X86RegisterInfo.h @@ -0,0 +1,130 @@ +//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86REGISTERINFO_H +#define X86REGISTERINFO_H + +#include "llvm/Target/MRegisterInfo.h" +#include "X86GenRegisterInfo.h.inc" + +namespace llvm { + class Type; + class TargetInstrInfo; + class X86TargetMachine; + +class X86RegisterInfo : public X86GenRegisterInfo { +public: + X86TargetMachine &TM; + const TargetInstrInfo &TII; + +private: + /// Is64Bit - Is the target 64-bits. + bool Is64Bit; + + /// SlotSize - Stack slot size in bytes. + unsigned SlotSize; + + /// StackPtr - X86 physical register used as stack ptr. + unsigned StackPtr; + + /// FramePtr - X86 physical register used as frame ptr. + unsigned FramePtr; + +public: + X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); + + /// Code Generation virtual methods... + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC) const; + + void copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *RC) const; + + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, const MachineInstr *Orig) const; + + /// foldMemoryOperand - If this target supports it, fold a load or store of + /// the specified stack slot into the specified machine instruction for the + /// specified operand. If this is possible, the target should perform the + /// folding and return true, otherwise it should return false. If it folds + /// the instruction, it is likely that the MachineInstruction the iterator + /// references has been changed. + MachineInstr* foldMemoryOperand(MachineInstr* MI, + unsigned OpNum, + int FrameIndex) const; + + /// getCalleeSavedRegs - Return a null-terminated list of all of the + /// callee-save registers on this target. + const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + + /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred + /// register classes to spill each callee-saved register with. The order and + /// length of this list match the getCalleeSavedRegs() list. + const TargetRegisterClass* const* getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + /// getReservedRegs - Returns a bitset indexed by physical register number + /// indicating if a register is a special register that has particular uses and + /// should be considered unavailable at all times, e.g. SP, RA. This is used by + /// register scavenger to determine what registers are free. + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, RegScavenger *RS = NULL) const; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + // Debug information queries. + unsigned getRARegister() const; + unsigned getFrameRegister(MachineFunction &MF) const; + void getInitialFrameState(std::vector<MachineMove> &Moves) const; + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +// getX86SubSuperRegister - X86 utility function. It returns the sub or super +// register of a specific X86 register. +// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX +unsigned getX86SubSuperRegister(unsigned, MVT::ValueType, bool High=false); + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td new file mode 100644 index 0000000..a1e7bb9 --- /dev/null +++ b/lib/Target/X86/X86RegisterInfo.td @@ -0,0 +1,468 @@ +//===- X86RegisterInfo.td - Describe the X86 Register File ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 Register file, defining the registers themselves, +// aliases between the registers, and the register classes built out of the +// registers. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Register definitions... +// +let Namespace = "X86" in { + + // In the register alias definitions below, we define which registers alias + // which others. We only specify which registers the small registers alias, + // because the register file generator is smart enough to figure out that + // AL aliases AX if we tell it that AX aliased AL (for example). + + // FIXME: X86-64 have different Dwarf numbers. + // 8-bit registers + // Low registers + def AL : Register<"AL">, DwarfRegNum<0>; + def CL : Register<"CL">, DwarfRegNum<1>; + def DL : Register<"DL">, DwarfRegNum<2>; + def BL : Register<"BL">, DwarfRegNum<3>; + + // X86-64 only + def SIL : Register<"SIL">, DwarfRegNum<4>; + def DIL : Register<"DIL">, DwarfRegNum<5>; + def BPL : Register<"BPL">, DwarfRegNum<6>; + def SPL : Register<"SPL">, DwarfRegNum<7>; + def R8B : Register<"R8B">, DwarfRegNum<8>; + def R9B : Register<"R9B">, DwarfRegNum<9>; + def R10B : Register<"R10B">, DwarfRegNum<10>; + def R11B : Register<"R11B">, DwarfRegNum<11>; + def R12B : Register<"R12B">, DwarfRegNum<12>; + def R13B : Register<"R13B">, DwarfRegNum<13>; + def R14B : Register<"R14B">, DwarfRegNum<14>; + def R15B : Register<"R15B">, DwarfRegNum<15>; + + // High registers X86-32 only + def AH : Register<"AH">, DwarfRegNum<0>; + def CH : Register<"CH">, DwarfRegNum<1>; + def DH : Register<"DH">, DwarfRegNum<2>; + def BH : Register<"BH">, DwarfRegNum<3>; + + // 16-bit registers + def AX : RegisterWithSubRegs<"AX", [AH,AL]>, DwarfRegNum<0>; + def CX : RegisterWithSubRegs<"CX", [CH,CL]>, DwarfRegNum<1>; + def DX : RegisterWithSubRegs<"DX", [DH,DL]>, DwarfRegNum<2>; + def BX : RegisterWithSubRegs<"BX", [BH,BL]>, DwarfRegNum<3>; + def SP : RegisterWithSubRegs<"SP", [SPL]>, DwarfRegNum<4>; + def BP : RegisterWithSubRegs<"BP", [BPL]>, DwarfRegNum<5>; + def SI : RegisterWithSubRegs<"SI", [SIL]>, DwarfRegNum<6>; + def DI : RegisterWithSubRegs<"DI", [DIL]>, DwarfRegNum<7>; + def IP : Register<"IP">, DwarfRegNum<8>; + + // X86-64 only + def R8W : RegisterWithSubRegs<"R8W", [R8B]>, DwarfRegNum<8>; + def R9W : RegisterWithSubRegs<"R9W", [R9B]>, DwarfRegNum<9>; + def R10W : RegisterWithSubRegs<"R10W", [R10B]>, DwarfRegNum<10>; + def R11W : RegisterWithSubRegs<"R11W", [R11B]>, DwarfRegNum<11>; + def R12W : RegisterWithSubRegs<"R12W", [R12B]>, DwarfRegNum<12>; + def R13W : RegisterWithSubRegs<"R13W", [R13B]>, DwarfRegNum<13>; + def R14W : RegisterWithSubRegs<"R14W", [R14B]>, DwarfRegNum<14>; + def R15W : RegisterWithSubRegs<"R15W", [R15B]>, DwarfRegNum<15>; + + // 32-bit registers + def EAX : RegisterWithSubRegs<"EAX", [AX]>, DwarfRegNum<0>; + def ECX : RegisterWithSubRegs<"ECX", [CX]>, DwarfRegNum<1>; + def EDX : RegisterWithSubRegs<"EDX", [DX]>, DwarfRegNum<2>; + def EBX : RegisterWithSubRegs<"EBX", [BX]>, DwarfRegNum<3>; + def ESP : RegisterWithSubRegs<"ESP", [SP]>, DwarfRegNum<4>; + def EBP : RegisterWithSubRegs<"EBP", [BP]>, DwarfRegNum<5>; + def ESI : RegisterWithSubRegs<"ESI", [SI]>, DwarfRegNum<6>; + def EDI : RegisterWithSubRegs<"EDI", [DI]>, DwarfRegNum<7>; + def EIP : RegisterWithSubRegs<"EIP", [IP]>, DwarfRegNum<8>; + + // X86-64 only + def R8D : RegisterWithSubRegs<"R8D", [R8W]>, DwarfRegNum<8>; + def R9D : RegisterWithSubRegs<"R9D", [R9W]>, DwarfRegNum<9>; + def R10D : RegisterWithSubRegs<"R10D", [R10W]>, DwarfRegNum<10>; + def R11D : RegisterWithSubRegs<"R11D", [R11W]>, DwarfRegNum<11>; + def R12D : RegisterWithSubRegs<"R12D", [R12W]>, DwarfRegNum<12>; + def R13D : RegisterWithSubRegs<"R13D", [R13W]>, DwarfRegNum<13>; + def R14D : RegisterWithSubRegs<"R14D", [R14W]>, DwarfRegNum<14>; + def R15D : RegisterWithSubRegs<"R15D", [R15W]>, DwarfRegNum<15>; + + // 64-bit registers, X86-64 only + def RAX : RegisterWithSubRegs<"RAX", [EAX]>, DwarfRegNum<0>; + def RDX : RegisterWithSubRegs<"RDX", [EDX]>, DwarfRegNum<1>; + def RCX : RegisterWithSubRegs<"RCX", [ECX]>, DwarfRegNum<2>; + def RBX : RegisterWithSubRegs<"RBX", [EBX]>, DwarfRegNum<3>; + def RSI : RegisterWithSubRegs<"RSI", [ESI]>, DwarfRegNum<4>; + def RDI : RegisterWithSubRegs<"RDI", [EDI]>, DwarfRegNum<5>; + def RBP : RegisterWithSubRegs<"RBP", [EBP]>, DwarfRegNum<6>; + def RSP : RegisterWithSubRegs<"RSP", [ESP]>, DwarfRegNum<7>; + + def R8 : RegisterWithSubRegs<"R8", [R8D]>, DwarfRegNum<8>; + def R9 : RegisterWithSubRegs<"R9", [R9D]>, DwarfRegNum<9>; + def R10 : RegisterWithSubRegs<"R10", [R10D]>, DwarfRegNum<10>; + def R11 : RegisterWithSubRegs<"R11", [R11D]>, DwarfRegNum<11>; + def R12 : RegisterWithSubRegs<"R12", [R12D]>, DwarfRegNum<12>; + def R13 : RegisterWithSubRegs<"R13", [R13D]>, DwarfRegNum<13>; + def R14 : RegisterWithSubRegs<"R14", [R14D]>, DwarfRegNum<14>; + def R15 : RegisterWithSubRegs<"R15", [R15D]>, DwarfRegNum<15>; + def RIP : RegisterWithSubRegs<"RIP", [EIP]>, DwarfRegNum<16>; + + // MMX Registers. These are actually aliased to ST0 .. ST7 + def MM0 : Register<"MM0">, DwarfRegNum<29>; + def MM1 : Register<"MM1">, DwarfRegNum<30>; + def MM2 : Register<"MM2">, DwarfRegNum<31>; + def MM3 : Register<"MM3">, DwarfRegNum<32>; + def MM4 : Register<"MM4">, DwarfRegNum<33>; + def MM5 : Register<"MM5">, DwarfRegNum<34>; + def MM6 : Register<"MM6">, DwarfRegNum<35>; + def MM7 : Register<"MM7">, DwarfRegNum<36>; + + // Pseudo Floating Point registers + def FP0 : Register<"FP0">, DwarfRegNum<-1>; + def FP1 : Register<"FP1">, DwarfRegNum<-1>; + def FP2 : Register<"FP2">, DwarfRegNum<-1>; + def FP3 : Register<"FP3">, DwarfRegNum<-1>; + def FP4 : Register<"FP4">, DwarfRegNum<-1>; + def FP5 : Register<"FP5">, DwarfRegNum<-1>; + def FP6 : Register<"FP6">, DwarfRegNum<-1>; + + // XMM Registers, used by the various SSE instruction set extensions + def XMM0: Register<"XMM0">, DwarfRegNum<17>; + def XMM1: Register<"XMM1">, DwarfRegNum<18>; + def XMM2: Register<"XMM2">, DwarfRegNum<19>; + def XMM3: Register<"XMM3">, DwarfRegNum<20>; + def XMM4: Register<"XMM4">, DwarfRegNum<21>; + def XMM5: Register<"XMM5">, DwarfRegNum<22>; + def XMM6: Register<"XMM6">, DwarfRegNum<23>; + def XMM7: Register<"XMM7">, DwarfRegNum<24>; + + // X86-64 only + def XMM8: Register<"XMM8">, DwarfRegNum<25>; + def XMM9: Register<"XMM9">, DwarfRegNum<26>; + def XMM10: Register<"XMM10">, DwarfRegNum<27>; + def XMM11: Register<"XMM11">, DwarfRegNum<28>; + def XMM12: Register<"XMM12">, DwarfRegNum<29>; + def XMM13: Register<"XMM13">, DwarfRegNum<30>; + def XMM14: Register<"XMM14">, DwarfRegNum<31>; + def XMM15: Register<"XMM15">, DwarfRegNum<32>; + + // Floating point stack registers + def ST0 : Register<"ST(0)">, DwarfRegNum<11>; + def ST1 : Register<"ST(1)">, DwarfRegNum<12>; + def ST2 : Register<"ST(2)">, DwarfRegNum<13>; + def ST3 : Register<"ST(3)">, DwarfRegNum<14>; + def ST4 : Register<"ST(4)">, DwarfRegNum<15>; + def ST5 : Register<"ST(5)">, DwarfRegNum<16>; + def ST6 : Register<"ST(6)">, DwarfRegNum<17>; + def ST7 : Register<"ST(7)">, DwarfRegNum<18>; +} + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// R12, R13, R14, and R15 for X86-64) are callee-save registers. +// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and +// R8B, ... R15B. +// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions, +def GR8 : RegisterClass<"X86", [i8], 8, + [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // Does the function dedicate RBP / EBP to being a frame ptr? + // If so, don't allocate SPL or BPL. + static const unsigned X86_GR8_AO_64_fp[] = + {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, + X86::R8B, X86::R9B, X86::R10B, X86::R11B, + X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B}; + // If not, just don't allocate SPL. + static const unsigned X86_GR8_AO_64[] = + {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL, + X86::R8B, X86::R9B, X86::R10B, X86::R11B, + X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL}; + // In 32-mode, none of the 8-bit registers aliases EBP or ESP. + static const unsigned X86_GR8_AO_32[] = + {X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH}; + + GR8Class::iterator + GR8Class::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (!Subtarget.is64Bit()) + return X86_GR8_AO_32; + else if (RI->hasFP(MF)) + return X86_GR8_AO_64_fp; + else + return X86_GR8_AO_64; + } + + GR8Class::iterator + GR8Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (!Subtarget.is64Bit()) + return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned)); + else if (RI->hasFP(MF)) + return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned)); + else + return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned)); + } + }]; +} + + +def GR16 : RegisterClass<"X86", [i16], 16, + [AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // Does the function dedicate RBP / EBP to being a frame ptr? + // If so, don't allocate SP or BP. + static const unsigned X86_GR16_AO_64_fp[] = + {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, + X86::R8W, X86::R9W, X86::R10W, X86::R11W, + X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W}; + static const unsigned X86_GR16_AO_32_fp[] = + {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX}; + // If not, just don't allocate SPL. + static const unsigned X86_GR16_AO_64[] = + {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, + X86::R8W, X86::R9W, X86::R10W, X86::R11W, + X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP}; + static const unsigned X86_GR16_AO_32[] = + {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP}; + + GR16Class::iterator + GR16Class::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (Subtarget.is64Bit()) { + if (RI->hasFP(MF)) + return X86_GR16_AO_64_fp; + else + return X86_GR16_AO_64; + } else { + if (RI->hasFP(MF)) + return X86_GR16_AO_32_fp; + else + return X86_GR16_AO_32; + } + } + + GR16Class::iterator + GR16Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (Subtarget.is64Bit()) { + if (RI->hasFP(MF)) + return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned)); + else + return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned)); + } else { + if (RI->hasFP(MF)) + return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned)); + else + return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned)); + } + } + }]; +} + + +def GR32 : RegisterClass<"X86", [i32], 32, + [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]> { + let MethodProtos = [{ + iterator allocation_order_begin(const MachineFunction &MF) const; + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + // Does the function dedicate RBP / EBP to being a frame ptr? + // If so, don't allocate ESP or EBP. + static const unsigned X86_GR32_AO_64_fp[] = + {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, + X86::R8D, X86::R9D, X86::R10D, X86::R11D, + X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D}; + static const unsigned X86_GR32_AO_32_fp[] = + {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX}; + // If not, just don't allocate SPL. + static const unsigned X86_GR32_AO_64[] = + {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, + X86::R8D, X86::R9D, X86::R10D, X86::R11D, + X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP}; + static const unsigned X86_GR32_AO_32[] = + {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP}; + + GR32Class::iterator + GR32Class::allocation_order_begin(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (Subtarget.is64Bit()) { + if (RI->hasFP(MF)) + return X86_GR32_AO_64_fp; + else + return X86_GR32_AO_64; + } else { + if (RI->hasFP(MF)) + return X86_GR32_AO_32_fp; + else + return X86_GR32_AO_32; + } + } + + GR32Class::iterator + GR32Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (Subtarget.is64Bit()) { + if (RI->hasFP(MF)) + return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned)); + else + return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned)); + } else { + if (RI->hasFP(MF)) + return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned)); + else + return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned)); + } + } + }]; +} + + +def GR64 : RegisterClass<"X86", [i64], 64, + [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + GR64Class::iterator + GR64Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const MRegisterInfo *RI = TM.getRegisterInfo(); + if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr? + return end()-2; // If so, don't allocate RSP or RBP + else + return end()-1; // If not, just don't allocate RSP + } + }]; +} + + +// GR16, GR32 subclasses which contain registers that have R8 sub-registers. +// These should only be used for 32-bit mode. +def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]>; +def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]>; + +// Scalar SSE2 floating point registers. +def FR32 : RegisterClass<"X86", [f32], 32, + [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, + XMM12, XMM13, XMM14, XMM15]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + FR32Class::iterator + FR32Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (!Subtarget.is64Bit()) + return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. + else + return end(); + } + }]; +} + +def FR64 : RegisterClass<"X86", [f64], 64, + [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, + XMM12, XMM13, XMM14, XMM15]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + FR64Class::iterator + FR64Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (!Subtarget.is64Bit()) + return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. + else + return end(); + } + }]; +} + + +// FIXME: This sets up the floating point register files as though they are f64 +// values, though they really are f80 values. This will cause us to spill +// values as 64-bit quantities instead of 80-bit quantities, which is much much +// faster on common hardware. In reality, this should be controlled by a +// command line option or something. + +def RFP32 : RegisterClass<"X86", [f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; +def RFP64 : RegisterClass<"X86", [f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>; + +// Floating point stack registers (these are not allocatable by the +// register allocator - the floating point stackifier is responsible +// for transforming FPn allocations to STn registers) +def RST : RegisterClass<"X86", [f64], 32, + [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + RSTClass::iterator + RSTClass::allocation_order_end(const MachineFunction &MF) const { + return begin(); + } + }]; +} + +// Generic vector registers: VR64 and VR128. +def VR64 : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64, + [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>; +def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128, + [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, + XMM12, XMM13, XMM14, XMM15]> { + let MethodProtos = [{ + iterator allocation_order_end(const MachineFunction &MF) const; + }]; + let MethodBodies = [{ + VR128Class::iterator + VR128Class::allocation_order_end(const MachineFunction &MF) const { + const TargetMachine &TM = MF.getTarget(); + const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>(); + if (!Subtarget.is64Bit()) + return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode. + else + return end(); + } + }]; +} diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h new file mode 100644 index 0000000..3dd2b24 --- /dev/null +++ b/lib/Target/X86/X86Relocations.h @@ -0,0 +1,34 @@ +//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef X86RELOCATIONS_H +#define X86RELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace X86 { + enum RelocationType { + // reloc_pcrel_word - PC relative relocation, add the relocated value to + // the value already in memory, after we adjust it for where the PC is. + reloc_pcrel_word = 0, + + // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just + // add the relocated value to the value already in memory. + reloc_absolute_word = 1, + reloc_absolute_dword = 2 + }; + } +} + +#endif diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp new file mode 100644 index 0000000..1a75e04 --- /dev/null +++ b/lib/Target/X86/X86Subtarget.cpp @@ -0,0 +1,293 @@ +//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Begeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "X86Subtarget.h" +#include "X86GenSubtarget.inc" +#include "llvm/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +cl::opt<X86Subtarget::AsmWriterFlavorTy> +AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset), + cl::desc("Choose style of code to emit from X86 backend:"), + cl::values( + clEnumValN(X86Subtarget::ATT, "att", " Emit AT&T-style assembly"), + clEnumValN(X86Subtarget::Intel, "intel", " Emit Intel-style assembly"), + clEnumValEnd)); + + +/// True if accessing the GV requires an extra load. For Windows, dllimported +/// symbols are indirect, loading the value at address GV rather then the +/// value of GV itself. This means that the GlobalAddress must be in the base +/// or index register of the address, not the GV offset field. +bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV, + const TargetMachine& TM, + bool isDirectCall) const +{ + // FIXME: PIC + if (TM.getRelocationModel() != Reloc::Static) + if (isTargetDarwin()) { + return (!isDirectCall && + (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() || + (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode()))); + } else if (TM.getRelocationModel() == Reloc::PIC_ && isPICStyleGOT()) { + // Extra load is needed for all non-statics. + return (!isDirectCall && + (GV->isDeclaration() || !GV->hasInternalLinkage())); + } else if (isTargetCygMing() || isTargetWindows()) { + return (GV->hasDLLImportLinkage()); + } + + return false; +} + +/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the +/// specified arguments. If we can't run cpuid on the host, return true. +bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, + unsigned *rECX, unsigned *rEDX) { +#if defined(__x86_64__) + // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. + asm ("movq\t%%rbx, %%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx, %%rsi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; +#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) +#if defined(__GNUC__) + asm ("movl\t%%ebx, %%esi\n\t" + "cpuid\n\t" + "xchgl\t%%ebx, %%esi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; +#elif defined(_MSC_VER) + __asm { + mov eax,value + cpuid + mov esi,rEAX + mov dword ptr [esi],eax + mov esi,rEBX + mov dword ptr [esi],ebx + mov esi,rECX + mov dword ptr [esi],ecx + mov esi,rEDX + mov dword ptr [esi],edx + } + return false; +#endif +#endif + return true; +} + +void X86Subtarget::AutoDetectSubtargetFeatures() { + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + union { + unsigned u[3]; + char c[12]; + } text; + + if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) + return; + + X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); + + if ((EDX >> 23) & 0x1) X86SSELevel = MMX; + if ((EDX >> 25) & 0x1) X86SSELevel = SSE1; + if ((EDX >> 26) & 0x1) X86SSELevel = SSE2; + if (ECX & 0x1) X86SSELevel = SSE3; + if ((ECX >> 9) & 0x1) X86SSELevel = SSSE3; + + if (memcmp(text.c, "GenuineIntel", 12) == 0 || + memcmp(text.c, "AuthenticAMD", 12) == 0) { + X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + HasX86_64 = (EDX >> 29) & 0x1; + } +} + +static const char *GetCurrentX86CPU() { + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX)) + return "generic"; + unsigned Family = (EAX >> 8) & 0xf; // Bits 8 - 11 + unsigned Model = (EAX >> 4) & 0xf; // Bits 4 - 7 + X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + bool Em64T = (EDX >> 29) & 0x1; + + union { + unsigned u[3]; + char c[12]; + } text; + + X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1); + if (memcmp(text.c, "GenuineIntel", 12) == 0) { + switch (Family) { + case 3: + return "i386"; + case 4: + return "i486"; + case 5: + switch (Model) { + case 4: return "pentium-mmx"; + default: return "pentium"; + } + case 6: + switch (Model) { + case 1: return "pentiumpro"; + case 3: + case 5: + case 6: return "pentium2"; + case 7: + case 8: + case 10: + case 11: return "pentium3"; + case 9: + case 13: return "pentium-m"; + case 14: return "yonah"; + case 15: return "core2"; + default: return "i686"; + } + case 15: { + switch (Model) { + case 3: + case 4: + return (Em64T) ? "nocona" : "prescott"; + default: + return (Em64T) ? "x86-64" : "pentium4"; + } + } + + default: + return "generic"; + } + } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) { + // FIXME: this poorly matches the generated SubtargetFeatureKV table. There + // appears to be no way to generate the wide variety of AMD-specific targets + // from the information returned from CPUID. + switch (Family) { + case 4: + return "i486"; + case 5: + switch (Model) { + case 6: + case 7: return "k6"; + case 8: return "k6-2"; + case 9: + case 13: return "k6-3"; + default: return "pentium"; + } + case 6: + switch (Model) { + case 4: return "athlon-tbird"; + case 6: + case 7: + case 8: return "athlon-mp"; + case 10: return "athlon-xp"; + default: return "athlon"; + } + case 15: + switch (Model) { + case 1: return "opteron"; + case 5: return "athlon-fx"; // also opteron + default: return "athlon64"; + } + default: + return "generic"; + } + } else { + return "generic"; + } +} + +X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit) + : AsmFlavor(AsmWriterFlavor) + , PICStyle(PICStyle::None) + , X86SSELevel(NoMMXSSE) + , HasX86_64(false) + , stackAlignment(8) + // FIXME: this is a known good value for Yonah. How about others? + , MinRepStrSizeThreshold(128) + , Is64Bit(is64Bit) + , TargetType(isELF) { // Default to ELF unless otherwise specified. + + // Determine default and user specified characteristics + if (!FS.empty()) { + // If feature string is not empty, parse features string. + std::string CPU = GetCurrentX86CPU(); + ParseSubtargetFeatures(FS, CPU); + + if (Is64Bit && !HasX86_64) + cerr << "Warning: Generation of 64-bit code for a 32-bit processor " + << "requested.\n"; + if (Is64Bit && X86SSELevel < SSE2) + cerr << "Warning: 64-bit processors all have at least SSE2.\n"; + } else { + // Otherwise, use CPUID to auto-detect feature set. + AutoDetectSubtargetFeatures(); + } + + // If requesting codegen for X86-64, make sure that 64-bit and SSE2 features + // are enabled. These are available on all x86-64 CPUs. + if (Is64Bit) { + HasX86_64 = true; + if (X86SSELevel < SSE2) + X86SSELevel = SSE2; + } + + // Set the boolean corresponding to the current target triple, or the default + // if one cannot be determined, to true. + const std::string& TT = M.getTargetTriple(); + if (TT.length() > 5) { + if (TT.find("cygwin") != std::string::npos) + TargetType = isCygwin; + else if (TT.find("mingw") != std::string::npos) + TargetType = isMingw; + else if (TT.find("darwin") != std::string::npos) + TargetType = isDarwin; + else if (TT.find("win32") != std::string::npos) + TargetType = isWindows; + } else if (TT.empty()) { +#if defined(__CYGWIN__) + TargetType = isCygwin; +#elif defined(__MINGW32__) + TargetType = isMingw; +#elif defined(__APPLE__) + TargetType = isDarwin; +#elif defined(_WIN32) + TargetType = isWindows; +#endif + } + + // If the asm syntax hasn't been overridden on the command line, use whatever + // the target wants. + if (AsmFlavor == X86Subtarget::Unset) { + if (TargetType == isWindows) { + AsmFlavor = X86Subtarget::Intel; + } else { + AsmFlavor = X86Subtarget::ATT; + } + } + + if (TargetType == isDarwin || + TargetType == isCygwin || + TargetType == isMingw || + (TargetType == isELF && Is64Bit)) + stackAlignment = 16; +} diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h new file mode 100644 index 0000000..2cda970 --- /dev/null +++ b/lib/Target/X86/X86Subtarget.h @@ -0,0 +1,156 @@ +//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by Nate Begeman and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef X86SUBTARGET_H +#define X86SUBTARGET_H + +#include "llvm/Target/TargetSubtarget.h" + +#include <string> + +namespace llvm { +class Module; +class GlobalValue; +class TargetMachine; + +namespace PICStyle { +enum Style { + Stub, GOT, RIPRel, WinPIC, None +}; +} + +class X86Subtarget : public TargetSubtarget { +public: + enum AsmWriterFlavorTy { + // Note: This numbering has to match the GCC assembler dialects for inline + // asm alternatives to work right. + ATT = 0, Intel = 1, Unset + }; +protected: + enum X86SSEEnum { + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3 + }; + + enum X863DNowEnum { + NoThreeDNow, ThreeDNow, ThreeDNowA + }; + + /// AsmFlavor - Which x86 asm dialect to use. + AsmWriterFlavorTy AsmFlavor; + + /// PICStyle - Which PIC style to use + PICStyle::Style PICStyle; + + /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, or none supported. + X86SSEEnum X86SSELevel; + + /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported. + X863DNowEnum X863DNowLevel; + + /// HasX86_64 - True if the processor supports X86-64 instructions. + bool HasX86_64; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// Min. memset / memcpy size that is turned into rep/movs, rep/stos ops. + unsigned MinRepStrSizeThreshold; + +private: + /// Is64Bit - True if the processor supports 64-bit instructions and module + /// pointer size is 64 bit. + bool Is64Bit; + +public: + enum { + isELF, isCygwin, isDarwin, isWindows, isMingw + } TargetType; + + /// This constructor initializes the data members to match that + /// of the specified module. + /// + X86Subtarget(const Module &M, const std::string &FS, bool is64Bit); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// getMinRepStrSizeThreshold - Returns the minimum memset / memcpy size + /// required to turn the operation into a X86 rep/movs or rep/stos + /// instruction. This is only used if the src / dst alignment is not DWORD + /// aligned. + unsigned getMinRepStrSizeThreshold() const { return MinRepStrSizeThreshold; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU); + + /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID + /// instruction. + void AutoDetectSubtargetFeatures(); + + bool is64Bit() const { return Is64Bit; } + + PICStyle::Style getPICStyle() const { return PICStyle; } + void setPICStyle(PICStyle::Style Style) { PICStyle = Style; } + + bool hasMMX() const { return X86SSELevel >= MMX; } + bool hasSSE1() const { return X86SSELevel >= SSE1; } + bool hasSSE2() const { return X86SSELevel >= SSE2; } + bool hasSSE3() const { return X86SSELevel >= SSE3; } + bool hasSSSE3() const { return X86SSELevel >= SSSE3; } + bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } + bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } + + unsigned getAsmFlavor() const { + return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0; + } + + bool isFlavorAtt() const { return AsmFlavor == ATT; } + bool isFlavorIntel() const { return AsmFlavor == Intel; } + + bool isTargetDarwin() const { return TargetType == isDarwin; } + bool isTargetELF() const { return TargetType == isELF; } + bool isTargetWindows() const { return TargetType == isWindows; } + bool isTargetMingw() const { return TargetType == isMingw; } + bool isTargetCygMing() const { return (TargetType == isMingw || + TargetType == isCygwin); } + bool isTargetCygwin() const { return TargetType == isCygwin; } + + bool isPICStyleSet() const { return PICStyle != PICStyle::None; } + bool isPICStyleGOT() const { return PICStyle == PICStyle::GOT; } + bool isPICStyleStub() const { return PICStyle == PICStyle::Stub; } + bool isPICStyleRIPRel() const { return PICStyle == PICStyle::RIPRel; } + bool isPICStyleWinPIC() const { return PICStyle == PICStyle:: WinPIC; } + + /// True if accessing the GV requires an extra load. For Windows, dllimported + /// symbols are indirect, loading the value at address GV rather then the + /// value of GV itself. This means that the GlobalAddress must be in the base + /// or index register of the address, not the GV offset field. + bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM, + bool isDirectCall) const; + +}; + +namespace X86 { + /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in + /// the specified arguments. If we can't run cpuid on the host, return true. + bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX, + unsigned *rECX, unsigned *rEDX); +} + +} // End llvm namespace + +#endif diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp new file mode 100644 index 0000000..4bb854e --- /dev/null +++ b/lib/Target/X86/X86TargetAsmInfo.cpp @@ -0,0 +1,280 @@ +//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the X86TargetAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetAsmInfo.h" +#include "X86TargetMachine.h" +#include "X86Subtarget.h" +#include "llvm/DerivedTypes.h" +#include "llvm/InlineAsm.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/ADT/StringExtras.h" +using namespace llvm; + +static const char* x86_asm_table[] = {"{si}", "S", + "{di}", "D", + "{ax}", "a", + "{cx}", "c", + "{memory}", "memory", + "{flags}", "", + "{dirflag}", "", + "{fpsr}", "", + "{cc}", "cc", + 0,0}; + +X86TargetAsmInfo::X86TargetAsmInfo(const X86TargetMachine &TM) { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + + // FIXME - Should be simplified. + + AsmTransCBE = x86_asm_table; + + switch (Subtarget->TargetType) { + case X86Subtarget::isDarwin: + AlignmentIsInBytes = false; + GlobalPrefix = "_"; + if (!Subtarget->is64Bit()) + Data64bitsDirective = 0; // we can't emit a 64-bit unit + ZeroDirective = "\t.space\t"; // ".space N" emits N zeros. + PrivateGlobalPrefix = "L"; // Marker for constant pool idxs + BSSSection = 0; // no BSS section. + ZeroFillDirective = "\t.zerofill\t"; // Uses .zerofill + ConstantPoolSection = "\t.const\n"; + JumpTableDataSection = "\t.const\n"; + CStringSection = "\t.cstring"; + FourByteConstantSection = "\t.literal4\n"; + EightByteConstantSection = "\t.literal8\n"; + if (Subtarget->is64Bit()) + SixteenByteConstantSection = "\t.literal16\n"; + ReadOnlySection = "\t.const\n"; + LCOMMDirective = "\t.lcomm\t"; + COMMDirectiveTakesAlignment = false; + HasDotTypeDotSizeDirective = false; + if (TM.getRelocationModel() == Reloc::Static) { + StaticCtorsSection = ".constructor"; + StaticDtorsSection = ".destructor"; + } else { + StaticCtorsSection = ".mod_init_func"; + StaticDtorsSection = ".mod_term_func"; + } + InlineAsmStart = "# InlineAsm Start"; + InlineAsmEnd = "# InlineAsm End"; + SetDirective = "\t.set"; + UsedDirective = "\t.no_dead_strip\t"; + WeakRefDirective = "\t.weak_reference\t"; + HiddenDirective = "\t.private_extern\t"; + + // In non-PIC modes, emit a special label before jump tables so that the + // linker can perform more accurate dead code stripping. + if (TM.getRelocationModel() != Reloc::PIC_) { + // Emit a local label that is preserved until the linker runs. + JumpTableSpecialLabelPrefix = "l"; + } + + SupportsDebugInformation = true; + NeedsSet = true; + DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug"; + DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug"; + DwarfLineSection = ".section __DWARF,__debug_line,regular,debug"; + DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug"; + DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug"; + DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug"; + DwarfStrSection = ".section __DWARF,__debug_str,regular,debug"; + DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug"; + DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug"; + DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug"; + DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug"; + break; + + case X86Subtarget::isELF: + ReadOnlySection = "\t.section\t.rodata"; + FourByteConstantSection = "\t.section\t.rodata.cst4,\"aM\",@progbits,4"; + EightByteConstantSection = "\t.section\t.rodata.cst8,\"aM\",@progbits,8"; + SixteenByteConstantSection = "\t.section\t.rodata.cst16,\"aM\",@progbits,16"; + CStringSection = "\t.section\t.rodata.str1.1,\"aMS\",@progbits,1"; + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + SetDirective = "\t.set\t"; + PCSymbol = "."; + + // Set up DWARF directives + HasLEB128 = true; // Target asm supports leb128 directives (little-endian) + AbsoluteDebugSectionOffsets = true; + AbsoluteEHSectionOffsets = false; + SupportsDebugInformation = true; + DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"\",@progbits"; + DwarfInfoSection = "\t.section\t.debug_info,\"\",@progbits"; + DwarfLineSection = "\t.section\t.debug_line,\"\",@progbits"; + DwarfFrameSection = "\t.section\t.debug_frame,\"\",@progbits"; + DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits"; + DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits"; + DwarfStrSection = "\t.section\t.debug_str,\"\",@progbits"; + DwarfLocSection = "\t.section\t.debug_loc,\"\",@progbits"; + DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits"; + DwarfRangesSection = "\t.section\t.debug_ranges,\"\",@progbits"; + DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits"; + + if (!Subtarget->is64Bit()) + SupportsExceptionHandling = true; + DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits"; + DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits"; + break; + + case X86Subtarget::isCygwin: + case X86Subtarget::isMingw: + GlobalPrefix = "_"; + LCOMMDirective = "\t.lcomm\t"; + COMMDirectiveTakesAlignment = false; + HasDotTypeDotSizeDirective = false; + StaticCtorsSection = "\t.section .ctors,\"aw\""; + StaticDtorsSection = "\t.section .dtors,\"aw\""; + HiddenDirective = NULL; + PrivateGlobalPrefix = "L"; // Prefix for private global symbols + WeakRefDirective = "\t.weak\t"; + SetDirective = "\t.set\t"; + + // Set up DWARF directives + HasLEB128 = true; // Target asm supports leb128 directives (little-endian) + AbsoluteDebugSectionOffsets = true; + AbsoluteEHSectionOffsets = false; + SupportsDebugInformation = true; + DwarfSectionOffsetDirective = "\t.secrel32\t"; + DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"dr\""; + DwarfInfoSection = "\t.section\t.debug_info,\"dr\""; + DwarfLineSection = "\t.section\t.debug_line,\"dr\""; + DwarfFrameSection = "\t.section\t.debug_frame,\"dr\""; + DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\""; + DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\""; + DwarfStrSection = "\t.section\t.debug_str,\"dr\""; + DwarfLocSection = "\t.section\t.debug_loc,\"dr\""; + DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\""; + DwarfRangesSection = "\t.section\t.debug_ranges,\"dr\""; + DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\""; + break; + + case X86Subtarget::isWindows: + GlobalPrefix = "_"; + HasDotTypeDotSizeDirective = false; + break; + + default: break; + } + + if (Subtarget->isFlavorIntel()) { + GlobalPrefix = "_"; + CommentString = ";"; + + PrivateGlobalPrefix = "$"; + AlignDirective = "\talign\t"; + ZeroDirective = "\tdb\t"; + ZeroDirectiveSuffix = " dup(0)"; + AsciiDirective = "\tdb\t"; + AscizDirective = 0; + Data8bitsDirective = "\tdb\t"; + Data16bitsDirective = "\tdw\t"; + Data32bitsDirective = "\tdd\t"; + Data64bitsDirective = "\tdq\t"; + HasDotTypeDotSizeDirective = false; + + TextSection = "_text"; + DataSection = "_data"; + JumpTableDataSection = NULL; + SwitchToSectionDirective = ""; + TextSectionStartSuffix = "\tsegment 'CODE'"; + DataSectionStartSuffix = "\tsegment 'DATA'"; + SectionEndDirectiveSuffix = "\tends\n"; + } + + AssemblerDialect = Subtarget->getAsmFlavor(); +} + +bool X86TargetAsmInfo::LowerToBSwap(CallInst *CI) const { + // FIXME: this should verify that we are targetting a 486 or better. If not, + // we will turn this bswap into something that will be lowered to logical ops + // instead of emitting the bswap asm. For now, we don't support 486 or lower + // so don't worry about this. + + // Verify this is a simple bswap. + if (CI->getNumOperands() != 2 || + CI->getType() != CI->getOperand(1)->getType() || + !CI->getType()->isInteger()) + return false; + + const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + + // Okay, we can do this xform, do so now. + const Type *Tys[] = { Ty, Ty }; + Module *M = CI->getParent()->getParent()->getParent(); + Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 2); + + Value *Op = CI->getOperand(1); + Op = new CallInst(Int, Op, CI->getName(), CI); + + CI->replaceAllUsesWith(Op); + CI->eraseFromParent(); + return true; +} + + +bool X86TargetAsmInfo::ExpandInlineAsm(CallInst *CI) const { + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); + + std::string AsmStr = IA->getAsmString(); + + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" + std::vector<std::string> AsmPieces; + SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? + + switch (AsmPieces.size()) { + default: return false; + case 1: + AsmStr = AsmPieces[0]; + AsmPieces.clear(); + SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. + + // bswap $0 + if (AsmPieces.size() == 2 && + AsmPieces[0] == "bswap" && AsmPieces[1] == "$0") { + // No need to check constraints, nothing other than the equivalent of + // "=r,0" would be valid here. + return LowerToBSwap(CI); + } + break; + case 3: + if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 && + Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && + Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { + // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 + std::vector<std::string> Words; + SplitString(AsmPieces[0], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { + Words.clear(); + SplitString(AsmPieces[1], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { + Words.clear(); + SplitString(AsmPieces[2], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && + Words[2] == "%edx") { + return LowerToBSwap(CI); + } + } + } + } + break; + } + return false; +} diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h new file mode 100644 index 0000000..cc509d1 --- /dev/null +++ b/lib/Target/X86/X86TargetAsmInfo.h @@ -0,0 +1,33 @@ +//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by James M. Laskey and is distributed under the +// University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the X86TargetAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETASMINFO_H +#define X86TARGETASMINFO_H + +#include "llvm/Target/TargetAsmInfo.h" + +namespace llvm { + + // Forward declaration. + class X86TargetMachine; + + struct X86TargetAsmInfo : public TargetAsmInfo { + X86TargetAsmInfo(const X86TargetMachine &TM); + + virtual bool ExpandInlineAsm(CallInst *CI) const; + private: + bool LowerToBSwap(CallInst *CI) const; + }; +} // namespace llvm + +#endif diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp new file mode 100644 index 0000000..4d4bd3f --- /dev/null +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -0,0 +1,190 @@ +//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetAsmInfo.h" +#include "X86TargetMachine.h" +#include "X86.h" +#include "llvm/Module.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetMachineRegistry.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +/// X86TargetMachineModule - Note that this is used on hosts that cannot link +/// in a library unless there are references into the library. In particular, +/// it seems that it is not possible to get things to work on Win32 without +/// this. Though it is unused, do not remove it. +extern "C" int X86TargetMachineModule; +int X86TargetMachineModule = 0; + +namespace { + // Register the target. + RegisterTarget<X86_32TargetMachine> + X("x86", " 32-bit X86: Pentium-Pro and above"); + RegisterTarget<X86_64TargetMachine> + Y("x86-64", " 64-bit X86: EM64T and AMD64"); +} + +const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const { + return new X86TargetAsmInfo(*this); +} + +unsigned X86_32TargetMachine::getJITMatchQuality() { +#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) + return 10; +#endif + return 0; +} + +unsigned X86_64TargetMachine::getJITMatchQuality() { +#if defined(__x86_64__) + return 10; +#endif + return 0; +} + +unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) { + // We strongly match "i[3-9]86-*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' && + TT[4] == '-' && TT[1] - '3' < 6) + return 20; + // If the target triple is something non-X86, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::LittleEndian && + M.getPointerSize() == Module::Pointer32) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + +unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) { + // We strongly match "x86_64-*". + std::string TT = M.getTargetTriple(); + if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' && + TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-') + return 20; + + // We strongly match "amd64-*". + if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' && + TT[3] == '6' && TT[4] == '4' && TT[5] == '-') + return 20; + + // If the target triple is something non-X86-64, we don't match. + if (!TT.empty()) return 0; + + if (M.getEndianness() == Module::LittleEndian && + M.getPointerSize() == Module::Pointer64) + return 10; // Weak match + else if (M.getEndianness() != Module::AnyEndianness || + M.getPointerSize() != Module::AnyPointerSize) + return 0; // Match for some other target + + return getJITMatchQuality()/2; +} + +X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS) + : X86TargetMachine(M, FS, false) { +} + + +X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS) + : X86TargetMachine(M, FS, true) { +} + +/// X86TargetMachine ctor - Create an ILP32 architecture model +/// +X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS, + bool is64Bit) + : Subtarget(M, FS, is64Bit), + DataLayout(Subtarget.is64Bit() ? + std::string("e-p:64:64-f64:32:64-i64:32:64") : + std::string("e-p:32:32-f64:32:64-i64:32:64")), + FrameInfo(TargetFrameInfo::StackGrowsDown, + Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4), + InstrInfo(*this), JITInfo(*this), TLInfo(*this) { + if (getRelocationModel() == Reloc::Default) + if (Subtarget.isTargetDarwin() || Subtarget.isTargetCygMing()) + setRelocationModel(Reloc::DynamicNoPIC); + else + setRelocationModel(Reloc::Static); + if (Subtarget.is64Bit()) { + // No DynamicNoPIC support under X86-64. + if (getRelocationModel() == Reloc::DynamicNoPIC) + setRelocationModel(Reloc::PIC_); + // Default X86-64 code model is small. + if (getCodeModel() == CodeModel::Default) + setCodeModel(CodeModel::Small); + } + + if (Subtarget.isTargetCygMing()) + Subtarget.setPICStyle(PICStyle::WinPIC); + else if (Subtarget.isTargetDarwin()) + if (Subtarget.is64Bit()) + Subtarget.setPICStyle(PICStyle::RIPRel); + else + Subtarget.setPICStyle(PICStyle::Stub); + else if (Subtarget.isTargetELF()) + if (Subtarget.is64Bit()) + Subtarget.setPICStyle(PICStyle::RIPRel); + else + Subtarget.setPICStyle(PICStyle::GOT); +} + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool X86TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) { + // Install an instruction selector. + PM.add(createX86ISelDag(*this, Fast)); + return false; +} + +bool X86TargetMachine::addPostRegAlloc(FunctionPassManager &PM, bool Fast) { + PM.add(createX86FloatingPointStackifierPass()); + return true; // -print-machineinstr should print after this. +} + +bool X86TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out) { + PM.add(createX86CodePrinterPass(Out, *this)); + return false; +} + +bool X86TargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + // FIXME: Move this to TargetJITInfo! + setRelocationModel(Reloc::Static); + Subtarget.setPICStyle(PICStyle::None); + + // JIT cannot ensure globals are placed in the lower 4G of address. + if (Subtarget.is64Bit()) + setCodeModel(CodeModel::Large); + + PM.add(createX86CodeEmitterPass(*this, MCE)); + return false; +} + +bool X86TargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE) { + PM.add(createX86CodeEmitterPass(*this, MCE)); + return false; +} diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h new file mode 100644 index 0000000..0a4f1b5 --- /dev/null +++ b/lib/Target/X86/X86TargetMachine.h @@ -0,0 +1,95 @@ +//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file was developed by the LLVM research group and is distributed under +// the University of Illinois Open Source License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETMACHINE_H +#define X86TARGETMACHINE_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameInfo.h" +#include "X86.h" +#include "X86ELFWriterInfo.h" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86Subtarget.h" +#include "X86ISelLowering.h" + +namespace llvm { + +class X86TargetMachine : public LLVMTargetMachine { + X86Subtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + TargetFrameInfo FrameInfo; + X86InstrInfo InstrInfo; + X86JITInfo JITInfo; + X86TargetLowering TLInfo; + X86ELFWriterInfo ELFWriterInfo; + +protected: + virtual const TargetAsmInfo *createTargetAsmInfo() const; + +public: + X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit); + + virtual const X86InstrInfo *getInstrInfo() const { return &InstrInfo; } + virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; } + virtual TargetJITInfo *getJITInfo() { return &JITInfo; } + virtual const TargetSubtarget *getSubtargetImpl() const{ return &Subtarget; } + virtual X86TargetLowering *getTargetLowering() const { + return const_cast<X86TargetLowering*>(&TLInfo); + } + virtual const MRegisterInfo *getRegisterInfo() const { + return &InstrInfo.getRegisterInfo(); + } + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const X86ELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } + + static unsigned getModuleMatchQuality(const Module &M); + static unsigned getJITMatchQuality(); + + // Set up the pass pipeline. + virtual bool addInstSelector(FunctionPassManager &PM, bool Fast); + virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast); + virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, + std::ostream &Out); + virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); + virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast, + MachineCodeEmitter &MCE); +}; + +/// X86_32TargetMachine - X86 32-bit target machine. +/// +class X86_32TargetMachine : public X86TargetMachine { +public: + X86_32TargetMachine(const Module &M, const std::string &FS); + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); +}; + +/// X86_64TargetMachine - X86 64-bit target machine. +/// +class X86_64TargetMachine : public X86TargetMachine { +public: + X86_64TargetMachine(const Module &M, const std::string &FS); + + static unsigned getJITMatchQuality(); + static unsigned getModuleMatchQuality(const Module &M); +}; + +} // End llvm namespace + +#endif |