diff options
Diffstat (limited to 'lib/Target/CellSPU')
20 files changed, 927 insertions, 901 deletions
diff --git a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt deleted file mode 100644 index 8a2b59a..0000000 --- a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -include_directories( - ${CMAKE_CURRENT_BINARY_DIR}/.. - ${CMAKE_CURRENT_SOURCE_DIR}/.. - ) - -add_llvm_library(LLVMCellSPUAsmPrinter - SPUAsmPrinter.cpp - ) -add_dependencies(LLVMCellSPUAsmPrinter CellSPUCodeGenTable_gen) diff --git a/lib/Target/CellSPU/AsmPrinter/Makefile b/lib/Target/CellSPU/AsmPrinter/Makefile deleted file mode 100644 index 4ec9d04..0000000 --- a/lib/Target/CellSPU/AsmPrinter/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -##===- lib/Target/CellSPU/AsmPrinter/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMCellSPUAsmPrinter - -# Hack: we need to include 'main' CellSPU target directory to grab -# private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt index ddfca37..9c39d38 100644 --- a/lib/Target/CellSPU/CMakeLists.txt +++ b/lib/Target/CellSPU/CMakeLists.txt @@ -12,16 +12,16 @@ tablegen(SPUGenSubtarget.inc -gen-subtarget) tablegen(SPUGenCallingConv.inc -gen-callingconv) add_llvm_target(CellSPUCodeGen + SPUAsmPrinter.cpp SPUFrameInfo.cpp SPUHazardRecognizers.cpp SPUInstrInfo.cpp SPUISelDAGToDAG.cpp SPUISelLowering.cpp + SPUFrameInfo.cpp SPUMCAsmInfo.cpp SPURegisterInfo.cpp SPUSubtarget.cpp SPUTargetMachine.cpp SPUSelectionDAGInfo.cpp ) - -target_link_libraries (LLVMCellSPUCodeGen LLVMSelectionDAG) diff --git a/lib/Target/CellSPU/Makefile b/lib/Target/CellSPU/Makefile index cbdbd3c..77c66be 100644 --- a/lib/Target/CellSPU/Makefile +++ b/lib/Target/CellSPU/Makefile @@ -16,6 +16,6 @@ BUILT_SOURCES = SPUGenInstrNames.inc SPUGenRegisterNames.inc \ SPUGenInstrInfo.inc SPUGenDAGISel.inc \ SPUGenSubtarget.inc SPUGenCallingConv.inc -DIRS = AsmPrinter TargetInfo +DIRS = TargetInfo include $(LEVEL)/Makefile.common diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td index 069a182..5ef5716 100644 --- a/lib/Target/CellSPU/SPU64InstrInfo.td +++ b/lib/Target/CellSPU/SPU64InstrInfo.td @@ -54,8 +54,8 @@ class I64SETCCNegCond<PatFrag cond, CodeFrag compare>: // The i64 seteq fragment that does the scalar->vector conversion and // comparison: def CEQr64compare: - CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB))), 0xb)>; + CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))), 0xb)>; // The i64 seteq fragment that does the vector comparison def CEQv2i64compare: @@ -67,12 +67,14 @@ def CEQv2i64compare: // v2i64 seteq (equality): the setcc result is v4i32 multiclass CompareEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>; - def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQr64compare.Fragment, R32C))>; + def v2i64: CodeFrag<(i32 (COPY_TO_REGCLASS CEQv2i64compare.Fragment, R32C))>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CEQr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CEQv2i64compare.Fragment), R32C))>; } defm I64EQ: CompareEqual64; @@ -89,10 +91,12 @@ def : I64SELECTNegCond<setne, I64EQr64>; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ def CLGTr64ugt: - CodeFrag<(CLGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CLGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CLGTr64eq: - CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CLGTr64compare: CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment, @@ -112,12 +116,14 @@ def CLGTv2i64compare: multiclass CompareLogicalGreaterThan64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CLGTr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGTr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CLGTv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGTr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGTv2i64compare.Fragment), R32C))>; } defm I64LGT: CompareLogicalGreaterThan64; @@ -144,12 +150,14 @@ def CLGEv2i64compare: multiclass CompareLogicalGreaterEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CLGEr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CLGEr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CLGEv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGEr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CLGEv2i64compare.Fragment),R32C))>; } defm I64LGE: CompareLogicalGreaterEqual64; @@ -168,10 +176,12 @@ def : I64SELECTNegCond<setult, I64LGEr64>; //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ def CGTr64sgt: - CodeFrag<(CGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CGTv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CGTr64eq: - CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>; + CodeFrag<(CEQv4i32 (COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG))>; def CGTr64compare: CodeFrag<(SELBv2i64 CGTr64sgt.Fragment, @@ -191,12 +201,14 @@ def CGTv2i64compare: multiclass CompareGreaterThan64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CGTr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGTr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CGTv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CGTr64compare.Fragment), R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS + (FSMv4i32 CGTv2i64compare.Fragment), R32C))>; } defm I64GT: CompareLogicalGreaterThan64; @@ -223,12 +235,12 @@ def CGEv2i64compare: multiclass CompareGreaterEqual64 { // Plain old comparison, converts back to i32 scalar - def r64: CodeFrag<(ORi32_v4i32 CGEr64compare.Fragment)>; + def r64: CodeFrag<(i32 (COPY_TO_REGCLASS CGEr64compare.Fragment, R32C))>; def v2i64: CodeFrag<CGEv2i64compare.Fragment>; // SELB mask from FSM: - def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEr64compare.Fragment))>; - def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEv2i64compare.Fragment))>; + def r64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEr64compare.Fragment),R32C))>; + def v2i64mask: CodeFrag<(i32 (COPY_TO_REGCLASS (FSMv4i32 CGEv2i64compare.Fragment),R32C))>; } defm I64GE: CompareGreaterEqual64; @@ -255,9 +267,9 @@ class v2i64_add<dag lhs, dag rhs, dag cg_mask>: v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>; def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS v2i64_add<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), @@ -275,11 +287,12 @@ class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>: CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>; def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - v2i64_sub_bg<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB)>.Fragment, - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS + v2i64_sub<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + v2i64_sub_bg<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG)>.Fragment, + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), @@ -374,9 +387,9 @@ class v2i64_mul<dag rA, dag rB, dag rCGmask>: rCGmask>; def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)), - (ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA), - (ORv2i64_i64 R64C:$rB), - (v4i32 VECREG:$rCGmask)>.Fragment)>; + (COPY_TO_REGCLASS v2i64_mul<(COPY_TO_REGCLASS R64C:$rA, VECREG), + (COPY_TO_REGCLASS R64C:$rB, VECREG), + (v4i32 VECREG:$rCGmask)>.Fragment, R64C)>; def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB), (v4i32 VECREG:$rCGmask)), diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp index 3e95531..4040461 100644 --- a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp +++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp @@ -46,10 +46,6 @@ namespace { return "STI CBEA SPU Assembly Printer"; } - SPUTargetMachine &getTM() { - return static_cast<SPUTargetMachine&>(TM); - } - /// printInstruction - This method is automatically generated by tablegen /// from the instruction set description. void printInstruction(const MachineInstr *MI, raw_ostream &OS); @@ -64,15 +60,6 @@ namespace { } void printOp(const MachineOperand &MO, raw_ostream &OS); - /// printRegister - Print register according to target requirements. - /// - void printRegister(const MachineOperand &MO, bool R0AsZero, raw_ostream &O){ - unsigned RegNo = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && - "Not physreg??"); - O << getRegisterName(RegNo); - } - void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { const MachineOperand &MO = MI->getOperand(OpNo); if (MO.isReg()) { @@ -93,17 +80,6 @@ namespace { void - printS7ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) - { - int value = MI->getOperand(OpNo).getImm(); - value = (value << (32 - 7)) >> (32 - 7); - - assert((value >= -(1 << 8) && value <= (1 << 7) - 1) - && "Invalid s7 argument"); - O << value; - } - - void printU7ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { unsigned int value = MI->getOperand(OpNo).getImm(); @@ -134,12 +110,6 @@ namespace { } void - printU32ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) - { - O << (unsigned)MI->getOperand(OpNo).getImm(); - } - - void printMemRegReg(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { // When used as the base register, r0 reads constant zero rather than // the value contained in the register. For this reason, the darwin @@ -221,13 +191,6 @@ namespace { printOp(MI->getOperand(OpNo), O); } - void printHBROperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - // HBR operands are generated in front of branches, hence, the - // program counter plus the target. - O << ".+"; - printOp(MI->getOperand(OpNo), O); - } - void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).isImm()) { printS16ImmOperand(MI, OpNo, O); diff --git a/lib/Target/CellSPU/SPUFrameInfo.cpp b/lib/Target/CellSPU/SPUFrameInfo.cpp index 60d7ba7..70ee8b8 100644 --- a/lib/Target/CellSPU/SPUFrameInfo.cpp +++ b/lib/Target/CellSPU/SPUFrameInfo.cpp @@ -14,16 +14,225 @@ #include "SPU.h" #include "SPUFrameInfo.h" #include "SPURegisterNames.h" +#include "SPUInstrBuilder.h" +#include "SPUInstrInfo.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; //===----------------------------------------------------------------------===// // SPUFrameInfo: //===----------------------------------------------------------------------===// -SPUFrameInfo::SPUFrameInfo(const TargetMachine &tm): - TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), - TM(tm) -{ +SPUFrameInfo::SPUFrameInfo(const SPUSubtarget &sti) + : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), + Subtarget(sti) { LR[0].first = SPU::R0; LR[0].second = 16; } + + +/// determineFrameLayout - Determine the size of the frame and maximum call +/// frame size. +void SPUFrameInfo::determineFrameLayout(MachineFunction &MF) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + unsigned FrameSize = MFI->getStackSize(); + + // Get the alignments provided by the target, and the maximum alignment + // (if any) of the fixed frame objects. + unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); + unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment()); + assert(isPowerOf2_32(Align) && "Alignment is not power of 2"); + unsigned AlignMask = Align - 1; + + // Get the maximum call frame size of all the calls. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so + // that allocations will be aligned. + if (MFI->hasVarSizedObjects()) + maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; + + // Update maximum call frame size. + MFI->setMaxCallFrameSize(maxCallFrameSize); + + // Include call frame size in total. + FrameSize += maxCallFrameSize; + + // Make sure the frame is aligned. + FrameSize = (FrameSize + AlignMask) & ~AlignMask; + + // Update frame info. + MFI->setStackSize(FrameSize); +} + +void SPUFrameInfo::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const SPUInstrInfo &TII = + *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo()); + MachineModuleInfo &MMI = MF.getMMI(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Prepare for debug frame info. + bool hasDebugInfo = MMI.hasDebugInfo(); + MCSymbol *FrameLabel = 0; + + // Move MBBI back to the beginning of the function. + MBBI = MBB.begin(); + + // Work out frame sizes. + determineFrameLayout(MF); + int FrameSize = MFI->getStackSize(); + + assert((FrameSize & 0xf) == 0 + && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); + + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->adjustsStack()) { + FrameSize = -(FrameSize + SPUFrameInfo::minStackSize()); + if (hasDebugInfo) { + // Mark effective beginning of when frame pointer becomes valid. + FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel); + } + + // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp) + // for the ABI + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16) + .addReg(SPU::R1); + if (isInt<10>(FrameSize)) { + // Spill $sp to adjusted $sp + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize) + .addReg(SPU::R1); + // Adjust $sp by required amout + BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1) + .addImm(FrameSize); + } else if (isInt<16>(FrameSize)) { + // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use + // $r2 to adjust $sp: + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) + .addImm(-16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) + .addImm(FrameSize); + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1) + .addReg(SPU::R2) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) + .addReg(SPU::R1) + .addReg(SPU::R2); + BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2) + .addReg(SPU::R2) + .addImm(16); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) + .addReg(SPU::R2) + .addReg(SPU::R1); + } else { + report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); + } + + if (hasDebugInfo) { + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + + // Show update of SP. + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned Reg = CSI[I].getReg(); + if (Reg == SPU::R0) continue; + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc)); + } + + // Mark effective beginning of when frame pointer is ready. + MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel); + + MachineLocation FPDst(SPU::R1); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc)); + } + } else { + // This is a leaf function -- insert a branch hint iff there are + // sufficient number instructions in the basic block. Note that + // this is just a best guess based on the basic block's size. + if (MBB.size() >= (unsigned) SPUFrameInfo::branchHintPenalty()) { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + dl = MBBI->getDebugLoc(); + + // Insert terminator label + BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)) + .addSym(MMI.getContext().CreateTempSymbol()); + } + } +} + +void SPUFrameInfo::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = prior(MBB.end()); + const SPUInstrInfo &TII = + *static_cast<const SPUInstrInfo*>(MF.getTarget().getInstrInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int FrameSize = MFI->getStackSize(); + int LinkSlotOffset = SPUFrameInfo::stackSlotSize(); + DebugLoc dl = MBBI->getDebugLoc(); + + assert(MBBI->getOpcode() == SPU::RET && + "Can only insert epilog into returning blocks"); + assert((FrameSize & 0xf) == 0 && "FrameSize not aligned"); + + // the "empty" frame size is 16 - just the register scavenger spill slot + if (FrameSize > 16 || MFI->adjustsStack()) { + FrameSize = FrameSize + SPUFrameInfo::minStackSize(); + if (isInt<10>(FrameSize + LinkSlotOffset)) { + // Reload $lr, adjust $sp by required amount + // Note: We do this to slightly improve dual issue -- not by much, but it + // is an opportunity for dual issue. + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) + .addImm(FrameSize + LinkSlotOffset) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1) + .addReg(SPU::R1) + .addImm(FrameSize); + } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) { + // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use + // $r2 to adjust $sp: + BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) + .addImm(16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) + .addImm(FrameSize); + BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) + .addReg(SPU::R1) + .addReg(SPU::R2); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) + .addImm(16) + .addReg(SPU::R1); + BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2). + addReg(SPU::R2) + .addImm(16); + BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) + .addReg(SPU::R2) + .addReg(SPU::R1); + } else { + report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); + } + } +} diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameInfo.h index f511acd..89ce1e9 100644 --- a/lib/Target/CellSPU/SPUFrameInfo.h +++ b/lib/Target/CellSPU/SPUFrameInfo.h @@ -12,19 +12,30 @@ // //===----------------------------------------------------------------------===// -#if !defined(SPUFRAMEINFO_H) +#ifndef SPU_FRAMEINFO_H +#define SPU_FRAMEINFO_H +#include "SPURegisterInfo.h" #include "llvm/Target/TargetFrameInfo.h" #include "llvm/Target/TargetMachine.h" -#include "SPURegisterInfo.h" namespace llvm { + class SPUSubtarget; + class SPUFrameInfo: public TargetFrameInfo { - const TargetMachine &TM; + const SPUSubtarget &Subtarget; std::pair<unsigned, int> LR[1]; public: - SPUFrameInfo(const TargetMachine &tm); + SPUFrameInfo(const SPUSubtarget &sti); + + //! Determine the frame's layour + void determineFrameLayout(MachineFunction &MF) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; //! Return a function's saved spill slots /*! @@ -71,5 +82,4 @@ namespace llvm { }; } -#define SPUFRAMEINFO_H 1 #endif diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp index 457bbc7..38a13d1 100644 --- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp +++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp @@ -41,13 +41,6 @@ using namespace llvm; namespace { //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates bool - isI64IntS10Immediate(ConstantSDNode *CN) - { - return isInt<10>(CN->getSExtValue()); - } - - //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates - bool isI32IntS10Immediate(ConstantSDNode *CN) { return isInt<10>(CN->getSExtValue()); @@ -118,55 +111,6 @@ namespace { return false; } - //===------------------------------------------------------------------===// - //! EVT to "useful stuff" mapping structure: - - struct valtype_map_s { - EVT VT; - unsigned ldresult_ins; /// LDRESULT instruction (0 = undefined) - bool ldresult_imm; /// LDRESULT instruction requires immediate? - unsigned lrinst; /// LR instruction - }; - - const valtype_map_s valtype_map[] = { - { MVT::i8, SPU::ORBIr8, true, SPU::LRr8 }, - { MVT::i16, SPU::ORHIr16, true, SPU::LRr16 }, - { MVT::i32, SPU::ORIr32, true, SPU::LRr32 }, - { MVT::i64, SPU::ORr64, false, SPU::LRr64 }, - { MVT::f32, SPU::ORf32, false, SPU::LRf32 }, - { MVT::f64, SPU::ORf64, false, SPU::LRf64 }, - // vector types... (sigh!) - { MVT::v16i8, 0, false, SPU::LRv16i8 }, - { MVT::v8i16, 0, false, SPU::LRv8i16 }, - { MVT::v4i32, 0, false, SPU::LRv4i32 }, - { MVT::v2i64, 0, false, SPU::LRv2i64 }, - { MVT::v4f32, 0, false, SPU::LRv4f32 }, - { MVT::v2f64, 0, false, SPU::LRv2f64 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) - { - const valtype_map_s *retval = 0; - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].VT == VT) { - retval = valtype_map + i; - break; - } - } - - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns" - "NULL for " + Twine(VT.getEVTString())); - } -#endif - - return retval; - } - //! Generate the carry-generate shuffle mask. SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) { SmallVector<SDValue, 16 > ShufBytes; @@ -228,16 +172,10 @@ namespace { return CurDAG->getTargetConstant(Imm, MVT::i32); } - /// getI64Imm - Return a target constant with the specified value, of type - /// i64. - inline SDValue getI64Imm(uint64_t Imm) { - return CurDAG->getTargetConstant(Imm, MVT::i64); - } - /// getSmallIPtrImm - Return a target constant of pointer type. inline SDValue getSmallIPtrImm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy()); - } + } SDNode *emitBuildVector(SDNode *bvNode) { EVT vecVT = bvNode->getValueType(0); @@ -278,7 +216,7 @@ namespace { HandleSDNode Dummy(CurDAG->getLoad(vecVT, dl, CurDAG->getEntryNode(), CGPoolOffset, - PseudoSourceValue::getConstantPool(),0, + MachinePointerInfo::getConstantPool(), false, false, Alignment)); CurDAG->ReplaceAllUsesWith(SDValue(bvNode, 0), Dummy.getValue()); if (SDNode *N = SelectCode(Dummy.getValue().getNode())) @@ -370,6 +308,9 @@ namespace { assert(II && "No InstrInfo?"); return new SPUHazardRecognizer(*II); } + + private: + SDValue getRC( MVT ); // Include the pieces autogenerated from the target description. #include "SPUGenDAGISel.inc" @@ -632,6 +573,43 @@ SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base, return false; } +/*! + Utility function to use with COPY_TO_REGCLASS instructions. Returns a SDValue + to be used as the last parameter of a +CurDAG->getMachineNode(COPY_TO_REGCLASS,..., ) function call + \arg VT the value type for which we want a register class +*/ +SDValue SPUDAGToDAGISel::getRC( MVT VT ) { + switch( VT.SimpleTy ) { + case MVT::i8: + return CurDAG->getTargetConstant(SPU::R8CRegClass.getID(), MVT::i32); + break; + case MVT::i16: + return CurDAG->getTargetConstant(SPU::R16CRegClass.getID(), MVT::i32); + break; + case MVT::i32: + return CurDAG->getTargetConstant(SPU::R32CRegClass.getID(), MVT::i32); + break; + case MVT::f32: + return CurDAG->getTargetConstant(SPU::R32FPRegClass.getID(), MVT::i32); + break; + case MVT::i64: + return CurDAG->getTargetConstant(SPU::R64CRegClass.getID(), MVT::i32); + break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v4f32: + case MVT::v2i64: + case MVT::v2f64: + return CurDAG->getTargetConstant(SPU::VECREGRegClass.getID(), MVT::i32); + break; + default: + assert( false && "add a new case here" ); + } + return SDValue(); +} + //! Convert the operand from a target-independent to a target-specific node /*! */ @@ -786,8 +764,8 @@ SPUDAGToDAGISel::Select(SDNode *N) { if (shift_amt >= 32) { SDNode *hi32 = - CurDAG->getMachineNode(SPU::ORr32_r64, dl, OpVT, - Op0.getOperand(0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + Op0.getOperand(0), getRC(MVT::i32)); shift_amt -= 32; if (shift_amt > 0) { @@ -869,23 +847,12 @@ SPUDAGToDAGISel::Select(SDNode *N) { SDValue Arg = N->getOperand(0); SDValue Chain = N->getOperand(1); SDNode *Result; - const valtype_map_s *vtm = getValueTypeMapEntry(VT); - - if (vtm->ldresult_ins == 0) { - report_fatal_error("LDRESULT for unsupported type: " + - Twine(VT.getEVTString())); - } - - Opc = vtm->ldresult_ins; - if (vtm->ldresult_imm) { - SDValue Zero = CurDAG->getTargetConstant(0, VT); - - Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain); - } else { - Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain); - } - + + Result = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VT, + MVT::Other, Arg, + getRC( VT.getSimpleVT()), Chain); return Result; + } else if (Opc == SPUISD::IndirectAddr) { // Look at the operands: SelectCode() will catch the cases that aren't // specifically handled here. @@ -954,7 +921,8 @@ SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) { SDValue SelMaskVal; DebugLoc dl = N->getDebugLoc(); - VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0); + VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT, + Op0, getRC(MVT::v2i64) ); SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16); SelMask = CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal); ZeroFill = CurDAG->getMachineNode(SPU::ILv2i64, dl, VecVT, @@ -998,7 +966,8 @@ SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(Bits, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1019,7 +988,8 @@ SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) { SDNode *VecOp0, *Shift = 0; DebugLoc dl = N->getDebugLoc(); - VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0); + VecOp0 = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, VecVT, + Op0, getRC(MVT::v2i64) ); if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) { unsigned bytes = unsigned(CN->getZExtValue()) >> 3; @@ -1065,7 +1035,8 @@ SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(Bits, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1086,14 +1057,16 @@ SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) { DebugLoc dl = N->getDebugLoc(); SDNode *VecOp0 = - CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, N->getOperand(0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + VecVT, N->getOperand(0), getRC(MVT::v2i64)); SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT); SDNode *SignRot = CurDAG->getMachineNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64, SDValue(VecOp0, 0), SignRotAmt); SDNode *UpperHalfSign = - CurDAG->getMachineNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0)); + CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + MVT::i32, SDValue(SignRot, 0), getRC(MVT::i32)); SDNode *UpperHalfSignMask = CurDAG->getMachineNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0)); @@ -1140,7 +1113,8 @@ SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) { SDValue(Shift, 0), SDValue(NegShift, 0)); } - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(Shift, 0), getRC(MVT::i64)); } /*! @@ -1167,8 +1141,9 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, SDValue Op0 = i64vec.getOperand(0); ReplaceUses(i64vec, Op0); - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, - SDValue(emitBuildVector(Op0.getNode()), 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + SDValue(emitBuildVector(Op0.getNode()), 0), + getRC(MVT::i64)); } else if (i64vec.getOpcode() == SPUISD::SHUFB) { SDValue lhs = i64vec.getOperand(0); SDValue rhs = i64vec.getOperand(1); @@ -1209,10 +1184,12 @@ SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT, SDNode *SN = SelectCode(Dummy.getValue().getNode()); if (SN == 0) SN = Dummy.getValue().getNode(); - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(SN, 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, + OpVT, SDValue(SN, 0), getRC(MVT::i64)); } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) { - return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, - SDValue(emitBuildVector(i64vec.getNode()), 0)); + return CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, OpVT, + SDValue(emitBuildVector(i64vec.getNode()), 0), + getRC(MVT::i64)); } else { report_fatal_error("SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec" "condition"); diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp index 53ec0ee..256f1f0 100644 --- a/lib/Target/CellSPU/SPUISelLowering.cpp +++ b/lib/Target/CellSPU/SPUISelLowering.cpp @@ -20,6 +20,7 @@ #include "llvm/Function.h" #include "llvm/Intrinsics.h" #include "llvm/CallingConv.h" +#include "llvm/Type.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -41,41 +42,12 @@ using namespace llvm; namespace { std::map<unsigned, const char *> node_names; - //! EVT mapping to useful data for Cell SPU - struct valtype_map_s { - EVT valtype; - int prefslot_byte; - }; - - const valtype_map_s valtype_map[] = { - { MVT::i1, 3 }, - { MVT::i8, 3 }, - { MVT::i16, 2 }, - { MVT::i32, 0 }, - { MVT::f32, 0 }, - { MVT::i64, 0 }, - { MVT::f64, 0 }, - { MVT::i128, 0 } - }; - - const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]); - - const valtype_map_s *getValueTypeMapEntry(EVT VT) { - const valtype_map_s *retval = 0; - - for (size_t i = 0; i < n_valtype_map; ++i) { - if (valtype_map[i].valtype == VT) { - retval = valtype_map + i; - break; - } - } - -#ifndef NDEBUG - if (retval == 0) { - report_fatal_error("getValueTypeMapEntry returns NULL for " + - Twine(VT.getEVTString())); - } -#endif + // Byte offset of the preferred slot (counted from the MSB) + int prefslotOffset(EVT VT) { + int retval=0; + if (VT==MVT::i1) retval=3; + if (VT==MVT::i8) retval=3; + if (VT==MVT::i16) retval=2; return retval; } @@ -439,9 +411,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM) setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::SELECT, VT, Legal); - setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::STORE, VT, Custom); // These operations need to be expanded: setOperationAction(ISD::SDIV, VT, Expand); @@ -502,8 +474,8 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB"; node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC"; node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS"; - node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES"; + node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS"; + node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES"; node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL"; node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR"; node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT"; @@ -572,105 +544,121 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT OutVT = Op.getValueType(); ISD::LoadExtType ExtType = LN->getExtensionType(); unsigned alignment = LN->getAlignment(); - const valtype_map_s *vtm = getValueTypeMapEntry(InVT); + int pso = prefslotOffset(InVT); DebugLoc dl = Op.getDebugLoc(); - - switch (LN->getAddressingMode()) { - case ISD::UNINDEXED: { - SDValue result; - SDValue basePtr = LN->getBasePtr(); - SDValue rotate; - - if (alignment == 16) { - ConstantSDNode *CN; - - // Special cases for a known aligned load to simplify the base pointer - // and the rotation amount: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte); - - if (rotamt < 0) - rotamt += 16; - - rotate = DAG.getConstant(rotamt, MVT::i16); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) - || (basePtr.getOpcode() == SPUISD::IndirectAddr - && basePtr.getOperand(0).getOpcode() == SPUISD::Hi - && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { - // Plain aligned a-form address: rotate into preferred slot - // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getConstant(rotamt, MVT::i16); - } else { - // Offset the rotate amount by the basePtr and the preferred slot - // byte offset - int64_t rotamt = -vtm->prefslot_byte; - if (rotamt < 0) - rotamt += 16; - rotate = DAG.getNode(ISD::ADD, dl, PtrVT, - basePtr, - DAG.getConstant(rotamt, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa<ConstantSDNode>(Op1)) { - // Convert the (add <ptr>, <const>) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add <arg1>, <arg2>) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT, + (128 / InVT.getSizeInBits())); + + // two sanity checks + assert( LN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (InVT.getSizeInBits() == 128 && alignment == 16) + return SDValue(); + + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = LN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16); + + SDValue result; + SDValue basePtr = LN->getBasePtr(); + SDValue rotate; + + if (alignment == 16) { + ConstantSDNode *CN; + + // Special cases for a known aligned load to simplify the base pointer + // and the rotation amount: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + int64_t rotamt = int64_t((offset & 0xf) - pso); + + if (rotamt < 0) + rotamt += 16; + + rotate = DAG.getConstant(rotamt, MVT::i16); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - + } else if ((basePtr.getOpcode() == SPUISD::AFormAddr) + || (basePtr.getOpcode() == SPUISD::IndirectAddr + && basePtr.getOperand(0).getOpcode() == SPUISD::Hi + && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) { + // Plain aligned a-form address: rotate into preferred slot + // Same for (SPUindirect (SPUhi ...), (SPUlo ...)) + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; + rotate = DAG.getConstant(rotamt, MVT::i16); + } else { // Offset the rotate amount by the basePtr and the preferred slot // byte offset + int64_t rotamt = -pso; + if (rotamt < 0) + rotamt += 16; rotate = DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, - DAG.getConstant(-vtm->prefslot_byte, PtrVT)); + DAG.getConstant(rotamt, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa<ConstantSDNode>(Op1)) { + // Convert the (add <ptr>, <const>) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add <arg1>, <arg2>) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Re-emit as a v16i8 vector load - result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), - LN->isVolatile(), LN->isNonTemporal(), 16); - + // Offset the rotate amount by the basePtr and the preferred slot + // byte offset + rotate = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(-pso, PtrVT)); + } + + // Do the load as a i128 to allow possible shifting + SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr, + lowMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + // When the size is not greater than alignment we get all data with just + // one load + if (alignment >= InVT.getSizeInBits()/8) { // Update the chain - the_chain = result.getValue(1); + the_chain = low.getValue(1); // Rotate into the preferred slot: - result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8, - result.getValue(0), rotate); + result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128, + low.getValue(0), rotate); // Convert the loaded v16i8 vector to the appropriate vector type // specified by the operand: @@ -678,7 +666,56 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { InVT, (128 / InVT.getSizeInBits())); result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result)); + } + // When alignment is less than the size, we might need (known only at + // run-time) two loads + // TODO: if the memory address is composed only from constants, we have + // extra kowledge, and might avoid the second load + else { + // storage position offset from lower 16 byte aligned memory chunk + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + basePtr, DAG.getConstant( 0xf, MVT::i32 ) ); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset ); + // get a registerfull of ones. (this implementation is a workaround: LLVM + // cannot handle 128 bit signed int constants) + SDValue ones = DAG.getConstant(-1, MVT::v4i32 ); + ones = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, ones); + + SDValue high = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(16, PtrVT)), + highMemPtr, + LN->isVolatile(), LN->isNonTemporal(), 16); + + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + high.getValue(1)); + + // Shift the (possible) high part right to compensate the misalignemnt. + // if there is no highpart (i.e. value is i64 and offset is 4), this + // will zero out the high value. + high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high, + DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset + )); + + // Shift the low similarily + // TODO: add SPUISD::SHL_BYTES + low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset ); + + // Merge the two parts + result = DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, + DAG.getNode(ISD::OR, dl, MVT::i128, low, high)); + + if (!InVT.isVector()) { + result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result ); + } + } // Handle extending loads by extending the scalar result: if (ExtType == ISD::SEXTLOAD) { result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result); @@ -702,21 +739,6 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { result = DAG.getNode(SPUISD::LDRESULT, dl, retvts, retops, sizeof(retops) / sizeof(retops[0])); return result; - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)LN->getAddressingMode())); - /*NOTREACHED*/ - } - } - - return SDValue(); } /// Custom lower stores for CellSPU @@ -734,93 +756,103 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); DebugLoc dl = Op.getDebugLoc(); unsigned alignment = SN->getAlignment(); + SDValue result; + EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT, + (128 / StVT.getSizeInBits())); + // Get pointerinfos to the memory chunk(s) that contain the data to load + uint64_t mpi_offset = SN->getPointerInfo().Offset; + mpi_offset -= mpi_offset%16; + MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset); + MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16); + + + // two sanity checks + assert( SN->getAddressingMode() == ISD::UNINDEXED + && "we should get only UNINDEXED adresses"); + // clean aligned loads can be selected as-is + if (StVT.getSizeInBits() == 128 && alignment == 16) + return SDValue(); + + SDValue alignLoadVec; + SDValue basePtr = SN->getBasePtr(); + SDValue the_chain = SN->getChain(); + SDValue insertEltOffs; + + if (alignment == 16) { + ConstantSDNode *CN; + // Special cases for a known aligned load to simplify the base pointer + // and insertion byte: + if (basePtr.getOpcode() == ISD::ADD + && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) { + // Known offset into basePtr + int64_t offset = CN->getSExtValue(); + + // Simplify the base pointer for this case: + basePtr = basePtr.getOperand(0); + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant((offset & 0xf), PtrVT)); - switch (SN->getAddressingMode()) { - case ISD::UNINDEXED: { - // The vector type we really want to load from the 16-byte chunk. - EVT vecVT = EVT::getVectorVT(*DAG.getContext(), - VT, (128 / VT.getSizeInBits())); - - SDValue alignLoadVec; - SDValue basePtr = SN->getBasePtr(); - SDValue the_chain = SN->getChain(); - SDValue insertEltOffs; - - if (alignment == 16) { - ConstantSDNode *CN; - // Special cases for a known aligned load to simplify the base pointer - // and insertion byte: - if (basePtr.getOpcode() == ISD::ADD - && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) { - // Known offset into basePtr - int64_t offset = CN->getSExtValue(); - - // Simplify the base pointer for this case: - basePtr = basePtr.getOperand(0); - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & 0xf), PtrVT)); - - if ((offset & ~0xf) > 0) { - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant((offset & ~0xf), PtrVT)); - } - } else { - // Otherwise, assume it's at byte 0 of basePtr - insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, - basePtr, - DAG.getConstant(0, PtrVT)); - } - } else { - // Unaligned load: must be more pessimistic about addressing modes: - if (basePtr.getOpcode() == ISD::ADD) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &RegInfo = MF.getRegInfo(); - unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); - SDValue Flag; - - SDValue Op0 = basePtr.getOperand(0); - SDValue Op1 = basePtr.getOperand(1); - - if (isa<ConstantSDNode>(Op1)) { - // Convert the (add <ptr>, <const>) to an indirect address contained - // in a register. Note that this is done because we need to avoid - // creating a 0(reg) d-form address due to the SPU's block loads. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); - basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); - } else { - // Convert the (add <arg1>, <arg2>) to an indirect address, which - // will likely be lowered as a reg(reg) x-form address. - basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); - } - } else { + if ((offset & ~0xf) > 0) { basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, - DAG.getConstant(0, PtrVT)); + DAG.getConstant((offset & ~0xf), PtrVT)); } - - // Insertion point is solely determined by basePtr's contents - insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + } else { + // Otherwise, assume it's at byte 0 of basePtr + insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, basePtr, DAG.getConstant(0, PtrVT)); } + } else { + // Unaligned load: must be more pessimistic about addressing modes: + if (basePtr.getOpcode() == ISD::ADD) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass); + SDValue Flag; + + SDValue Op0 = basePtr.getOperand(0); + SDValue Op1 = basePtr.getOperand(1); + + if (isa<ConstantSDNode>(Op1)) { + // Convert the (add <ptr>, <const>) to an indirect address contained + // in a register. Note that this is done because we need to avoid + // creating a 0(reg) d-form address due to the SPU's block loads. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag); + basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT); + } else { + // Convert the (add <arg1>, <arg2>) to an indirect address, which + // will likely be lowered as a reg(reg) x-form address. + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1); + } + } else { + basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } - // Load the memory to which to store. - alignLoadVec = DAG.getLoad(vecVT, dl, the_chain, basePtr, - SN->getSrcValue(), SN->getSrcValueOffset(), - SN->isVolatile(), SN->isNonTemporal(), 16); + // Insertion point is solely determined by basePtr's contents + insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT, + basePtr, + DAG.getConstant(0, PtrVT)); + } + + // Load the lower part of the memory to which to store. + SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr, + lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16); + // if we don't need to store over the 16 byte boundary, one store suffices + if (alignment >= StVT.getSizeInBits()/8) { // Update the chain - the_chain = alignLoadVec.getValue(1); + the_chain = low.getValue(1); - LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec); + LoadSDNode *LN = cast<LoadSDNode>(low); SDValue theValue = SN->getValue(); - SDValue result; if (StVT != VT && (theValue.getOpcode() == ISD::AssertZext @@ -848,44 +880,114 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) { theValue); result = DAG.getNode(SPUISD::SHUFB, dl, vecVT, - vectorizeOp, alignLoadVec, + vectorizeOp, low, DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, insertEltOp)); result = DAG.getStore(the_chain, dl, result, basePtr, - LN->getSrcValue(), LN->getSrcValueOffset(), + lowMemPtr, LN->isVolatile(), LN->isNonTemporal(), - LN->getAlignment()); + 16); -#if 0 && !defined(NDEBUG) - if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) { - const SDValue ¤tRoot = DAG.getRoot(); - - DAG.setRoot(result); - errs() << "------- CellSPU:LowerStore result:\n"; - DAG.dump(); - errs() << "-------\n"; - DAG.setRoot(currentRoot); - } -#endif - - return result; - /*UNREACHED*/ - } - case ISD::PRE_INC: - case ISD::PRE_DEC: - case ISD::POST_INC: - case ISD::POST_DEC: - case ISD::LAST_INDEXED_MODE: - { - report_fatal_error("LowerLOAD: Got a LoadSDNode with an addr mode other " - "than UNINDEXED\n" + - Twine((unsigned)SN->getAddressingMode())); - /*NOTREACHED*/ - } } + // do the store when it might cross the 16 byte memory access boundary. + else { + // TODO issue a warning if SN->isVolatile()== true? This is likely not + // what the user wanted. + + // address offset from nearest lower 16byte alinged address + SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32, + SN->getBasePtr(), + DAG.getConstant(0xf, MVT::i32)); + // 16 - offset + SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + offset); + SDValue hi_shift = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32), + offset_compl); + // 16 - sizeof(Value) + SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant( 16, MVT::i32), + DAG.getConstant( VT.getSizeInBits()/8, + MVT::i32)); + // get a registerfull of ones + SDValue ones = DAG.getConstant(-1, MVT::v4i32); + ones = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, ones); + + // Create the 128 bit masks that have ones where the data to store is + // located. + SDValue lowmask, himask; + // if the value to store don't fill up the an entire 128 bits, zero + // out the last bits of the mask so that only the value we want to store + // is masked. + // this is e.g. in the case of store i32, align 2 + if (!VT.isVector()){ + Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus); + lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + surplus); + Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value); + Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask); + + } + else { + lowmask = ones; + Value = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, Value); + } + // this will zero, if there are no data that goes to the high quad + himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask, + offset_compl); + lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask, + offset); + + // Load in the old data and zero out the parts that will be overwritten with + // the new data to store. + SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1), + hi.getValue(1)); + + low = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, low), + DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones)); + hi = DAG.getNode(ISD::AND, dl, MVT::i128, + DAG.getNode( ISD::BIT_CONVERT, dl, MVT::i128, hi), + DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones)); + + // Shift the Value to store into place. rlow contains the parts that go to + // the lower memory chunk, rhi has the parts that go to the upper one. + SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset); + rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask); + SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value, + offset_compl); + + // Merge the old data and the new data and store the results + // Need to convert vectors here to integer as 'OR'ing floats assert + rlow = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, low), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rlow)); + rhi = DAG.getNode(ISD::OR, dl, MVT::i128, + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, hi), + DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, rhi)); + + low = DAG.getStore(the_chain, dl, rlow, basePtr, + lowMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + hi = DAG.getStore(the_chain, dl, rhi, + DAG.getNode(ISD::ADD, dl, PtrVT, basePtr, + DAG.getConstant( 16, PtrVT)), + highMemPtr, + SN->isVolatile(), SN->isNonTemporal(), 16); + result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0), + hi.getValue(0)); + } - return SDValue(); + return result; } //! Generate the address of a constant pool entry. @@ -1080,7 +1182,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, // or we're forced to do vararg int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0, false, false, 0); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, 0); ArgOffset += StackSlotSize; } @@ -1119,7 +1222,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass); SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8); - SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0, + SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(), false, false, 0); Chain = Store.getOperand(0); MemOps.push_back(Store); @@ -1219,7 +1322,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee, if (ArgRegIdx != NumArgRegs) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0, + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); ArgOffset += StackSlotSize; } @@ -1735,9 +1839,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned CurrElt = 0; unsigned MaxElts = VecVT.getVectorNumElements(); unsigned PrevElt = 0; - unsigned V0Elt = 0; bool monotonic = true; bool rotate = true; + int rotamt=0; EVT maskVT; // which of the c?d instructions to use if (EltVT == MVT::i8) { @@ -1781,14 +1885,13 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (PrevElt > 0 && SrcElt < MaxElts) { if ((PrevElt == SrcElt - 1) || (PrevElt == MaxElts - 1 && SrcElt == 0)) { + rotamt = SrcElt-i; PrevElt = SrcElt; - if (SrcElt == 0) - V0Elt = i; } else { rotate = false; } - } else if (i == 0) { - // First time through, need to keep track of previous element + } else if (i == 0 || (PrevElt==0 && SrcElt==1)) { + // First time or after a "wrap around" PrevElt = SrcElt; } else { // This isn't a rotation, takes elements from vector 2 @@ -1813,8 +1916,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1, ShufMaskOp); } else if (rotate) { - int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8; - + if (rotamt < 0) + rotamt +=MaxElts; + rotamt *= EltVT.getSizeInBits()/8; return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(), V1, DAG.getConstant(rotamt, MVT::i16)); } else { @@ -1999,7 +2103,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(scaleShift, MVT::i32)); } - vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt); + vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt); // Replicate the bytes starting at byte 0 across the entire vector (for // consistency with the notion of a unified register set) @@ -2053,20 +2157,21 @@ static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { SDValue IdxOp = Op.getOperand(2); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); + EVT eltVT = ValOp.getValueType(); // use 0 when the lane to insert to is 'undef' - int64_t Idx=0; + int64_t Offset=0; if (IdxOp.getOpcode() != ISD::UNDEF) { ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp); assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!"); - Idx = (CN->getSExtValue()); + Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8; } EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); // Use $sp ($1) because it's always 16-byte aligned and it's available: SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, DAG.getRegister(SPU::R1, PtrVT), - DAG.getConstant(Idx, PtrVT)); + DAG.getConstant(Offset, PtrVT)); // widen the mask when dealing with half vectors EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(), 128/ VT.getVectorElementType().getSizeInBits()); @@ -2574,7 +2679,7 @@ static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) SDValue Op0 = Op.getOperand(0); EVT Op0VT = Op0.getValueType(); - if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) { + if (Op0VT == MVT::i128 && simpleVT == MVT::i64) { // Create shuffle mask, least significant doubleword of quadword unsigned maskHigh = 0x08090a0b; unsigned maskLow = 0x0c0d0e0f; @@ -2639,11 +2744,16 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0), DAG.getConstant(31, MVT::i32)); + // reinterpret as a i128 (SHUFB requires it). This gets lowered away. + SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op0VT, Op0, + DAG.getTargetConstant( + SPU::GPRCRegClass.getID(), + MVT::i32)), 0); // Shuffle bytes - Copy the sign bits into the upper 64 bits // and the input value into the lower 64 bits. SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt, - DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask); - + extended, sraVal, shufMask); return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle); } @@ -2902,8 +3012,8 @@ SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const } break; } - case SPUISD::SHLQUAD_L_BITS: - case SPUISD::SHLQUAD_L_BYTES: + case SPUISD::SHL_BITS: + case SPUISD::SHL_BYTES: case SPUISD::ROTBYTES_LEFT: { SDValue Op1 = N->getOperand(1); @@ -2981,6 +3091,38 @@ SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const return TargetLowering::getConstraintType(ConstraintLetter); } +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +SPUTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break;
+ //FIXME: Seems like the supported constraint letters were just copied + // from PPC, as the following doesn't correspond to the GCC docs. + // I'm leaving it so until someone adds the corresponding lowering support. + case 'b': + case 'r': + case 'f': + case 'd': + case 'v': + case 'y': + weight = CW_Register; + break; + } + return weight; +} + std::pair<unsigned, const TargetRegisterClass*> SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const @@ -3085,3 +3227,29 @@ SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The SPU target isn't yet aware of offsets. return false; } + +// can we compare to Imm without writing it into a register? +bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + //ceqi, cgti, etc. all take s10 operand + return isInt<10>(Imm); +} + +bool +SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM, + const Type * ) const{ + + // A-form: 18bit absolute address. + if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0) + return true; + + // D-form: reg + 14bit offset + if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs)) + return true; + + // X-form: reg+reg + if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0) + return true; + + return false; +} + diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h index 6d3c90b..82f1027 100644 --- a/lib/Target/CellSPU/SPUISelLowering.h +++ b/lib/Target/CellSPU/SPUISelLowering.h @@ -41,8 +41,9 @@ namespace llvm { CNTB, ///< Count leading ones in bytes PREFSLOT2VEC, ///< Promote scalar->vector VEC2PREFSLOT, ///< Extract element 0 - SHLQUAD_L_BITS, ///< Rotate quad left, by bits - SHLQUAD_L_BYTES, ///< Rotate quad left, by bytes + SHL_BITS, ///< Shift quad left, by bits + SHL_BYTES, ///< Shift quad left, by bytes + SRL_BYTES, ///< Shift quad right, by bytes. Insert zeros. VEC_ROTL, ///< Vector rotate left VEC_ROTR, ///< Vector rotate right ROTBYTES_LEFT, ///< Rotate bytes (loads -> ROTQBYI) @@ -129,6 +130,11 @@ namespace llvm { ConstraintType getConstraintType(const std::string &ConstraintLetter) const; + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + std::pair<unsigned, const TargetRegisterClass*> getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const; @@ -170,6 +176,11 @@ namespace llvm { const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl, SelectionDAG &DAG) const; + + virtual bool isLegalICmpImmediate(int64_t Imm) const; + + virtual bool isLegalAddressingMode(const AddrMode &AM, + const Type *Ty) const; }; } diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td index ca0fe00..50f688a 100644 --- a/lib/Target/CellSPU/SPUInstrInfo.td +++ b/lib/Target/CellSPU/SPUInstrInfo.td @@ -1385,59 +1385,6 @@ class ORRegInst<RegisterClass rclass>: ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB), [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>; -// ORCvtForm: OR conversion form -// -// This is used to "convert" the preferred slot to its vector equivalent, as -// well as convert a vector back to its preferred slot. -// -// These are effectively no-ops, but need to exist for proper type conversion -// and type coercion. - -class ORCvtForm<dag OOL, dag IOL, list<dag> pattern = [/* no pattern */]> - : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> { - bits<7> RA; - bits<7> RT; - - let Pattern = pattern; - - let Inst{0-10} = 0b10000010000; - let Inst{11-17} = RA; - let Inst{18-24} = RA; - let Inst{25-31} = RT; -} - -class ORPromoteScalar<RegisterClass rclass>: - ORCvtForm<(outs VECREG:$rT), (ins rclass:$rA)>; - -class ORExtractElt<RegisterClass rclass>: - ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>; - -/* class ORCvtRegGPRC<RegisterClass rclass>: - ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>; */ - -/* class ORCvtGPRCReg<RegisterClass rclass>: - ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>; */ - -class ORCvtFormR32Reg<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs rclass:$rT), (ins R32C:$rA), pattern>; - -class ORCvtFormRegR32<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs R32C:$rT), (ins rclass:$rA), pattern>; - -class ORCvtFormR64Reg<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs rclass:$rT), (ins R64C:$rA), pattern>; - -class ORCvtFormRegR64<RegisterClass rclass, list<dag> pattern = [ ]>: - ORCvtForm<(outs R64C:$rT), (ins rclass:$rA), pattern>; - -class ORCvtGPRCVec: - ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>; - -class ORCvtVecGPRC: - ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>; - -class ORCvtVecVec: - ORCvtForm<(outs VECREG:$rT), (ins VECREG:$rA)>; multiclass BitwiseOr { @@ -1468,119 +1415,48 @@ multiclass BitwiseOr def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB), [/* no pattern */]>; - - // scalar->vector promotion, prefslot2vec: - def v16i8_i8: ORPromoteScalar<R8C>; - def v8i16_i16: ORPromoteScalar<R16C>; - def v4i32_i32: ORPromoteScalar<R32C>; - def v2i64_i64: ORPromoteScalar<R64C>; - def v4f32_f32: ORPromoteScalar<R32FP>; - def v2f64_f64: ORPromoteScalar<R64FP>; - - // vector->scalar demotion, vec2prefslot: - def i8_v16i8: ORExtractElt<R8C>; - def i16_v8i16: ORExtractElt<R16C>; - def i32_v4i32: ORExtractElt<R32C>; - def i64_v2i64: ORExtractElt<R64C>; - def f32_v4f32: ORExtractElt<R32FP>; - def f64_v2f64: ORExtractElt<R64FP>; - - // Conversion from vector to GPRC - def i128_vec: ORCvtVecGPRC; - - // Conversion from GPRC to vector - def vec_i128: ORCvtGPRCVec; - -/* - // Conversion from register to GPRC - def i128_r64: ORCvtRegGPRC<R64C>; - def i128_f64: ORCvtRegGPRC<R64FP>; - def i128_r32: ORCvtRegGPRC<R32C>; - def i128_f32: ORCvtRegGPRC<R32FP>; - def i128_r16: ORCvtRegGPRC<R16C>; - def i128_r8: ORCvtRegGPRC<R8C>; - - // Conversion from GPRC to register - def r64_i128: ORCvtGPRCReg<R64C>; - def f64_i128: ORCvtGPRCReg<R64FP>; - def r32_i128: ORCvtGPRCReg<R32C>; - def f32_i128: ORCvtGPRCReg<R32FP>; - def r16_i128: ORCvtGPRCReg<R16C>; - def r8_i128: ORCvtGPRCReg<R8C>; -*/ -/* - // Conversion from register to R32C: - def r32_r16: ORCvtFormRegR32<R16C>; - def r32_r8: ORCvtFormRegR32<R8C>; - - // Conversion from R32C to register - def r32_r16: ORCvtFormR32Reg<R16C>; - def r32_r8: ORCvtFormR32Reg<R8C>; -*/ - - // Conversion from R64C to register: - def r32_r64: ORCvtFormR64Reg<R32C>; - // def r16_r64: ORCvtFormR64Reg<R16C>; - // def r8_r64: ORCvtFormR64Reg<R8C>; - - // Conversion to R64C from register: - def r64_r32: ORCvtFormRegR64<R32C>; - // def r64_r16: ORCvtFormRegR64<R16C>; - // def r64_r8: ORCvtFormRegR64<R8C>; - - // bitconvert patterns: - def r32_f32: ORCvtFormR32Reg<R32FP, - [(set R32FP:$rT, (bitconvert R32C:$rA))]>; - def f32_r32: ORCvtFormRegR32<R32FP, - [(set R32C:$rT, (bitconvert R32FP:$rA))]>; - - def r64_f64: ORCvtFormR64Reg<R64FP, - [(set R64FP:$rT, (bitconvert R64C:$rA))]>; - def f64_r64: ORCvtFormRegR64<R64FP, - [(set R64C:$rT, (bitconvert R64FP:$rA))]>; } defm OR : BitwiseOr; -// scalar->vector promotion patterns (preferred slot to vector): +//===----------------------------------------------------------------------===// +// SPU::PREFSLOT2VEC and VEC2PREFSLOT re-interpretations of registers +//===----------------------------------------------------------------------===// def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)), - (ORv16i8_i8 R8C:$rA)>; + (COPY_TO_REGCLASS R8C:$rA, VECREG)>; def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)), - (ORv8i16_i16 R16C:$rA)>; + (COPY_TO_REGCLASS R16C:$rA, VECREG)>; def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)), - (ORv4i32_i32 R32C:$rA)>; + (COPY_TO_REGCLASS R32C:$rA, VECREG)>; def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)), - (ORv2i64_i64 R64C:$rA)>; + (COPY_TO_REGCLASS R64C:$rA, VECREG)>; def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)), - (ORv4f32_f32 R32FP:$rA)>; + (COPY_TO_REGCLASS R32FP:$rA, VECREG)>; def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)), - (ORv2f64_f64 R64FP:$rA)>; + (COPY_TO_REGCLASS R64FP:$rA, VECREG)>; + +def : Pat<(i8 (SPUvec2prefslot (v16i8 VECREG:$rA))), + (COPY_TO_REGCLASS (v16i8 VECREG:$rA), R8C)>; -// ORi*_v*: Used to extract vector element 0 (the preferred slot), otherwise -// known as converting the vector back to its preferred slot +def : Pat<(i16 (SPUvec2prefslot (v8i16 VECREG:$rA))), + (COPY_TO_REGCLASS (v8i16 VECREG:$rA), R16C)>; -def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)), - (ORi8_v16i8 VECREG:$rA)>; +def : Pat<(i32 (SPUvec2prefslot (v4i32 VECREG:$rA))), + (COPY_TO_REGCLASS (v4i32 VECREG:$rA), R32C)>; -def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)), - (ORi16_v8i16 VECREG:$rA)>; +def : Pat<(i64 (SPUvec2prefslot (v2i64 VECREG:$rA))), + (COPY_TO_REGCLASS (v2i64 VECREG:$rA), R64C)>; -def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)), - (ORi32_v4i32 VECREG:$rA)>; +def : Pat<(f32 (SPUvec2prefslot (v4f32 VECREG:$rA))), + (COPY_TO_REGCLASS (v4f32 VECREG:$rA), R32FP)>; -def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)), - (ORi64_v2i64 VECREG:$rA)>; - -def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)), - (ORf32_v4f32 VECREG:$rA)>; - -def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)), - (ORf64_v2f64 VECREG:$rA)>; +def : Pat<(f64 (SPUvec2prefslot (v2f64 VECREG:$rA))), + (COPY_TO_REGCLASS (v2f64 VECREG:$rA), R64FP)>; // Load Register: This is an assembler alias for a bitwise OR of a register // against itself. It's here because it brings some clarity to assembly @@ -2493,10 +2369,13 @@ class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>: RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB", RotateShift, pattern>; -class ROTQBYVecInst<ValueType vectype>: - ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>; +class ROTQBYGenInst<ValueType type, RegisterClass rc>: + ROTQBYInst<(outs rc:$rT), (ins rc:$rA, R32C:$rB), + [(set (type rc:$rT), + (SPUrotbytes_left (type rc:$rA), R32C:$rB))]>; + +class ROTQBYVecInst<ValueType type>: + ROTQBYGenInst<type, VECREG>; multiclass RotateQuadLeftByBytes { @@ -2506,6 +2385,7 @@ multiclass RotateQuadLeftByBytes def v4f32: ROTQBYVecInst<v4f32>; def v2i64: ROTQBYVecInst<v2i64>; def v2f64: ROTQBYVecInst<v2f64>; + def i128: ROTQBYGenInst<i128, GPRC>; } defm ROTQBY: RotateQuadLeftByBytes; @@ -2518,10 +2398,13 @@ class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val", RotateShift, pattern>; +class ROTQBYIGenInst<ValueType type, RegisterClass rclass>: + ROTQBYIInst<(outs rclass:$rT), (ins rclass:$rA, u7imm:$val), + [(set (type rclass:$rT), + (SPUrotbytes_left (type rclass:$rA), (i16 uimm7:$val)))]>; + class ROTQBYIVecInst<ValueType vectype>: - ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val), - [(set (vectype VECREG:$rT), - (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>; + ROTQBYIGenInst<vectype, VECREG>; multiclass RotateQuadByBytesImm { @@ -2531,6 +2414,7 @@ multiclass RotateQuadByBytesImm def v4f32: ROTQBYIVecInst<v4f32>; def v2i64: ROTQBYIVecInst<v2i64>; def vfi64: ROTQBYIVecInst<v2f64>; + def i128: ROTQBYIGenInst<i128, GPRC>; } defm ROTQBYI: RotateQuadByBytesImm; @@ -2785,6 +2669,10 @@ multiclass RotateQuadBytes defm ROTQMBY : RotateQuadBytes; +def : Pat<(SPUsrl_bytes GPRC:$rA, R32C:$rB), + (ROTQMBYr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0))>; + class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>: RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val", RotateShift, pattern>; @@ -2873,6 +2761,11 @@ multiclass RotateMaskQuadByBits defm ROTQMBI: RotateMaskQuadByBits; +def : Pat<(srl GPRC:$rA, R32C:$rB), + (ROTQMBIr128 GPRC:$rA, + (SFIr32 R32C:$rB, 0))>; + + //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ // Rotate quad and mask by bits, immediate //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~ @@ -4379,30 +4272,43 @@ def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VECREG:$src))), (v2f64 VECREG:$src)>; def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v8i16 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v4i32 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v2i64 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v4f32 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(i128 (bitconvert (v2f64 VECREG:$src))), - (ORi128_vec VECREG:$src)>; + (COPY_TO_REGCLASS VECREG:$src, GPRC)>; def : Pat<(v16i8 (bitconvert (i128 GPRC:$src))), - (v16i8 (ORvec_i128 GPRC:$src))>; + (v16i8 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v8i16 (bitconvert (i128 GPRC:$src))), - (v8i16 (ORvec_i128 GPRC:$src))>; + (v8i16 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v4i32 (bitconvert (i128 GPRC:$src))), - (v4i32 (ORvec_i128 GPRC:$src))>; + (v4i32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v2i64 (bitconvert (i128 GPRC:$src))), - (v2i64 (ORvec_i128 GPRC:$src))>; + (v2i64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v4f32 (bitconvert (i128 GPRC:$src))), - (v4f32 (ORvec_i128 GPRC:$src))>; + (v4f32 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; def : Pat<(v2f64 (bitconvert (i128 GPRC:$src))), - (v2f64 (ORvec_i128 GPRC:$src))>; + (v2f64 (COPY_TO_REGCLASS GPRC:$src, VECREG))>; + +def : Pat<(i32 (bitconvert R32FP:$rA)), + (COPY_TO_REGCLASS R32FP:$rA, R32C)>; + +def : Pat<(f32 (bitconvert R32C:$rA)), + (COPY_TO_REGCLASS R32C:$rA, R32FP)>; + +def : Pat<(i64 (bitconvert R64FP:$rA)), + (COPY_TO_REGCLASS R64FP:$rA, R64C)>; + +def : Pat<(f64 (bitconvert R64C:$rA)), + (COPY_TO_REGCLASS R64C:$rA, R64FP)>; + //===----------------------------------------------------------------------===// // Instruction patterns: @@ -4453,11 +4359,12 @@ def : Pat<(i32 (zext R8C:$rSrc)), // zext 8->64: Zero extend bytes to double words def : Pat<(i64 (zext R8C:$rSrc)), - (ORi64_v2i64 (SELBv4i32 (ROTQMBYv4i32 - (ORv4i32_i32 (ANDIi8i32 R8C:$rSrc, 0xff)), + (COPY_TO_REGCLASS (SELBv4i32 (ROTQMBYv4i32 + (COPY_TO_REGCLASS + (ANDIi8i32 R8C:$rSrc,0xff), VECREG), 0x4), (ILv4i32 0x0), - (FSMBIv4i32 0x0f0f)))>; + (FSMBIv4i32 0x0f0f)), R64C)>; // anyext 8->16: Extend 8->16 bits, irrespective of sign, preserves high bits def : Pat<(i16 (anyext R8C:$rSrc)), @@ -4465,7 +4372,7 @@ def : Pat<(i16 (anyext R8C:$rSrc)), // anyext 8->32: Extend 8->32 bits, irrespective of sign, preserves high bits def : Pat<(i32 (anyext R8C:$rSrc)), - (ORIi8i32 R8C:$rSrc, 0)>; + (COPY_TO_REGCLASS R8C:$rSrc, R32C)>; // sext 16->64: Sign extend halfword to double word def : Pat<(sext_inreg R64C:$rSrc, i16), @@ -4489,7 +4396,7 @@ def : Pat<(i32 (zext (and R16C:$rSrc, 0xfff))), // anyext 16->32: Extend 16->32 bits, irrespective of sign def : Pat<(i32 (anyext R16C:$rSrc)), - (ORIi16i32 R16C:$rSrc, 0)>; + (COPY_TO_REGCLASS R16C:$rSrc, R32C)>; //===----------------------------------------------------------------------===// // Truncates: @@ -4498,61 +4405,61 @@ def : Pat<(i32 (anyext R16C:$rSrc)), //===----------------------------------------------------------------------===// def : Pat<(i8 (trunc GPRC:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)), R8C)>; def : Pat<(i8 (trunc R64C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)), R8C)>; def : Pat<(i8 (trunc R32C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv4i32_i32 R32C:$src), - (ORv4i32_i32 R32C:$src), - (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>; + (COPY_TO_REGCLASS R32C:$src, VECREG), + (COPY_TO_REGCLASS R32C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>; def : Pat<(i8 (trunc R16C:$src)), - (ORi8_v16i8 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv8i16_i16 R16C:$src), - (ORv8i16_i16 R16C:$src), - (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>; + (COPY_TO_REGCLASS R16C:$src, VECREG), + (COPY_TO_REGCLASS R16C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)), R8C)>; def : Pat<(i16 (trunc GPRC:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)), R16C)>; def : Pat<(i16 (trunc R64C:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)), R16C)>; def : Pat<(i16 (trunc R32C:$src)), - (ORi16_v8i16 + (COPY_TO_REGCLASS (SHUFBv4i32_m32 - (ORv4i32_i32 R32C:$src), - (ORv4i32_i32 R32C:$src), - (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)))>; + (COPY_TO_REGCLASS R32C:$src, VECREG), + (COPY_TO_REGCLASS R32C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)), R16C)>; def : Pat<(i32 (trunc GPRC:$src)), - (ORi32_v4i32 + (COPY_TO_REGCLASS (SHUFBgprc GPRC:$src, GPRC:$src, - (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)))>; + (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)), R32C)>; def : Pat<(i32 (trunc R64C:$src)), - (ORi32_v4i32 + (COPY_TO_REGCLASS (SHUFBv2i64_m32 - (ORv2i64_i64 R64C:$src), - (ORv2i64_i64 R64C:$src), - (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)))>; + (COPY_TO_REGCLASS R64C:$src, VECREG), + (COPY_TO_REGCLASS R64C:$src, VECREG), + (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)), R32C)>; //===----------------------------------------------------------------------===// // Address generation: SPU, like PPC, has to split addresses into high and diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td index 647da30..dc3dc8c 100644 --- a/lib/Target/CellSPU/SPUNodes.td +++ b/lib/Target/CellSPU/SPUNodes.td @@ -83,10 +83,6 @@ def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>; // SPUISelLowering.h): def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>; -// Shift left quadword by bits and bytes -def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>; -def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>; - // Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only): def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>; def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>; @@ -105,6 +101,12 @@ def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT", def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS", SPUvecshift_type>; +// Shift entire quad left by bytes/bits. Zeros are shifted in on the right +// SHL_BITS the same as SHL for i128, but ISD::SHL is not implemented for i128 +def SPUshlquad_l_bytes: SDNode<"SPUISD::SHL_BYTES", SPUvecshift_type, []>; +def SPUshlquad_l_bits: SDNode<"SPUISD::SHL_BITS", SPUvecshift_type, []>; +def SPUsrl_bytes: SDNode<"SPUISD::SRL_BYTES", SPUvecshift_type, []>; + // SPU form select mask for bytes, immediate def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>; diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td index 6216651..3c8b3c3 100644 --- a/lib/Target/CellSPU/SPUOperands.td +++ b/lib/Target/CellSPU/SPUOperands.td @@ -98,12 +98,6 @@ def immU8 : PatLeaf<(imm), [{ return (N->getZExtValue() <= 0xff); }]>; -// i64ImmSExt10 predicate - True if the i64 immediate fits in a 10-bit sign -// extended field. Used by RI10Form instructions like 'ldq'. -def i64ImmSExt10 : PatLeaf<(imm), [{ - return isI64IntS10Immediate(N); -}]>; - // i32ImmSExt10 predicate - True if the i32 immediate fits in a 10-bit sign // extended field. Used by RI10Form instructions like 'ldq'. def i32ImmSExt10 : PatLeaf<(imm), [{ @@ -660,7 +654,11 @@ def memrr : Operand<iPTR> { // A-form : abs (256K LSA offset) // D-form(2): [r+I7] (7-bit signed offset + reg) -def dform_addr : ComplexPattern<iPTR, 2, "SelectDFormAddr", [], []>; -def xform_addr : ComplexPattern<iPTR, 2, "SelectXFormAddr", [], []>; -def aform_addr : ComplexPattern<iPTR, 2, "SelectAFormAddr", [], []>; -def dform2_addr : ComplexPattern<iPTR, 2, "SelectDForm2Addr", [], []>; +def dform_addr : ComplexPattern<iPTR, 2, "SelectDFormAddr", + [], [SDNPWantRoot]>; +def xform_addr : ComplexPattern<iPTR, 2, "SelectXFormAddr", + [], [SDNPWantRoot]>; +def aform_addr : ComplexPattern<iPTR, 2, "SelectAFormAddr", + [], [SDNPWantRoot]>; +def dform2_addr : ComplexPattern<iPTR, 2, "SelectDForm2Addr", + [], [SDNPWantRoot]>; diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp index cf71891..0928a9c 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.cpp +++ b/lib/Target/CellSPU/SPURegisterInfo.cpp @@ -329,44 +329,6 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, } } -/// determineFrameLayout - Determine the size of the frame and maximum call -/// frame size. -void -SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const -{ - MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Get the number of bytes to allocate from the FrameInfo - unsigned FrameSize = MFI->getStackSize(); - - // Get the alignments provided by the target, and the maximum alignment - // (if any) of the fixed frame objects. - unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment(); - unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment()); - assert(isPowerOf2_32(Align) && "Alignment is not power of 2"); - unsigned AlignMask = Align - 1; - - // Get the maximum call frame size of all the calls. - unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); - - // If we have dynamic alloca then maxCallFrameSize needs to be aligned so - // that allocations will be aligned. - if (MFI->hasVarSizedObjects()) - maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; - - // Update maximum call frame size. - MFI->setMaxCallFrameSize(maxCallFrameSize); - - // Include call frame size in total. - FrameSize += maxCallFrameSize; - - // Make sure the frame is aligned. - FrameSize = (FrameSize + AlignMask) & ~AlignMask; - - // Update frame info. - MFI->setStackSize(FrameSize); -} - void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { @@ -378,174 +340,11 @@ void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, MF.getRegInfo().setPhysRegUnused(SPU::R1); MF.getRegInfo().setPhysRegUnused(SPU::R2); - MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetRegisterClass *RC = &SPU::R32CRegClass; RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false)); - - -} - -void SPURegisterInfo::emitPrologue(MachineFunction &MF) const -{ - MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB - MachineBasicBlock::iterator MBBI = MBB.begin(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); - - // Prepare for debug frame info. - bool hasDebugInfo = MMI.hasDebugInfo(); - MCSymbol *FrameLabel = 0; - - // Move MBBI back to the beginning of the function. - MBBI = MBB.begin(); - - // Work out frame sizes. - determineFrameLayout(MF); - int FrameSize = MFI->getStackSize(); - - assert((FrameSize & 0xf) == 0 - && "SPURegisterInfo::emitPrologue: FrameSize not aligned"); - - // the "empty" frame size is 16 - just the register scavenger spill slot - if (FrameSize > 16 || MFI->adjustsStack()) { - FrameSize = -(FrameSize + SPUFrameInfo::minStackSize()); - if (hasDebugInfo) { - // Mark effective beginning of when frame pointer becomes valid. - FrameLabel = MMI.getContext().CreateTempSymbol(); - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(FrameLabel); - } - - // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp) - // for the ABI - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16) - .addReg(SPU::R1); - if (isInt<10>(FrameSize)) { - // Spill $sp to adjusted $sp - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize) - .addReg(SPU::R1); - // Adjust $sp by required amout - BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1) - .addImm(FrameSize); - } else if (isInt<16>(FrameSize)) { - // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use - // $r2 to adjust $sp: - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) - .addImm(-16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) - .addImm(FrameSize); - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQXr32), SPU::R1) - .addReg(SPU::R2) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) - .addReg(SPU::R1) - .addReg(SPU::R2); - BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2) - .addReg(SPU::R2) - .addImm(16); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) - .addReg(SPU::R2) - .addReg(SPU::R1); - } else { - report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); - } - - if (hasDebugInfo) { - std::vector<MachineMove> &Moves = MMI.getFrameMoves(); - - // Show update of SP. - MachineLocation SPDst(MachineLocation::VirtualFP); - MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize); - Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); - - // Add callee saved registers to move list. - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - for (unsigned I = 0, E = CSI.size(); I != E; ++I) { - int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); - unsigned Reg = CSI[I].getReg(); - if (Reg == SPU::R0) continue; - MachineLocation CSDst(MachineLocation::VirtualFP, Offset); - MachineLocation CSSrc(Reg); - Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc)); - } - - // Mark effective beginning of when frame pointer is ready. - MCSymbol *ReadyLabel = MMI.getContext().CreateTempSymbol(); - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)).addSym(ReadyLabel); - - MachineLocation FPDst(SPU::R1); - MachineLocation FPSrc(MachineLocation::VirtualFP); - Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc)); - } - } else { - // This is a leaf function -- insert a branch hint iff there are - // sufficient number instructions in the basic block. Note that - // this is just a best guess based on the basic block's size. - if (MBB.size() >= (unsigned) SPUFrameInfo::branchHintPenalty()) { - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - dl = MBBI->getDebugLoc(); - - // Insert terminator label - BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL)) - .addSym(MMI.getContext().CreateTempSymbol()); - } - } -} - -void -SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const -{ - MachineBasicBlock::iterator MBBI = prior(MBB.end()); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int FrameSize = MFI->getStackSize(); - int LinkSlotOffset = SPUFrameInfo::stackSlotSize(); - DebugLoc dl = MBBI->getDebugLoc(); - - assert(MBBI->getOpcode() == SPU::RET && - "Can only insert epilog into returning blocks"); - assert((FrameSize & 0xf) == 0 - && "SPURegisterInfo::emitEpilogue: FrameSize not aligned"); - - // the "empty" frame size is 16 - just the register scavenger spill slot - if (FrameSize > 16 || MFI->adjustsStack()) { - FrameSize = FrameSize + SPUFrameInfo::minStackSize(); - if (isInt<10>(FrameSize + LinkSlotOffset)) { - // Reload $lr, adjust $sp by required amount - // Note: We do this to slightly improve dual issue -- not by much, but it - // is an opportunity for dual issue. - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) - .addImm(FrameSize + LinkSlotOffset) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1) - .addReg(SPU::R1) - .addImm(FrameSize); - } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) { - // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use - // $r2 to adjust $sp: - BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2) - .addImm(16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2) - .addImm(FrameSize); - BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1) - .addReg(SPU::R1) - .addReg(SPU::R2); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0) - .addImm(16) - .addReg(SPU::R1); - BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2). - addReg(SPU::R2) - .addImm(16); - BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2) - .addReg(SPU::R2) - .addReg(SPU::R1); - } else { - report_fatal_error("Unhandled frame size: " + Twine(FrameSize)); - } - } } unsigned @@ -576,10 +375,10 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const { return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0); } -int +int SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const { - switch(dFormOpcode) + switch(dFormOpcode) { case SPU::AIr32: return SPU::Ar32; case SPU::LQDr32: return SPU::LQXr32; @@ -602,10 +401,10 @@ SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const // TODO this is already copied from PPC. Could this convenience function // be moved to the RegScavenger class? -unsigned -SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, +unsigned +SPURegisterInfo::findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS, - const TargetRegisterClass *RC, + const TargetRegisterClass *RC, int SPAdj) const { assert(RS && "Register scavenging must be on"); diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h index aedb769..088157e 100644 --- a/lib/Target/CellSPU/SPURegisterInfo.h +++ b/lib/Target/CellSPU/SPURegisterInfo.h @@ -33,7 +33,7 @@ namespace llvm { public: SPURegisterInfo(const SPUSubtarget &subtarget, const TargetInstrInfo &tii); - + //! Translate a register's enum value to a register number /*! This method translates a register's enum value to it's regiser number, @@ -65,15 +65,10 @@ namespace llvm { //! Convert frame indicies into machine operands void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, RegScavenger *RS = NULL) const; - //! Determine the frame's layour - void determineFrameLayout(MachineFunction &MF) const; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = NULL) const; - //! Emit the function prologue - void emitPrologue(MachineFunction &MF) const; - //! Emit the function epilogue - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + //! Get return address register (LR, aka R0) unsigned getRARegister() const; //! Get the stack frame register (SP, aka R1) diff --git a/lib/Target/CellSPU/SPUSchedule.td b/lib/Target/CellSPU/SPUSchedule.td index a0b581f..f4d082c 100644 --- a/lib/Target/CellSPU/SPUSchedule.td +++ b/lib/Target/CellSPU/SPUSchedule.td @@ -36,7 +36,7 @@ def RotateShift : InstrItinClass; // EVEN_UNIT def ImmLoad : InstrItinClass; // EVEN_UNIT /* Note: The itinerary for the Cell SPU is somewhat contrived... */ -def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [ +def SPUItineraries : ProcessorItineraries<[ODD_UNIT, EVEN_UNIT], [], [ InstrItinData<LoadStore , [InstrStage<6, [ODD_UNIT]>]>, InstrItinData<BranchHints , [InstrStage<6, [ODD_UNIT]>]>, InstrItinData<BranchResolv, [InstrStage<4, [ODD_UNIT]>]>, diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h index 88201c6..147163d 100644 --- a/lib/Target/CellSPU/SPUSubtarget.h +++ b/lib/Target/CellSPU/SPUSubtarget.h @@ -81,7 +81,7 @@ namespace llvm { /// properties of this subtarget. const char *getTargetDataString() const { return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128" - "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:128:128-v128:128:128" + "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:64:128-v128:128:128" "-s:128:128-n32:64"; } }; diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp index 480ec3f..7e16ab5 100644 --- a/lib/Target/CellSPU/SPUTargetMachine.cpp +++ b/lib/Target/CellSPU/SPUTargetMachine.cpp @@ -40,7 +40,7 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT, Subtarget(TT, FS), DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this), - FrameInfo(*this), + FrameInfo(Subtarget), TLInfo(*this), TSInfo(*this), InstrItins(Subtarget.getInstrItineraryData()) { diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h index 7e02701..e306883 100644 --- a/lib/Target/CellSPU/SPUTargetMachine.h +++ b/lib/Target/CellSPU/SPUTargetMachine.h @@ -75,8 +75,8 @@ public: return &DataLayout; } - virtual const InstrItineraryData getInstrItineraryData() const { - return InstrItins; + virtual const InstrItineraryData *getInstrItineraryData() const { + return &InstrItins; } // Pass Pipeline Configuration |