diff options
author | Evan Cheng <evan.cheng@apple.com> | 2009-12-18 07:40:29 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2009-12-18 07:40:29 +0000 |
commit | 400073d5467b79534d8c63b0d996a55e4252ff4b (patch) | |
tree | f7204e84da8877e7b062f05bcb1878a05108b44e | |
parent | 3a5d409f3c2eccf1d1f0a4616023760829a4db67 (diff) | |
download | external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.zip external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.tar.gz external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.tar.bz2 |
On recent Intel u-arch's, folding loads into some unary SSE instructions can
be non-optimal. To be precise, we should avoid folding loads if the instructions
only update part of the destination register, and the non-updated part is not
needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these instructions breaks
the partial register dependency and it can improve performance. e.g.
movss (%rdi), %xmm0
cvtss2sd %xmm0, %xmm0
instead of
cvtss2sd (%rdi), %xmm0
An alternative method to break dependency is to clear the register first. e.g.
xorps %xmm0, %xmm0
cvtss2sd (%rdi), %xmm0
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91672 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86.td | 32 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 34 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 20 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.cpp | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86Subtarget.h | 9 | ||||
-rw-r--r-- | test/CodeGen/X86/break-sse-dep.ll | 28 |
7 files changed, 110 insertions, 17 deletions
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index a6e1ca3..313177e 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -57,6 +57,8 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", "Support 64-bit instructions">; def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; +def FeatureBreakSSEDep : SubtargetFeature<"break-sse-dep", "BreakSSEDep","true", + "Should break SSE partial update dep with load / xorps">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions">; @@ -86,17 +88,27 @@ def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; def : Proc<"pentium3", [FeatureSSE1]>; def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; -def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; +def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureBreakSSEDep]>; // Sandy Bridge does not have FMA -def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>; +def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit, + FeatureBreakSSEDep]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index bc72f63..e1e6ff3 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2370,6 +2370,23 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check switch flag if (NoFusing) return NULL; + if (TM.getSubtarget<X86Subtarget>().shouldBreakSSEDep()) + switch (MI->getOpcode()) { + case X86::CVTSD2SSrr: + case X86::Int_CVTSD2SSrr: + case X86::CVTSS2SDrr: + case X86::Int_CVTSS2SDrr: + case X86::RCPSSr: + case X86::RCPSSr_Int: + case X86::ROUNDSDr_Int: + case X86::ROUNDSSr_Int: + case X86::RSQRTSSr: + case X86::RSQRTSSr_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + return 0; + } + const MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Size = MFI->getObjectSize(FrameIndex); unsigned Alignment = MFI->getObjectAlignment(FrameIndex); @@ -2405,6 +2422,23 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // Check switch flag if (NoFusing) return NULL; + if (TM.getSubtarget<X86Subtarget>().shouldBreakSSEDep()) + switch (MI->getOpcode()) { + case X86::CVTSD2SSrr: + case X86::Int_CVTSD2SSrr: + case X86::CVTSS2SDrr: + case X86::Int_CVTSS2SDrr: + case X86::RCPSSr: + case X86::RCPSSr_Int: + case X86::ROUNDSDr_Int: + case X86::ROUNDSSr_Int: + case X86::RSQRTSSr: + case X86::RSQRTSSr_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + return 0; + } + // Determine the alignment of the load. unsigned Alignment = 0; if (LoadMI->hasOneMemOperand()) diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 7411dde..0b6efaa 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -301,6 +301,8 @@ def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; +def SSEBreakDep : Predicate<"Subtarget->shouldBreakSSEDep() && !OptForSize">; +def NoSSEBreakDep: Predicate<"!Subtarget->shouldBreakSSEDep() || OptForSize">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ae1a68a..694b91e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -824,9 +824,10 @@ multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr, } // Scalar operation, mem. - def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))]>; + [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS, + Requires<[HasSSE1, NoSSEBreakDep]>; // Vector operation, reg. def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -1116,9 +1117,10 @@ def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src), def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround FR64:$src))]>; -def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround (loadf64 addr:$src)))]>; + [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, + Requires<[HasSSE2, NoSSEBreakDep]>; def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src), "cvtsi2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (sint_to_fp GR32:$src))]>; @@ -1155,7 +1157,10 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, - Requires<[HasSSE2]>; + Requires<[HasSSE2, NoSSEBreakDep]>; + +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[SSEBreakDep]>; // Match intrinsics which expect XMM operand(s). def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), @@ -3220,13 +3225,14 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, OpSize; // Vector intrinsic operation, mem - def PSm_Int : SS4AIi8<opcps, MRMSrcMem, + def PSm_Int : Ii8<opcps, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>, - OpSize; + TA, OpSize, + Requires<[HasSSE41, NoSSEBreakDep]>; // Vector intrinsic operation, reg def PDr_Int : SS4AIi8<opcpd, MRMSrcReg, diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 75cdbad..4db3fdb 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -266,6 +266,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { unsigned Model = 0; DetectFamilyModel(EAX, Family, Model); IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13); + BreakSSEDep = IsIntel; GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); HasX86_64 = (EDX >> 29) & 0x1; @@ -286,6 +287,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, , HasFMA3(false) , HasFMA4(false) , IsBTMemSlow(false) + , BreakSSEDep(false) , DarwinVers(0) , stackAlignment(8) // FIXME: this is a known good value for Yonah. How about others? diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index fb457dd..b2b48ed 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -77,6 +77,14 @@ protected: /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; + + /// BreakSSEDep - True if codegen should unfold load or insert xorps / pxor + /// to break register dependency for a partial register update SSE + /// instruction. This is needed for instructions such as CVTSS2SD which + /// only update the lower part of the register, and the result of the updated + /// part does not depend on the contents of the destination before the + /// instruction, and the non-updated portion of the register is not used. + bool BreakSSEDep; /// DarwinVers - Nonzero if this is a darwin platform: the numeric /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc. @@ -142,6 +150,7 @@ public: bool hasFMA3() const { return HasFMA3; } bool hasFMA4() const { return HasFMA4; } bool isBTMemSlow() const { return IsBTMemSlow; } + bool shouldBreakSSEDep() const { return BreakSSEDep; } bool isTargetDarwin() const { return TargetType == isDarwin; } bool isTargetELF() const { return TargetType == isELF; } diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll new file mode 100644 index 0000000..00c943f --- /dev/null +++ b/test/CodeGen/X86/break-sse-dep.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -march=x86-64 -mattr=+sse2,+break-sse-dep | FileCheck %s --check-prefix=YES +; RUN: llc < %s -march=x86-64 -mattr=+sse2,-break-sse-dep | FileCheck %s --check-prefix=NO + +define double @t1(float* nocapture %x) nounwind readonly ssp { +entry: +; YES: t1: +; YES: movss (%rdi), %xmm0 +; YES; cvtss2sd %xmm0, %xmm0 + +; NO: t1: +; NO; cvtss2sd (%rdi), %xmm0 + %0 = load float* %x, align 4 + %1 = fpext float %0 to double + ret double %1 +} + +define float @t2(double* nocapture %x) nounwind readonly ssp { +entry: +; YES: t2: +; YES: movsd (%rdi), %xmm0 +; YES; cvtsd2ss %xmm0, %xmm0 + +; NO: t2: +; NO; cvtsd2ss (%rdi), %xmm0 + %0 = load double* %x, align 8 + %1 = fptrunc double %0 to float + ret float %1 +} |