diff options
author | Evan Cheng <evan.cheng@apple.com> | 2009-12-18 07:40:29 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2009-12-18 07:40:29 +0000 |
commit | 400073d5467b79534d8c63b0d996a55e4252ff4b (patch) | |
tree | f7204e84da8877e7b062f05bcb1878a05108b44e /lib/Target/X86/X86InstrSSE.td | |
parent | 3a5d409f3c2eccf1d1f0a4616023760829a4db67 (diff) | |
download | external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.zip external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.tar.gz external_llvm-400073d5467b79534d8c63b0d996a55e4252ff4b.tar.bz2 |
On recent Intel u-arch's, folding loads into some unary SSE instructions can
be non-optimal. To be precise, we should avoid folding loads if the instructions
only update part of the destination register, and the non-updated part is not
needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these instructions breaks
the partial register dependency and it can improve performance. e.g.
movss (%rdi), %xmm0
cvtss2sd %xmm0, %xmm0
instead of
cvtss2sd (%rdi), %xmm0
An alternative method to break dependency is to clear the register first. e.g.
xorps %xmm0, %xmm0
cvtss2sd (%rdi), %xmm0
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@91672 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86/X86InstrSSE.td')
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ae1a68a..694b91e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -824,9 +824,10 @@ multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr, } // Scalar operation, mem. - def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))]>; + [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS, + Requires<[HasSSE1, NoSSEBreakDep]>; // Vector operation, reg. def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -1116,9 +1117,10 @@ def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src), def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround FR64:$src))]>; -def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", - [(set FR32:$dst, (fround (loadf64 addr:$src)))]>; + [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, + Requires<[HasSSE2, NoSSEBreakDep]>; def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src), "cvtsi2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (sint_to_fp GR32:$src))]>; @@ -1155,7 +1157,10 @@ def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, - Requires<[HasSSE2]>; + Requires<[HasSSE2, NoSSEBreakDep]>; + +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[SSEBreakDep]>; // Match intrinsics which expect XMM operand(s). def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), @@ -3220,13 +3225,14 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, OpSize; // Vector intrinsic operation, mem - def PSm_Int : SS4AIi8<opcps, MRMSrcMem, + def PSm_Int : Ii8<opcps, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>, - OpSize; + TA, OpSize, + Requires<[HasSSE41, NoSSEBreakDep]>; // Vector intrinsic operation, reg def PDr_Int : SS4AIi8<opcpd, MRMSrcReg, |