- Two-address pass should not assume unfolding is always successful.

- X86 unfolding should check if the instructions being unfolded has memoperands. If there is no memoperands, then it must assume conservative alignment. If this would introduce an expensive sse unaligned load / store, then unfoldMemoryOperand etc. should not unfold the instruction. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@107509 91177308-0d34-0410-b5e6-96231b3b80d8
author: Evan Cheng <evan.cheng@apple.com> 2010-07-02 20:36:18 +0000
committer: Evan Cheng <evan.cheng@apple.com> 2010-07-02 20:36:18 +0000
commit: 4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f (patch)
tree: 8c339eb3454cb4b52ad3eb87c73a4c76d0df1bb4
parent: af85b0cc0711711c384e1216c757c144d1d7fcab (diff)
download: external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.zip
external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.tar.gz
external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.tar.bz2
3 files changed, 126 insertions, 12 deletions
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 0c97dad..62fa0fd 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -926,14 +926,12 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
           UnfoldTID.OpInfo[LoadRegIndex].getRegClass(TRI);
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
-        bool Success =
-          TII->unfoldMemoryOperand(MF, mi, Reg,
-                                   /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
-                                   NewMIs);
-        (void)Success;
-        assert(Success &&
-               "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
-               "succeeded!");
+        if (!TII->unfoldMemoryOperand(MF, mi, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
         assert(NewMIs.size() == 2 &&
                "Unfolded a load into multiple instructions!");
         // The load was previously folded, so this is the only use.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index caa6233..c1d66cb 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2159,7 +2159,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -2189,7 +2189,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  bool isAligned = *MMOBegin && (*MMOBegin)->getAlignment() >= 16;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@@ -2693,6 +2693,13 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   const TargetInstrDesc &TID = get(Opc);
   const TargetOperandInfo &TOI = TID.OpInfo[Index];
   const TargetRegisterClass *RC = TOI.getRegClass(&RI);
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
   SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
@@ -2834,7 +2841,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                             cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                               VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
@@ -2871,7 +2883,12 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                              cast<MachineSDNode>(N)->memoperands_end());
-    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
     SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                          isAligned, TM),
                                        dl, MVT::Other,
diff --git a/test/CodeGen/X86/2010-07-02-UnfoldBug.ll b/test/CodeGen/X86/2010-07-02-UnfoldBug.ll
new file mode 100644
index 0000000..79219dc
--- /dev/null
+++ b/test/CodeGen/X86/2010-07-02-UnfoldBug.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin
+; rdar://8154265
+
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define void @_ZN2CA3OGL20fill_surface_mesh_3dERNS0_7ContextEPKNS_6Render13MeshTransformEPKNS0_5LayerEPNS0_7SurfaceEfNS0_13TextureFilterESC_f() nounwind optsize ssp {
+entry:
+  br i1 undef, label %bb2.thread, label %bb2
+
+bb2.thread:                                       ; preds = %entry
+  br i1 undef, label %bb41, label %bb10.preheader
+
+bb2:                                              ; preds = %entry
+  unreachable
+
+bb10.preheader:                                   ; preds = %bb2.thread
+  br i1 undef, label %bb9, label %bb12
+
+bb9:                                              ; preds = %bb9, %bb10.preheader
+  br i1 undef, label %bb9, label %bb12
+
+bb12:                                             ; preds = %bb9, %bb10.preheader
+  br i1 undef, label %bb4.i.i, label %bb3.i.i
+
+bb3.i.i:                                          ; preds = %bb12
+  unreachable
+
+bb4.i.i:                                          ; preds = %bb12
+  br i1 undef, label %bb8.i.i, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+
+bb8.i.i:                                          ; preds = %bb4.i.i
+  br i1 undef, label %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit, label %bb9.i.i
+
+bb9.i.i:                                          ; preds = %bb8.i.i
+  br i1 undef, label %bb11.i.i, label %bb10.i.i
+
+bb10.i.i:                                         ; preds = %bb9.i.i
+  unreachable
+
+bb11.i.i:                                         ; preds = %bb9.i.i
+  unreachable
+
+_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit: ; preds = %bb8.i.i, %bb4.i.i
+  br i1 undef, label %bb19, label %bb14
+
+bb14:                                             ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+  unreachable
+
+bb19:                                             ; preds = %_ZN2CA3OGL12_GLOBAL__N_16LightsC1ERNS0_7ContextEPKNS0_5LayerEPKNS_6Render13MeshTransformERKNS_4Vec3IfEESF_.exit
+  br i1 undef, label %bb.i50, label %bb6.i
+
+bb.i50:                                           ; preds = %bb19
+  unreachable
+
+bb6.i:                                            ; preds = %bb19
+  br i1 undef, label %bb28, label %bb.nph106
+
+bb22:                                             ; preds = %bb24.preheader
+  br i1 undef, label %bb2.i.i, label %bb.i.i49
+
+bb.i.i49:                                         ; preds = %bb22
+  %0 = load float* undef, align 4                 ; <float> [#uses=1]
+  %1 = insertelement <4 x float> undef, float %0, i32 0 ; <<4 x float>> [#uses=1]
+  %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> %1) nounwind readnone ; <<4 x float>> [#uses=1]
+  %3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %2, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
+  %4 = extractelement <4 x float> %3, i32 0       ; <float> [#uses=1]
+  store float %4, float* undef, align 4
+  %5 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> <float 1.000000e+00, float undef, float undef, float undef>, <4 x float> undef) nounwind readnone ; <<4 x float>> [#uses=1]
+  %6 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %5, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>) nounwind readnone ; <<4 x float>> [#uses=1]
+  %7 = extractelement <4 x float> %6, i32 0       ; <float> [#uses=1]
+  store float %7, float* undef, align 4
+  unreachable
+
+bb2.i.i:                                          ; preds = %bb22
+  unreachable
+
+bb26.loopexit:                                    ; preds = %bb24.preheader
+  br i1 undef, label %bb28, label %bb24.preheader
+
+bb.nph106:                                        ; preds = %bb6.i
+  br label %bb24.preheader
+
+bb24.preheader:                                   ; preds = %bb.nph106, %bb26.loopexit
+  br i1 undef, label %bb22, label %bb26.loopexit
+
+bb28:                                             ; preds = %bb26.loopexit, %bb6.i
+  unreachable
+
+bb41:                                             ; preds = %bb2.thread
+  br i1 undef, label %return, label %bb46
+
+bb46:                                             ; preds = %bb41
+  ret void
+
+return:                                           ; preds = %bb41
+  ret void
+}
author	Evan Cheng <evan.cheng@apple.com>	2010-07-02 20:36:18 +0000
committer	Evan Cheng <evan.cheng@apple.com>	2010-07-02 20:36:18 +0000
commit	4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f (patch)
tree	8c339eb3454cb4b52ad3eb87c73a4c76d0df1bb4
parent	af85b0cc0711711c384e1216c757c144d1d7fcab (diff)
download	external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.zip external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.tar.gz external_llvm-4cb1de56a6ed9c5d12360773ea32cd8e8b635e7f.tar.bz2