2 files changed, 46 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b5e91ce..96a58c1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -704,6 +704,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::STORE);
 
   computeRegisterProperties();
 
@@ -5872,6 +5873,35 @@ static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   return SDOperand();
 }
 
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDOperand PerformSTORECombine(StoreSDNode *St, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  if (MVT::isVector(St->getValue().getValueType()) && 
+      MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
+      // Must be a store of a load.
+      isa<LoadSDNode>(St->getChain()) &&
+      St->getChain().Val == St->getValue().Val && 
+      St->getValue().hasOneUse() && St->getChain().hasOneUse() &&
+      !St->isVolatile() && !cast<LoadSDNode>(St->getChain())->isVolatile()) {
+    LoadSDNode *Ld = cast<LoadSDNode>(St->getChain());
+    
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    if (Subtarget->is64Bit()) {
+      SDOperand NewLd = DAG.getLoad(MVT::i64, Ld->getChain(), Ld->getBasePtr(),
+                                    Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                                    Ld->isVolatile(), Ld->getAlignment());
+      return DAG.getStore(NewLd.getValue(1), NewLd, St->getBasePtr(),
+                          St->getSrcValue(), St->getSrcValueOffset(),
+                          St->isVolatile(), St->getAlignment());
+    }
+    
+    // TODO: 2 32-bit copies.
+  }
+  return SDOperand();
+}
+
 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
 /// X86ISD::FXOR nodes.
 static SDOperand PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -5908,6 +5938,8 @@ SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, Subtarget);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
+  case ISD::STORE:          
+      return PerformSTORECombine(cast<StoreSDNode>(N), DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
diff --git a/test/CodeGen/X86/mmx-copy-gprs.ll b/test/CodeGen/X86/mmx-copy-gprs.ll
new file mode 100644
index 0000000..8cf36e0
--- /dev/null
+++ b/test/CodeGen/X86/mmx-copy-gprs.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {movq.*(%rsi), %rax}
+
+; This test should use GPRs to copy the mmx value, not MMX regs.  Using mmx regs,
+; increases the places that need to use emms.
+
+; rdar://5741668
+target triple = "x86_64-apple-darwin8"
+
+define i32 @foo(<1 x i64>* %x, <1 x i64>* %y) nounwind  {
+entry:
+	%tmp1 = load <1 x i64>* %y, align 8		; <<1 x i64>> [#uses=1]
+	store <1 x i64> %tmp1, <1 x i64>* %x, align 8
+	ret i32 undef
+}