31 files changed, 687 insertions, 750 deletions
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 2b8d678..8ea14a5 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -323,17 +323,20 @@ public:
   SDOperand getNode(unsigned Opcode, SDVTList VTs,
                     const SDOperand *Ops, unsigned NumOps);
 
-  SDOperand getMemcpy(SDOperand Chain, SDOperand Dest, SDOperand Src,
-                      SDOperand Size, SDOperand Align,
-                      SDOperand AlwaysInline);
-
-  SDOperand getMemmove(SDOperand Chain, SDOperand Dest, SDOperand Src,
-                      SDOperand Size, SDOperand Align,
-                      SDOperand AlwaysInline);
-
-  SDOperand getMemset(SDOperand Chain, SDOperand Dest, SDOperand Src,
-                      SDOperand Size, SDOperand Align,
-                      SDOperand AlwaysInline);
+  SDOperand getMemcpy(SDOperand Chain, SDOperand Dst, SDOperand Src,
+                      SDOperand Size, unsigned Align,
+                      bool AlwaysInline,
+                      Value *DstSV, uint64_t DstOff,
+                      Value *SrcSV, uint64_t SrcOff);
+
+  SDOperand getMemmove(SDOperand Chain, SDOperand Dst, SDOperand Src,
+                      SDOperand Size, unsigned Align,
+                      Value *DstSV, uint64_t DstOff,
+                      Value *SrcSV, uint64_t SrcOff);
+
+  SDOperand getMemset(SDOperand Chain, SDOperand Dst, SDOperand Src,
+                      SDOperand Size, unsigned Align,
+                      Value *DstSV, uint64_t DstOff);
 
   /// getSetCC - Helper function to make it easier to build SetCC's if you just
   /// have an ISD::CondCode instead of an SDOperand.
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 6b2b857..deded1a 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -497,14 +497,6 @@ namespace ISD {
     // it returns an output chain.
     STACKRESTORE,
     
-    // MEMSET/MEMCPY/MEMMOVE - The first operand is the chain. The following
-    // correspond to the operands of the LLVM intrinsic functions and the last
-    // one is AlwaysInline.  The only result is a token chain.  The alignment
-    // argument is guaranteed to be a Constant node.
-    MEMSET,
-    MEMMOVE,
-    MEMCPY,
-
     // CALLSEQ_START/CALLSEQ_END - These operators mark the beginning and end of
     // a call sequence, and carry arbitrary information that target might want
     // to know.  The first operand is a chain, the rest are specified by the
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 719f719..16f9ed6 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -948,18 +948,61 @@ public:
               SDOperand Callee, ArgListTy &Args, SelectionDAG &DAG);
 
 
-  virtual SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
-  virtual SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest,
-                                    SDOperand Source, SDOperand Count,
-                                    SelectionDAG &DAG);
-  virtual SDOperand LowerMEMCPYInline(SDOperand Chain, SDOperand Dest,
-                                      SDOperand Source, unsigned Size,
-                                      unsigned Align, SelectionDAG &DAG) {
-    assert(0 && "Not Implemented");
-    return SDOperand();   // this is here to silence compiler errors
+  /// EmitTargetCodeForMemcpy - Emit target-specific code that performs a
+  /// memcpy. This can be used by targets to provide code sequences for cases
+  /// that don't fit the target's parameters for simple loads/stores and can be
+  /// more efficient than using a library call. This function can return a null
+  /// SDOperand if the target declines to use inline code and a different
+  /// lowering strategy should be used.
+  /// 
+  /// If AlwaysInline is true, the size is constant and the target should not
+  /// emit any calls and is strongly encouraged to attempt to emit inline code
+  /// even if it is beyond the usual threshold because this intrinsic is being
+  /// expanded in a place where calls are not feasible (e.g. within the prologue
+  /// for another call). If the target chooses to decline an AlwaysInline
+  /// request here, legalize will resort to using simple loads and stores.
+  virtual SDOperand
+  EmitTargetCodeForMemcpy(SelectionDAG &DAG,
+                          SDOperand Chain,
+                          SDOperand Op1, SDOperand Op2,
+                          SDOperand Op3, unsigned Align,
+                          bool AlwaysInline,
+                          Value *DstSV, uint64_t DstOff,
+                          Value *SrcSV, uint64_t SrcOff) {
+    return SDOperand();
+  }
+
+  /// EmitTargetCodeForMemmove - Emit target-specific code that performs a
+  /// memmove. This can be used by targets to provide code sequences for cases
+  /// that don't fit the target's parameters for simple loads/stores and can be
+  /// more efficient than using a library call. This function can return a null
+  /// SDOperand if the target declines to use code and a different lowering
+  /// strategy should be used.
+  virtual SDOperand
+  EmitTargetCodeForMemmove(SelectionDAG &DAG,
+                           SDOperand Chain,
+                           SDOperand Op1, SDOperand Op2,
+                           SDOperand Op3, unsigned Align,
+                           Value *DstSV, uint64_t DstOff,
+                           Value *SrcSV, uint64_t SrcOff) {
+    return SDOperand();
+  }
+
+  /// EmitTargetCodeForMemset - Emit target-specific code that performs a
+  /// memset. This can be used by targets to provide code sequences for cases
+  /// that don't fit the target's parameters for simple stores and can be more
+  /// efficient than using a library call. This function can return a null
+  /// SDOperand if the target declines to use code and a different lowering
+  /// strategy should be used.
+  virtual SDOperand
+  EmitTargetCodeForMemset(SelectionDAG &DAG,
+                          SDOperand Chain,
+                          SDOperand Op1, SDOperand Op2,
+                          SDOperand Op3, unsigned Align,
+                          Value *DstSV, uint64_t DstOff) {
+    return SDOperand();
   }
 
-
   /// LowerOperation - This callback is invoked for operations that are 
   /// unsupported by the target, which are registered to use 'custom' lowering,
   /// and whose defined values are all legal.
diff --git a/include/llvm/Target/TargetSubtarget.h b/include/llvm/Target/TargetSubtarget.h
index 1096b16..fde8f44 100644
--- a/include/llvm/Target/TargetSubtarget.h
+++ b/include/llvm/Target/TargetSubtarget.h
@@ -28,9 +28,6 @@ class TargetSubtarget {
 protected: // Can only create subclasses...
   TargetSubtarget();
 public:
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  virtual unsigned getMaxInlineSizeThreshold() const {return 0; }
   virtual ~TargetSubtarget();
 };
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5cb13e3..2df363e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtarget.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -2842,123 +2843,6 @@ SDOperand SelectionDAGLegalize::LegalizeOp(SDOperand Op) {
       break;
     }
     break;
-  case ISD::MEMSET:
-  case ISD::MEMCPY:
-  case ISD::MEMMOVE: {
-    Tmp1 = LegalizeOp(Node->getOperand(0));      // Chain
-    Tmp2 = LegalizeOp(Node->getOperand(1));      // Pointer
-
-    if (Node->getOpcode() == ISD::MEMSET) {      // memset = ubyte
-      switch (getTypeAction(Node->getOperand(2).getValueType())) {
-      case Expand: assert(0 && "Cannot expand a byte!");
-      case Legal:
-        Tmp3 = LegalizeOp(Node->getOperand(2));
-        break;
-      case Promote:
-        Tmp3 = PromoteOp(Node->getOperand(2));
-        break;
-      }
-    } else {
-      Tmp3 = LegalizeOp(Node->getOperand(2));    // memcpy/move = pointer,
-    }
-
-    SDOperand Tmp4;
-    switch (getTypeAction(Node->getOperand(3).getValueType())) {
-    case Expand: {
-      // Length is too big, just take the lo-part of the length.
-      SDOperand HiPart;
-      ExpandOp(Node->getOperand(3), Tmp4, HiPart);
-      break;
-    }
-    case Legal:
-      Tmp4 = LegalizeOp(Node->getOperand(3));
-      break;
-    case Promote:
-      Tmp4 = PromoteOp(Node->getOperand(3));
-      break;
-    }
-
-    SDOperand Tmp5;
-    switch (getTypeAction(Node->getOperand(4).getValueType())) {  // uint
-    case Expand: assert(0 && "Cannot expand this yet!");
-    case Legal:
-      Tmp5 = LegalizeOp(Node->getOperand(4));
-      break;
-    case Promote:
-      Tmp5 = PromoteOp(Node->getOperand(4));
-      break;
-    }
-
-    SDOperand Tmp6;
-    switch (getTypeAction(Node->getOperand(5).getValueType())) {  // bool
-    case Expand: assert(0 && "Cannot expand this yet!");
-    case Legal:
-      Tmp6 = LegalizeOp(Node->getOperand(5));
-      break;
-    case Promote:
-      Tmp6 = PromoteOp(Node->getOperand(5));
-      break;
-    }
-
-    switch (TLI.getOperationAction(Node->getOpcode(), MVT::Other)) {
-    default: assert(0 && "This action not implemented for this operation!");
-    case TargetLowering::Custom:
-      isCustom = true;
-      // FALLTHROUGH
-    case TargetLowering::Legal: {
-      SDOperand Ops[] = { Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6 };
-      Result = DAG.UpdateNodeOperands(Result, Ops, 6);
-      if (isCustom) {
-        Tmp1 = TLI.LowerOperation(Result, DAG);
-        if (Tmp1.Val) Result = Tmp1;
-      }
-      break;
-    }
-    case TargetLowering::Expand: {
-      // Otherwise, the target does not support this operation.  Lower the
-      // operation to an explicit libcall as appropriate.
-      MVT::ValueType IntPtr = TLI.getPointerTy();
-      const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType();
-      TargetLowering::ArgListTy Args;
-      TargetLowering::ArgListEntry Entry;
-
-      const char *FnName = 0;
-      if (Node->getOpcode() == ISD::MEMSET) {
-        Entry.Node = Tmp2; Entry.Ty = IntPtrTy;
-        Args.push_back(Entry);
-        // Extend the (previously legalized) ubyte argument to be an int value
-        // for the call.
-        if (Tmp3.getValueType() > MVT::i32)
-          Tmp3 = DAG.getNode(ISD::TRUNCATE, MVT::i32, Tmp3);
-        else
-          Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Tmp3);
-        Entry.Node = Tmp3; Entry.Ty = Type::Int32Ty; Entry.isSExt = true;
-        Args.push_back(Entry);
-        Entry.Node = Tmp4; Entry.Ty = IntPtrTy; Entry.isSExt = false;
-        Args.push_back(Entry);
-
-        FnName = "memset";
-      } else if (Node->getOpcode() == ISD::MEMCPY ||
-                 Node->getOpcode() == ISD::MEMMOVE) {
-        Entry.Ty = IntPtrTy;
-        Entry.Node = Tmp2; Args.push_back(Entry);
-        Entry.Node = Tmp3; Args.push_back(Entry);
-        Entry.Node = Tmp4; Args.push_back(Entry);
-        FnName = Node->getOpcode() == ISD::MEMMOVE ? "memmove" : "memcpy";
-      } else {
-        assert(0 && "Unknown op!");
-      }
-
-      std::pair<SDOperand,SDOperand> CallResult =
-        TLI.LowerCallTo(Tmp1, Type::VoidTy,
-                        false, false, false, CallingConv::C, false,
-                        DAG.getExternalSymbol(FnName, IntPtr), Args, DAG);
-      Result = CallResult.second;
-      break;
-    }
-    }
-    break;
-  }
 
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 6511cff..380c422 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -439,51 +439,6 @@ SDOperand DAGTypeLegalizer::CreateStackStoreLoad(SDOperand Op,
   return DAG.getLoad(DestVT, Store, FIPtr, NULL, 0);
 }
 
-/// HandleMemIntrinsic - This handles memcpy/memset/memmove with invalid
-/// operands.  This promotes or expands the operands as required.
-SDOperand DAGTypeLegalizer::HandleMemIntrinsic(SDNode *N) {
-  // The chain and pointer [operands #0 and #1] are always valid types.
-  SDOperand Chain = N->getOperand(0);
-  SDOperand Ptr   = N->getOperand(1);
-  SDOperand Op2   = N->getOperand(2);
-  
-  // Op #2 is either a value (memset) or a pointer.  Promote it if required.
-  switch (getTypeAction(Op2.getValueType())) {
-  default: assert(0 && "Unknown action for pointer/value operand");
-  case Legal: break;
-  case Promote: Op2 = GetPromotedOp(Op2); break;
-  }
-
-  // The length could have any action required.
-  SDOperand Length = N->getOperand(3);
-  switch (getTypeAction(Length.getValueType())) {
-  default: assert(0 && "Unknown action for memop operand");
-  case Legal: break;
-  case Promote: Length = GetPromotedZExtOp(Length); break;
-  case Expand:
-    SDOperand Dummy;  // discard the high part.
-    GetExpandedOp(Length, Length, Dummy);
-    break;
-  }
-
-  SDOperand Align = N->getOperand(4);
-  switch (getTypeAction(Align.getValueType())) {
-  default: assert(0 && "Unknown action for memop operand");
-  case Legal: break;
-  case Promote: Align = GetPromotedZExtOp(Align); break;
-  }
-
-  SDOperand AlwaysInline = N->getOperand(5);
-  switch (getTypeAction(AlwaysInline.getValueType())) {
-  default: assert(0 && "Unknown action for memop operand");
-  case Legal: break;
-  case Promote: AlwaysInline = GetPromotedZExtOp(AlwaysInline); break;
-  }
-
-  SDOperand Ops[] = { Chain, Ptr, Op2, Length, Align, AlwaysInline };
-  return DAG.UpdateNodeOperands(SDOperand(N, 0), Ops, 6);
-}
-
 /// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
 SDOperand DAGTypeLegalizer::JoinIntegers(SDOperand Lo, SDOperand Hi) {
   MVT::ValueType LVT = Lo.getValueType();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 7d245ab..5b98793 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -165,7 +165,6 @@ private:
   // Common routines.
   SDOperand BitConvertToInteger(SDOperand Op);
   SDOperand CreateStackStoreLoad(SDOperand Op, MVT::ValueType DestVT);
-  SDOperand HandleMemIntrinsic(SDNode *N);
   SDOperand JoinIntegers(SDOperand Lo, SDOperand Hi);
   void SplitInteger(SDOperand Op, SDOperand &Lo, SDOperand &Hi);
   void SplitInteger(SDOperand Op, MVT::ValueType LoVT, MVT::ValueType HiVT,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesExpand.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesExpand.cpp
index b872a44..fcde8f3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesExpand.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesExpand.cpp
@@ -946,9 +946,6 @@ bool DAGTypeLegalizer::ExpandOperand(SDNode *N, unsigned OpNo) {
     case ISD::STORE:
       Res = ExpandOperand_STORE(cast<StoreSDNode>(N), OpNo);
       break;
-    case ISD::MEMSET:
-    case ISD::MEMCPY:
-    case ISD::MEMMOVE:     Res = HandleMemIntrinsic(N); break;
 
     case ISD::BUILD_VECTOR: Res = ExpandOperand_BUILD_VECTOR(N); break;
     }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesPromote.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesPromote.cpp
index b8118eb..93c8c60 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesPromote.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesPromote.cpp
@@ -447,9 +447,6 @@ bool DAGTypeLegalizer::PromoteOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::STORE:       Res = PromoteOperand_STORE(cast<StoreSDNode>(N),
                                                     OpNo); break;
-  case ISD::MEMSET:
-  case ISD::MEMCPY:
-  case ISD::MEMMOVE:     Res = HandleMemIntrinsic(N); break;
 
   case ISD::BUILD_VECTOR: Res = PromoteOperand_BUILD_VECTOR(N); break;
   case ISD::INSERT_VECTOR_ELT:
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f096c70..327a8fe 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/CallingConv.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -2385,28 +2386,357 @@ SDOperand SelectionDAG::getNode(unsigned Opcode, MVT::ValueType VT,
   return getNode(Opcode, VT, Ops, 5);
 }
 
-SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dest,
-                                  SDOperand Src, SDOperand Size,
-                                  SDOperand Align,
-                                  SDOperand AlwaysInline) {
-  SDOperand Ops[] = { Chain, Dest, Src, Size, Align, AlwaysInline };
-  return getNode(ISD::MEMCPY, MVT::Other, Ops, 6);
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDOperand getMemsetValue(SDOperand Value, MVT::ValueType VT,
+                                SelectionDAG &DAG) {
+  MVT::ValueType CurVT = VT;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    uint64_t Val   = C->getValue() & 255;
+    unsigned Shift = 8;
+    while (CurVT != MVT::i8) {
+      Val = (Val << Shift) | Val;
+      Shift <<= 1;
+      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
+    }
+    return DAG.getConstant(Val, VT);
+  } else {
+    Value = DAG.getNode(ISD::ZERO_EXTEND, VT, Value);
+    unsigned Shift = 8;
+    while (CurVT != MVT::i8) {
+      Value =
+        DAG.getNode(ISD::OR, VT,
+                    DAG.getNode(ISD::SHL, VT, Value,
+                                DAG.getConstant(Shift, MVT::i8)), Value);
+      Shift <<= 1;
+      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
+    }
+
+    return Value;
+  }
 }
 
-SDOperand SelectionDAG::getMemmove(SDOperand Chain, SDOperand Dest,
-                                  SDOperand Src, SDOperand Size,
-                                  SDOperand Align,
-                                  SDOperand AlwaysInline) {
-  SDOperand Ops[] = { Chain, Dest, Src, Size, Align, AlwaysInline };
-  return getNode(ISD::MEMMOVE, MVT::Other, Ops, 6);
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDOperand getMemsetStringVal(MVT::ValueType VT,
+                                    SelectionDAG &DAG,
+                                    const TargetLowering &TLI,
+                                    std::string &Str, unsigned Offset) {
+  uint64_t Val = 0;
+  unsigned MSB = MVT::getSizeInBits(VT) / 8;
+  if (TLI.isLittleEndian())
+    Offset = Offset + MSB - 1;
+  for (unsigned i = 0; i != MSB; ++i) {
+    Val = (Val << 8) | (unsigned char)Str[Offset];
+    Offset += TLI.isLittleEndian() ? -1 : 1;
+  }
+  return DAG.getConstant(Val, VT);
+}
+
+/// getMemBasePlusOffset - Returns base and offset node for the 
+static SDOperand getMemBasePlusOffset(SDOperand Base, unsigned Offset,
+                                      SelectionDAG &DAG) {
+  MVT::ValueType VT = Base.getValueType();
+  return DAG.getNode(ISD::ADD, VT, Base, DAG.getConstant(Offset, VT));
 }
 
-SDOperand SelectionDAG::getMemset(SDOperand Chain, SDOperand Dest,
+/// MeetsMaxMemopRequirement - Determines if the number of memory ops required
+/// to replace the memset / memcpy is below the threshold. It also returns the
+/// types of the sequence of memory ops to perform memset / memcpy.
+static bool MeetsMaxMemopRequirement(std::vector<MVT::ValueType> &MemOps,
+                                     unsigned Limit, uint64_t Size,
+                                     unsigned Align,
+                                     const TargetLowering &TLI) {
+  MVT::ValueType VT;
+
+  if (TLI.allowsUnalignedMemoryAccesses()) {
+    VT = MVT::i64;
+  } else {
+    switch (Align & 7) {
+    case 0:
+      VT = MVT::i64;
+      break;
+    case 4:
+      VT = MVT::i32;
+      break;
+    case 2:
+      VT = MVT::i16;
+      break;
+    default:
+      VT = MVT::i8;
+      break;
+    }
+  }
+
+  MVT::ValueType LVT = MVT::i64;
+  while (!TLI.isTypeLegal(LVT))
+    LVT = (MVT::ValueType)((unsigned)LVT - 1);
+  assert(MVT::isInteger(LVT));
+
+  if (VT > LVT)
+    VT = LVT;
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
+    while (VTSize > Size) {
+      VT = (MVT::ValueType)((unsigned)VT - 1);
+      VTSize >>= 1;
+    }
+    assert(MVT::isInteger(VT));
+
+    if (++NumMemOps > Limit)
+      return false;
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+static SDOperand getMemcpyLoadsAndStores(SelectionDAG &DAG,
+                                         SDOperand Chain, SDOperand Dst,
+                                         SDOperand Src, uint64_t Size,
+                                         unsigned Align,
+                                         bool AlwaysInline,
+                                         Value *DstSV, uint64_t DstOff,
+                                         Value *SrcSV, uint64_t SrcOff) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memcpy to a series of store ops if the size operand falls below
+  // a certain threshold.
+  std::vector<MVT::ValueType> MemOps;
+  uint64_t Limit = -1;
+  if (!AlwaysInline)
+    Limit = TLI.getMaxStoresPerMemcpy();
+  if (!MeetsMaxMemopRequirement(MemOps, Limit, Size, Align, TLI))
+    return SDOperand();
+
+  SmallVector<SDOperand, 8> OutChains;
+
+  unsigned NumMemOps = MemOps.size();
+  unsigned SrcDelta = 0;
+  GlobalAddressSDNode *G = NULL;
+  std::string Str;
+  bool CopyFromStr = false;
+
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getValue();
+  }
+  if (G) {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
+    if (GV && GV->isConstant()) {
+      Str = GV->getStringValue(false);
+      if (!Str.empty()) {
+        CopyFromStr = true;
+        SrcOff += SrcDelta;
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT::ValueType VT = MemOps[i];
+    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
+    SDOperand Value, Store;
+
+    if (CopyFromStr) {
+      Value = getMemsetStringVal(VT, DAG, TLI, Str, SrcOff);
+      Store =
+        DAG.getStore(Chain, Value,
+                     getMemBasePlusOffset(Dst, DstOff, DAG),
+                     DstSV, DstOff);
+    } else {
+      Value = DAG.getLoad(VT, Chain,
+                          getMemBasePlusOffset(Src, SrcOff, DAG),
+                          SrcSV, SrcOff, false, Align);
+      Store =
+        DAG.getStore(Chain, Value,
+                     getMemBasePlusOffset(Dst, DstOff, DAG),
+                     DstSV, DstOff, false, Align);
+    }
+    OutChains.push_back(Store);
+    SrcOff += VTSize;
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDOperand getMemsetStores(SelectionDAG &DAG,
+                                 SDOperand Chain, SDOperand Dst,
+                                 SDOperand Src, uint64_t Size,
+                                 unsigned Align,
+                                 Value *DstSV, uint64_t DstOff) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  std::vector<MVT::ValueType> MemOps;
+  if (!MeetsMaxMemopRequirement(MemOps, TLI.getMaxStoresPerMemset(),
+                                Size, Align, TLI))
+    return SDOperand();
+
+  SmallVector<SDOperand, 8> OutChains;
+
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    MVT::ValueType VT = MemOps[i];
+    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
+    SDOperand Value = getMemsetValue(Src, VT, DAG);
+    SDOperand Store = DAG.getStore(Chain, Value,
+                                   getMemBasePlusOffset(Dst, DstOff, DAG),
+                                   DstSV, DstOff);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+SDOperand SelectionDAG::getMemcpy(SDOperand Chain, SDOperand Dst,
+                                  SDOperand Src, SDOperand Size,
+                                  unsigned Align, bool AlwaysInline,
+                                  Value *DstSV, uint64_t DstOff,
+                                  Value *SrcSV, uint64_t SrcOff) {
+
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDOperand Result =
+      getMemcpyLoadsAndStores(*this, Chain, Dst, Src, ConstantSize->getValue(),
+                              Align, false, DstSV, DstOff, SrcSV, SrcOff);
+    if (Result.Val)
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDOperand Result =
+    TLI.EmitTargetCodeForMemcpy(*this, Chain, Dst, Src, Size, Align,
+                                AlwaysInline,
+                                DstSV, DstOff, SrcSV, SrcOff);
+  if (Result.Val)
+    return Result;
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(*this, Chain, Dst, Src,
+                                   ConstantSize->getValue(), Align, true,
+                                   DstSV, DstOff, SrcSV, SrcOff);
+  }
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  std::pair<SDOperand,SDOperand> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memcpy", TLI.getPointerTy()),
+                    Args, *this);
+  return CallResult.second;
+}
+
+SDOperand SelectionDAG::getMemmove(SDOperand Chain, SDOperand Dst,
+                                   SDOperand Src, SDOperand Size,
+                                   unsigned Align,
+                                   Value *DstSV, uint64_t DstOff,
+                                   Value *SrcSV, uint64_t SrcOff) {
+
+  // TODO: Optimize small memmove cases with simple loads and stores,
+  // ensuring that all loads precede all stores. This can cause severe
+  // register pressure, so targets should be careful with the size limit.
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDOperand Result =
+    TLI.EmitTargetCodeForMemmove(*this, Chain, Dst, Src, Size, Align,
+                                 DstSV, DstOff, SrcSV, SrcOff);
+  if (Result.Val)
+    return Result;
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getTargetData()->getIntPtrType();
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  std::pair<SDOperand,SDOperand> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memmove", TLI.getPointerTy()),
+                    Args, *this);
+  return CallResult.second;
+}
+
+SDOperand SelectionDAG::getMemset(SDOperand Chain, SDOperand Dst,
                                   SDOperand Src, SDOperand Size,
-                                  SDOperand Align,
-                                  SDOperand AlwaysInline) {
-  SDOperand Ops[] = { Chain, Dest, Src, Size, Align, AlwaysInline };
-  return getNode(ISD::MEMSET, MVT::Other, Ops, 6);
+                                  unsigned Align,
+                                  Value *DstSV, uint64_t DstOff) {
+
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDOperand Result =
+      getMemsetStores(*this, Chain, Dst, Src, ConstantSize->getValue(), Align,
+                      DstSV, DstOff);
+    if (Result.Val)
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDOperand Result =
+    TLI.EmitTargetCodeForMemset(*this, Chain, Dst, Src, Size, Align,
+                                DstSV, DstOff);
+  if (Result.Val)
+    return Result;
+
+  // Emit a library call.
+  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Dst; Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType() > MVT::i32)
+    Src = getNode(ISD::TRUNCATE, MVT::i32, Src);
+  else
+    Src = getNode(ISD::ZERO_EXTEND, MVT::i32, Src);
+  Entry.Node = Src; Entry.Ty = Type::Int32Ty; Entry.isSExt = true;
+  Args.push_back(Entry);
+  Entry.Node = Size; Entry.Ty = IntPtrTy; Entry.isSExt = false;
+  Args.push_back(Entry);
+  std::pair<SDOperand,SDOperand> CallResult =
+    TLI.LowerCallTo(Chain, Type::VoidTy,
+                    false, false, false, CallingConv::C, false,
+                    getExternalSymbol("memset", TLI.getPointerTy()),
+                    Args, *this);
+  return CallResult.second;
 }
 
 SDOperand SelectionDAG::getAtomic(unsigned Opcode, SDOperand Chain, 
@@ -4009,11 +4339,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STACKRESTORE:       return "stackrestore";
   case ISD::TRAP:               return "trap";
 
-  // Block memory operations.
-  case ISD::MEMSET:  return "memset";
-  case ISD::MEMCPY:  return "memcpy";
-  case ISD::MEMMOVE: return "memmove";
-
   // Bit manipulation
   case ISD::BSWAP:   return "bswap";
   case ISD::CTPOP:   return "ctpop";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index cfef9ac..ac5cfd2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -647,8 +647,6 @@ public:
   void visitVAEnd(CallInst &I);
   void visitVACopy(CallInst &I);
 
-  void visitMemIntrinsic(CallInst &I, unsigned Op);
-
   void visitGetResult(GetResultInst &I);
 
   void visitUserOp1(Instruction &I) {
@@ -2737,18 +2735,48 @@ SelectionDAGLowering::visitIntrinsicCall(CallInst &I, unsigned Intrinsic) {
     return "_longjmp"+!TLI.usesUnderscoreLongJmp();
     break;
   case Intrinsic::memcpy_i32:
-  case Intrinsic::memcpy_i64:
-    visitMemIntrinsic(I, ISD::MEMCPY);
+  case Intrinsic::memcpy_i64: {
+    SDOperand Op1 = getValue(I.getOperand(1));
+    SDOperand Op2 = getValue(I.getOperand(2));
+    SDOperand Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemcpy(getRoot(), Op1, Op2, Op3, Align, false,
+                              I.getOperand(1), 0, I.getOperand(2), 0));
     return 0;
+  }
   case Intrinsic::memset_i32:
-  case Intrinsic::memset_i64:
-    visitMemIntrinsic(I, ISD::MEMSET);
+  case Intrinsic::memset_i64: {
+    SDOperand Op1 = getValue(I.getOperand(1));
+    SDOperand Op2 = getValue(I.getOperand(2));
+    SDOperand Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemset(getRoot(), Op1, Op2, Op3, Align,
+                              I.getOperand(1), 0));
     return 0;
+  }
   case Intrinsic::memmove_i32:
-  case Intrinsic::memmove_i64:
-    visitMemIntrinsic(I, ISD::MEMMOVE);
+  case Intrinsic::memmove_i64: {
+    SDOperand Op1 = getValue(I.getOperand(1));
+    SDOperand Op2 = getValue(I.getOperand(2));
+    SDOperand Op3 = getValue(I.getOperand(3));
+    unsigned Align = cast<ConstantInt>(I.getOperand(4))->getZExtValue();
+
+    // If the source and destination are known to not be aliases, we can
+    // lower memmove as memcpy.
+    uint64_t Size = -1ULL;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
+      Size = C->getValue();
+    if (AA.alias(I.getOperand(1), Size, I.getOperand(2), Size) ==
+        AliasAnalysis::NoAlias) {
+      DAG.setRoot(DAG.getMemcpy(getRoot(), Op1, Op2, Op3, Align, false,
+                                I.getOperand(1), 0, I.getOperand(2), 0));
+      return 0;
+    }
+
+    DAG.setRoot(DAG.getMemmove(getRoot(), Op1, Op2, Op3, Align,
+                               I.getOperand(1), 0, I.getOperand(2), 0));
     return 0;
-    
+  }
   case Intrinsic::dbg_stoppoint: {
     MachineModuleInfo *MMI = DAG.getMachineModuleInfo();
     DbgStopPointInst &SPI = cast<DbgStopPointInst>(I);
@@ -4342,242 +4370,6 @@ SDOperand TargetLowering::CustomPromoteOperation(SDOperand Op,
   return SDOperand();
 }
 
-/// getMemsetValue - Vectorized representation of the memset value
-/// operand.
-static SDOperand getMemsetValue(SDOperand Value, MVT::ValueType VT,
-                                SelectionDAG &DAG) {
-  MVT::ValueType CurVT = VT;
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
-    uint64_t Val   = C->getValue() & 255;
-    unsigned Shift = 8;
-    while (CurVT != MVT::i8) {
-      Val = (Val << Shift) | Val;
-      Shift <<= 1;
-      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
-    }
-    return DAG.getConstant(Val, VT);
-  } else {
-    Value = DAG.getNode(ISD::ZERO_EXTEND, VT, Value);
-    unsigned Shift = 8;
-    while (CurVT != MVT::i8) {
-      Value =
-        DAG.getNode(ISD::OR, VT,
-                    DAG.getNode(ISD::SHL, VT, Value,
-                                DAG.getConstant(Shift, MVT::i8)), Value);
-      Shift <<= 1;
-      CurVT = (MVT::ValueType)((unsigned)CurVT - 1);
-    }
-
-    return Value;
-  }
-}
-
-/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
-/// used when a memcpy is turned into a memset when the source is a constant
-/// string ptr.
-static SDOperand getMemsetStringVal(MVT::ValueType VT,
-                                    SelectionDAG &DAG, TargetLowering &TLI,
-                                    std::string &Str, unsigned Offset) {
-  uint64_t Val = 0;
-  unsigned MSB = MVT::getSizeInBits(VT) / 8;
-  if (TLI.isLittleEndian())
-    Offset = Offset + MSB - 1;
-  for (unsigned i = 0; i != MSB; ++i) {
-    Val = (Val << 8) | (unsigned char)Str[Offset];
-    Offset += TLI.isLittleEndian() ? -1 : 1;
-  }
-  return DAG.getConstant(Val, VT);
-}
-
-/// getMemBasePlusOffset - Returns base and offset node for the 
-static SDOperand getMemBasePlusOffset(SDOperand Base, unsigned Offset,
-                                      SelectionDAG &DAG, TargetLowering &TLI) {
-  MVT::ValueType VT = Base.getValueType();
-  return DAG.getNode(ISD::ADD, VT, Base, DAG.getConstant(Offset, VT));
-}
-
-/// MeetsMaxMemopRequirement - Determines if the number of memory ops required
-/// to replace the memset / memcpy is below the threshold. It also returns the
-/// types of the sequence of  memory ops to perform memset / memcpy.
-static bool MeetsMaxMemopRequirement(std::vector<MVT::ValueType> &MemOps,
-                                     unsigned Limit, uint64_t Size,
-                                     unsigned Align, TargetLowering &TLI) {
-  MVT::ValueType VT;
-
-  if (TLI.allowsUnalignedMemoryAccesses()) {
-    VT = MVT::i64;
-  } else {
-    switch (Align & 7) {
-    case 0:
-      VT = MVT::i64;
-      break;
-    case 4:
-      VT = MVT::i32;
-      break;
-    case 2:
-      VT = MVT::i16;
-      break;
-    default:
-      VT = MVT::i8;
-      break;
-    }
-  }
-
-  MVT::ValueType LVT = MVT::i64;
-  while (!TLI.isTypeLegal(LVT))
-    LVT = (MVT::ValueType)((unsigned)LVT - 1);
-  assert(MVT::isInteger(LVT));
-
-  if (VT > LVT)
-    VT = LVT;
-
-  unsigned NumMemOps = 0;
-  while (Size != 0) {
-    unsigned VTSize = MVT::getSizeInBits(VT) / 8;
-    while (VTSize > Size) {
-      VT = (MVT::ValueType)((unsigned)VT - 1);
-      VTSize >>= 1;
-    }
-    assert(MVT::isInteger(VT));
-
-    if (++NumMemOps > Limit)
-      return false;
-    MemOps.push_back(VT);
-    Size -= VTSize;
-  }
-
-  return true;
-}
-
-void SelectionDAGLowering::visitMemIntrinsic(CallInst &I, unsigned Op) {
-  SDOperand Op1 = getValue(I.getOperand(1));
-  SDOperand Op2 = getValue(I.getOperand(2));
-  SDOperand Op3 = getValue(I.getOperand(3));
-  SDOperand Op4 = getValue(I.getOperand(4));
-  unsigned Align = (unsigned)cast<ConstantSDNode>(Op4)->getValue();
-  if (Align == 0) Align = 1;
-
-  // If the source and destination are known to not be aliases, we can
-  // lower memmove as memcpy.
-  if (Op == ISD::MEMMOVE) {
-    uint64_t Size = -1ULL;
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op3))
-      Size = C->getValue();
-    if (AA.alias(I.getOperand(1), Size, I.getOperand(2), Size) ==
-        AliasAnalysis::NoAlias)
-      Op = ISD::MEMCPY;
-  }
-
-  if (ConstantSDNode *Size = dyn_cast<ConstantSDNode>(Op3)) {
-    std::vector<MVT::ValueType> MemOps;
-
-    // Expand memset / memcpy to a series of load / store ops
-    // if the size operand falls below a certain threshold.
-    SmallVector<SDOperand, 8> OutChains;
-    switch (Op) {
-    default: break;  // Do nothing for now.
-    case ISD::MEMSET: {
-      if (MeetsMaxMemopRequirement(MemOps, TLI.getMaxStoresPerMemset(),
-                                   Size->getValue(), Align, TLI)) {
-        unsigned NumMemOps = MemOps.size();
-        unsigned Offset = 0;
-        for (unsigned i = 0; i < NumMemOps; i++) {
-          MVT::ValueType VT = MemOps[i];
-          unsigned VTSize = MVT::getSizeInBits(VT) / 8;
-          SDOperand Value = getMemsetValue(Op2, VT, DAG);
-          SDOperand Store = DAG.getStore(getRoot(), Value,
-                                    getMemBasePlusOffset(Op1, Offset, DAG, TLI),
-                                         I.getOperand(1), Offset);
-          OutChains.push_back(Store);
-          Offset += VTSize;
-        }
-      }
-      break;
-    }
-    case ISD::MEMCPY: {
-      if (MeetsMaxMemopRequirement(MemOps, TLI.getMaxStoresPerMemcpy(),
-                                   Size->getValue(), Align, TLI)) {
-        unsigned NumMemOps = MemOps.size();
-        unsigned SrcOff = 0, DstOff = 0, SrcDelta = 0;
-        GlobalAddressSDNode *G = NULL;
-        std::string Str;
-        bool CopyFromStr = false;
-
-        if (Op2.getOpcode() == ISD::GlobalAddress)
-          G = cast<GlobalAddressSDNode>(Op2);
-        else if (Op2.getOpcode() == ISD::ADD &&
-                 Op2.getOperand(0).getOpcode() == ISD::GlobalAddress &&
-                 Op2.getOperand(1).getOpcode() == ISD::Constant) {
-          G = cast<GlobalAddressSDNode>(Op2.getOperand(0));
-          SrcDelta = cast<ConstantSDNode>(Op2.getOperand(1))->getValue();
-        }
-        if (G) {
-          GlobalVariable *GV = dyn_cast<GlobalVariable>(G->getGlobal());
-          if (GV && GV->isConstant()) {
-            Str = GV->getStringValue(false);
-            if (!Str.empty()) {
-              CopyFromStr = true;
-              SrcOff += SrcDelta;
-            }
-          }
-        }
-
-        for (unsigned i = 0; i < NumMemOps; i++) {
-          MVT::ValueType VT = MemOps[i];
-          unsigned VTSize = MVT::getSizeInBits(VT) / 8;
-          SDOperand Value, Chain, Store;
-
-          if (CopyFromStr) {
-            Value = getMemsetStringVal(VT, DAG, TLI, Str, SrcOff);
-            Chain = getRoot();
-            Store =
-              DAG.getStore(Chain, Value,
-                           getMemBasePlusOffset(Op1, DstOff, DAG, TLI),
-                           I.getOperand(1), DstOff);
-          } else {
-            Value = DAG.getLoad(VT, getRoot(),
-                                getMemBasePlusOffset(Op2, SrcOff, DAG, TLI),
-                                I.getOperand(2), SrcOff, false, Align);
-            Chain = Value.getValue(1);
-            Store =
-              DAG.getStore(Chain, Value,
-                           getMemBasePlusOffset(Op1, DstOff, DAG, TLI),
-                           I.getOperand(1), DstOff, false, Align);
-          }
-          OutChains.push_back(Store);
-          SrcOff += VTSize;
-          DstOff += VTSize;
-        }
-      }
-      break;
-    }
-    }
-
-    if (!OutChains.empty()) {
-      DAG.setRoot(DAG.getNode(ISD::TokenFactor, MVT::Other,
-                  &OutChains[0], OutChains.size()));
-      return;
-    }
-  }
-
-  SDOperand AlwaysInline = DAG.getConstant(0, MVT::i1);
-  SDOperand Node;
-  switch(Op) {
-    default:
-      assert(0 && "Unknown Op");
-    case ISD::MEMCPY:
-      Node = DAG.getMemcpy(getRoot(), Op1, Op2, Op3, Op4, AlwaysInline);
-      break;
-    case ISD::MEMMOVE:
-      Node = DAG.getMemmove(getRoot(), Op1, Op2, Op3, Op4, AlwaysInline);
-      break;
-    case ISD::MEMSET:
-      Node = DAG.getMemset(getRoot(), Op1, Op2, Op3, Op4, AlwaysInline);
-      break;
-  }
-  DAG.setRoot(Node);
-}
-
 //===----------------------------------------------------------------------===//
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a0894dd..f69f046 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -17,7 +17,7 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/StringExtras.h"
@@ -234,59 +234,6 @@ TargetLowering::TargetLowering(TargetMachine &tm)
 
 TargetLowering::~TargetLowering() {}
 
-
-SDOperand TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
-  assert(getSubtarget() && "Subtarget not defined");
-  SDOperand ChainOp = Op.getOperand(0);
-  SDOperand DestOp = Op.getOperand(1);
-  SDOperand SourceOp = Op.getOperand(2);
-  SDOperand CountOp = Op.getOperand(3);
-  SDOperand AlignOp = Op.getOperand(4);
-  SDOperand AlwaysInlineOp = Op.getOperand(5);
-
-  bool AlwaysInline = (bool)cast<ConstantSDNode>(AlwaysInlineOp)->getValue();
-  unsigned Align = (unsigned)cast<ConstantSDNode>(AlignOp)->getValue();
-  if (Align == 0) Align = 1;
-
-  // If size is unknown, call memcpy.
-  ConstantSDNode *I = dyn_cast<ConstantSDNode>(CountOp);
-  if (!I) {
-    assert(!AlwaysInline && "Cannot inline copy of unknown size");
-    return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
-  }
-
-  // If not DWORD aligned or if size is more than threshold, then call memcpy.
-  // The libc version is likely to be faster for the following cases. It can
-  // use the address value and run time information about the CPU.
-  // With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster
-  unsigned Size = I->getValue();
-  if (AlwaysInline ||
-      (Size <= getSubtarget()->getMaxInlineSizeThreshold() &&
-       (Align & 3) == 0))
-    return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
-  return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
-}
-
-
-SDOperand TargetLowering::LowerMEMCPYCall(SDOperand Chain,
-                                          SDOperand Dest,
-                                          SDOperand Source,
-                                          SDOperand Count,
-                                          SelectionDAG &DAG) {
-  MVT::ValueType IntPtr = getPointerTy();
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Entry.Ty = getTargetData()->getIntPtrType();
-  Entry.Node = Dest; Args.push_back(Entry);
-  Entry.Node = Source; Args.push_back(Entry);
-  Entry.Node = Count; Args.push_back(Entry);
-  std::pair<SDOperand,SDOperand> CallResult =
-      LowerCallTo(Chain, Type::VoidTy, false, false, false, CallingConv::C,
-                  false, DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
-  return CallResult.second;
-}
-
-
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLowering::computeRegisterProperties() {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 7218560..0095352 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -197,11 +197,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
 
-  // Expand mem operations genericly.
-  setOperationAction(ISD::MEMSET          , MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
-  setOperationAction(ISD::MEMMOVE         , MVT::Other, Expand);
-
   // Use the default implementation.
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
@@ -1246,18 +1241,30 @@ static SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
   return DAG.getNode(ARMISD::CNEG, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
 }
 
-SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
-                                               SDOperand Dest,
-                                               SDOperand Source,
-                                               unsigned Size,
-                                               unsigned Align,
-                                               SelectionDAG &DAG) {
+SDOperand
+ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG,
+                                           SDOperand Chain,
+                                           SDOperand Dst, SDOperand Src,
+                                           SDOperand Size, unsigned Align,
+                                           bool AlwaysInline,
+                                           Value *DstSV, uint64_t DstOff,
+                                           Value *SrcSV, uint64_t SrcOff){
   // Do repeated 4-byte loads and stores. To be improved.
-  assert((Align & 3) == 0 && "Expected 4-byte aligned addresses!");
-  unsigned BytesLeft = Size & 3;
-  unsigned NumMemOps = Size >> 2;
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDOperand();
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDOperand();
+  uint64_t SizeVal = ConstantSize->getValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDOperand();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
   unsigned EmittedNumMemOps = 0;
-  unsigned SrcOff = 0, DstOff = 0;
   MVT::ValueType VT = MVT::i32;
   unsigned VTSize = 4;
   unsigned i = 0;
@@ -1272,9 +1279,9 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
     for (i = 0;
          i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
       Loads[i] = DAG.getLoad(VT, Chain,
-                             DAG.getNode(ISD::ADD, MVT::i32, Source,
+                             DAG.getNode(ISD::ADD, MVT::i32, Src,
                                          DAG.getConstant(SrcOff, MVT::i32)),
-                             NULL, 0);
+                             SrcSV, SrcOff);
       TFOps[i] = Loads[i].getValue(1);
       SrcOff += VTSize;
     }
@@ -1283,9 +1290,9 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
     for (i = 0;
          i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
       TFOps[i] = DAG.getStore(Chain, Loads[i],
-                           DAG.getNode(ISD::ADD, MVT::i32, Dest, 
+                           DAG.getNode(ISD::ADD, MVT::i32, Dst, 
                                        DAG.getConstant(DstOff, MVT::i32)),
-                           NULL, 0);
+                           DstSV, DstOff);
       DstOff += VTSize;
     }
     Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i);
@@ -1309,9 +1316,9 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
     }
 
     Loads[i] = DAG.getLoad(VT, Chain,
-                           DAG.getNode(ISD::ADD, MVT::i32, Source,
+                           DAG.getNode(ISD::ADD, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, MVT::i32)),
-                           NULL, 0);
+                           SrcSV, SrcOff);
     TFOps[i] = Loads[i].getValue(1);
     ++i;
     SrcOff += VTSize;
@@ -1331,9 +1338,9 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain,
     }
 
     TFOps[i] = DAG.getStore(Chain, Loads[i],
-                            DAG.getNode(ISD::ADD, MVT::i32, Dest, 
+                            DAG.getNode(ISD::ADD, MVT::i32, Dst, 
                                         DAG.getConstant(DstOff, MVT::i32)),
-                            NULL, 0);
+                            DstSV, DstOff);
     ++i;
     DstOff += VTSize;
     BytesLeft -= VTSize;
@@ -1409,7 +1416,6 @@ SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
   case ISD::RETURNADDR:    break;
   case ISD::FRAMEADDR:     break;
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
-  case ISD::MEMCPY:        return LowerMEMCPY(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
       
       
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 285a20d..58d8d8c 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -119,8 +119,8 @@ namespace llvm {
     getRegClassForInlineAsmConstraint(const std::string &Constraint,
                                       MVT::ValueType VT) const;
 
-    virtual const TargetSubtarget* getSubtarget() {
-      return static_cast<const TargetSubtarget*>(Subtarget);
+    virtual const ARMSubtarget* getSubtarget() {
+      return Subtarget;
     }
 
   private:
@@ -143,11 +143,14 @@ namespace llvm {
     SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG);
-    SDOperand LowerMEMCPYInline(SDOperand Chain, SDOperand Dest,
-                                SDOperand Source, unsigned Size,
-                                unsigned Align, SelectionDAG &DAG);
-
 
+    SDOperand EmitTargetCodeForMemcpy(SelectionDAG &DAG,
+                                      SDOperand Chain,
+                                      SDOperand Dst, SDOperand Src,
+                                      SDOperand Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      Value *DstSV, uint64_t DstOff,
+                                      Value *SrcSV, uint64_t SrcOff);
   };
 }
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c43924b..fbc9e57 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -62,6 +62,8 @@ protected:
   ///
   ARMSubtarget(const Module &M, const std::string &FS, bool thumb);
 
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
   unsigned getMaxInlineSizeThreshold() const {
     // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb.
     // Change this once Thumb ldmia / stmia support is added.
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index d208f59..91b1180 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -87,10 +87,6 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::SDIV     , MVT::i64, Custom);
   setOperationAction(ISD::UDIV     , MVT::i64, Custom);
 
-  setOperationAction(ISD::MEMMOVE  , MVT::Other, Expand);
-  setOperationAction(ISD::MEMSET   , MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY   , MVT::Other, Expand);
-  
   // We don't support sin/cos/sqrt/pow
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 2922609..1cb6918 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -175,9 +175,6 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);  
 
   // SPU has no intrinsics for these particular operations:
-  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
-  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
   // PowerPC has no SREM/UREM instructions
diff --git a/lib/Target/IA64/IA64ISelLowering.cpp b/lib/Target/IA64/IA64ISelLowering.cpp
index 2ec08b6..c53f3b4 100644
--- a/lib/Target/IA64/IA64ISelLowering.cpp
+++ b/lib/Target/IA64/IA64ISelLowering.cpp
@@ -65,9 +65,6 @@ IA64TargetLowering::IA64TargetLowering(TargetMachine &TM)
       setOperationAction(ISD::UREM             , MVT::f32  , Expand);
       setOperationAction(ISD::UREM             , MVT::f64  , Expand);
 
-      setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
-      setOperationAction(ISD::MEMSET           , MVT::Other, Expand);
-      setOperationAction(ISD::MEMCPY           , MVT::Other, Expand);
       setOperationAction(ISD::MEMBARRIER       , MVT::Other, Expand);
 
       setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 5c2e1c0..5ea9cdd 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -80,9 +80,6 @@ MipsTargetLowering(MipsTargetMachine &TM): TargetLowering(TM)
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   // Mips not supported intrinsics.
-  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
-  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index ddc8e1a..e42e9dc 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -78,9 +78,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
 
   // PowerPC has no intrinsics for these particular operations
-  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
-  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
   // PowerPC has no SREM/UREM instructions
@@ -1735,10 +1732,9 @@ static SDOperand
 CreateCopyOfByValArgument(SDOperand Src, SDOperand Dst, SDOperand Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
                           unsigned Size) {
-  SDOperand AlignNode    = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
-  SDOperand SizeNode     = DAG.getConstant(Size, MVT::i32);
-  SDOperand AlwaysInline = DAG.getConstant(0, MVT::i32);
-  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, AlignNode, AlwaysInline);
+  SDOperand SizeNode = DAG.getConstant(Size, MVT::i32);
+  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(), false,
+                       NULL, 0, NULL, 0);
 }
 
 SDOperand PPCTargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG,
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 1d4fe0b..3d5ad0b 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -570,9 +570,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   
   // SPARC has no intrinsics for these particular operations.
-  setOperationAction(ISD::MEMMOVE, MVT::Other, Expand);
-  setOperationAction(ISD::MEMSET, MVT::Other, Expand);
-  setOperationAction(ISD::MEMCPY, MVT::Other, Expand);
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 66384f9..9db0288 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -206,7 +206,6 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
   setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
-  setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
   if (Subtarget->is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
@@ -281,9 +280,6 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
   }
-  // X86 wants to expand memset / memcpy itself.
-  setOperationAction(ISD::MEMSET          , MVT::Other, Custom);
-  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
 
   if (Subtarget->hasSSE1())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
@@ -1113,10 +1109,10 @@ CopyTailCallClobberedArgumentsToVRegs(SDOperand Chain,
 static SDOperand 
 CreateCopyOfByValArgument(SDOperand Src, SDOperand Dst, SDOperand Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
-  SDOperand AlignNode    = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
   SDOperand SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
-  SDOperand AlwaysInline = DAG.getConstant(1, MVT::i32);
-  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, AlignNode, AlwaysInline);
+  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*AlwaysInline=*/true,
+                       NULL, 0, NULL, 0);
 }
 
 SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
@@ -4557,52 +4553,51 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDOperand Op,
   return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2);
 }
 
-SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
-  SDOperand InFlag(0, 0);
-  SDOperand Chain = Op.getOperand(0);
-  unsigned Align =
-    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
-  if (Align == 0) Align = 1;
-
-  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-  // If not DWORD aligned or size is more than the threshold, call memset.
-  // The libc version is likely to be faster for these cases. It can use the
-  // address value and run time information about the CPU.
-  if ((Align & 3) != 0 ||
-      (I && I->getValue() > Subtarget->getMaxInlineSizeThreshold())) {
+SDOperand
+X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
+                                           SDOperand Chain,
+                                           SDOperand Dst, SDOperand Src,
+                                           SDOperand Size, unsigned Align,
+                                           Value *DstSV, uint64_t DstOff) {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  /// If not DWORD aligned or size is more than the threshold, call the library.
+  /// The libc version is likely to be faster for these cases. It can use the
+  /// address value and run time information about the CPU.
+  if ((Align & 3) == 0 ||
+      !ConstantSize ||
+      ConstantSize->getValue() > getSubtarget()->getMaxInlineSizeThreshold()) {
+    SDOperand InFlag(0, 0);
 
     // Check to see if there is a specialized entry-point for memory zeroing.
-    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Op.getOperand(2));
-    const char *bzeroEntry = 
-      V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0;
-
-    MVT::ValueType IntPtr = getPointerTy();
-    const Type *IntPtrTy = getTargetData()->getIntPtrType();
-    TargetLowering::ArgListTy Args; 
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Op.getOperand(1);
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-
-    if (!bzeroEntry) {
-      // Extend the unsigned i8 argument to be an int value for the call.
-      Entry.Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Op.getOperand(2));
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+    if (const char *bzeroEntry = 
+          V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+      MVT::ValueType IntPtr = getPointerTy();
+      const Type *IntPtrTy = getTargetData()->getIntPtrType();
+      TargetLowering::ArgListTy Args; 
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
       Entry.Ty = IntPtrTy;
       Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+      std::pair<SDOperand,SDOperand> CallResult =
+        LowerCallTo(Chain, Type::VoidTy, false, false, false, CallingConv::C,
+                    false, DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                    Args, DAG);
+      return CallResult.second;
     }
 
-    Entry.Node = Op.getOperand(3);
-    Args.push_back(Entry);
-    const char *Name = bzeroEntry ? bzeroEntry : "memset";
-    std::pair<SDOperand,SDOperand> CallResult =
-      LowerCallTo(Chain, Type::VoidTy, false, false, false, CallingConv::C,
-                  false, DAG.getExternalSymbol(Name, IntPtr), Args, DAG);
-    return CallResult.second;
+    // Otherwise have the target-independent code call memset.
+    return SDOperand();
   }
 
+  uint64_t SizeVal = ConstantSize->getValue();
+  SDOperand InFlag(0, 0);
   MVT::ValueType AVT;
   SDOperand Count;
-  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
   unsigned BytesLeft = 0;
   bool TwoRepStos = false;
   if (ValC) {
@@ -4630,22 +4625,14 @@ SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
       default:  // Byte aligned
         AVT = MVT::i8;
         ValReg = X86::AL;
-        Count = Op.getOperand(3);
+        Count = Size;
         break;
     }
 
     if (AVT > MVT::i8) {
-      if (I) {
-        unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
-        Count = DAG.getIntPtrConstant(I->getValue() / UBytes);
-        BytesLeft = I->getValue() % UBytes;
-      } else {
-        assert(AVT >= MVT::i32 &&
-               "Do not use rep;stos if not at least DWORD aligned");
-        Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
-                            Op.getOperand(3), DAG.getConstant(2, MVT::i8));
-        TwoRepStos = true;
-      }
+      unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+      BytesLeft = SizeVal % UBytes;
     }
 
     Chain  = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT),
@@ -4653,8 +4640,8 @@ SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
     InFlag = Chain.getValue(1);
   } else {
     AVT = MVT::i8;
-    Count  = Op.getOperand(3);
-    Chain  = DAG.getCopyToReg(Chain, X86::AL, Op.getOperand(2), InFlag);
+    Count  = Size;
+    Chain  = DAG.getCopyToReg(Chain, X86::AL, Src, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -4662,7 +4649,7 @@ SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
                             Count, InFlag);
   InFlag = Chain.getValue(1);
   Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
-                            Op.getOperand(1), InFlag);
+                            Dst, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
@@ -4674,7 +4661,7 @@ SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
 
   if (TwoRepStos) {
     InFlag = Chain.getValue(1);
-    Count = Op.getOperand(3);
+    Count  = Size;
     MVT::ValueType CVT = Count.getValueType();
     SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
@@ -4688,79 +4675,68 @@ SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
     Ops.push_back(InFlag);
     Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
   } else if (BytesLeft) {
-    // Issue stores for the last 1 - 7 bytes.
-    SDOperand Value;
-    unsigned Val = ValC->getValue() & 255;
-    unsigned Offset = I->getValue() - BytesLeft;
-    SDOperand DstAddr = Op.getOperand(1);
-    MVT::ValueType AddrVT = DstAddr.getValueType();
-    if (BytesLeft >= 4) {
-      Val = (Val << 8)  | Val;
-      Val = (Val << 16) | Val;
-      Value = DAG.getConstant(Val, MVT::i32);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
-                                       DAG.getConstant(Offset, AddrVT)),
-                           NULL, 0);
-      BytesLeft -= 4;
-      Offset += 4;
-    }
-    if (BytesLeft >= 2) {
-      Value = DAG.getConstant((Val << 8) | Val, MVT::i16);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
-                                       DAG.getConstant(Offset, AddrVT)),
-                           NULL, 0);
-      BytesLeft -= 2;
-      Offset += 2;
-    }
-    if (BytesLeft == 1) {
-      Value = DAG.getConstant(Val, MVT::i8);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
-                                       DAG.getConstant(Offset, AddrVT)),
-                           NULL, 0);
-    }
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    MVT::ValueType AddrVT = Dst.getValueType();
+    MVT::ValueType SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain,
+                          DAG.getNode(ISD::ADD, AddrVT, Dst,
+                                      DAG.getConstant(Offset, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, SizeVT),
+                          Align, DstSV, Offset);
   }
 
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
   return Chain;
 }
 
-SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain,
-                                               SDOperand Dest,
-                                               SDOperand Source,
-                                               unsigned Size,
-                                               unsigned Align,
-                                               SelectionDAG &DAG) {
+SDOperand
+X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG,
+                                           SDOperand Chain,
+                                           SDOperand Dst, SDOperand Src,
+                                           SDOperand Size, unsigned Align,
+                                           bool AlwaysInline,
+                                           Value *DstSV, uint64_t DstOff,
+                                           Value *SrcSV, uint64_t SrcOff){
+  
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDOperand();
+  uint64_t SizeVal = ConstantSize->getValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDOperand();
+
+  SmallVector<SDOperand, 4> Results;
+
   MVT::ValueType AVT;
   unsigned BytesLeft = 0;
-  switch (Align & 3) {
-    case 2:   // WORD aligned
-      AVT = MVT::i16;
-      break;
-    case 0:  // DWORD aligned
-      AVT = MVT::i32;
-      if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
-        AVT = MVT::i64;
-      break;
-    default:  // Byte aligned
-      AVT = MVT::i8;
-      break;
-  }
+  if (Align >= 8 && Subtarget->is64Bit())
+    AVT = MVT::i64;
+  else if (Align >= 4)
+    AVT = MVT::i32;
+  else if (Align >= 2)
+    AVT = MVT::i16;
+  else
+    AVT = MVT::i8;
 
   unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
-  SDOperand Count = DAG.getIntPtrConstant(Size / UBytes);
-  BytesLeft = Size % UBytes;
+  unsigned CountVal = SizeVal / UBytes;
+  SDOperand Count = DAG.getIntPtrConstant(CountVal);
+  BytesLeft = SizeVal % UBytes;
 
   SDOperand InFlag(0, 0);
   Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
                             Count, InFlag);
   InFlag = Chain.getValue(1);
   Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
-                            Dest, InFlag);
+                            Dst, InFlag);
   InFlag = Chain.getValue(1);
   Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI,
-                            Source, InFlag);
+                            Src, InFlag);
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
@@ -4768,57 +4744,28 @@ SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain,
   Ops.push_back(Chain);
   Ops.push_back(DAG.getValueType(AVT));
   Ops.push_back(InFlag);
-  Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+  Results.push_back(DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size()));
 
   if (BytesLeft) {
-    // Issue loads and stores for the last 1 - 7 bytes.
-    unsigned Offset = Size - BytesLeft;
-    SDOperand DstAddr = Dest;
-    MVT::ValueType DstVT = DstAddr.getValueType();
-    SDOperand SrcAddr = Source;
-    MVT::ValueType SrcVT = SrcAddr.getValueType();
-    SDOperand Value;
-    if (BytesLeft >= 4) {
-      Value = DAG.getLoad(MVT::i32, Chain,
-                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
-                                      DAG.getConstant(Offset, SrcVT)),
-                          NULL, 0);
-      Chain = Value.getValue(1);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
-                                       DAG.getConstant(Offset, DstVT)),
-                           NULL, 0);
-      BytesLeft -= 4;
-      Offset += 4;
-    }
-    if (BytesLeft >= 2) {
-      Value = DAG.getLoad(MVT::i16, Chain,
-                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
-                                      DAG.getConstant(Offset, SrcVT)),
-                          NULL, 0);
-      Chain = Value.getValue(1);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
-                                       DAG.getConstant(Offset, DstVT)),
-                           NULL, 0);
-      BytesLeft -= 2;
-      Offset += 2;
-    }
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    MVT::ValueType DstVT = Dst.getValueType();
+    MVT::ValueType SrcVT = Src.getValueType();
+    MVT::ValueType SizeVT = Size.getValueType();
 
-    if (BytesLeft == 1) {
-      Value = DAG.getLoad(MVT::i8, Chain,
-                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
-                                      DAG.getConstant(Offset, SrcVT)),
-                          NULL, 0);
-      Chain = Value.getValue(1);
-      Chain = DAG.getStore(Chain, Value,
-                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
-                                       DAG.getConstant(Offset, DstVT)),
-                           NULL, 0);
-    }
+    Results.push_back(DAG.getMemcpy(Chain, 
+                                    DAG.getNode(ISD::ADD, DstVT, Dst,
+                                                DAG.getConstant(Offset,
+                                                                DstVT)),
+                                    DAG.getNode(ISD::ADD, SrcVT, Src,
+                                                DAG.getConstant(Offset,
+                                                                SrcVT)),
+                                    DAG.getConstant(BytesLeft, SizeVT),
+                                    Align, AlwaysInline,
+                                    DstSV, Offset, SrcSV, Offset));
   }
 
-  return Chain;
+  return DAG.getNode(ISD::TokenFactor, MVT::Other, &Results[0], Results.size());
 }
 
 /// Expand the result of: i64,outchain = READCYCLECOUNTER inchain
@@ -5430,8 +5377,6 @@ SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
   case ISD::CALL:               return LowerCALL(Op, DAG);
   case ISD::RET:                return LowerRET(Op, DAG);
   case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
-  case ISD::MEMSET:             return LowerMEMSET(Op, DAG);
-  case ISD::MEMCPY:             return LowerMEMCPY(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index d809950..2abe237 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -441,8 +441,8 @@ namespace llvm {
                                                    SDOperand Ret, 
                                                    SelectionDAG &DAG) const;
 
-    virtual const TargetSubtarget* getSubtarget() {
-      return static_cast<const TargetSubtarget*>(Subtarget);
+    virtual const X86Subtarget* getSubtarget() {
+      return Subtarget;
     }
 
     /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
@@ -512,9 +512,6 @@ namespace llvm {
     SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG);
-    SDOperand LowerMEMCPYInline(SDOperand Dest, SDOperand Source,
-                                SDOperand Chain, unsigned Size, unsigned Align,
-                                SelectionDAG &DAG);
     SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
     SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG);
@@ -535,6 +532,19 @@ namespace llvm {
     SDNode *ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG);
     SDNode *ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG);
     SDNode *ExpandATOMIC_LCS(SDNode *N, SelectionDAG &DAG);
+
+    SDOperand EmitTargetCodeForMemset(SelectionDAG &DAG,
+                                      SDOperand Chain,
+                                      SDOperand Dst, SDOperand Src,
+                                      SDOperand Size, unsigned Align,
+                                      Value *DstSV, uint64_t DstOff);
+    SDOperand EmitTargetCodeForMemcpy(SelectionDAG &DAG,
+                                      SDOperand Chain,
+                                      SDOperand Dst, SDOperand Src,
+                                      SDOperand Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      Value *DstSV, uint64_t DstOff,
+                                      Value *SrcSV, uint64_t SrcOff);
   };
 }
 
diff --git a/test/CodeGen/X86/2004-02-12-Memcpy.llx b/test/CodeGen/X86/2004-02-12-Memcpy.llx
index 151c5a5..59364c1 100644
--- a/test/CodeGen/X86/2004-02-12-Memcpy.llx
+++ b/test/CodeGen/X86/2004-02-12-Memcpy.llx
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 1
-; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep memcpy | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-pc-linux-gnu | grep movs | count 3
 
 @A = global [32 x i32] zeroinitializer
 @B = global [32 x i32] zeroinitializer
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index f438160..f85c8ff 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -1,7 +1,9 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsq | count 2
 ; RUN: llvm-as < %s | llc -march=x86    | grep rep.movsl | count 2
 
-%struct.s = type { i64, i64, i64 }
+%struct.s = type { i64, i64, i64, i64, i64, i64, i64, i64,
+                   i64, i64, i64, i64, i64, i64, i64, i64,
+                   i64 }
 
 define void @g(i64 %a, i64 %b, i64 %c) {
 entry:
diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll
index b3794ec..074bab4 100644
--- a/test/CodeGen/X86/byval3.ll
+++ b/test/CodeGen/X86/byval3.ll
@@ -1,7 +1,11 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsl | count 2
 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl | count 2
 
-%struct.s = type { i32, i32, i32, i32, i32, i32 }
+%struct.s = type { i32, i32, i32, i32, i32, i32, i32, i32,
+                   i32, i32, i32, i32, i32, i32, i32, i32,
+                   i32, i32, i32, i32, i32, i32, i32, i32,
+                   i32, i32, i32, i32, i32, i32, i32, i32,
+                   i32 }
 
 define void @g(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6) {
 entry:
diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll
index 591749f..d2fa9e2 100644
--- a/test/CodeGen/X86/byval4.ll
+++ b/test/CodeGen/X86/byval4.ll
@@ -1,7 +1,15 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsw | count 2
 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl	 | count 2
 
-%struct.s = type { i16, i16, i16, i16, i16, i16 }
+%struct.s = type { i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16, i16, i16, i16, i16, i16, i16, i16,
+                   i16 }
 
 
 define void @g(i16 signext  %a1, i16 signext  %a2, i16 signext  %a3,
diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll
index 4965d16..fd9c197 100644
--- a/test/CodeGen/X86/byval5.ll
+++ b/test/CodeGen/X86/byval5.ll
@@ -1,7 +1,23 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 | grep rep.movsb | count 2
 ; RUN: llvm-as < %s | llc -march=x86 | grep rep.movsl	 | count 2
 
-%struct.s = type { i8, i8, i8, i8, i8, i8 }
+%struct.s = type { i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8, i8, i8, i8, i8, i8, i8, i8,
+                   i8 }
 
 
 define void @g(i8 signext  %a1, i8 signext  %a2, i8 signext  %a3,
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 4199bf0..fcbc59b 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,6 +1,7 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | grep add | grep 16
 
-	%struct.S = type { <2 x i64> }
+	%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
+                           <2 x i64> }
 
 define i32 @main() nounwind  {
 entry:
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
new file mode 100644
index 0000000..dedd948
--- /dev/null
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc | not grep movs
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %z) nounwind  {
+entry:
+	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
+	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
+	%tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2		; <x86_fp80> [#uses=1]
+	%tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp7 = load x86_fp80* %tmp6, align 16		; <x86_fp80> [#uses=1]
+	store x86_fp80 %tmp3, x86_fp80* %real, align 16
+	store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16
+	call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval align 4  %iz ) nounwind 
+	ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind 
diff --git a/test/CodeGen/X86/variable-sized-darwin-bzero.ll b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
new file mode 100644
index 0000000..b0cdf49
--- /dev/null
+++ b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llc -march=x86 -mtriple=i686-apple-darwin10 | grep __bzero
+
+declare void @llvm.memset.i64(i8*, i8, i64, i32)
+
+define void @foo(i8* %p, i64 %n) {
+  call void @llvm.memset.i64(i8* %p, i8 0, i64 %n, i32 4)
+  ret void
+}