diff options
author | Evan Cheng <evan.cheng@apple.com> | 2007-10-22 22:11:27 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2007-10-22 22:11:27 +0000 |
commit | 4102eb57bbeecbbf5b5b5122ed1ecd4cd5487878 (patch) | |
tree | 6e5fa21a413cd28b0c0076a1d5b083d800be6365 | |
parent | ac72058dd6afcb41ddf8b19c4ec2093999b34282 (diff) | |
download | external_llvm-4102eb57bbeecbbf5b5b5122ed1ecd4cd5487878.zip external_llvm-4102eb57bbeecbbf5b5b5122ed1ecd4cd5487878.tar.gz external_llvm-4102eb57bbeecbbf5b5b5122ed1ecd4cd5487878.tar.bz2 |
Fix memcpy lowering when addresses are 4-byte aligned but size is not multiple of 4.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@43234 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 117 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 5 | ||||
-rw-r--r-- | lib/Target/ARM/README-Thumb.txt | 4 | ||||
-rw-r--r-- | test/CodeGen/ARM/memcpy-inline.ll | 15 |
4 files changed, 101 insertions, 40 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 99d1d24..154832b 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1287,7 +1287,8 @@ static SDOperand LowerSRx(SDOperand Op, SelectionDAG &DAG, return DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Lo, Hi); } -SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { +SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDOperand ChainOp = Op.getOperand(0); SDOperand DestOp = Op.getOperand(1); SDOperand SourceOp = Op.getOperand(2); @@ -1305,25 +1306,18 @@ SDOperand ARMTargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) { assert(!AlwaysInline && "Cannot inline copy of unknown size"); return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); } - unsigned Size = I->getValue(); - - if (AlwaysInline) - return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); - // The libc version is likely to be faster for the following cases. It can + // If not DWORD aligned or if size is more than threshold, then call memcpy. + // The libc version is likely to be faster for the these cases. It can // use the address value and run time information about the CPU. // With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster - - // If not DWORD aligned, call memcpy. - if ((Align & 3) != 0) - return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); - - // If size is more than the threshold, call memcpy. - // if (Size > Subtarget->getMinRepStrSizeThreshold()) - if (Size >= 64) - return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); - - return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); + // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb. Change + // this once Thumb ldmia / stmia support is added. + unsigned Size = I->getValue(); + if (AlwaysInline || + (!ST->isThumb() && Size < 64 && (Align & 3) == 0)) + return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG); + return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG); } SDOperand ARMTargetLowering::LowerMEMCPYCall(SDOperand Chain, @@ -1350,46 +1344,93 @@ SDOperand ARMTargetLowering::LowerMEMCPYInline(SDOperand Chain, unsigned Size, unsigned Align, SelectionDAG &DAG) { - - // Do repeated 4-byte loads and stores. To be improved. - assert((Size& 3) == 0); - assert((Align & 3) == 0); + // Do repeated 4-byte loads and stores. To be improved. + assert((Align & 3) == 0 && "Expected 4-byte aligned addresses!"); + unsigned BytesLeft = Size & 3; unsigned NumMemOps = Size >> 2; unsigned EmittedNumMemOps = 0; unsigned SrcOff = 0, DstOff = 0; MVT::ValueType VT = MVT::i32; unsigned VTSize = 4; + unsigned i = 0; const unsigned MAX_LOADS_IN_LDM = 6; - SDOperand LoadChains[MAX_LOADS_IN_LDM]; + SDOperand TFOps[MAX_LOADS_IN_LDM]; SDOperand Loads[MAX_LOADS_IN_LDM]; - // Emit up to 4 loads, then a TokenFactor barrier, then the same - // number of stores. The loads and stores will get combined into + // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the + // same number of stores. The loads and stores will get combined into // ldm/stm later on. - while(EmittedNumMemOps < NumMemOps) { - unsigned i; - for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) { + while (EmittedNumMemOps < NumMemOps) { + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { Loads[i] = DAG.getLoad(VT, Chain, - DAG.getNode(ISD::ADD, VT, Source, - DAG.getConstant(SrcOff, VT)), + DAG.getNode(ISD::ADD, MVT::i32, Source, + DAG.getConstant(SrcOff, MVT::i32)), NULL, 0); - LoadChains[i] = Loads[i].getValue(1); + TFOps[i] = Loads[i].getValue(1); SrcOff += VTSize; } + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); - Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &LoadChains[0], i); - - for (i=0; i<MAX_LOADS_IN_LDM && EmittedNumMemOps+i < NumMemOps; i++) { - Chain = DAG.getStore(Chain, Loads[i], - DAG.getNode(ISD::ADD, VT, Dest, - DAG.getConstant(DstOff, VT)), + for (i = 0; + i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { + TFOps[i] = DAG.getStore(Chain, Loads[i], + DAG.getNode(ISD::ADD, MVT::i32, Dest, + DAG.getConstant(DstOff, MVT::i32)), NULL, 0); DstOff += VTSize; } + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); + EmittedNumMemOps += i; } - return Chain; + if (BytesLeft == 0) + return Chain; + + // Issue loads / stores for the trailing (1 - 3) bytes. + unsigned BytesLeftSave = BytesLeft; + i = 0; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + Loads[i] = DAG.getLoad(VT, Chain, + DAG.getNode(ISD::ADD, MVT::i32, Source, + DAG.getConstant(SrcOff, MVT::i32)), + NULL, 0); + TFOps[i] = Loads[i].getValue(1); + ++i; + SrcOff += VTSize; + BytesLeft -= VTSize; + } + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); + + i = 0; + BytesLeft = BytesLeftSave; + while (BytesLeft) { + if (BytesLeft >= 2) { + VT = MVT::i16; + VTSize = 2; + } else { + VT = MVT::i8; + VTSize = 1; + } + + TFOps[i] = DAG.getStore(Chain, Loads[i], + DAG.getNode(ISD::ADD, MVT::i32, Dest, + DAG.getConstant(DstOff, MVT::i32)), + NULL, 0); + ++i; + DstOff += VTSize; + BytesLeft -= VTSize; + } + return DAG.getNode(ISD::TokenFactor, MVT::Other, &TFOps[0], i); } SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { @@ -1419,7 +1460,7 @@ SDOperand ARMTargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) { case ISD::RETURNADDR: break; case ISD::FRAMEADDR: break; case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); - case ISD::MEMCPY: return LowerMEMCPY(Op, DAG); + case ISD::MEMCPY: return LowerMEMCPY(Op, DAG, Subtarget); } return SDOperand(); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 46bcb34..47cb2a1 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -130,11 +130,12 @@ namespace llvm { SDOperand LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG); SDOperand LowerToTLSExecModels(GlobalAddressSDNode *GA, - SelectionDAG &DAG); + SelectionDAG &DAG); SDOperand LowerGLOBAL_OFFSET_TABLE(SDOperand Op, SelectionDAG &DAG); SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG); SDOperand LowerBR_JT(SDOperand Op, SelectionDAG &DAG); - SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG); + SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG, + const ARMSubtarget *ST); SDOperand LowerMEMCPYCall(SDOperand Chain, SDOperand Dest, SDOperand Source, SDOperand Count, SelectionDAG &DAG); diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 380097d..5bd16a3 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -221,3 +221,7 @@ LPC0: Make register allocator / spiller smarter so we can re-materialize "mov r, imm", etc. Almost all Thumb instructions clobber condition code. + +//===---------------------------------------------------------------------===// + +Add ldmia, stmia support. diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll new file mode 100644 index 0000000..665d3ac --- /dev/null +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -0,0 +1,15 @@ +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldmia +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrb +; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrh + + %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } +@src = external global %struct.x +@dst = external global %struct.x + +define i32 @t() { +entry: + call void @llvm.memcpy.i32( i8* getelementptr (%struct.x* @dst, i32 0, i32 0), i8* getelementptr (%struct.x* @src, i32 0, i32 0), i32 11, i32 8 ) + ret i32 0 +} + +declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) |