diff options
author | Lang Hames <lhames@gmail.com> | 2012-05-01 00:20:38 +0000 |
---|---|---|
committer | Lang Hames <lhames@gmail.com> | 2012-05-01 00:20:38 +0000 |
commit | 973f72a29aeafb1fdc4f8dafc3f6c6651cbb0c99 (patch) | |
tree | 6df5027f9488aa2a915f2760e8a412f298d55f4e | |
parent | 39379c5df3a0bdcd768fc6421381ca805cd4ee21 (diff) | |
download | external_llvm-973f72a29aeafb1fdc4f8dafc3f6c6651cbb0c99.zip external_llvm-973f72a29aeafb1fdc4f8dafc3f6c6651cbb0c99.tar.gz external_llvm-973f72a29aeafb1fdc4f8dafc3f6c6651cbb0c99.tar.bz2 |
Add support for llvm.arm.neon.vmull* intrinsics to InstCombine. Fixes
<rdar://problem/11291436>.
This is a second attempt at a fix for this, the first was r155468. Thanks
to Chandler, Bob and others for the feedback that helped me improve this.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@155866 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineCalls.cpp | 51 | ||||
-rw-r--r-- | test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll | 68 |
2 files changed, 119 insertions, 0 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 77e4727..a3dc77d 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -694,6 +694,57 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + // Handle mul by zero first: + if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { + return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu); + VectorType *NewVT = cast<VectorType>(II->getType()); + unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth(); + if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) { + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + VectorType* VT = cast<VectorType>(CV0->getType()); + SmallVector<Constant*, 4> NewElems; + for (unsigned i = 0; i < VT->getNumElements(); ++i) { + APInt CV0E = + (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue(); + CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth); + APInt CV1E = + (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue(); + CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth); + NewElems.push_back( + ConstantInt::get(NewVT->getElementType(), CV0E * CV1E)); + } + return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems)); + } + + // Couldn't simplify - cannonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + if (ConstantInt *Splat = + dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) { + if (Splat->isOne()) { + if (Zext) + return CastInst::CreateZExtOrBitCast(Arg0, II->getType()); + // else + return CastInst::CreateSExtOrBitCast(Arg0, II->getType()); + } + } + } + + break; + } + case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll new file mode 100644 index 0000000..0907c49 --- /dev/null +++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll @@ -0,0 +1,68 @@ +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios0" + +; RUN: opt -S -instcombine < %s | FileCheck %s + +define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +} + +define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @constantMul() nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind + ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6> +} + +define <4 x i32> @constantMulS() nounwind readnone ssp { +entry: + %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> +} + +define <4 x i32> @constantMulU() nounwind readnone ssp { +entry: + %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> +} + +define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind + %b = add <4 x i32> zeroinitializer, %a + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp { +entry: + %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind + %b = add <4 x i32> %x, %a + ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6> +; CHECK-NEXT: ret <4 x i32> %b +} + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone |