diff options
| author | Lang Hames <lhames@gmail.com> | 2012-04-24 18:58:36 +0000 | 
|---|---|---|
| committer | Lang Hames <lhames@gmail.com> | 2012-04-24 18:58:36 +0000 | 
| commit | 1d9e68dab1fc29e3a5e05a3b0b8d7c70de5e10b2 (patch) | |
| tree | 56eb583cdfc55544e7c01a939eee01be24873b75 | |
| parent | 7362ac7f8cc803708187fd6029de9f8f62ff2ed2 (diff) | |
| download | external_llvm-1d9e68dab1fc29e3a5e05a3b0b8d7c70de5e10b2.zip external_llvm-1d9e68dab1fc29e3a5e05a3b0b8d7c70de5e10b2.tar.gz external_llvm-1d9e68dab1fc29e3a5e05a3b0b8d7c70de5e10b2.tar.bz2 | |
Add support for llvm.arm.neon.vmull* intrinsics to InstCombine. This fixes
<rdar://problem/11291436>.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@155468 91177308-0d34-0410-b5e6-96231b3b80d8
| -rw-r--r-- | lib/Transforms/InstCombine/InstCombineCalls.cpp | 35 | ||||
| -rw-r--r-- | test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll | 68 | 
2 files changed, 103 insertions, 0 deletions
| diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 77e4727..5ad9382 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -14,6 +14,7 @@  #include "InstCombine.h"  #include "llvm/Support/CallSite.h"  #include "llvm/Target/TargetData.h" +#include "llvm/Analysis/InstructionSimplify.h"  #include "llvm/Analysis/MemoryBuiltins.h"  #include "llvm/Transforms/Utils/BuildLibCalls.h"  #include "llvm/Transforms/Utils/Local.h" @@ -694,6 +695,40 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {      break;    } +  case Intrinsic::arm_neon_vmulls: +  case Intrinsic::arm_neon_vmullu: { +    // Zext/sext intrinsic operands according to the intrinsic type, then try to +    // simplify them. This lets us try a SimplifyMulInst on the extended +    // operands. If the zext/sext instructions are unused when we're done then +    // delete them from the block.  +    Value* Arg0 = II->getArgOperand(0); +    Value* Arg1 = II->getArgOperand(1); +    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu); +    Instruction *Arg0W = +      Zext ? CastInst::CreateZExtOrBitCast(Arg0, II->getType(), "", II) : +             CastInst::CreateSExtOrBitCast(Arg0, II->getType(), "", II); +    Value* Arg0WS = SimplifyInstruction(Arg0W); +    if (Arg0WS == 0) // If simplification fails just pass through the ext'd val. +      Arg0WS = Arg0W; +    Instruction *Arg1W = +      Zext ? CastInst::CreateZExtOrBitCast(Arg1, II->getType(), "", II) : +             CastInst::CreateSExtOrBitCast(Arg1, II->getType(), "", II); +    Value* Arg1WS = SimplifyInstruction(Arg1W); +    if (Arg1WS == 0) +      Arg1WS = Arg1W; +    Instruction *SimplifiedInst = 0; +    if (Value* V = SimplifyMulInst(Arg0WS, Arg1WS, TD)) { +      SimplifiedInst = ReplaceInstUsesWith(CI, V); +    } +    if (Arg0W->use_empty()) +      Arg0W->eraseFromParent(); +    if (Arg1W->use_empty()) +      Arg1W->eraseFromParent(); +    if (SimplifiedInst != 0) +      return SimplifiedInst; +    break; +  } +    case Intrinsic::stackrestore: {      // If the save is right next to the restore, remove the restore.  This can      // happen when variable allocas are DCE'd. diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll new file mode 100644 index 0000000..9bb988c --- /dev/null +++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll @@ -0,0 +1,68 @@ +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios0" + +; RUN: opt -S -instcombine < %s | FileCheck %s + +define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> zeroinitializer +} + +define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: %0 = sext <4 x i16> %x to <4 x i32> +; CHECK-NEXT: ret <4 x i32> %0 +} + +define <4 x i32> @constantMul() nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind +  ret <4 x i32> %a +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6> +} + +define <4 x i32> @constantMulS() nounwind readnone ssp { +entry: +  %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> +} + +define <4 x i32> @constantMulU() nounwind readnone ssp { +entry: +  %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> +} + +define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind +  %b = add <4 x i32> zeroinitializer, %a +  ret <4 x i32> %b +; CHECK: entry: +; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind +; CHECK-NEXT: ret <4 x i32> %a +} + +define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp { +entry: +  %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind +  %b = add <4 x i32> %x, %a +  ret <4 x i32> %b   +; CHECK: entry: +; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6> +; CHECK-NEXT: ret <4 x i32> %b +} + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone | 
