diff options
-rw-r--r-- | include/llvm/Target/TargetLowering.h | 15 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.cpp | 21 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.h | 10 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.cpp | 9 | ||||
-rw-r--r-- | lib/Target/PowerPC/PPCISelLowering.h | 10 | ||||
-rw-r--r-- | lib/Target/SystemZ/SystemZISelLowering.cpp | 20 | ||||
-rw-r--r-- | lib/Target/SystemZ/SystemZISelLowering.h | 4 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 21 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 10 | ||||
-rw-r--r-- | test/CodeGen/AArch64/fp-dp3.ll | 34 | ||||
-rw-r--r-- | test/CodeGen/AArch64/illegal-float-ops.ll | 26 | ||||
-rw-r--r-- | test/CodeGen/PowerPC/vec_fmuladd.ll | 56 | ||||
-rw-r--r-- | test/CodeGen/X86/extended-fma-contraction.ll | 22 | ||||
-rw-r--r-- | test/CodeGen/X86/fma_patterns_wide.ll | 84 | ||||
-rw-r--r-- | test/CodeGen/X86/wide-fma-contraction.ll | 14 |
17 files changed, 329 insertions, 37 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 70b285e..d1c98f6 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1213,11 +1213,16 @@ public: return false; } - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + /// + /// NOTE: This may be called before legalization on types for which FMAs are + /// not legal, but should return true if those types will eventually legalize + /// to types that support FMAs. After legalization, it will only be called on + /// types that support FMAs (via Legal or Custom actions) + virtual bool isFMAFasterThanFMulAndFAdd(EVT) const { return false; } diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9eb63e2..9880655 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6084,8 +6084,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // FADD -> FMA combines: if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && - DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) { + DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) @@ -6161,8 +6161,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // FSUB -> FMA combines: if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && - DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && - TLI.isOperationLegalOrCustom(ISD::FMA, VT)) { + DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) && + (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) { // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index dcde5ac..2a1ded0 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4922,7 +4922,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::fmuladd: { EVT VT = TLI->getValueType(I.getType()); if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict && - TLI->isFMAFasterThanMulAndAdd(VT)) { + TLI->isFMAFasterThanFMulAndFAdd(VT)) { setValue(&I, DAG.getNode(ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)), diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 84051d4..1fa1edb 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2798,6 +2798,27 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } +bool +AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f16: + case MVT::f32: + case MVT::f64: + return true; + case MVT::f128: + return false; + default: + break; + } + + return false; +} + AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index 901a9be..320346e 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,11 +229,11 @@ public: virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; ConstraintType getConstraintType(const std::string &Constraint) const; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index cf41c02..812f096 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7809,18 +7809,15 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return true; } -/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than -/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to -/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd -/// is expanded to mul + add. -bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const { +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: - case MVT::v4f32: return true; default: break; diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 4801a41..776ad2a 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -459,11 +459,11 @@ namespace llvm { /// relative to software emulation. virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT VT) const; + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; private: SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index b1abc2c..d344134 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -255,6 +255,26 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) MaxStoresPerMemsetOptSize = 0; } +bool +SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + case MVT::f128: + return false; + default: + break; + } + + return false; +} + bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // We can load zero using LZ?R and negative zero using LZ?R;LC?BR. return Imm.isZero() || Imm.isNegZero(); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 4ddfcbb..88e1fa7 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -129,9 +129,7 @@ public: virtual EVT getSetCCResultType(LLVMContext &, EVT) const { return MVT::i32; } - virtual bool isFMAFasterThanMulAndAdd(EVT) const LLVM_OVERRIDE { - return true; - } + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE; virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const; virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a680ac0..f00df35 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -12966,6 +12966,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool +X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + return false; + + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { // i16 instructions are longer (0x66 prefix) and potentially slower. return !(VT1 == MVT::i32 && VT2 == MVT::i16); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 0e5e822..8317824 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -646,11 +646,11 @@ namespace llvm { virtual bool isZExtFree(EVT VT1, EVT VT2) const; virtual bool isZExtFree(SDValue Val, EVT VT2) const; - /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than - /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to - /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd - /// is expanded to mul + add. - virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const; /// isNarrowingProfitable - Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll index 39db9be..f372c43 100644 --- a/test/CodeGen/AArch64/fp-dp3.ll +++ b/test/CodeGen/AArch64/fp-dp3.ll @@ -1,102 +1,136 @@ ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST declare float @llvm.fma.f32(float, float, float) declare double @llvm.fma.f64(double, double, double) define float @test_fmadd(float %a, float %b, float %c) { ; CHECK: test_fmadd: +; CHECK-NOFAST: test_fmadd: %val = call float @llvm.fma.f32(float %a, float %b, float %c) ; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fmsub(float %a, float %b, float %c) { ; CHECK: test_fmsub: +; CHECK-NOFAST: test_fmsub: %nega = fsub float -0.0, %a %val = call float @llvm.fma.f32(float %nega, float %b, float %c) ; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fnmadd(float %a, float %b, float %c) { ; CHECK: test_fnmadd: +; CHECK-NOFAST: test_fnmadd: %negc = fsub float -0.0, %c %val = call float @llvm.fma.f32(float %a, float %b, float %negc) ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define float @test_fnmsub(float %a, float %b, float %c) { ; CHECK: test_fnmsub: +; CHECK-NOFAST: test_fnmsub: %nega = fsub float -0.0, %a %negc = fsub float -0.0, %c %val = call float @llvm.fma.f32(float %nega, float %b, float %negc) ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %val } define double @testd_fmadd(double %a, double %b, double %c) { ; CHECK: testd_fmadd: +; CHECK-NOFAST: testd_fmadd: %val = call double @llvm.fma.f64(double %a, double %b, double %c) ; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fmsub(double %a, double %b, double %c) { ; CHECK: testd_fmsub: +; CHECK-NOFAST: testd_fmsub: %nega = fsub double -0.0, %a %val = call double @llvm.fma.f64(double %nega, double %b, double %c) ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fnmadd(double %a, double %b, double %c) { ; CHECK: testd_fnmadd: +; CHECK-NOFAST: testd_fnmadd: %negc = fsub double -0.0, %c %val = call double @llvm.fma.f64(double %a, double %b, double %negc) ; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define double @testd_fnmsub(double %a, double %b, double %c) { ; CHECK: testd_fnmsub: +; CHECK-NOFAST: testd_fnmsub: %nega = fsub double -0.0, %a %negc = fsub double -0.0, %c %val = call double @llvm.fma.f64(double %nega, double %b, double %negc) ; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-NOFAST: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} ret double %val } define float @test_fmadd_unfused(float %a, float %b, float %c) { ; CHECK: test_fmadd_unfused: +; CHECK-NOFAST: test_fmadd_unfused: %prod = fmul float %b, %c %sum = fadd float %a, %prod ; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %sum } define float @test_fmsub_unfused(float %a, float %b, float %c) { ; CHECK: test_fmsub_unfused: +; CHECK-NOFAST: test_fmsub_unfused: %prod = fmul float %b, %c %diff = fsub float %a, %prod ; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %diff } define float @test_fnmadd_unfused(float %a, float %b, float %c) { ; CHECK: test_fnmadd_unfused: +; CHECK-NOFAST: test_fnmadd_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c %sum = fadd float %nega, %prod ; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %sum } define float @test_fnmsub_unfused(float %a, float %b, float %c) { ; CHECK: test_fnmsub_unfused: +; CHECK-NOFAST: test_fnmsub_unfused: %nega = fsub float -0.0, %a %prod = fmul float %b, %c %diff = fsub float %nega, %prod ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fneg {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ret float %diff } diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll index 446151b..a398f7b 100644 --- a/test/CodeGen/AArch64/illegal-float-ops.ll +++ b/test/CodeGen/AArch64/illegal-float-ops.ll @@ -219,3 +219,29 @@ define void @test_frem(float %float, double %double, fp128 %fp128) { ret void } + +declare fp128 @llvm.fma.f128(fp128, fp128, fp128) + +define void @test_fma(fp128 %fp128) { +; CHECK: test_fma: + + %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128) + store fp128 %fmafp128, fp128* @varfp128 +; CHECK: bl fmal + + ret void +} + +declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128) + +define void @test_fmuladd(fp128 %fp128) { +; CHECK: test_fmuladd: + + %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128) + store fp128 %fmuladdfp128, fp128* @varfp128 +; CHECK-NOT: bl fmal +; CHECK: bl __multf3 +; CHECK: bl __addtf3 + + ret void +} diff --git a/test/CodeGen/PowerPC/vec_fmuladd.ll b/test/CodeGen/PowerPC/vec_fmuladd.ll new file mode 100644 index 0000000..b1bc377 --- /dev/null +++ b/test/CodeGen/PowerPC/vec_fmuladd.ll @@ -0,0 +1,56 @@ +; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s + +target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float> %val, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float> %val, <4 x float>, <4 x float>) +declare <8 x float> @llvm.fmuladd.v8f32(<8 x float> %val, <8 x float>, <8 x float>) +declare <2 x double> @llvm.fmuladd.v2f64(<2 x double> %val, <2 x double>, <2 x double>) +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double> %val, <4 x double>, <4 x double>) + +define <2 x float> @v2f32_fmuladd(<2 x float> %x) nounwind readnone { +entry: + %fmuladd = call <2 x float> @llvm.fmuladd.v2f32 (<2 x float> %x, <2 x float> %x, <2 x float> %x) + ret <2 x float> %fmuladd +} +; fmuladd (<2 x float>) is promoted to fmuladd (<4 x float>) +; CHECK: v2f32_fmuladd: +; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} + +define <4 x float> @v4f32_fmuladd(<4 x float> %x) nounwind readnone { +entry: + %fmuladd = call <4 x float> @llvm.fmuladd.v4f32 (<4 x float> %x, <4 x float> %x, <4 x float> %x) + ret <4 x float> %fmuladd +} +; CHECK: v4f32_fmuladd: +; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} + +define <8 x float> @v8f32_fmuladd(<8 x float> %x) nounwind readnone { +entry: + %fmuladd = call <8 x float> @llvm.fmuladd.v8f32 (<8 x float> %x, <8 x float> %x, <8 x float> %x) + ret <8 x float> %fmuladd +} +; CHECK: v8f32_fmuladd: +; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: vmaddfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} + +define <2 x double> @v2f64_fmuladd(<2 x double> %x) nounwind readnone { +entry: + %fmuladd = call <2 x double> @llvm.fmuladd.v2f64 (<2 x double> %x, <2 x double> %x, <2 x double> %x) + ret <2 x double> %fmuladd +} +; CHECK: v2f64_fmuladd: +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} + +define <4 x double> @v4f64_fmuladd(<4 x double> %x) nounwind readnone { +entry: + %fmuladd = call <4 x double> @llvm.fmuladd.v4f64 (<4 x double> %x, <4 x double> %x, <4 x double> %x) + ret <4 x double> %fmuladd +} +; CHECK: v4f64_fmuladd: +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} +; CHECK: fmadd {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}} diff --git a/test/CodeGen/X86/extended-fma-contraction.ll b/test/CodeGen/X86/extended-fma-contraction.ll new file mode 100644 index 0000000..ef2c22b --- /dev/null +++ b/test/CodeGen/X86/extended-fma-contraction.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=x86 -mattr=+fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s +; RUN: llc -march=x86 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA + +; CHECK: fmafunc +define <3 x float> @fmafunc(<3 x float> %a, <3 x float> %b, <3 x float> %c) { + +; CHECK-NOT: vmulps +; CHECK-NOT: vaddps +; CHECK: vfmaddps +; CHECK-NOT: vmulps +; CHECK-NOT: vaddps + +; CHECK-NOFMA-NOT: calll +; CHECK-NOFMA: vmulps +; CHECK-NOFMA: vaddps +; CHECK-NOFMA-NOT: calll + + %ret = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) + ret <3 x float> %ret +} + +declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) nounwind readnone diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll new file mode 100644 index 0000000..d84e5a0 --- /dev/null +++ b/test/CodeGen/X86/fma_patterns_wide.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4 + +; CHECK: test_x86_fmadd_ps_y_wide +; CHECK: vfmadd213ps +; CHECK: vfmadd213ps +; CHECK: ret +; CHECK_FMA4: test_x86_fmadd_ps_y_wide +; CHECK_FMA4: vfmaddps +; CHECK_FMA4: vfmaddps +; CHECK_FMA4: ret +define <16 x float> @test_x86_fmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + %x = fmul <16 x float> %a0, %a1 + %res = fadd <16 x float> %x, %a2 + ret <16 x float> %res +} + +; CHECK: test_x86_fmsub_ps_y_wide +; CHECK: vfmsub213ps +; CHECK: vfmsub213ps +; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_ps_y_wide +; CHECK_FMA4: vfmsubps +; CHECK_FMA4: vfmsubps +; CHECK_FMA4: ret +define <16 x float> @test_x86_fmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + %x = fmul <16 x float> %a0, %a1 + %res = fsub <16 x float> %x, %a2 + ret <16 x float> %res +} + +; CHECK: test_x86_fnmadd_ps_y_wide +; CHECK: vfnmadd213ps +; CHECK: vfnmadd213ps +; CHECK: ret +; CHECK_FMA4: test_x86_fnmadd_ps_y_wide +; CHECK_FMA4: vfnmaddps +; CHECK_FMA4: vfnmaddps +; CHECK_FMA4: ret +define <16 x float> @test_x86_fnmadd_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + %x = fmul <16 x float> %a0, %a1 + %res = fsub <16 x float> %a2, %x + ret <16 x float> %res +} + +; CHECK: test_x86_fnmsub_ps_y_wide +; CHECK: vfnmsub213ps +; CHECK: vfnmsub213ps +; CHECK: ret +define <16 x float> @test_x86_fnmsub_ps_y_wide(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + %x = fmul <16 x float> %a0, %a1 + %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x + %res = fsub <16 x float> %y, %a2 + ret <16 x float> %res +} + +; CHECK: test_x86_fmadd_pd_y_wide +; CHECK: vfmadd213pd +; CHECK: vfmadd213pd +; CHECK: ret +; CHECK_FMA4: test_x86_fmadd_pd_y_wide +; CHECK_FMA4: vfmaddpd +; CHECK_FMA4: vfmaddpd +; CHECK_FMA4: ret +define <8 x double> @test_x86_fmadd_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + %x = fmul <8 x double> %a0, %a1 + %res = fadd <8 x double> %x, %a2 + ret <8 x double> %res +} + +; CHECK: test_x86_fmsub_pd_y_wide +; CHECK: vfmsub213pd +; CHECK: vfmsub213pd +; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_pd_y_wide +; CHECK_FMA4: vfmsubpd +; CHECK_FMA4: vfmsubpd +; CHECK_FMA4: ret +define <8 x double> @test_x86_fmsub_pd_y_wide(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { + %x = fmul <8 x double> %a0, %a1 + %res = fsub <8 x double> %x, %a2 + ret <8 x double> %res +} diff --git a/test/CodeGen/X86/wide-fma-contraction.ll b/test/CodeGen/X86/wide-fma-contraction.ll index d93f33b..7ee0fba 100644 --- a/test/CodeGen/X86/wide-fma-contraction.ll +++ b/test/CodeGen/X86/wide-fma-contraction.ll @@ -1,7 +1,9 @@ ; RUN: llc -march=x86 -mattr=+fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s +; RUN: llc -march=x86 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA ; CHECK: fmafunc define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c) { + ; CHECK-NOT: vmulps ; CHECK-NOT: vaddps ; CHECK: vfmaddps @@ -10,11 +12,17 @@ define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c) ; CHECK: vfmaddps ; CHECK-NOT: vmulps ; CHECK-NOT: vaddps + +; CHECK-NOFMA-NOT: calll +; CHECK-NOFMA: vmulps +; CHECK-NOFMA: vaddps +; CHECK-NOFMA-NOT: calll +; CHECK-NOFMA: vmulps +; CHECK-NOFMA: vaddps +; CHECK-NOFMA-NOT: calll + %ret = tail call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c) ret <16 x float> %ret } declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>) nounwind readnone - - - |