diff options
author | Evan Cheng <evan.cheng@apple.com> | 2012-04-11 00:13:00 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2012-04-11 00:13:00 +0000 |
commit | 82509e5c62a99912c636b22e227b810eaf6eda78 (patch) | |
tree | 4c7f36ce001590abdda1adea175a40a34ae0eb7c /lib/Target/ARM | |
parent | 71fbed45d9f4e2e886afc7f22c058087e7872dc6 (diff) | |
download | external_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.zip external_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.tar.gz external_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.tar.bz2 |
Fix a number of problems with ARM fused multiply add/subtract instructions.
1. The new instruction itinerary entries are not properly described.
2. The asm parser can't handle vfms and vfnms.
3. There were no assembler, disassembler test cases.
4. HasNEON2 has the wrong assembler predicate.
rdar://10139676
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154456 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r-- | lib/Target/ARM/ARM.td | 2 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 7 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 9 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleA8.td | 19 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleA9.td | 36 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleV6.td | 6 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.h | 2 | ||||
-rw-r--r-- | lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 |
8 files changed, 73 insertions, 10 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index b05fe62..85c41fc 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", "true", "Use NEON for single precision FP">; -// Allow more precision in FP computation -def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6b8f4cc..37284f9 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -181,11 +181,11 @@ def HasVFP3 : Predicate<"Subtarget->hasVFP3()">, AssemblerPredicate<"FeatureVFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, AssemblerPredicate<"FeatureVFP4">; -def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; +def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON">; def HasNEON2 : Predicate<"Subtarget->hasNEON2()">, - AssemblerPredicate<"FeatureNEON2">; + AssemblerPredicate<"FeatureNEON,FeatureVFP4">; def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16">; @@ -221,6 +221,9 @@ def UseMovt : Predicate<"Subtarget->useMovt()">; def DontUseMovt : Predicate<"!Subtarget->useMovt()">; def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">; +// Allow more precision in FP computation +def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 99dbb95..501cc8f 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -4115,7 +4115,6 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; - // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", v2f32, fmul_su, fadd_mlx>, @@ -4136,10 +4135,10 @@ def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", // Match @llvm.fma.* intrinsics def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)), (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, - Requires<[HasNEON, HasVFP4]>; + Requires<[HasNEON2]>; def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)), (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, - Requires<[HasNEON, HasVFP4]>; + Requires<[HasNEON2]>; // Vector Subtract Operations. @@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>, def : N3VSMulOpPat<fmul, fsub, VMLSfd>, Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>; def : N3VSMulOpPat<fmul, fadd, VFMAfd>, - Requires<[HasNEON2, UseNEONForFP,FPContractions]>; + Requires<[HasNEON2, UseNEONForFP, FPContractions]>; def : N3VSMulOpPat<fmul, fsub, VFMSfd>, - Requires<[HasNEON2, UseNEONForFP,FPContractions]>; + Requires<[HasNEON2, UseNEONForFP, FPContractions]>; def : N2VSPat<fabs, VABSfd>; def : N2VSPat<fneg, VNEGfd>; def : N3VSPat<NEONfmax, VMAXfd>; diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td index 8d86c01..8b1fb93 100644 --- a/lib/Target/ARM/ARMScheduleA8.td +++ b/lib/Target/ARM/ARMScheduleA8.td @@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries< InstrStage<19, [A8_NPipe], 0>, InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<19, [A8_NPipe], 0>, + InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, + // // Single-precision FP DIV InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<20, [A8_NPipe], 0>, @@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries< InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, // + // Double-register Fused FP Multiple-Accumulate + InstrItinData<IIC_VFMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VFMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, + InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, + // // Double-register Reciprical Step InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td index 49fedf6..0d710cc 100644 --- a/lib/Target/ARM/ARMScheduleA9.td +++ b/lib/Target/ARM/ARMScheduleA9.td @@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<2, [A9_NPipe]>], [9, 1, 1, 1]>, // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<9, [A9_DRegsN], 0, Reserved>, + InstrStage<1, [A9_NPipe]>], + [8, 1, 1, 1]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsVFP], 0, Required>, + InstrStage<10, [A9_DRegsN], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [9, 1, 1, 1]>, + // // Single-precision FP DIV InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, @@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries< InstrStage<4, [A9_NPipe]>], [8, 4, 2, 1]>, // + // Double-register Fused FP Multiple-Accumulate + InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 7 cycles + InstrStage<8, [A9_DRegsVFP], 0, Reserved>, + InstrStage<2, [A9_NPipe]>], + [6, 3, 2, 1]>, + // + // Quad-register Fused FP Multiple-Accumulate + // Result written in N9, but that is relative to the last cycle of multicycle, + // so we use 10 for those cases + InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, + InstrStage<1, [A9_MUX0], 0>, + InstrStage<1, [A9_DRegsN], 0, Required>, + // Extra latency cycles since wbck is 9 cycles + InstrStage<10, [A9_DRegsVFP], 0, Reserved>, + InstrStage<4, [A9_NPipe]>], + [8, 4, 2, 1]>, + // // Double-register Reciprical Step InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, InstrStage<1, [A9_MUX0], 0>, diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td index 4d959f5..0ace9bc 100644 --- a/lib/Target/ARM/ARMScheduleV6.td +++ b/lib/Target/ARM/ARMScheduleV6.td @@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries< // Double-precision FP MAC InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, // + // Single-precision Fused FP MAC + InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, + // + // Double-precision Fused FP MAC + InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, + // // Single-precision FP DIV InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>, // diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 3d9c03d..5cf54b94 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -45,7 +45,7 @@ protected: bool HasV6T2Ops; bool HasV7Ops; - /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what + /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what /// floating point ISAs are supported. bool HasVFPv2; bool HasVFPv3; diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 34dadf8..8fa7378 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" || Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" || Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" || + Mnemonic == "vfms" || Mnemonic == "vfnms" || (Mnemonic == "movs" && isThumb()))) { Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1); CarrySetting = true; @@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet, Mnemonic == "orr" || Mnemonic == "mvn" || Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" || Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" || + Mnemonic == "vfm" || Mnemonic == "vfnm" || (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" || Mnemonic == "mla" || Mnemonic == "smlal" || Mnemonic == "umlal" || Mnemonic == "umull"))) { |