aboutsummaryrefslogtreecommitdiffstats
path: root/lib/Target/ARM
diff options
context:
space:
mode:
authorEvan Cheng <evan.cheng@apple.com>2012-04-11 00:13:00 +0000
committerEvan Cheng <evan.cheng@apple.com>2012-04-11 00:13:00 +0000
commit82509e5c62a99912c636b22e227b810eaf6eda78 (patch)
tree4c7f36ce001590abdda1adea175a40a34ae0eb7c /lib/Target/ARM
parent71fbed45d9f4e2e886afc7f22c058087e7872dc6 (diff)
downloadexternal_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.zip
external_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.tar.gz
external_llvm-82509e5c62a99912c636b22e227b810eaf6eda78.tar.bz2
Fix a number of problems with ARM fused multiply add/subtract instructions.
1. The new instruction itinerary entries are not properly described. 2. The asm parser can't handle vfms and vfnms. 3. There were no assembler, disassembler test cases. 4. HasNEON2 has the wrong assembler predicate. rdar://10139676 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154456 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/ARM')
-rw-r--r--lib/Target/ARM/ARM.td2
-rw-r--r--lib/Target/ARM/ARMInstrInfo.td7
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td9
-rw-r--r--lib/Target/ARM/ARMScheduleA8.td19
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td36
-rw-r--r--lib/Target/ARM/ARMScheduleV6.td6
-rw-r--r--lib/Target/ARM/ARMSubtarget.h2
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp2
8 files changed, 73 insertions, 10 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b05fe62..85c41fc 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
"true",
"Use NEON for single precision FP">;
-// Allow more precision in FP computation
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
// Disable 32-bit to 16-bit narrowing for experimentation.
def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6b8f4cc..37284f9 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -181,11 +181,11 @@ def HasVFP3 : Predicate<"Subtarget->hasVFP3()">,
AssemblerPredicate<"FeatureVFP3">;
def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
AssemblerPredicate<"FeatureVFP4">;
-def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
+def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
AssemblerPredicate<"FeatureNEON">;
def HasNEON2 : Predicate<"Subtarget->hasNEON2()">,
- AssemblerPredicate<"FeatureNEON2">;
+ AssemblerPredicate<"FeatureNEON,FeatureVFP4">;
def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">,
AssemblerPredicate<"FeatureFP16">;
@@ -221,6 +221,9 @@ def UseMovt : Predicate<"Subtarget->useMovt()">;
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
+// Allow more precision in FP computation
+def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 99dbb95..501cc8f 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4115,7 +4115,6 @@ defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
"vqdmlsl", "s", int_arm_neon_vqdmlsl>;
defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
-
// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
v2f32, fmul_su, fadd_mlx>,
@@ -4136,10 +4135,10 @@ def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
// Match @llvm.fma.* intrinsics
def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),
(VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
- Requires<[HasNEON, HasVFP4]>;
+ Requires<[HasNEON2]>;
def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),
(VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
- Requires<[HasNEON, HasVFP4]>;
+ Requires<[HasNEON2]>;
// Vector Subtract Operations.
@@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
- Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+ Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
- Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+ Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
def : N2VSPat<fabs, VABSfd>;
def : N2VSPat<fneg, VNEGfd>;
def : N3VSPat<NEONfmax, VMAXfd>;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 8d86c01..8b1fb93 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<
InstrStage<19, [A8_NPipe], 0>,
InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
//
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<19, [A8_NPipe], 0>,
+ InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+ //
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<20, [A8_NPipe], 0>,
@@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<
InstrItinData<IIC_VMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
//
+ // Double-register Fused FP Multiple-Accumulate
+ InstrItinData<IIC_VFMACD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+ //
+ // Quad-register Fused FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VFMACQ, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+ InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+ //
// Double-register Reciprical Step
InstrItinData<IIC_VRECSD, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 49fedf6..0d710cc 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<
InstrStage<2, [A9_NPipe]>],
[9, 1, 1, 1]>,
//
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<9, [A9_DRegsN], 0, Reserved>,
+ InstrStage<1, [A9_NPipe]>],
+ [8, 1, 1, 1]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsVFP], 0, Required>,
+ InstrStage<10, [A9_DRegsN], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [9, 1, 1, 1]>,
+ //
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
@@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<
InstrStage<4, [A9_NPipe]>],
[8, 4, 2, 1]>,
//
+ // Double-register Fused FP Multiple-Accumulate
+ InstrItinData<IIC_VFMACD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 7 cycles
+ InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<2, [A9_NPipe]>],
+ [6, 3, 2, 1]>,
+ //
+ // Quad-register Fused FP Multiple-Accumulate
+ // Result written in N9, but that is relative to the last cycle of multicycle,
+ // so we use 10 for those cases
+ InstrItinData<IIC_VFMACQ, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+ InstrStage<1, [A9_MUX0], 0>,
+ InstrStage<1, [A9_DRegsN], 0, Required>,
+ // Extra latency cycles since wbck is 9 cycles
+ InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+ InstrStage<4, [A9_NPipe]>],
+ [8, 4, 2, 1]>,
+ //
// Double-register Reciprical Step
InstrItinData<IIC_VRECSD, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
InstrStage<1, [A9_MUX0], 0>,
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 4d959f5..0ace9bc 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<
// Double-precision FP MAC
InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
//
+ // Single-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
+ // Double-precision Fused FP MAC
+ InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+ //
// Single-precision FP DIV
InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
//
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3d9c03d..5cf54b94 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -45,7 +45,7 @@ protected:
bool HasV6T2Ops;
bool HasV7Ops;
- /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
+ /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what
/// floating point ISAs are supported.
bool HasVFPv2;
bool HasVFPv3;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 34dadf8..8fa7378 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+ Mnemonic == "vfms" || Mnemonic == "vfnms" ||
(Mnemonic == "movs" && isThumb()))) {
Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
CarrySetting = true;
@@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
Mnemonic == "orr" || Mnemonic == "mvn" ||
Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+ Mnemonic == "vfm" || Mnemonic == "vfnm" ||
(!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
Mnemonic == "mla" || Mnemonic == "smlal" ||
Mnemonic == "umlal" || Mnemonic == "umull"))) {