10 files changed, 160 insertions, 10 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b05fe62..85c41fc 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
                                         "true",
                                         "Use NEON for single precision FP">;
-// Allow more precision in FP computation
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6b8f4cc..37284f9 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -181,11 +181,11 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4">;
-def NoVFP4            : Predicate<"!Subtarget->hasVFP4()">;
+def NoVFP4           : Predicate<"!Subtarget->hasVFP4()">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON">;
 def HasNEON2         : Predicate<"Subtarget->hasNEON2()">,
-                                 AssemblerPredicate<"FeatureNEON2">;
+                                 AssemblerPredicate<"FeatureNEON,FeatureVFP4">;
 def NoNEON2          : Predicate<"!Subtarget->hasNEON2()">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16">;
@@ -221,6 +221,9 @@ def UseMovt          : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
 def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 
+// Allow more precision in FP computation
+def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 99dbb95..501cc8f 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4115,7 +4115,6 @@ defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
                             "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
 defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
 
-
 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
 def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                           v2f32, fmul_su, fadd_mlx>,
@@ -4136,10 +4135,10 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
 // Match @llvm.fma.* intrinsics
 def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),
           (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
-          Requires<[HasNEON, HasVFP4]>;
+          Requires<[HasNEON2]>;
 def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),
           (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
-          Requires<[HasNEON, HasVFP4]>;
+          Requires<[HasNEON2]>;
 
 // Vector Subtract Operations.
 
@@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
 def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
       Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
 def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
 def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
 def : N3VSPat<NEONfmax, VMAXfd>;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 8d86c01..8b1fb93 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<19, [A8_NPipe], 0>,
                                InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<20, [A8_NPipe], 0>,
@@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
   //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 49fedf6..0d710cc 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<2,  [A9_NPipe]>],
                               [9, 1, 1, 1]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
@@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<4, [A9_NPipe]>],
                               [8, 4, 2, 1]>,
   //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 4d959f5..0ace9bc 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<
   // Double-precision FP MAC
   InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
   //
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3d9c03d..5cf54b94 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -45,7 +45,7 @@ protected:
   bool HasV6T2Ops;
   bool HasV7Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 34dadf8..8fa7378 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
         Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
       Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
       Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      Mnemonic == "vfm" || Mnemonic == "vfnm" ||
       (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
                       Mnemonic == "mla" || Mnemonic == "smlal" ||
                       Mnemonic == "umlal" || Mnemonic == "umull"))) {
diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s
new file mode 100644
index 0000000..009d31d
--- /dev/null
+++ b/test/MC/ARM/vfp4.s
@@ -0,0 +1,50 @@
+@ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4   | FileCheck %s --check-prefix=ARM
+@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB
+
+        @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
+@ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
+vfma.f64 d16, d18, d17
+
+@ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
+@ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
+vfma.f32 s2, s4, s0
+
+@ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
+@ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
+vfma.f32 d16, d18, d17
+
+@ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
+@ THUMB: vfma.f32	q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
+vfma.f32 q2, q4, q0
+
+@ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
+@ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
+vfnma.f64 d16, d18, d17
+
+@ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
+@ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
+vfnma.f32 s2, s4, s0
+
+@ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
+@ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
+vfms.f64 d16, d18, d17
+
+@ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
+@ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
+vfms.f32 s2, s4, s0
+
+@ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
+@ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
+vfms.f32 d16, d18, d17
+
+@ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
+@ THUMB: vfms.f32	q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
+vfms.f32 q2, q4, q0
+
+@ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
+@ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
+vfnms.f64 d16, d18, d17
+
+@ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee]
+@ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
+vfnms.f32 s2, s4, s0
diff --git a/test/MC/Disassembler/ARM/vfp4.txt b/test/MC/Disassembler/ARM/vfp4.txt
new file mode 100644
index 0000000..4f2c732
--- /dev/null
+++ b/test/MC/Disassembler/ARM/vfp4.txt
@@ -0,0 +1,37 @@
+# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown --disassemble -mattr=+neon,+vfp4 | FileCheck %s
+
+# CHECK: vfma.f64 d16, d18, d17
+0xe2 0xee 0xa1 0x0b
+
+# CHECK: vfma.f32 s2, s4, s0
+0xa2 0xee 0x00 0x1a
+
+# CHECK: vfma.f32 d16, d18, d17
+0x42 0xef 0xb1 0x0c
+
+# CHECK: vfma.f32 q2, q4, q0
+0x08 0xef 0x50 0x4c
+
+# CHECK: vfnms.f64 d16, d18, d17
+0xd2 0xee 0xa1 0x0b
+
+# CHECK: vfnms.f32 s2, s4, s0
+0x92 0xee 0x00 0x1a
+
+# CHECK: vfms.f64 d16, d18, d17
+0xe2 0xee 0xe1 0x0b
+
+# CHECK: vfms.f32 s2, s4, s0
+0xa2 0xee 0x40 0x1a
+
+# CHECK: vfms.f32 d16, d18, d17
+0x62 0xef 0xb1 0x0c
+
+# CHECK: vfms.f32 q2, q4, q0
+0x28 0xef 0x50 0x4c
+
+# CHECK: vfnma.f64 d16, d18, d17
+0xd2 0xee 0xe1 0x0b
+
+# CHECK: vfnma.f32 s2, s4, s0
+0x92 0xee 0x40 0x1a