1 files changed, 33 insertions, 0 deletions
diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll
new file mode 100644
index 0000000..3f68b18
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp-contract.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+  %v0 = fmul float %a, %b
+  %v1 = fadd float %v0, %c
+  ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+  %v1 = fadd float %a, %b
+  ret float %v1
+}