aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/NVPTX
diff options
context:
space:
mode:
authorStephen Hines <srhines@google.com>2014-12-01 14:51:49 -0800
committerStephen Hines <srhines@google.com>2014-12-02 16:08:10 -0800
commit37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch)
tree8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen/NVPTX
parentd2327b22152ced7bc46dc629fc908959e8a52d03 (diff)
downloadexternal_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz
external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2
Update aosp/master LLVM for rebase to r222494.
Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
Diffstat (limited to 'test/CodeGen/NVPTX')
-rw-r--r--test/CodeGen/NVPTX/arithmetic-fp-sm20.ll12
-rw-r--r--test/CodeGen/NVPTX/arithmetic-int.ll26
-rw-r--r--test/CodeGen/NVPTX/atomics.ll77
-rw-r--r--test/CodeGen/NVPTX/bug21465.ll24
-rw-r--r--test/CodeGen/NVPTX/call-with-alloca-buffer.ll14
-rw-r--r--test/CodeGen/NVPTX/compare-int.ll40
-rw-r--r--test/CodeGen/NVPTX/convert-fp.ll28
-rw-r--r--test/CodeGen/NVPTX/convert-int-sm20.ll8
-rw-r--r--test/CodeGen/NVPTX/fma.ll4
-rw-r--r--test/CodeGen/NVPTX/fp-contract.ll33
-rw-r--r--test/CodeGen/NVPTX/fp-literals.ll7
-rw-r--r--test/CodeGen/NVPTX/fp16.ll45
-rw-r--r--test/CodeGen/NVPTX/half.ll70
-rw-r--r--test/CodeGen/NVPTX/implicit-def.ll2
-rw-r--r--test/CodeGen/NVPTX/intrinsic-old.ll2
-rw-r--r--test/CodeGen/NVPTX/intrinsics.ll2
-rw-r--r--test/CodeGen/NVPTX/ld-addrspace.ll48
-rw-r--r--test/CodeGen/NVPTX/ld-generic.ll16
-rw-r--r--test/CodeGen/NVPTX/ldu-i8.ll6
-rw-r--r--test/CodeGen/NVPTX/ldu-ldg.ll20
-rw-r--r--test/CodeGen/NVPTX/ldu-reg-plus-offset.ll8
-rw-r--r--test/CodeGen/NVPTX/local-stack-frame.ll4
-rw-r--r--test/CodeGen/NVPTX/machine-sink.ll40
-rw-r--r--test/CodeGen/NVPTX/misaligned-vector-ldst.ll77
-rw-r--r--test/CodeGen/NVPTX/mulwide.ll71
-rw-r--r--test/CodeGen/NVPTX/pr13291-i1-store.ll4
-rw-r--r--test/CodeGen/NVPTX/st-addrspace.ll48
-rw-r--r--test/CodeGen/NVPTX/st-generic.ll16
-rw-r--r--test/CodeGen/NVPTX/surf-read-cuda.ll53
-rw-r--r--test/CodeGen/NVPTX/surf-write-cuda.ll42
-rw-r--r--test/CodeGen/NVPTX/tex-read-cuda.ll46
-rw-r--r--test/CodeGen/NVPTX/tex-read.ll4
-rw-r--r--test/CodeGen/NVPTX/texsurf-queries.ll103
-rw-r--r--test/CodeGen/NVPTX/vector-call.ll12
-rw-r--r--test/CodeGen/NVPTX/vector-return.ll14
35 files changed, 837 insertions, 189 deletions
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index e474fa4..c167db4 100644
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
;; These tests should run for all targets
@@ -9,28 +9,28 @@
;;; f64
define double @fadd_f64(double %a, double %b) {
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
; CHECK: ret
%ret = fadd double %a, %b
ret double %ret
}
define double @fsub_f64(double %a, double %b) {
-; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
; CHECK: ret
%ret = fsub double %a, %b
ret double %ret
}
define double @fmul_f64(double %a, double %b) {
-; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
; CHECK: ret
%ret = fmul double %a, %b
ret double %ret
}
define double @fdiv_f64(double %a, double %b) {
-; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
; CHECK: ret
%ret = fdiv double %a, %b
ret double %ret
diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
index 8d73b7e..b5a2872 100644
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll
@@ -9,70 +9,70 @@
;;; i64
define i64 @add_i64(i64 %a, i64 %b) {
-; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = add i64 %a, %b
ret i64 %ret
}
define i64 @sub_i64(i64 %a, i64 %b) {
-; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: sub.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = sub i64 %a, %b
ret i64 %ret
}
define i64 @mul_i64(i64 %a, i64 %b) {
-; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: mul.lo.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = mul i64 %a, %b
ret i64 %ret
}
define i64 @sdiv_i64(i64 %a, i64 %b) {
-; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = sdiv i64 %a, %b
ret i64 %ret
}
define i64 @udiv_i64(i64 %a, i64 %b) {
-; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = udiv i64 %a, %b
ret i64 %ret
}
define i64 @srem_i64(i64 %a, i64 %b) {
-; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = srem i64 %a, %b
ret i64 %ret
}
define i64 @urem_i64(i64 %a, i64 %b) {
-; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = urem i64 %a, %b
ret i64 %ret
}
define i64 @and_i64(i64 %a, i64 %b) {
-; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = and i64 %a, %b
ret i64 %ret
}
define i64 @or_i64(i64 %a, i64 %b) {
-; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: or.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = or i64 %a, %b
ret i64 %ret
}
define i64 @xor_i64(i64 %a, i64 %b) {
-; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: xor.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%ret = xor i64 %a, %b
ret i64 %ret
@@ -80,7 +80,7 @@ define i64 @xor_i64(i64 %a, i64 %b) {
define i64 @shl_i64(i64 %a, i64 %b) {
; PTX requires 32-bit shift amount
-; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shl.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
; CHECK: ret
%ret = shl i64 %a, %b
ret i64 %ret
@@ -88,7 +88,7 @@ define i64 @shl_i64(i64 %a, i64 %b) {
define i64 @ashr_i64(i64 %a, i64 %b) {
; PTX requires 32-bit shift amount
-; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
; CHECK: ret
%ret = ashr i64 %a, %b
ret i64 %ret
@@ -96,7 +96,7 @@ define i64 @ashr_i64(i64 %a, i64 %b) {
define i64 @lshr_i64(i64 %a, i64 %b) {
; PTX requires 32-bit shift amount
-; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
; CHECK: ret
%ret = lshr i64 %a, %b
ret i64 %ret
diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll
index 10ab73d..daadb6e 100644
--- a/test/CodeGen/NVPTX/atomics.ll
+++ b/test/CodeGen/NVPTX/atomics.ll
@@ -1,21 +1,21 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; CHECK: atom0
+; CHECK-LABEL: atom0
define i32 @atom0(i32* %addr, i32 %val) {
; CHECK: atom.add.u32
%ret = atomicrmw add i32* %addr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom1
+; CHECK-LABEL: atom1
define i64 @atom1(i64* %addr, i64 %val) {
; CHECK: atom.add.u64
%ret = atomicrmw add i64* %addr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom2
+; CHECK-LABEL: atom2
define i32 @atom2(i32* %subr, i32 %val) {
; CHECK: neg.s32
; CHECK: atom.add.u32
@@ -23,7 +23,7 @@ define i32 @atom2(i32* %subr, i32 %val) {
ret i32 %ret
}
-; CHECK: atom3
+; CHECK-LABEL: atom3
define i64 @atom3(i64* %subr, i64 %val) {
; CHECK: neg.s64
; CHECK: atom.add.u64
@@ -31,14 +31,14 @@ define i64 @atom3(i64* %subr, i64 %val) {
ret i64 %ret
}
-; CHECK: atom4
+; CHECK-LABEL: atom4
define i32 @atom4(i32* %subr, i32 %val) {
; CHECK: atom.and.b32
%ret = atomicrmw and i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom5
+; CHECK-LABEL: atom5
define i64 @atom5(i64* %subr, i64 %val) {
; CHECK: atom.and.b64
%ret = atomicrmw and i64* %subr, i64 %val seq_cst
@@ -56,86 +56,127 @@ define i64 @atom5(i64* %subr, i64 %val) {
; ret i64 %ret
;}
-; CHECK: atom8
+; CHECK-LABEL: atom8
define i32 @atom8(i32* %subr, i32 %val) {
; CHECK: atom.or.b32
%ret = atomicrmw or i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom9
+; CHECK-LABEL: atom9
define i64 @atom9(i64* %subr, i64 %val) {
; CHECK: atom.or.b64
%ret = atomicrmw or i64* %subr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom10
+; CHECK-LABEL: atom10
define i32 @atom10(i32* %subr, i32 %val) {
; CHECK: atom.xor.b32
%ret = atomicrmw xor i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom11
+; CHECK-LABEL: atom11
define i64 @atom11(i64* %subr, i64 %val) {
; CHECK: atom.xor.b64
%ret = atomicrmw xor i64* %subr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom12
+; CHECK-LABEL: atom12
define i32 @atom12(i32* %subr, i32 %val) {
; CHECK: atom.max.s32
%ret = atomicrmw max i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom13
+; CHECK-LABEL: atom13
define i64 @atom13(i64* %subr, i64 %val) {
; CHECK: atom.max.s64
%ret = atomicrmw max i64* %subr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom14
+; CHECK-LABEL: atom14
define i32 @atom14(i32* %subr, i32 %val) {
; CHECK: atom.min.s32
%ret = atomicrmw min i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom15
+; CHECK-LABEL: atom15
define i64 @atom15(i64* %subr, i64 %val) {
; CHECK: atom.min.s64
%ret = atomicrmw min i64* %subr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom16
+; CHECK-LABEL: atom16
define i32 @atom16(i32* %subr, i32 %val) {
; CHECK: atom.max.u32
%ret = atomicrmw umax i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom17
+; CHECK-LABEL: atom17
define i64 @atom17(i64* %subr, i64 %val) {
; CHECK: atom.max.u64
%ret = atomicrmw umax i64* %subr, i64 %val seq_cst
ret i64 %ret
}
-; CHECK: atom18
+; CHECK-LABEL: atom18
define i32 @atom18(i32* %subr, i32 %val) {
; CHECK: atom.min.u32
%ret = atomicrmw umin i32* %subr, i32 %val seq_cst
ret i32 %ret
}
-; CHECK: atom19
+; CHECK-LABEL: atom19
define i64 @atom19(i64* %subr, i64 %val) {
; CHECK: atom.min.u64
%ret = atomicrmw umin i64* %subr, i64 %val seq_cst
ret i64 %ret
}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_generic
+define float @atomic_add_f32_generic(float* %addr, float %val) {
+; CHECK: atom.add.f32
+ %ret = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+ ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace1
+define float @atomic_add_f32_addrspace1(float addrspace(1)* %addr, float %val) {
+; CHECK: atom.global.add.f32
+ %ret = call float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+ ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace3
+define float @atomic_add_f32_addrspace3(float addrspace(3)* %addr, float %val) {
+; CHECK: atom.shared.add.f32
+ %ret = call float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+ ret float %ret
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i32
+define i32 @atomic_cmpxchg_i32(i32* %addr, i32 %cmp, i32 %new) {
+; CHECK: atom.cas.b32
+ %pairold = cmpxchg i32* %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i64
+define i64 @atomic_cmpxchg_i64(i64* %addr, i64 %cmp, i64 %new) {
+; CHECK: atom.cas.b64
+ %pairold = cmpxchg i64* %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
new file mode 100644
index 0000000..157b28c
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: nounwind
+define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 {
+entry:
+; CHECK-LABEL @_Z22TakesStruct1SPi
+; CHECK: bitcast %struct.S* %input to i8*
+; CHECK: call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
+ %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1
+ %0 = load i32* %b, align 4
+ store i32 %0, i32* %output, align 4
+ ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 28dfa46..83d4916 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
@@ -20,11 +20,11 @@ entry:
%buf = alloca [16 x i8], align 4
; CHECK: .local .align 4 .b8 __local_depot0[16]
-; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]]
-; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]]
+; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]]
+; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
-; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0]
-; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]]
+; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
+; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
%0 = load float* %a, align 4
@@ -46,11 +46,11 @@ entry:
%7 = bitcast i8* %arrayidx7 to float*
store float %6, float* %7, align 4
-; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0
+; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
; CHECK: .param .b64 param0;
-; CHECK-NEXT: st.param.b64 [param0+0], %rl[[A_REG]]
+; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A_REG]]
; CHECK-NEXT: .param .b64 param1;
-; CHECK-NEXT: st.param.b64 [param1+0], %rl[[SP_REG]]
+; CHECK-NEXT: st.param.b64 [param1+0], %rd[[SP_REG]]
; CHECK-NEXT: call.uni
; CHECK-NEXT: callee,
diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll
index c595f21..e4e0601 100644
--- a/test/CodeGen/NVPTX/compare-int.ll
+++ b/test/CodeGen/NVPTX/compare-int.ll
@@ -9,8 +9,8 @@
;;; i64
define i64 @icmp_eq_i64(i64 %a, i64 %b) {
-; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp eq i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -18,8 +18,8 @@ define i64 @icmp_eq_i64(i64 %a, i64 %b) {
}
define i64 @icmp_ne_i64(i64 %a, i64 %b) {
-; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ne i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -27,8 +27,8 @@ define i64 @icmp_ne_i64(i64 %a, i64 %b) {
}
define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ugt i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -36,8 +36,8 @@ define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
}
define i64 @icmp_uge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp uge i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -45,8 +45,8 @@ define i64 @icmp_uge_i64(i64 %a, i64 %b) {
}
define i64 @icmp_ult_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ult i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -54,8 +54,8 @@ define i64 @icmp_ult_i64(i64 %a, i64 %b) {
}
define i64 @icmp_ule_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp ule i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -63,8 +63,8 @@ define i64 @icmp_ule_i64(i64 %a, i64 %b) {
}
define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sgt i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -72,8 +72,8 @@ define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
}
define i64 @icmp_sge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sge i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -81,8 +81,8 @@ define i64 @icmp_sge_i64(i64 %a, i64 %b) {
}
define i64 @icmp_slt_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp slt i64 %a, %b
%ret = zext i1 %cmp to i64
@@ -90,8 +90,8 @@ define i64 @icmp_slt_i64(i64 %a, i64 %b) {
}
define i64 @icmp_sle_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
; CHECK: ret
%cmp = icmp sle i64 %a, %b
%ret = zext i1 %cmp to i64
diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll
index 1882121..4b5446e 100644
--- a/test/CodeGen/NVPTX/convert-fp.ll
+++ b/test/CodeGen/NVPTX/convert-fp.ll
@@ -10,7 +10,7 @@ define i16 @cvt_i16_f32(float %x) {
}
define i16 @cvt_i16_f64(double %x) {
-; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%a = fptoui double %x to i16
ret i16 %a
@@ -24,7 +24,7 @@ define i32 @cvt_i32_f32(float %x) {
}
define i32 @cvt_i32_f64(double %x) {
-; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%a = fptoui double %x to i32
ret i32 %a
@@ -32,14 +32,14 @@ define i32 @cvt_i32_f64(double %x) {
define i64 @cvt_i64_f32(float %x) {
-; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}};
; CHECK: ret;
%a = fptoui float %x to i64
ret i64 %a
}
define i64 @cvt_i64_f64(double %x) {
-; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%a = fptoui double %x to i64
ret i64 %a
@@ -60,14 +60,14 @@ define float @cvt_f32_i32(i32 %x) {
}
define float @cvt_f32_i64(i64 %x) {
-; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}};
; CHECK: ret;
%a = uitofp i64 %x to float
ret float %a
}
define float @cvt_f32_f64(double %x) {
-; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%a = fptrunc double %x to float
ret float %a
@@ -88,56 +88,56 @@ define float @cvt_f32_s32(i32 %x) {
}
define float @cvt_f32_s64(i64 %x) {
-; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%a = sitofp i64 %x to float
ret float %a
}
define double @cvt_f64_i16(i16 %x) {
-; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}};
; CHECK: ret;
%a = uitofp i16 %x to double
ret double %a
}
define double @cvt_f64_i32(i32 %x) {
-; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}};
; CHECK: ret;
%a = uitofp i32 %x to double
ret double %a
}
define double @cvt_f64_i64(i64 %x) {
-; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}};
; CHECK: ret;
%a = uitofp i64 %x to double
ret double %a
}
define double @cvt_f64_f32(float %x) {
-; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}};
; CHECK: ret;
%a = fpext float %x to double
ret double %a
}
define double @cvt_f64_s16(i16 %x) {
-; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}}
; CHECK: ret
%a = sitofp i16 %x to double
ret double %a
}
define double @cvt_f64_s32(i32 %x) {
-; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}}
; CHECK: ret
%a = sitofp i32 %x to double
ret double %a
}
define double @cvt_f64_s64(i64 %x) {
-; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}}
; CHECK: ret
%a = sitofp i64 %x to double
ret double %a
diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll
index 227cd31..57a2316 100644
--- a/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/test/CodeGen/NVPTX/convert-int-sm20.ll
@@ -48,16 +48,16 @@ define i32 @cvt_i32_i64(i64 %x) {
; i64
define i64 @cvt_i64_i16(i16 %x) {
-; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
; CHECK: ret
%a = zext i16 %x to i64
ret i64 %a
}
define i64 @cvt_i64_i32(i32 %x) {
-; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
; CHECK: ret
%a = zext i32 %x to i64
ret i64 %a
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 4ef1a9a..14b5c45 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
define ptx_device float @t1_f32(float %x, float %y, float %z) {
; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
@@ -9,7 +9,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) {
}
define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%a = fmul double %x, %y
%b = fadd double %a, %z
diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll
new file mode 100644
index 0000000..3f68b18
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp-contract.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able. If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+ %v0 = fmul float %a, %b
+ %v1 = fadd float %v0, %c
+ ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+ %v1 = fadd float %a, %b
+ ret float %v1
+}
diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll
index 0cc2413..755e0f9 100644
--- a/test/CodeGen/NVPTX/fp-literals.ll
+++ b/test/CodeGen/NVPTX/fp-literals.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; Make sure we can properly differentiate between single-precision and
; double-precision FP literals.
@@ -11,7 +14,7 @@ define float @myaddf(float %a) {
}
; CHECK: myaddd
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000
define double @myaddd(double %a) {
%ret = fadd double %a, 1.0
ret double %ret
diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll
new file mode 100644
index 0000000..8770399
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp16.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_convert_fp16_to_fp32
+; CHECK: cvt.f32.f16
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+ %val = load i16 addrspace(1)* %in, align 2
+ %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+ store float %cvt, float addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp16_to_fp64
+; CHECK: cvt.f64.f16
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+ %val = load i16 addrspace(1)* %in, align 2
+ %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+ store double %cvt, double addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp32_to_fp16
+; CHECK: cvt.rn.f16.f32
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+ %val = load float addrspace(1)* %in, align 2
+ %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+ store i16 %cvt, i16 addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp64_to_fp16
+; CHECK: cvt.rn.f16.f64
+define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+ %val = load double addrspace(1)* %in, align 2
+ %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone
+ store i16 %cvt, i16 addrspace(1)* %out, align 4
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll
new file mode 100644
index 0000000..aa08cc7
--- /dev/null
+++ b/test/CodeGen/NVPTX/half.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -march=nvptx | FileCheck %s
+
+define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_load_store
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+ %val = load half addrspace(1)* %in
+ store half %val, half addrspace(1) * %out
+ ret void
+}
+
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) {
+; CHECK-LABEL: @test_bitcast_from_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+ %val = load half addrspace(1) * %in
+ %val_int = bitcast half %val to i16
+ store i16 %val_int, i16 addrspace(1)* %out
+ ret void
+}
+
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+; CHECK-LABEL: @test_bitcast_to_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+ %val = load i16 addrspace(1)* %in
+ %val_fp = bitcast i16 %val to half
+ store half %val_fp, half addrspace(1)* %out
+ ret void
+}
+
+define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend32
+; CHECK: cvt.f32.f16
+
+ %val16 = load half addrspace(1)* %in
+ %val32 = fpext half %val16 to float
+ store float %val32, float addrspace(1)* %out
+ ret void
+}
+
+define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend64
+; CHECK: cvt.f64.f16
+
+ %val16 = load half addrspace(1)* %in
+ %val64 = fpext half %val16 to double
+ store double %val64, double addrspace(1)* %out
+ ret void
+}
+
+define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: test_trunc32
+; CHECK: cvt.rn.f16.f32
+
+ %val32 = load float addrspace(1)* %in
+ %val16 = fptrunc float %val32 to half
+ store half %val16, half addrspace(1)* %out
+ ret void
+}
+
+define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_trunc64
+; CHECK: cvt.rn.f16.f64
+
+ %val32 = load double addrspace(1)* %in
+ %val16 = fptrunc double %val32 to half
+ store half %val16, half addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll
index 06d3d56..2d2c6e5 100644
--- a/test/CodeGen/NVPTX/implicit-def.ll
+++ b/test/CodeGen/NVPTX/implicit-def.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
; CHECK: // implicit-def: %f[[F0:[0-9]+]]
-; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+; CHECK: add.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
define float @foo(float %a) {
%ret = fadd float %a, undef
ret float %ret
diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index af91bb4..3c51776 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -198,7 +198,7 @@ define ptx_device i32 @test_clock() {
}
define ptx_device i64 @test_clock64() {
-; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64;
+; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64;
; CHECK: ret;
%x = call i64 @llvm.ptx.read.clock64()
ret i64 %x
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 78e1e77..34b671d 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -9,7 +9,7 @@ define ptx_device float @test_fabsf(float %f) {
}
define ptx_device double @test_fabs(double %d) {
-; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}};
; CHECK: ret;
%x = call double @llvm.fabs.f64(double %d)
ret double %x
diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
index 133ef09..f33659c 100644
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll
@@ -6,7 +6,7 @@
define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(1)* %ptr
ret i8 %a
@@ -15,7 +15,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(3)* %ptr
ret i8 %a
@@ -24,7 +24,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(5)* %ptr
ret i8 %a
@@ -34,7 +34,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(1)* %ptr
ret i16 %a
@@ -43,7 +43,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(3)* %ptr
ret i16 %a
@@ -52,7 +52,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(5)* %ptr
ret i16 %a
@@ -62,7 +62,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i32 addrspace(1)* %ptr
ret i32 %a
@@ -71,7 +71,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i32 addrspace(3)* %ptr
ret i32 %a
@@ -80,7 +80,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i32 addrspace(5)* %ptr
ret i32 %a
@@ -88,27 +88,27 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
;; i64
define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
-; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i64 addrspace(1)* %ptr
ret i64 %a
}
define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
-; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i64 addrspace(3)* %ptr
ret i64 %a
}
define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
-; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i64 addrspace(5)* %ptr
ret i64 %a
@@ -118,7 +118,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
define float @ld_global_f32(float addrspace(1)* %ptr) {
; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load float addrspace(1)* %ptr
ret float %a
@@ -127,7 +127,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) {
define float @ld_shared_f32(float addrspace(3)* %ptr) {
; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load float addrspace(3)* %ptr
ret float %a
@@ -136,7 +136,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) {
define float @ld_local_f32(float addrspace(5)* %ptr) {
; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load float addrspace(5)* %ptr
ret float %a
@@ -144,27 +144,27 @@ define float @ld_local_f32(float addrspace(5)* %ptr) {
;; f64
define double @ld_global_f64(double addrspace(1)* %ptr) {
-; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load double addrspace(1)* %ptr
ret double %a
}
define double @ld_shared_f64(double addrspace(3)* %ptr) {
-; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load double addrspace(3)* %ptr
ret double %a
}
define double @ld_local_f64(double addrspace(5)* %ptr) {
-; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load double addrspace(5)* %ptr
ret double %a
diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll
index 3728268..d629e0e 100644
--- a/test/CodeGen/NVPTX/ld-generic.ll
+++ b/test/CodeGen/NVPTX/ld-generic.ll
@@ -6,7 +6,7 @@
define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i8 addrspace(0)* %ptr
ret i8 %a
@@ -16,7 +16,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i16 addrspace(0)* %ptr
ret i16 %a
@@ -26,7 +26,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i32 addrspace(0)* %ptr
ret i32 %a
@@ -34,9 +34,9 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
;; i64
define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
-; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load i64 addrspace(0)* %ptr
ret i64 %a
@@ -46,7 +46,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
define float @ld_global_f32(float addrspace(0)* %ptr) {
; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load float addrspace(0)* %ptr
ret float %a
@@ -54,9 +54,9 @@ define float @ld_global_f32(float addrspace(0)* %ptr) {
;; f64
define double @ld_global_f64(double addrspace(0)* %ptr) {
-; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: ret
-; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: ret
%a = load double addrspace(0)* %ptr
ret double %a
diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll
index 9cc6675..36c99b3 100644
--- a/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/test/CodeGen/NVPTX/ldu-i8.ll
@@ -2,15 +2,13 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)
define i8 @foo(i8* %a) {
; Ensure we properly truncate off the high-order 24 bits
; CHECK: ldu.global.u8
; CHECK: cvt.u32.u16
; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255
- %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0
+ %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)
ret i8 %val
}
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll
index 3b0619f..4bfd68c 100644
--- a/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -1,40 +1,36 @@
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
-declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
; CHECK: func0
define i8 @func0(i8 addrspace(1)* %ptr) {
; ldu.global.u8
- %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+ %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
ret i8 %val
}
; CHECK: func1
define i32 @func1(i32 addrspace(1)* %ptr) {
; ldu.global.u32
- %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+ %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
ret i32 %val
}
; CHECK: func2
define i8 @func2(i8 addrspace(1)* %ptr) {
; ld.global.nc.u8
- %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+ %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
ret i8 %val
}
; CHECK: func3
define i32 @func3(i32 addrspace(1)* %ptr) {
; ld.global.nc.u32
- %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+ %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
ret i32 %val
}
-
-
-
-!0 = metadata !{i32 4}
diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 55707ea..fd35a75 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
@@ -7,15 +7,13 @@ define void @reg_plus_offset(i32* %a) {
; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
%p2 = getelementptr i32* %a, i32 8
- %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1
+ %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4)
%p3 = getelementptr i32* %a, i32 9
- %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1
+ %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4)
%t3 = mul i32 %t1, %t2
store i32 %t3, i32* %a
ret void
}
-!1 = metadata !{ i32 4 }
-
-declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*, i32)
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll
index c0d7d1c..377eee9 100644
--- a/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/test/CodeGen/NVPTX/local-stack-frame.ll
@@ -7,8 +7,8 @@
; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}};
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
-; PTX64: mov.u64 %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64: cvta.local.u64 %SP, %rl{{[0-9]+}};
+; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}};
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
define void @foo(i32 %a) {
diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll
new file mode 100644
index 0000000..3614bea
--- /dev/null
+++ b/test/CodeGen/NVPTX/machine-sink.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4
+@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4
+
+; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates
+; BB %entry. Over-sinking created more register pressure on this example. The
+; backend would sink the fmuls to BB %merge, but not the loads for being
+; conservative on sinking memory accesses. As a result, the loads and
+; the two fmuls would be separated to two basic blocks, causing two
+; cross-BB live ranges.
+define float @post_dominate(float %x, i1 %cond) {
+; CHECK-LABEL: post_dominate(
+entry:
+ %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+ %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+; CHECK: ld.shared.f32
+; CHECK: ld.shared.f32
+ %2 = fmul float %0, %0
+ %3 = fmul float %1, %2
+; CHECK-NOT: bra
+; CHECK: mul.rn.f32
+; CHECK: mul.rn.f32
+ br i1 %cond, label %then, label %merge
+
+then:
+ %z = fadd float %x, %x
+ br label %then2
+
+then2:
+ %z2 = fadd float %z, %z
+ br label %merge
+
+merge:
+ %y = phi float [ 0.0, %entry ], [ %z2, %then2 ]
+ %w = fadd float %y, %3
+ ret float %w
+}
diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
new file mode 100644
index 0000000..90c9c43
--- /dev/null
+++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: t1
+define <4 x float> @t1(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK-NOT: ld.f32
+; CHECK: ld.u8
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 1
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t2
+define <4 x float> @t2(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK: ld.f32
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 4
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t3
+define <4 x float> @t3(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK: ld.v2
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 8
+ ret <4 x float> %r
+}
+
+; CHECK-LABEL: t4
+define <4 x float> @t4(i8* %p1) {
+; CHECK: ld.v4
+ %cast = bitcast i8* %p1 to <4 x float>*
+ %r = load <4 x float>* %cast, align 16
+ ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: s1
+define void @s1(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK-NOT: st.f32
+; CHECK: st.u8
+ store <4 x float> %v, <4 x float>* %p1, align 1
+ ret void
+}
+
+; CHECK-LABEL: s2
+define void @s2(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK: st.f32
+ store <4 x float> %v, <4 x float>* %p1, align 4
+ ret void
+}
+
+; CHECK-LABEL: s3
+define void @s3(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+ store <4 x float> %v, <4 x float>* %p1, align 8
+ ret void
+}
+
+; CHECK-LABEL: s4
+define void @s4(<4 x float>* %p1, <4 x float> %v) {
+; CHECK: st.v4
+ store <4 x float> %v, <4 x float>* %p1, align 16
+ ret void
+}
+
diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll
index 927946c..1ddf973 100644
--- a/test/CodeGen/NVPTX/mulwide.ll
+++ b/test/CodeGen/NVPTX/mulwide.ll
@@ -1,37 +1,90 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
-; CHECK: mulwide16
+; OPT-LABEL: @mulwide16
+; NOOPT-LABEL: @mulwide16
define i32 @mulwide16(i16 %a, i16 %b) {
-; CHECK: mul.wide.s16
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
%val0 = sext i16 %a to i32
%val1 = sext i16 %b to i32
%val2 = mul i32 %val0, %val1
ret i32 %val2
}
-; CHECK: mulwideu16
+; OPT-LABEL: @mulwideu16
+; NOOPT-LABEL: @mulwideu16
define i32 @mulwideu16(i16 %a, i16 %b) {
-; CHECK: mul.wide.u16
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
%val0 = zext i16 %a to i32
%val1 = zext i16 %b to i32
%val2 = mul i32 %val0, %val1
ret i32 %val2
}
-; CHECK: mulwide32
+; OPT-LABEL: @mulwide8
+; NOOPT-LABEL: @mulwide8
+define i32 @mulwide8(i8 %a, i8 %b) {
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
+ %val0 = sext i8 %a to i32
+ %val1 = sext i8 %b to i32
+ %val2 = mul i32 %val0, %val1
+ ret i32 %val2
+}
+
+; OPT-LABEL: @mulwideu8
+; NOOPT-LABEL: @mulwideu8
+define i32 @mulwideu8(i8 %a, i8 %b) {
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
+ %val0 = zext i8 %a to i32
+ %val1 = zext i8 %b to i32
+ %val2 = mul i32 %val0, %val1
+ ret i32 %val2
+}
+
+; OPT-LABEL: @mulwide32
+; NOOPT-LABEL: @mulwide32
define i64 @mulwide32(i32 %a, i32 %b) {
-; CHECK: mul.wide.s32
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
%val0 = sext i32 %a to i64
%val1 = sext i32 %b to i64
%val2 = mul i64 %val0, %val1
ret i64 %val2
}
-; CHECK: mulwideu32
+; OPT-LABEL: @mulwideu32
+; NOOPT-LABEL: @mulwideu32
define i64 @mulwideu32(i32 %a, i32 %b) {
-; CHECK: mul.wide.u32
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
%val0 = zext i32 %a to i64
%val1 = zext i32 %b to i64
%val2 = mul i64 %val0, %val1
ret i64 %val2
}
+
+; OPT-LABEL: @mulwideu7
+; NOOPT-LABEL: @mulwideu7
+define i64 @mulwideu7(i7 %a, i7 %b) {
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
+ %val0 = zext i7 %a to i64
+ %val1 = zext i7 %b to i64
+ %val2 = mul i64 %val0, %val1
+ ret i64 %val2
+}
+
+; OPT-LABEL: @mulwides7
+; NOOPT-LABEL: @mulwides7
+define i64 @mulwides7(i7 %a, i7 %b) {
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
+ %val0 = sext i7 %a to i64
+ %val1 = sext i7 %b to i64
+ %val2 = mul i64 %val0, %val1
+ ret i64 %val2
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
index e7a81be..cc67a6f 100644
--- a/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -5,7 +5,7 @@ define ptx_kernel void @t1(i1* %a) {
; PTX32: mov.u16 %rs{{[0-9]+}}, 0;
; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
; PTX64: mov.u16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
store i1 false, i1* %a
ret void
}
@@ -15,7 +15,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) {
; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
-; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll
index 68c09fe..34a83f3 100644
--- a/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/test/CodeGen/NVPTX/st-addrspace.ll
@@ -7,7 +7,7 @@
define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i8 %a, i8 addrspace(1)* %ptr
ret void
@@ -16,7 +16,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i8 %a, i8 addrspace(3)* %ptr
ret void
@@ -25,7 +25,7 @@ define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i8 %a, i8 addrspace(5)* %ptr
ret void
@@ -36,7 +36,7 @@ define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i16 %a, i16 addrspace(1)* %ptr
ret void
@@ -45,7 +45,7 @@ define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i16 %a, i16 addrspace(3)* %ptr
ret void
@@ -54,7 +54,7 @@ define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i16 %a, i16 addrspace(5)* %ptr
ret void
@@ -65,7 +65,7 @@ define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; PTX64: ret
store i32 %a, i32 addrspace(1)* %ptr
ret void
@@ -74,7 +74,7 @@ define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; PTX64: ret
store i32 %a, i32 addrspace(3)* %ptr
ret void
@@ -83,7 +83,7 @@ define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; PTX64: ret
store i32 %a, i32 addrspace(5)* %ptr
ret void
@@ -92,27 +92,27 @@ define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
;; i64
define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) {
-; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; PTX64: ret
store i64 %a, i64 addrspace(1)* %ptr
ret void
}
define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) {
-; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; PTX64: ret
store i64 %a, i64 addrspace(3)* %ptr
ret void
}
define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
-; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; PTX64: ret
store i64 %a, i64 addrspace(5)* %ptr
ret void
@@ -123,7 +123,7 @@ define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; PTX64: ret
store float %a, float addrspace(1)* %ptr
ret void
@@ -132,7 +132,7 @@ define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; PTX64: ret
store float %a, float addrspace(3)* %ptr
ret void
@@ -141,7 +141,7 @@ define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; PTX64: ret
store float %a, float addrspace(5)* %ptr
ret void
@@ -150,27 +150,27 @@ define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
;; f64
define void @st_global_f64(double addrspace(1)* %ptr, double %a) {
-; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; PTX64: ret
store double %a, double addrspace(1)* %ptr
ret void
}
define void @st_shared_f64(double addrspace(3)* %ptr, double %a) {
-; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; PTX64: ret
store double %a, double addrspace(3)* %ptr
ret void
}
define void @st_local_f64(double addrspace(5)* %ptr, double %a) {
-; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; PTX64: ret
store double %a, double addrspace(5)* %ptr
ret void
diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll
index b9c616f..022f7ab 100644
--- a/test/CodeGen/NVPTX/st-generic.ll
+++ b/test/CodeGen/NVPTX/st-generic.ll
@@ -7,7 +7,7 @@
define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i8 %a, i8 addrspace(0)* %ptr
ret void
@@ -18,7 +18,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
; PTX32: ret
-; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; PTX64: ret
store i16 %a, i16 addrspace(0)* %ptr
ret void
@@ -29,7 +29,7 @@ define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
; PTX32: ret
-; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; PTX64: ret
store i32 %a, i32 addrspace(0)* %ptr
ret void
@@ -38,9 +38,9 @@ define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
;; i64
define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
-; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; PTX64: ret
store i64 %a, i64 addrspace(0)* %ptr
ret void
@@ -51,7 +51,7 @@ define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
; PTX32: ret
-; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; PTX64: ret
store float %a, float addrspace(0)* %ptr
ret void
@@ -60,9 +60,9 @@ define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
;; f64
define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
-; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
; PTX32: ret
-; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; PTX64: ret
store double %a, double addrspace(0)* %ptr
ret void
diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll
new file mode 100644
index 0000000..10a1ecc
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-read-cuda.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+ %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+ %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+ store float %ret, float* %red
+ ret void
+}
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+ %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}]
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}]
+ %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+ %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+ store float %ret, float* %red
+ ret void
+}
+
+
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+
diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll
new file mode 100644
index 0000000..654c47f
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-write-cuda.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, i32 %val, i32 %idx) {
+; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+ tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
+ ret void
+}
+
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(i32 %val, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+ %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+ tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val)
+ ret void
+}
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+
diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll
new file mode 100644
index 0000000..ee0cefa
--- /dev/null
+++ b/test/CodeGen/NVPTX/tex-read-cuda.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+
+target triple = "nvptx-unknown-cuda"
+
+declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+ %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
+ %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+ store float %ret, float* %red
+ ret void
+}
+
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0
+ %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}]
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
+ %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
+ %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+ store float %ret, float* %red
+ ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
index 291060b..55e4bfc 100644
--- a/test/CodeGen/NVPTX/tex-read.ll
+++ b/test/CodeGen/NVPTX/tex-read.ll
@@ -2,12 +2,12 @@
target triple = "nvptx-unknown-nvcl"
-declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32)
+declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
; CHECK: .entry foo
define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
- %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx)
+ %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
%ret = extractvalue { float, float, float, float } %val, 0
; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
store float %ret, float* %red
diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll
new file mode 100644
index 0000000..c7637cc
--- /dev/null
+++ b/test/CodeGen/NVPTX/texsurf-queries.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+declare i32 @llvm.nvvm.txq.width(i64)
+declare i32 @llvm.nvvm.txq.height(i64)
+declare i32 @llvm.nvvm.suq.width(i64)
+declare i32 @llvm.nvvm.suq.height(i64)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: @t0
+; SM30-LABEL: @t0
+define i32 @t0(i64 %texHandle) {
+; SM20: txq.width.b32
+; SM30: txq.width.b32
+ %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+ ret i32 %width
+}
+
+; SM20-LABEL: @t1
+; SM30-LABEL: @t1
+define i32 @t1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+ %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+ %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+ ret i32 %width
+}
+
+
+; SM20-LABEL: @t2
+; SM30-LABEL: @t2
+define i32 @t2(i64 %texHandle) {
+; SM20: txq.height.b32
+; SM30: txq.height.b32
+ %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+ ret i32 %height
+}
+
+; SM20-LABEL: @t3
+; SM30-LABEL: @t3
+define i32 @t3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+ %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+ %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+ ret i32 %height
+}
+
+
+; SM20-LABEL: @s0
+; SM30-LABEL: @s0
+define i32 @s0(i64 %surfHandle) {
+; SM20: suq.width.b32
+; SM30: suq.width.b32
+ %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+ ret i32 %width
+}
+
+; SM20-LABEL: @s1
+; SM30-LABEL: @s1
+define i32 @s1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+ %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+ %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+ ret i32 %width
+}
+
+
+; SM20-LABEL: @s2
+; SM30-LABEL: @s2
+define i32 @s2(i64 %surfHandle) {
+; SM20: suq.height.b32
+; SM30: suq.height.b32
+ %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+ ret i32 %height
+}
+
+; SM20-LABEL: @s3
+; SM30-LABEL: @s3
+define i32 @s3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+ %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+ %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+ ret i32 %height
+}
+
+
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
+!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
new file mode 100644
index 0000000..a03d7fd
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL @foo
+define void @foo(<4 x i32> %a) {
+; CHECK: st.param.v4.b32
+ tail call void @bar(<4 x i32> %a)
+ ret void
+}
diff --git a/test/CodeGen/NVPTX/vector-return.ll b/test/CodeGen/NVPTX/vector-return.ll
new file mode 100644
index 0000000..15e50f8
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-return.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+declare <2 x float> @bar(<2 x float> %input)
+
+define void @foo(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @foo
+entry:
+ %call = tail call <2 x float> @bar(<2 x float> %input)
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0];
+ store <2 x float> %call, <2 x float>* %output, align 8
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]}
+ ret void
+}