diff options
author | Stephen Hines <srhines@google.com> | 2014-12-01 14:51:49 -0800 |
---|---|---|
committer | Stephen Hines <srhines@google.com> | 2014-12-02 16:08:10 -0800 |
commit | 37ed9c199ca639565f6ce88105f9e39e898d82d0 (patch) | |
tree | 8fb36d3910e3ee4c4e1b7422f4f017108efc52f5 /test/CodeGen/NVPTX | |
parent | d2327b22152ced7bc46dc629fc908959e8a52d03 (diff) | |
download | external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.zip external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.gz external_llvm-37ed9c199ca639565f6ce88105f9e39e898d82d0.tar.bz2 |
Update aosp/master LLVM for rebase to r222494.
Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d
Diffstat (limited to 'test/CodeGen/NVPTX')
35 files changed, 837 insertions, 189 deletions
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll index e474fa4..c167db4 100644 --- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll +++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s ;; These tests should run for all targets @@ -9,28 +9,28 @@ ;;; f64 define double @fadd_f64(double %a, double %b) { -; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fadd double %a, %b ret double %ret } define double @fsub_f64(double %a, double %b) { -; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fsub double %a, %b ret double %ret } define double @fmul_f64(double %a, double %b) { -; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fmul double %a, %b ret double %ret } define double @fdiv_f64(double %a, double %b) { -; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}} +; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}} ; CHECK: ret %ret = fdiv double %a, %b ret double %ret diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll index 8d73b7e..b5a2872 100644 --- a/test/CodeGen/NVPTX/arithmetic-int.ll +++ b/test/CodeGen/NVPTX/arithmetic-int.ll @@ -9,70 +9,70 @@ ;;; i64 define i64 @add_i64(i64 %a, i64 %b) { -; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = add i64 %a, %b ret i64 %ret } define i64 @sub_i64(i64 %a, i64 %b) { -; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: sub.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = sub i64 %a, %b ret i64 %ret } define i64 @mul_i64(i64 %a, i64 %b) { -; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: mul.lo.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = mul i64 %a, %b ret i64 %ret } define i64 @sdiv_i64(i64 %a, i64 %b) { -; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = sdiv i64 %a, %b ret i64 %ret } define i64 @udiv_i64(i64 %a, i64 %b) { -; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: div.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = udiv i64 %a, %b ret i64 %ret } define i64 @srem_i64(i64 %a, i64 %b) { -; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: rem.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = srem i64 %a, %b ret i64 %ret } define i64 @urem_i64(i64 %a, i64 %b) { -; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: rem.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = urem i64 %a, %b ret i64 %ret } define i64 @and_i64(i64 %a, i64 %b) { -; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = and i64 %a, %b ret i64 %ret } define i64 @or_i64(i64 %a, i64 %b) { -; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: or.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = or i64 %a, %b ret i64 %ret } define i64 @xor_i64(i64 %a, i64 %b) { -; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: xor.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %ret = xor i64 %a, %b ret i64 %ret @@ -80,7 +80,7 @@ define i64 @xor_i64(i64 %a, i64 %b) { define i64 @shl_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shl.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = shl i64 %a, %b ret i64 %ret @@ -88,7 +88,7 @@ define i64 @shl_i64(i64 %a, i64 %b) { define i64 @ashr_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = ashr i64 %a, %b ret i64 %ret @@ -96,7 +96,7 @@ define i64 @ashr_i64(i64 %a, i64 %b) { define i64 @lshr_i64(i64 %a, i64 %b) { ; PTX requires 32-bit shift amount -; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: shr.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %ret = lshr i64 %a, %b ret i64 %ret diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll index 10ab73d..daadb6e 100644 --- a/test/CodeGen/NVPTX/atomics.ll +++ b/test/CodeGen/NVPTX/atomics.ll @@ -1,21 +1,21 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; CHECK: atom0 +; CHECK-LABEL: atom0 define i32 @atom0(i32* %addr, i32 %val) { ; CHECK: atom.add.u32 %ret = atomicrmw add i32* %addr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom1 +; CHECK-LABEL: atom1 define i64 @atom1(i64* %addr, i64 %val) { ; CHECK: atom.add.u64 %ret = atomicrmw add i64* %addr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom2 +; CHECK-LABEL: atom2 define i32 @atom2(i32* %subr, i32 %val) { ; CHECK: neg.s32 ; CHECK: atom.add.u32 @@ -23,7 +23,7 @@ define i32 @atom2(i32* %subr, i32 %val) { ret i32 %ret } -; CHECK: atom3 +; CHECK-LABEL: atom3 define i64 @atom3(i64* %subr, i64 %val) { ; CHECK: neg.s64 ; CHECK: atom.add.u64 @@ -31,14 +31,14 @@ define i64 @atom3(i64* %subr, i64 %val) { ret i64 %ret } -; CHECK: atom4 +; CHECK-LABEL: atom4 define i32 @atom4(i32* %subr, i32 %val) { ; CHECK: atom.and.b32 %ret = atomicrmw and i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom5 +; CHECK-LABEL: atom5 define i64 @atom5(i64* %subr, i64 %val) { ; CHECK: atom.and.b64 %ret = atomicrmw and i64* %subr, i64 %val seq_cst @@ -56,86 +56,127 @@ define i64 @atom5(i64* %subr, i64 %val) { ; ret i64 %ret ;} -; CHECK: atom8 +; CHECK-LABEL: atom8 define i32 @atom8(i32* %subr, i32 %val) { ; CHECK: atom.or.b32 %ret = atomicrmw or i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom9 +; CHECK-LABEL: atom9 define i64 @atom9(i64* %subr, i64 %val) { ; CHECK: atom.or.b64 %ret = atomicrmw or i64* %subr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom10 +; CHECK-LABEL: atom10 define i32 @atom10(i32* %subr, i32 %val) { ; CHECK: atom.xor.b32 %ret = atomicrmw xor i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom11 +; CHECK-LABEL: atom11 define i64 @atom11(i64* %subr, i64 %val) { ; CHECK: atom.xor.b64 %ret = atomicrmw xor i64* %subr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom12 +; CHECK-LABEL: atom12 define i32 @atom12(i32* %subr, i32 %val) { ; CHECK: atom.max.s32 %ret = atomicrmw max i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom13 +; CHECK-LABEL: atom13 define i64 @atom13(i64* %subr, i64 %val) { ; CHECK: atom.max.s64 %ret = atomicrmw max i64* %subr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom14 +; CHECK-LABEL: atom14 define i32 @atom14(i32* %subr, i32 %val) { ; CHECK: atom.min.s32 %ret = atomicrmw min i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom15 +; CHECK-LABEL: atom15 define i64 @atom15(i64* %subr, i64 %val) { ; CHECK: atom.min.s64 %ret = atomicrmw min i64* %subr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom16 +; CHECK-LABEL: atom16 define i32 @atom16(i32* %subr, i32 %val) { ; CHECK: atom.max.u32 %ret = atomicrmw umax i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom17 +; CHECK-LABEL: atom17 define i64 @atom17(i64* %subr, i64 %val) { ; CHECK: atom.max.u64 %ret = atomicrmw umax i64* %subr, i64 %val seq_cst ret i64 %ret } -; CHECK: atom18 +; CHECK-LABEL: atom18 define i32 @atom18(i32* %subr, i32 %val) { ; CHECK: atom.min.u32 %ret = atomicrmw umin i32* %subr, i32 %val seq_cst ret i32 %ret } -; CHECK: atom19 +; CHECK-LABEL: atom19 define i64 @atom19(i64* %subr, i64 %val) { ; CHECK: atom.min.u64 %ret = atomicrmw umin i64* %subr, i64 %val seq_cst ret i64 %ret } + +declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_generic +define float @atomic_add_f32_generic(float* %addr, float %val) { +; CHECK: atom.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val) + ret float %ret +} + +declare float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_addrspace1 +define float @atomic_add_f32_addrspace1(float addrspace(1)* %addr, float %val) { +; CHECK: atom.global.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val) + ret float %ret +} + +declare float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val) + +; CHECK-LABEL: atomic_add_f32_addrspace3 +define float @atomic_add_f32_addrspace3(float addrspace(3)* %addr, float %val) { +; CHECK: atom.shared.add.f32 + %ret = call float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val) + ret float %ret +} + +; CHECK-LABEL: atomic_cmpxchg_i32 +define i32 @atomic_cmpxchg_i32(i32* %addr, i32 %cmp, i32 %new) { +; CHECK: atom.cas.b32 + %pairold = cmpxchg i32* %addr, i32 %cmp, i32 %new seq_cst seq_cst + ret i32 %new +} + +; CHECK-LABEL: atomic_cmpxchg_i64 +define i64 @atomic_cmpxchg_i64(i64* %addr, i64 %cmp, i64 %new) { +; CHECK: atom.cas.b64 + %pairold = cmpxchg i64* %addr, i64 %cmp, i64 %new seq_cst seq_cst + ret i64 %new +} diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll new file mode 100644 index 0000000..157b28c --- /dev/null +++ b/test/CodeGen/NVPTX/bug21465.ll @@ -0,0 +1,24 @@ +; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +%struct.S = type { i32, i32 } + +; Function Attrs: nounwind +define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 { +entry: +; CHECK-LABEL @_Z22TakesStruct1SPi +; CHECK: bitcast %struct.S* %input to i8* +; CHECK: call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8 + %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1 + %0 = load i32* %b, align 4 + store i32 %0, i32* %output, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!nvvm.annotations = !{!0} + +!0 = metadata !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, metadata !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 28dfa46..83d4916 100644 --- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -20,11 +20,11 @@ entry: %buf = alloca [16 x i8], align 4 ; CHECK: .local .align 4 .b8 __local_depot0[16] -; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]] -; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]] +; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]] +; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] -; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0] -; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]] +; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] +; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]] ; CHECK: st.f32 [%SP+0], %f[[A0_REG]] %0 = load float* %a, align 4 @@ -46,11 +46,11 @@ entry: %7 = bitcast i8* %arrayidx7 to float* store float %6, float* %7, align 4 -; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0 +; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 ; CHECK: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0+0], %rl[[A_REG]] +; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A_REG]] ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1+0], %rl[[SP_REG]] +; CHECK-NEXT: st.param.b64 [param1+0], %rd[[SP_REG]] ; CHECK-NEXT: call.uni ; CHECK-NEXT: callee, diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll index c595f21..e4e0601 100644 --- a/test/CodeGen/NVPTX/compare-int.ll +++ b/test/CodeGen/NVPTX/compare-int.ll @@ -9,8 +9,8 @@ ;;; i64 define i64 @icmp_eq_i64(i64 %a, i64 %b) { -; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp eq i64 %a, %b %ret = zext i1 %cmp to i64 @@ -18,8 +18,8 @@ define i64 @icmp_eq_i64(i64 %a, i64 %b) { } define i64 @icmp_ne_i64(i64 %a, i64 %b) { -; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ne i64 %a, %b %ret = zext i1 %cmp to i64 @@ -27,8 +27,8 @@ define i64 @icmp_ne_i64(i64 %a, i64 %b) { } define i64 @icmp_ugt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ugt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -36,8 +36,8 @@ define i64 @icmp_ugt_i64(i64 %a, i64 %b) { } define i64 @icmp_uge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp uge i64 %a, %b %ret = zext i1 %cmp to i64 @@ -45,8 +45,8 @@ define i64 @icmp_uge_i64(i64 %a, i64 %b) { } define i64 @icmp_ult_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ult i64 %a, %b %ret = zext i1 %cmp to i64 @@ -54,8 +54,8 @@ define i64 @icmp_ult_i64(i64 %a, i64 %b) { } define i64 @icmp_ule_i64(i64 %a, i64 %b) { -; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ule i64 %a, %b %ret = zext i1 %cmp to i64 @@ -63,8 +63,8 @@ define i64 @icmp_ule_i64(i64 %a, i64 %b) { } define i64 @icmp_sgt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sgt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -72,8 +72,8 @@ define i64 @icmp_sgt_i64(i64 %a, i64 %b) { } define i64 @icmp_sge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sge i64 %a, %b %ret = zext i1 %cmp to i64 @@ -81,8 +81,8 @@ define i64 @icmp_sge_i64(i64 %a, i64 %b) { } define i64 @icmp_slt_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp slt i64 %a, %b %ret = zext i1 %cmp to i64 @@ -90,8 +90,8 @@ define i64 @icmp_slt_i64(i64 %a, i64 %b) { } define i64 @icmp_sle_i64(i64 %a, i64 %b) { -; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}} -; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} +; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sle i64 %a, %b %ret = zext i1 %cmp to i64 diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll index 1882121..4b5446e 100644 --- a/test/CodeGen/NVPTX/convert-fp.ll +++ b/test/CodeGen/NVPTX/convert-fp.ll @@ -10,7 +10,7 @@ define i16 @cvt_i16_f32(float %x) { } define i16 @cvt_i16_f64(double %x) { -; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i16 ret i16 %a @@ -24,7 +24,7 @@ define i32 @cvt_i32_f32(float %x) { } define i32 @cvt_i32_f64(double %x) { -; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i32 ret i32 %a @@ -32,14 +32,14 @@ define i32 @cvt_i32_f64(double %x) { define i64 @cvt_i64_f32(float %x) { -; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fptoui float %x to i64 ret i64 %a } define i64 @cvt_i64_f64(double %x) { -; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptoui double %x to i64 ret i64 %a @@ -60,14 +60,14 @@ define float @cvt_f32_i32(i32 %x) { } define float @cvt_f32_i64(i64 %x) { -; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}}; +; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; %a = uitofp i64 %x to float ret float %a } define float @cvt_f32_f64(double %x) { -; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fptrunc double %x to float ret float %a @@ -88,56 +88,56 @@ define float @cvt_f32_s32(i32 %x) { } define float @cvt_f32_s64(i64 %x) { -; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %a = sitofp i64 %x to float ret float %a } define double @cvt_f64_i16(i16 %x) { -; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}}; +; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}}; ; CHECK: ret; %a = uitofp i16 %x to double ret double %a } define double @cvt_f64_i32(i32 %x) { -; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}}; +; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}}; ; CHECK: ret; %a = uitofp i32 %x to double ret double %a } define double @cvt_f64_i64(i64 %x) { -; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}}; +; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}}; ; CHECK: ret; %a = uitofp i64 %x to double ret double %a } define double @cvt_f64_f32(float %x) { -; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %a = fpext float %x to double ret double %a } define double @cvt_f64_s16(i16 %x) { -; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}} ; CHECK: ret %a = sitofp i16 %x to double ret double %a } define double @cvt_f64_s32(i32 %x) { -; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}} +; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}} ; CHECK: ret %a = sitofp i32 %x to double ret double %a } define double @cvt_f64_s64(i64 %x) { -; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}} +; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}} ; CHECK: ret %a = sitofp i64 %x to double ret double %a diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll index 227cd31..57a2316 100644 --- a/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -48,16 +48,16 @@ define i32 @cvt_i32_i64(i64 %x) { ; i64 define i64 @cvt_i64_i16(i16 %x) { -; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]] +; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] ; CHECK: ret %a = zext i16 %x to i64 ret i64 %a } define i64 @cvt_i64_i32(i32 %x) { -; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] -; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]] +; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] +; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]] ; CHECK: ret %a = zext i32 %x to i64 ret i64 %a diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll index 4ef1a9a..14b5c45 100644 --- a/test/CodeGen/NVPTX/fma.ll +++ b/test/CodeGen/NVPTX/fma.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s define ptx_device float @t1_f32(float %x, float %y, float %z) { ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; @@ -9,7 +9,7 @@ define ptx_device float @t1_f32(float %x, float %y, float %z) { } define ptx_device double @t1_f64(double %x, double %y, double %z) { -; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %a = fmul double %x, %y %b = fadd double %a, %z diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll new file mode 100644 index 0000000..3f68b18 --- /dev/null +++ b/test/CodeGen/NVPTX/fp-contract.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT + +target triple = "nvptx64-unknown-cuda" + +;; Make sure we are generating proper instruction sequences for fused ops +;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit +;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas +;; is free to fuse with a multiply if it is able. If fusion is not allowed, +;; we do not form fma.rn at the PTX level and explicitly generate add.rn +;; for all adds to prevent ptxas from fusion the ops. + +;; FAST-LABEL: @t0 +;; DEFAULT-LABEL: @t0 +define float @t0(float %a, float %b, float %c) { +;; FAST: fma.rn.f32 +;; DEFAULT: mul.rn.f32 +;; DEFAULT: add.rn.f32 + %v0 = fmul float %a, %b + %v1 = fadd float %v0, %c + ret float %v1 +} + +;; FAST-LABEL: @t1 +;; DEFAULT-LABEL: @t1 +define float @t1(float %a, float %b) { +;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32 +;; to prevent ptxas from fusing this with anything else. +;; FAST: add.f32 +;; DEFAULT: add.rn.f32 + %v1 = fadd float %a, %b + ret float %v1 +} diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll index 0cc2413..755e0f9 100644 --- a/test/CodeGen/NVPTX/fp-literals.ll +++ b/test/CodeGen/NVPTX/fp-literals.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s + +target triple = "nvptx64-unknown-cuda" +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" ; Make sure we can properly differentiate between single-precision and ; double-precision FP literals. @@ -11,7 +14,7 @@ define float @myaddf(float %a) { } ; CHECK: myaddd -; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000 +; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000 define double @myaddd(double %a) { %ret = fadd double %a, 1.0 ret double %ret diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll new file mode 100644 index 0000000..8770399 --- /dev/null +++ b/test/CodeGen/NVPTX/fp16.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone +declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone + +; CHECK-LABEL: @test_convert_fp16_to_fp32 +; CHECK: cvt.f32.f16 +define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp16_to_fp64 +; CHECK: cvt.f64.f16 +define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16 addrspace(1)* %in, align 2 + %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone + store double %cvt, double addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp32_to_fp16 +; CHECK: cvt.rn.f16.f32 +define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float addrspace(1)* %in, align 2 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 4 + ret void +} + + +; CHECK-LABEL: @test_convert_fp64_to_fp16 +; CHECK: cvt.rn.f16.f64 +define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { + %val = load double addrspace(1)* %in, align 2 + %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll new file mode 100644 index 0000000..aa08cc7 --- /dev/null +++ b/test/CodeGen/NVPTX/half.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -march=nvptx | FileCheck %s + +define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_load_store +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load half addrspace(1)* %in + store half %val, half addrspace(1) * %out + ret void +} + +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) { +; CHECK-LABEL: @test_bitcast_from_half +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load half addrspace(1) * %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) { +; CHECK-LABEL: @test_bitcast_to_half +; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}] +; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]] + %val = load i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) { +; CHECK-LABEL: @test_extend32 +; CHECK: cvt.f32.f16 + + %val16 = load half addrspace(1)* %in + %val32 = fpext half %val16 to float + store float %val32, float addrspace(1)* %out + ret void +} + +define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) { +; CHECK-LABEL: @test_extend64 +; CHECK: cvt.f64.f16 + + %val16 = load half addrspace(1)* %in + %val64 = fpext half %val16 to double + store double %val64, double addrspace(1)* %out + ret void +} + +define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: test_trunc32 +; CHECK: cvt.rn.f16.f32 + + %val32 = load float addrspace(1)* %in + %val16 = fptrunc float %val32 to half + store half %val16, half addrspace(1)* %out + ret void +} + +define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) { +; CHECK-LABEL: @test_trunc64 +; CHECK: cvt.rn.f16.f64 + + %val32 = load double addrspace(1)* %in + %val16 = fptrunc double %val32 to half + store half %val16, half addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll index 06d3d56..2d2c6e5 100644 --- a/test/CodeGen/NVPTX/implicit-def.ll +++ b/test/CodeGen/NVPTX/implicit-def.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s ; CHECK: // implicit-def: %f[[F0:[0-9]+]] -; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]]; +; CHECK: add.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]]; define float @foo(float %a) { %ret = fadd float %a, undef ret float %ret diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll index af91bb4..3c51776 100644 --- a/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/test/CodeGen/NVPTX/intrinsic-old.ll @@ -198,7 +198,7 @@ define ptx_device i32 @test_clock() { } define ptx_device i64 @test_clock64() { -; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64; +; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64; ; CHECK: ret; %x = call i64 @llvm.ptx.read.clock64() ret i64 %x diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 78e1e77..34b671d 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -9,7 +9,7 @@ define ptx_device float @test_fabsf(float %f) { } define ptx_device double @test_fabs(double %d) { -; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}; +; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}; ; CHECK: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll index 133ef09..f33659c 100644 --- a/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/test/CodeGen/NVPTX/ld-addrspace.ll @@ -6,7 +6,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { ; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(1)* %ptr ret i8 %a @@ -15,7 +15,7 @@ define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { ; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(3)* %ptr ret i8 %a @@ -24,7 +24,7 @@ define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { ; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(5)* %ptr ret i8 %a @@ -34,7 +34,7 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { ; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(1)* %ptr ret i16 %a @@ -43,7 +43,7 @@ define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { ; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(3)* %ptr ret i16 %a @@ -52,7 +52,7 @@ define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { ; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(5)* %ptr ret i16 %a @@ -62,7 +62,7 @@ define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { ; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(1)* %ptr ret i32 %a @@ -71,7 +71,7 @@ define i32 @ld_global_i32(i32 addrspace(1)* %ptr) { define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { ; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(3)* %ptr ret i32 %a @@ -80,7 +80,7 @@ define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) { define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { ; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(5)* %ptr ret i32 %a @@ -88,27 +88,27 @@ define i32 @ld_local_i32(i32 addrspace(5)* %ptr) { ;; i64 define i64 @ld_global_i64(i64 addrspace(1)* %ptr) { -; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(1)* %ptr ret i64 %a } define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) { -; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(3)* %ptr ret i64 %a } define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { -; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(5)* %ptr ret i64 %a @@ -118,7 +118,7 @@ define i64 @ld_local_i64(i64 addrspace(5)* %ptr) { define float @ld_global_f32(float addrspace(1)* %ptr) { ; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(1)* %ptr ret float %a @@ -127,7 +127,7 @@ define float @ld_global_f32(float addrspace(1)* %ptr) { define float @ld_shared_f32(float addrspace(3)* %ptr) { ; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(3)* %ptr ret float %a @@ -136,7 +136,7 @@ define float @ld_shared_f32(float addrspace(3)* %ptr) { define float @ld_local_f32(float addrspace(5)* %ptr) { ; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(5)* %ptr ret float %a @@ -144,27 +144,27 @@ define float @ld_local_f32(float addrspace(5)* %ptr) { ;; f64 define double @ld_global_f64(double addrspace(1)* %ptr) { -; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(1)* %ptr ret double %a } define double @ld_shared_f64(double addrspace(3)* %ptr) { -; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(3)* %ptr ret double %a } define double @ld_local_f64(double addrspace(5)* %ptr) { -; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(5)* %ptr ret double %a diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll index 3728268..d629e0e 100644 --- a/test/CodeGen/NVPTX/ld-generic.ll +++ b/test/CodeGen/NVPTX/ld-generic.ll @@ -6,7 +6,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { ; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(0)* %ptr ret i8 %a @@ -16,7 +16,7 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { ; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(0)* %ptr ret i16 %a @@ -26,7 +26,7 @@ define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { define i32 @ld_global_i32(i32 addrspace(0)* %ptr) { ; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32 addrspace(0)* %ptr ret i32 %a @@ -34,9 +34,9 @@ define i32 @ld_global_i32(i32 addrspace(0)* %ptr) { ;; i64 define i64 @ld_global_i64(i64 addrspace(0)* %ptr) { -; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64 addrspace(0)* %ptr ret i64 %a @@ -46,7 +46,7 @@ define i64 @ld_global_i64(i64 addrspace(0)* %ptr) { define float @ld_global_f32(float addrspace(0)* %ptr) { ; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float addrspace(0)* %ptr ret float %a @@ -54,9 +54,9 @@ define float @ld_global_f32(float addrspace(0)* %ptr) { ;; f64 define double @ld_global_f64(double addrspace(0)* %ptr) { -; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double addrspace(0)* %ptr ret double %a diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll index 9cc6675..36c99b3 100644 --- a/test/CodeGen/NVPTX/ldu-i8.ll +++ b/test/CodeGen/NVPTX/ldu-i8.ll @@ -2,15 +2,13 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*) +declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32) define i8 @foo(i8* %a) { ; Ensure we properly truncate off the high-order 24 bits ; CHECK: ldu.global.u8 ; CHECK: cvt.u32.u16 ; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255 - %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0 + %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4) ret i8 %val } - -!0 = metadata !{i32 4} diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll index 3b0619f..4bfd68c 100644 --- a/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/test/CodeGen/NVPTX/ldu-ldg.ll @@ -1,40 +1,36 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr) -declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr) -declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr) -declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr) +declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align) +declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align) +declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align) +declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align) ; CHECK: func0 define i8 @func0(i8 addrspace(1)* %ptr) { ; ldu.global.u8 - %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0 + %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4) ret i8 %val } ; CHECK: func1 define i32 @func1(i32 addrspace(1)* %ptr) { ; ldu.global.u32 - %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0 + %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4) ret i32 %val } ; CHECK: func2 define i8 @func2(i8 addrspace(1)* %ptr) { ; ld.global.nc.u8 - %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0 + %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4) ret i8 %val } ; CHECK: func3 define i32 @func3(i32 addrspace(1)* %ptr) { ; ld.global.nc.u32 - %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0 + %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4) ret i32 %val } - - - -!0 = metadata !{i32 4} diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll index 55707ea..fd35a75 100644 --- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll +++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -7,15 +7,13 @@ define void @reg_plus_offset(i32* %a) { ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32]; ; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36]; %p2 = getelementptr i32* %a, i32 8 - %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1 + %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4) %p3 = getelementptr i32* %a, i32 9 - %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1 + %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4) %t3 = mul i32 %t1, %t2 store i32 %t3, i32* %a ret void } -!1 = metadata !{ i32 4 } - -declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*) +declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*, i32) declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll index c0d7d1c..377eee9 100644 --- a/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/test/CodeGen/NVPTX/local-stack-frame.ll @@ -7,8 +7,8 @@ ; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; ; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; -; PTX64: mov.u64 %rl{{[0-9]+}}, __local_depot{{[0-9]+}}; -; PTX64: cvta.local.u64 %SP, %rl{{[0-9]+}}; +; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}}; ; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; ; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}}; define void @foo(i32 %a) { diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll new file mode 100644 index 0000000..3614bea --- /dev/null +++ b/test/CodeGen/NVPTX/machine-sink.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4 +@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4 + +; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates +; BB %entry. Over-sinking created more register pressure on this example. The +; backend would sink the fmuls to BB %merge, but not the loads for being +; conservative on sinking memory accesses. As a result, the loads and +; the two fmuls would be separated to two basic blocks, causing two +; cross-BB live ranges. +define float @post_dominate(float %x, i1 %cond) { +; CHECK-LABEL: post_dominate( +entry: + %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4 + %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4 +; CHECK: ld.shared.f32 +; CHECK: ld.shared.f32 + %2 = fmul float %0, %0 + %3 = fmul float %1, %2 +; CHECK-NOT: bra +; CHECK: mul.rn.f32 +; CHECK: mul.rn.f32 + br i1 %cond, label %then, label %merge + +then: + %z = fadd float %x, %x + br label %then2 + +then2: + %z2 = fadd float %z, %z + br label %merge + +merge: + %y = phi float [ 0.0, %entry ], [ %z2, %then2 ] + %w = fadd float %y, %3 + ret float %w +} diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll new file mode 100644 index 0000000..90c9c43 --- /dev/null +++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: t1 +define <4 x float> @t1(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK-NOT: ld.f32 +; CHECK: ld.u8 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 1 + ret <4 x float> %r +} + +; CHECK-LABEL: t2 +define <4 x float> @t2(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK: ld.f32 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 4 + ret <4 x float> %r +} + +; CHECK-LABEL: t3 +define <4 x float> @t3(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK: ld.v2 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 8 + ret <4 x float> %r +} + +; CHECK-LABEL: t4 +define <4 x float> @t4(i8* %p1) { +; CHECK: ld.v4 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 16 + ret <4 x float> %r +} + + +; CHECK-LABEL: s1 +define void @s1(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK-NOT: st.f32 +; CHECK: st.u8 + store <4 x float> %v, <4 x float>* %p1, align 1 + ret void +} + +; CHECK-LABEL: s2 +define void @s2(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK: st.f32 + store <4 x float> %v, <4 x float>* %p1, align 4 + ret void +} + +; CHECK-LABEL: s3 +define void @s3(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 8 + ret void +} + +; CHECK-LABEL: s4 +define void @s4(<4 x float>* %p1, <4 x float> %v) { +; CHECK: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 16 + ret void +} + diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll index 927946c..1ddf973 100644 --- a/test/CodeGen/NVPTX/mulwide.ll +++ b/test/CodeGen/NVPTX/mulwide.ll @@ -1,37 +1,90 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT -; CHECK: mulwide16 +; OPT-LABEL: @mulwide16 +; NOOPT-LABEL: @mulwide16 define i32 @mulwide16(i16 %a, i16 %b) { -; CHECK: mul.wide.s16 +; OPT: mul.wide.s16 +; NOOPT: mul.lo.s32 %val0 = sext i16 %a to i32 %val1 = sext i16 %b to i32 %val2 = mul i32 %val0, %val1 ret i32 %val2 } -; CHECK: mulwideu16 +; OPT-LABEL: @mulwideu16 +; NOOPT-LABEL: @mulwideu16 define i32 @mulwideu16(i16 %a, i16 %b) { -; CHECK: mul.wide.u16 +; OPT: mul.wide.u16 +; NOOPT: mul.lo.s32 %val0 = zext i16 %a to i32 %val1 = zext i16 %b to i32 %val2 = mul i32 %val0, %val1 ret i32 %val2 } -; CHECK: mulwide32 +; OPT-LABEL: @mulwide8 +; NOOPT-LABEL: @mulwide8 +define i32 @mulwide8(i8 %a, i8 %b) { +; OPT: mul.wide.s16 +; NOOPT: mul.lo.s32 + %val0 = sext i8 %a to i32 + %val1 = sext i8 %b to i32 + %val2 = mul i32 %val0, %val1 + ret i32 %val2 +} + +; OPT-LABEL: @mulwideu8 +; NOOPT-LABEL: @mulwideu8 +define i32 @mulwideu8(i8 %a, i8 %b) { +; OPT: mul.wide.u16 +; NOOPT: mul.lo.s32 + %val0 = zext i8 %a to i32 + %val1 = zext i8 %b to i32 + %val2 = mul i32 %val0, %val1 + ret i32 %val2 +} + +; OPT-LABEL: @mulwide32 +; NOOPT-LABEL: @mulwide32 define i64 @mulwide32(i32 %a, i32 %b) { -; CHECK: mul.wide.s32 +; OPT: mul.wide.s32 +; NOOPT: mul.lo.s64 %val0 = sext i32 %a to i64 %val1 = sext i32 %b to i64 %val2 = mul i64 %val0, %val1 ret i64 %val2 } -; CHECK: mulwideu32 +; OPT-LABEL: @mulwideu32 +; NOOPT-LABEL: @mulwideu32 define i64 @mulwideu32(i32 %a, i32 %b) { -; CHECK: mul.wide.u32 +; OPT: mul.wide.u32 +; NOOPT: mul.lo.s64 %val0 = zext i32 %a to i64 %val1 = zext i32 %b to i64 %val2 = mul i64 %val0, %val1 ret i64 %val2 } + +; OPT-LABEL: @mulwideu7 +; NOOPT-LABEL: @mulwideu7 +define i64 @mulwideu7(i7 %a, i7 %b) { +; OPT: mul.wide.u32 +; NOOPT: mul.lo.s64 + %val0 = zext i7 %a to i64 + %val1 = zext i7 %b to i64 + %val2 = mul i64 %val0, %val1 + ret i64 %val2 +} + +; OPT-LABEL: @mulwides7 +; NOOPT-LABEL: @mulwides7 +define i64 @mulwides7(i7 %a, i7 %b) { +; OPT: mul.wide.s32 +; NOOPT: mul.lo.s64 + %val0 = sext i7 %a to i64 + %val1 = sext i7 %b to i64 + %val2 = mul i64 %val0, %val1 + ret i64 %val2 +} diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index e7a81be..cc67a6f 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -5,7 +5,7 @@ define ptx_kernel void @t1(i1* %a) { ; PTX32: mov.u16 %rs{{[0-9]+}}, 0; ; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; ; PTX64: mov.u16 %rs{{[0-9]+}}, 0; -; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, i1* %a ret void } @@ -15,7 +15,7 @@ define ptx_kernel void @t2(i1* %a, i8* %b) { ; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; -; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll index 68c09fe..34a83f3 100644 --- a/test/CodeGen/NVPTX/st-addrspace.ll +++ b/test/CodeGen/NVPTX/st-addrspace.ll @@ -7,7 +7,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { ; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(1)* %ptr ret void @@ -16,7 +16,7 @@ define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { ; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(3)* %ptr ret void @@ -25,7 +25,7 @@ define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { ; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(5)* %ptr ret void @@ -36,7 +36,7 @@ define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) { ; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(1)* %ptr ret void @@ -45,7 +45,7 @@ define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) { define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) { ; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(3)* %ptr ret void @@ -54,7 +54,7 @@ define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) { define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) { ; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(5)* %ptr ret void @@ -65,7 +65,7 @@ define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) { define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) { ; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(1)* %ptr ret void @@ -74,7 +74,7 @@ define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) { define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) { ; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(3)* %ptr ret void @@ -83,7 +83,7 @@ define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) { define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) { ; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(5)* %ptr ret void @@ -92,27 +92,27 @@ define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) { ;; i64 define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) { -; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(1)* %ptr ret void } define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) { -; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(3)* %ptr ret void } define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) { -; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(5)* %ptr ret void @@ -123,7 +123,7 @@ define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) { define void @st_global_f32(float addrspace(1)* %ptr, float %a) { ; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(1)* %ptr ret void @@ -132,7 +132,7 @@ define void @st_global_f32(float addrspace(1)* %ptr, float %a) { define void @st_shared_f32(float addrspace(3)* %ptr, float %a) { ; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(3)* %ptr ret void @@ -141,7 +141,7 @@ define void @st_shared_f32(float addrspace(3)* %ptr, float %a) { define void @st_local_f32(float addrspace(5)* %ptr, float %a) { ; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(5)* %ptr ret void @@ -150,27 +150,27 @@ define void @st_local_f32(float addrspace(5)* %ptr, float %a) { ;; f64 define void @st_global_f64(double addrspace(1)* %ptr, double %a) { -; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(1)* %ptr ret void } define void @st_shared_f64(double addrspace(3)* %ptr, double %a) { -; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(3)* %ptr ret void } define void @st_local_f64(double addrspace(5)* %ptr, double %a) { -; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(5)* %ptr ret void diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll index b9c616f..022f7ab 100644 --- a/test/CodeGen/NVPTX/st-generic.ll +++ b/test/CodeGen/NVPTX/st-generic.ll @@ -7,7 +7,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { ; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(0)* %ptr ret void @@ -18,7 +18,7 @@ define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) { ; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, i16 addrspace(0)* %ptr ret void @@ -29,7 +29,7 @@ define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) { define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) { ; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, i32 addrspace(0)* %ptr ret void @@ -38,9 +38,9 @@ define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) { ;; i64 define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) { -; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}} +; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}} +; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, i64 addrspace(0)* %ptr ret void @@ -51,7 +51,7 @@ define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) { define void @st_global_f32(float addrspace(0)* %ptr, float %a) { ; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, float addrspace(0)* %ptr ret void @@ -60,9 +60,9 @@ define void @st_global_f32(float addrspace(0)* %ptr, float %a) { ;; f64 define void @st_global_f64(double addrspace(0)* %ptr, double %a) { -; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}} +; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}} +; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, double addrspace(0)* %ptr ret void diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll new file mode 100644 index 0000000..10a1ecc --- /dev/null +++ b/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, float* %red, i32 %idx) { +; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}] +; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}] + %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx) +; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] +; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] + %ret = sitofp i32 %val to float +; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] + store float %ret, float* %red + ret void +} + +@surf0 = internal addrspace(1) global i64 0, align 8 + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(float* %red, i32 %idx) { +; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}] +; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}] + %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx) +; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] +; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] + %ret = sitofp i32 %val to float +; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] + store float %ret, float* %red + ret void +} + + + + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} + diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll new file mode 100644 index 0000000..654c47f --- /dev/null +++ b/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, i32 %val, i32 %idx) { +; SM20: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} +; SM30: ld.param.u64 %rd[[SURFREG:[0-9]+]], [foo_param_0]; +; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} + tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val) + ret void +} + + +@surf0 = internal addrspace(1) global i64 0, align 8 + + + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(i32 %val, i32 %idx) { +; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}} +; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}} + tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val) + ret void +} + + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} + diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll new file mode 100644 index 0000000..ee0cefa --- /dev/null +++ b/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + + +target triple = "nvptx-unknown-cuda" + +declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + +; SM20-LABEL: .entry foo +; SM30-LABEL: .entry foo +define void @foo(i64 %img, float* %red, i32 %idx) { +; SM20: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0]; +; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}] +; SM30: ld.param.u64 %rd[[TEXREG:[0-9]+]], [foo_param_0]; +; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}] + %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx) + %ret = extractvalue { float, float, float, float } %val, 0 +; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] + store float %ret, float* %red + ret void +} + + +@tex0 = internal addrspace(1) global i64 0, align 8 + +; SM20-LABEL: .entry bar +; SM30-LABEL: .entry bar +define void @bar(float* %red, i32 %idx) { +; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}] +; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}] + %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) + %ret = extractvalue { float, float, float, float } %val, 0 +; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] + store float %ret, float* %red + ret void +} + +!nvvm.annotations = !{!1, !2, !3} +!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1} +!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1} +!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1} diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll index 291060b..55e4bfc 100644 --- a/test/CodeGen/NVPTX/tex-read.ll +++ b/test/CodeGen/NVPTX/tex-read.ll @@ -2,12 +2,12 @@ target triple = "nvptx-unknown-nvcl" -declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32) +declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32) ; CHECK: .entry foo define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) { ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}] - %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx) + %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 ; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]] store float %ret, float* %red diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll new file mode 100644 index 0000000..c7637cc --- /dev/null +++ b/test/CodeGen/NVPTX/texsurf-queries.ll @@ -0,0 +1,103 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20 +; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30 + +target triple = "nvptx-unknown-cuda" + +@tex0 = internal addrspace(1) global i64 0, align 8 +@surf0 = internal addrspace(1) global i64 0, align 8 + +declare i32 @llvm.nvvm.txq.width(i64) +declare i32 @llvm.nvvm.txq.height(i64) +declare i32 @llvm.nvvm.suq.width(i64) +declare i32 @llvm.nvvm.suq.height(i64) +declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*) + + +; SM20-LABEL: @t0 +; SM30-LABEL: @t0 +define i32 @t0(i64 %texHandle) { +; SM20: txq.width.b32 +; SM30: txq.width.b32 + %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle) + ret i32 %width +} + +; SM20-LABEL: @t1 +; SM30-LABEL: @t1 +define i32 @t1() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0] +; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle) + ret i32 %width +} + + +; SM20-LABEL: @t2 +; SM30-LABEL: @t2 +define i32 @t2(i64 %texHandle) { +; SM20: txq.height.b32 +; SM30: txq.height.b32 + %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle) + ret i32 %height +} + +; SM20-LABEL: @t3 +; SM30-LABEL: @t3 +define i32 @t3() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0 + %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0) +; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0] +; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle) + ret i32 %height +} + + +; SM20-LABEL: @s0 +; SM30-LABEL: @s0 +define i32 @s0(i64 %surfHandle) { +; SM20: suq.width.b32 +; SM30: suq.width.b32 + %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle) + ret i32 %width +} + +; SM20-LABEL: @s1 +; SM30-LABEL: @s1 +define i32 @s1() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0] +; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle) + ret i32 %width +} + + +; SM20-LABEL: @s2 +; SM30-LABEL: @s2 +define i32 @s2(i64 %surfHandle) { +; SM20: suq.height.b32 +; SM30: suq.height.b32 + %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle) + ret i32 %height +} + +; SM20-LABEL: @s3 +; SM30-LABEL: @s3 +define i32 @s3() { +; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0 + %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0) +; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0] +; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]] + %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle) + ret i32 %height +} + + + +!nvvm.annotations = !{!1, !2} +!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1} +!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1} diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll new file mode 100644 index 0000000..a03d7fd --- /dev/null +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx-unknown-cuda" + +declare void @bar(<4 x i32>) + +; CHECK-LABEL @foo +define void @foo(<4 x i32> %a) { +; CHECK: st.param.v4.b32 + tail call void @bar(<4 x i32> %a) + ret void +} diff --git a/test/CodeGen/NVPTX/vector-return.ll b/test/CodeGen/NVPTX/vector-return.ll new file mode 100644 index 0000000..15e50f8 --- /dev/null +++ b/test/CodeGen/NVPTX/vector-return.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s + +declare <2 x float> @bar(<2 x float> %input) + +define void @foo(<2 x float> %input, <2 x float>* %output) { +; CHECK-LABEL: @foo +entry: + %call = tail call <2 x float> @bar(<2 x float> %input) +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0]; + store <2 x float> %call, <2 x float>* %output, align 8 +; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]} + ret void +} |