aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChe-Liang Chiou <clchiou@gmail.com>2011-02-28 06:34:09 +0000
committerChe-Liang Chiou <clchiou@gmail.com>2011-02-28 06:34:09 +0000
commitf71720231f6de9b2b7fe28edd179ae217a105329 (patch)
tree7ed3e644aac2d4be87b13f146f0467761e8e7600
parentd8d1584c13c554349c235177b2b89cb5117347b2 (diff)
downloadexternal_llvm-f71720231f6de9b2b7fe28edd179ae217a105329.zip
external_llvm-f71720231f6de9b2b7fe28edd179ae217a105329.tar.gz
external_llvm-f71720231f6de9b2b7fe28edd179ae217a105329.tar.bz2
Add preliminary support for .f32 in the PTX backend.
- Add appropriate TableGen patterns for fadd, fsub, fmul. - Add .f32 as the PTX type for the LLVM float type. - Allow parameters, return values, and global variable declarations to accept the float type. - Add appropriate test cases. Patch by Justin Holewinski git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126636 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/PTX/PTXAsmPrinter.cpp29
-rw-r--r--lib/Target/PTX/PTXISelLowering.cpp22
-rw-r--r--lib/Target/PTX/PTXInstrInfo.cpp8
-rw-r--r--lib/Target/PTX/PTXInstrInfo.td42
-rw-r--r--lib/Target/PTX/PTXRegisterInfo.td40
-rw-r--r--test/CodeGen/PTX/add.ll14
-rw-r--r--test/CodeGen/PTX/ld_float.ll86
-rw-r--r--test/CodeGen/PTX/mov.ll12
-rw-r--r--test/CodeGen/PTX/mul.ll25
-rw-r--r--test/CodeGen/PTX/st_float.ll78
-rw-r--r--test/CodeGen/PTX/sub.ll14
11 files changed, 360 insertions, 10 deletions
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index a605997..25f26fa 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -84,6 +84,7 @@ static const char PARAM_PREFIX[] = "__param_";
static const char *getRegisterTypeName(unsigned RegNo) {
#define TEST_REGCLS(cls, clsstr) \
if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
+ TEST_REGCLS(RRegf32, f32);
TEST_REGCLS(RRegs32, s32);
TEST_REGCLS(Preds, pred);
#undef TEST_REGCLS
@@ -115,6 +116,21 @@ static const char *getStateSpaceName(unsigned addressSpace) {
return NULL;
}
+static const char *getTypeName(const Type* type) {
+ while (true) {
+ switch (type->getTypeID()) {
+ default: llvm_unreachable("Unknown type");
+ case Type::FloatTyID: return ".f32";
+ case Type::IntegerTyID: return ".s32"; // TODO: Handle 64-bit types.
+ case Type::ArrayTyID:
+ case Type::PointerTyID:
+ type = dyn_cast<const SequentialType>(type)->getElementType();
+ break;
+ }
+ }
+ return NULL;
+}
+
bool PTXAsmPrinter::doFinalization(Module &M) {
// XXX Temproarily remove global variables so that doFinalization() will not
// emit them again (global variables are emitted at beginning).
@@ -218,6 +234,15 @@ void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
case MachineOperand::MO_Register:
OS << getRegisterName(MO.getReg());
break;
+ case MachineOperand::MO_FPImmediate:
+ APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
+ if (constFP.getZExtValue() > 0) {
+ OS << "0F" << constFP.toString(16, false);
+ }
+ else {
+ OS << "0F00000000";
+ }
+ break;
}
}
@@ -265,8 +290,8 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
decl += " ";
}
- // TODO: add types
- decl += ".s32 ";
+ decl += getTypeName(gv->getType());
+ decl += " ";
decl += gvsym->getName();
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index e6d4490..d30c9ec 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -28,9 +28,12 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
// Set up the register classes.
addRegisterClass(MVT::i1, PTX::PredsRegisterClass);
addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass);
-
+ addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass);
+
setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
// Customize translation of memory addresses
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
@@ -87,7 +90,8 @@ struct argmap_entry {
bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; }
} argmap[] = {
argmap_entry(MVT::i1, PTX::PredsRegisterClass),
- argmap_entry(MVT::i32, PTX::RRegs32RegisterClass)
+ argmap_entry(MVT::i32, PTX::RRegs32RegisterClass),
+ argmap_entry(MVT::f32, PTX::RRegf32RegisterClass)
};
} // end anonymous namespace
@@ -185,10 +189,18 @@ SDValue PTXTargetLowering::
if (Outs.size() == 0)
return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
- assert(Outs[0].VT == MVT::i32 && "Can return only basic types");
-
SDValue Flag;
- unsigned reg = PTX::R0;
+ unsigned reg;
+
+ if (Outs[0].VT == MVT::i32) {
+ reg = PTX::R0;
+ }
+ else if (Outs[0].VT == MVT::f32) {
+ reg = PTX::F0;
+ }
+ else {
+ assert(false && "Can return only basic types");
+ }
MachineFunction &MF = DAG.getMachineFunction();
PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 805759b..f2e5e4c 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -28,6 +28,7 @@ static const struct map_entry {
const int opcode;
} map[] = {
{ &PTX::RRegs32RegClass, PTX::MOVrr },
+ { &PTX::RRegf32RegClass, PTX::MOVrr },
{ &PTX::PredsRegClass, PTX::MOVpp }
};
@@ -35,12 +36,13 @@ void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, DebugLoc DL,
unsigned DstReg, unsigned SrcReg,
bool KillSrc) const {
- for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
- if (PTX::RRegs32RegClass.contains(DstReg, SrcReg)) {
+ for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) {
+ if (map[i].cls->contains(DstReg, SrcReg)) {
BuildMI(MBB, I, DL,
- get(PTX::MOVrr), DstReg).addReg(SrcReg, getKillRegState(KillSrc));
+ get(map[i].opcode), DstReg).addReg(SrcReg, getKillRegState(KillSrc));
return;
}
+ }
llvm_unreachable("Impossible reg-to-reg copy");
}
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 9a74778..9d962b0 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -143,6 +143,18 @@ def PTXret
// Instruction Class Templates
//===----------------------------------------------------------------------===//
+// Three-operand f32 instruction template
+multiclass FLOAT3<string opcstr, SDNode opnode> {
+ def rr : InstPTX<(outs RRegf32:$d),
+ (ins RRegf32:$a, RRegf32:$b),
+ !strconcat(opcstr, ".%type\t$d, $a, $b"),
+ [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
+ def ri : InstPTX<(outs RRegf32:$d),
+ (ins RRegf32:$a, f32imm:$b),
+ !strconcat(opcstr, ".%type\t$d, $a, $b"),
+ [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
+}
+
multiclass INT3<string opcstr, SDNode opnode> {
def rr : InstPTX<(outs RRegs32:$d),
(ins RRegs32:$a, RRegs32:$b),
@@ -204,6 +216,12 @@ multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> {
// Instructions
//===----------------------------------------------------------------------===//
+///===- Floating-Point Arithmetic Instructions ----------------------------===//
+
+defm FADD : FLOAT3<"add", fadd>;
+defm FSUB : FLOAT3<"sub", fsub>;
+defm FMUL : FLOAT3<"mul", fmul>;
+
///===- Integer Arithmetic Instructions -----------------------------------===//
defm ADD : INT3<"add", add>;
@@ -223,6 +241,8 @@ let neverHasSideEffects = 1 in {
: InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>;
def MOVrr
: InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>;
+ def FMOVrr
+ : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>;
}
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -232,8 +252,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def MOVri
: InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a",
[(set RRegs32:$d, imm:$a)]>;
+ def FMOVri
+ : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
+ [(set RRegf32:$d, fpimm:$a)]>;
}
+// Integer loads
defm LDg : PTX_LD<"ld.global", RRegs32, load_global>;
defm LDc : PTX_LD<"ld.const", RRegs32, load_constant>;
defm LDl : PTX_LD<"ld.local", RRegs32, load_local>;
@@ -243,12 +267,30 @@ defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>;
def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a),
"ld.param.%type\t$d, [$a]", []>;
+// Floating-point loads
+defm FLDg : PTX_LD<"ld.global", RRegf32, load_global>;
+defm FLDc : PTX_LD<"ld.const", RRegf32, load_constant>;
+defm FLDl : PTX_LD<"ld.local", RRegf32, load_local>;
+defm FLDp : PTX_LD<"ld.param", RRegf32, load_parameter>;
+defm FLDs : PTX_LD<"ld.shared", RRegf32, load_shared>;
+
+def FLDpi : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
+ "ld.param.%type\t$d, [$a]", []>;
+
+// Integer stores
defm STg : PTX_ST<"st.global", RRegs32, store_global>;
defm STl : PTX_ST<"st.local", RRegs32, store_local>;
// Store to parameter state space requires PTX 2.0 or higher?
// defm STp : PTX_ST<"st.param", RRegs32, store_parameter>;
defm STs : PTX_ST<"st.shared", RRegs32, store_shared>;
+// Floating-point stores
+defm FSTg : PTX_ST<"st.global", RRegf32, store_global>;
+defm FSTl : PTX_ST<"st.local", RRegf32, store_local>;
+// Store to parameter state space requires PTX 2.0 or higher?
+// defm FSTp : PTX_ST<"st.param", RRegf32, store_parameter>;
+defm FSTs : PTX_ST<"st.shared", RRegf32, store_shared>;
+
///===- Control Flow Instructions -----------------------------------------===//
let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 22e2b34..9158f0d 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -85,6 +85,40 @@ def R29 : PTXReg<"r29">;
def R30 : PTXReg<"r30">;
def R31 : PTXReg<"r31">;
+def F0 : PTXReg<"f0">;
+def F1 : PTXReg<"f1">;
+def F2 : PTXReg<"f2">;
+def F3 : PTXReg<"f3">;
+def F4 : PTXReg<"f4">;
+def F5 : PTXReg<"f5">;
+def F6 : PTXReg<"f6">;
+def F7 : PTXReg<"f7">;
+def F8 : PTXReg<"f8">;
+def F9 : PTXReg<"f9">;
+def F10 : PTXReg<"f10">;
+def F11 : PTXReg<"f11">;
+def F12 : PTXReg<"f12">;
+def F13 : PTXReg<"f13">;
+def F14 : PTXReg<"f14">;
+def F15 : PTXReg<"f15">;
+def F16 : PTXReg<"f16">;
+def F17 : PTXReg<"f17">;
+def F18 : PTXReg<"f18">;
+def F19 : PTXReg<"f19">;
+def F20 : PTXReg<"f20">;
+def F21 : PTXReg<"f21">;
+def F22 : PTXReg<"f22">;
+def F23 : PTXReg<"f23">;
+def F24 : PTXReg<"f24">;
+def F25 : PTXReg<"f25">;
+def F26 : PTXReg<"f26">;
+def F27 : PTXReg<"f27">;
+def F28 : PTXReg<"f28">;
+def F29 : PTXReg<"f29">;
+def F30 : PTXReg<"f30">;
+def F31 : PTXReg<"f31">;
+
+
//===----------------------------------------------------------------------===//
// Register classes
//===----------------------------------------------------------------------===//
@@ -100,3 +134,9 @@ def RRegs32 : RegisterClass<"PTX", [i32], 32,
R8, R9, R10, R11, R12, R13, R14, R15,
R16, R17, R18, R19, R20, R21, R22, R23,
R24, R25, R26, R27, R28, R29, R30, R31]>;
+
+def RRegf32 : RegisterClass<"PTX", [f32], 32,
+ [F0, F1, F2, F3, F4, F5, F6, F7,
+ F8, F9, F10, F11, F12, F13, F14, F15,
+ F16, F17, F18, F19, F20, F21, F22, F23,
+ F24, F25, F26, F27, F28, F29, F30, F31]>;
diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll
index 1259d03..9e777ae 100644
--- a/test/CodeGen/PTX/add.ll
+++ b/test/CodeGen/PTX/add.ll
@@ -13,3 +13,17 @@ define ptx_device i32 @t2(i32 %x) {
; CHECK: ret;
ret i32 %z
}
+
+define ptx_device float @t3(float %x, float %y) {
+; CHECK: add.f32 f0, f1, f2
+; CHECK-NEXT: ret;
+ %z = fadd float %x, %y
+ ret float %z
+}
+
+define ptx_device float @t4(float %x) {
+; CHECK: add.f32 f0, f1, 0F3F800000;
+; CHECK-NEXT: ret;
+ %z = fadd float %x, 1.0
+ ret float %z
+}
diff --git a/test/CodeGen/PTX/ld_float.ll b/test/CodeGen/PTX/ld_float.ll
new file mode 100644
index 0000000..62d2c36
--- /dev/null
+++ b/test/CodeGen/PTX/ld_float.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -march=ptx | FileCheck %s
+
+;CHECK: .extern .global .f32 array[];
+@array = external global [10 x float]
+
+;CHECK: .extern .const .f32 array_constant[];
+@array_constant = external addrspace(1) constant [10 x float]
+
+;CHECK: .extern .local .f32 array_local[];
+@array_local = external addrspace(2) global [10 x float]
+
+;CHECK: .extern .shared .f32 array_shared[];
+@array_shared = external addrspace(4) global [10 x float]
+
+define ptx_device float @t1(float* %p) {
+entry:
+;CHECK: ld.global.f32 f0, [r1];
+;CHECK-NEXT: ret;
+ %x = load float* %p
+ ret float %x
+}
+
+define ptx_device float @t2(float* %p) {
+entry:
+;CHECK: ld.global.f32 f0, [r1+4];
+;CHECK-NEXT: ret;
+ %i = getelementptr float* %p, i32 1
+ %x = load float* %i
+ ret float %x
+}
+
+define ptx_device float @t3(float* %p, i32 %q) {
+entry:
+;CHECK: shl.b32 r0, r2, 2;
+;CHECK-NEXT: add.s32 r0, r1, r0;
+;CHECK-NEXT: ld.global.f32 f0, [r0];
+;CHECK-NEXT: ret;
+ %i = getelementptr float* %p, i32 %q
+ %x = load float* %i
+ ret float %x
+}
+
+define ptx_device float @t4_global() {
+entry:
+;CHECK: ld.global.f32 f0, [array];
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float]* @array, i32 0, i32 0
+ %x = load float* %i
+ ret float %x
+}
+
+define ptx_device float @t4_const() {
+entry:
+;CHECK: ld.const.f32 f0, [array_constant];
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float] addrspace(1)* @array_constant, i32 0, i32 0
+ %x = load float addrspace(1)* %i
+ ret float %x
+}
+
+define ptx_device float @t4_local() {
+entry:
+;CHECK: ld.local.f32 f0, [array_local];
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0
+ %x = load float addrspace(2)* %i
+ ret float %x
+}
+
+define ptx_device float @t4_shared() {
+entry:
+;CHECK: ld.shared.f32 f0, [array_shared];
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0
+ %x = load float addrspace(4)* %i
+ ret float %x
+}
+
+define ptx_device float @t5() {
+entry:
+;CHECK: ld.global.f32 f0, [array+4];
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float]* @array, i32 0, i32 1
+ %x = load float* %i
+ ret float %x
+}
diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll
index c365e9b..d201a78 100644
--- a/test/CodeGen/PTX/mov.ll
+++ b/test/CodeGen/PTX/mov.ll
@@ -11,3 +11,15 @@ define ptx_device i32 @t2(i32 %x) {
; CHECK: ret;
ret i32 %x
}
+
+define ptx_device float @t3() {
+; CHECK: mov.f32 f0, 0F00000000;
+; CHECK-NEXT: ret;
+ ret float 0.0
+}
+
+define ptx_device float @t4(float %x) {
+; CHECK: mov.f32 f0, f1;
+; CHECK-NEXT: ret;
+ ret float %x
+}
diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll
new file mode 100644
index 0000000..01871da
--- /dev/null
+++ b/test/CodeGen/PTX/mul.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ptx | FileCheck %s
+
+;define ptx_device i32 @t1(i32 %x, i32 %y) {
+; %z = mul i32 %x, %y
+; ret i32 %z
+;}
+
+;define ptx_device i32 @t2(i32 %x) {
+; %z = mul i32 %x, 1
+; ret i32 %z
+;}
+
+define ptx_device float @t3(float %x, float %y) {
+; CHECK: mul.f32 f0, f1, f2
+; CHECK-NEXT: ret;
+ %z = fmul float %x, %y
+ ret float %z
+}
+
+define ptx_device float @t4(float %x) {
+; CHECK: mul.f32 f0, f1, 0F40A00000;
+; CHECK-NEXT: ret;
+ %z = fmul float %x, 5.0
+ ret float %z
+}
diff --git a/test/CodeGen/PTX/st_float.ll b/test/CodeGen/PTX/st_float.ll
new file mode 100644
index 0000000..f0e0010
--- /dev/null
+++ b/test/CodeGen/PTX/st_float.ll
@@ -0,0 +1,78 @@
+; RUN: llc < %s -march=ptx | FileCheck %s
+
+;CHECK: .extern .global .f32 array[];
+@array = external global [10 x float]
+
+;CHECK: .extern .const .f32 array_constant[];
+@array_constant = external addrspace(1) constant [10 x float]
+
+;CHECK: .extern .local .f32 array_local[];
+@array_local = external addrspace(2) global [10 x float]
+
+;CHECK: .extern .shared .f32 array_shared[];
+@array_shared = external addrspace(4) global [10 x float]
+
+define ptx_device void @t1(float* %p, float %x) {
+entry:
+;CHECK: st.global.f32 [r1], f1;
+;CHECK-NEXT: ret;
+ store float %x, float* %p
+ ret void
+}
+
+define ptx_device void @t2(float* %p, float %x) {
+entry:
+;CHECK: st.global.f32 [r1+4], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr float* %p, i32 1
+ store float %x, float* %i
+ ret void
+}
+
+define ptx_device void @t3(float* %p, i32 %q, float %x) {
+;CHECK: .reg .s32 r0;
+entry:
+;CHECK: shl.b32 r0, r2, 2;
+;CHECK-NEXT: add.s32 r0, r1, r0;
+;CHECK-NEXT: st.global.f32 [r0], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr float* %p, i32 %q
+ store float %x, float* %i
+ ret void
+}
+
+define ptx_device void @t4_global(float %x) {
+entry:
+;CHECK: st.global.f32 [array], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float]* @array, i32 0, i32 0
+ store float %x, float* %i
+ ret void
+}
+
+define ptx_device void @t4_local(float %x) {
+entry:
+;CHECK: st.local.f32 [array_local], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0
+ store float %x, float addrspace(2)* %i
+ ret void
+}
+
+define ptx_device void @t4_shared(float %x) {
+entry:
+;CHECK: st.shared.f32 [array_shared], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0
+ store float %x, float addrspace(4)* %i
+ ret void
+}
+
+define ptx_device void @t5(float %x) {
+entry:
+;CHECK: st.global.f32 [array+4], f1;
+;CHECK-NEXT: ret;
+ %i = getelementptr [10 x float]* @array, i32 0, i32 1
+ store float %x, float* %i
+ ret void
+}
diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll
index aab3fda..e11deca 100644
--- a/test/CodeGen/PTX/sub.ll
+++ b/test/CodeGen/PTX/sub.ll
@@ -13,3 +13,17 @@ define ptx_device i32 @t2(i32 %x) {
;CHECK: ret;
ret i32 %z
}
+
+define ptx_device float @t3(float %x, float %y) {
+; CHECK: sub.f32 f0, f1, f2
+; CHECK-NEXT: ret;
+ %z = fsub float %x, %y
+ ret float %z
+}
+
+define ptx_device float @t4(float %x) {
+; CHECK: add.f32 f0, f1, 0FBF800000;
+; CHECK-NEXT: ret;
+ %z = fsub float %x, 1.0
+ ret float %z
+}