16 files changed, 708 insertions, 49 deletions
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
index 5c981ef..55aba0d 100644
--- a/test/CodeGen/R600/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -1,10 +1,10 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
-; chance to optimize the fcmp + select instructions to CNDE was missed
+; chance to optimize the fcmp + select instructions to SET* was missed
 ; due to the fact that the operands to fcmp and select had different types
 
-;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
+; CHECK: SET{{[A-Z]+}}_DX10
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index 89f5e9e..37f621d 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -1,8 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fcmp_sext
+; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
@@ -12,3 +13,25 @@ entry:
   store i32 %sext, i32 addrspace(1)* %out
   ret void
 }
+
+; This test checks that a setcc node with f32 operands is lowered to a
+; SET*_DX10 instruction.  Previously we were lowering this to:
+; SET* + FP_TO_SINT
+
+; CHECK: @fcmp_br
+; CHECK: SET{{[N]*}}E_DX10 T{{[0-9]+\.[XYZW], [a-zA-Z0-9, .]+}}(5.0
+
+define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp oeq float %in, 5.0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fdiv.v4f32.ll b/test/CodeGen/R600/fdiv.v4f32.ll
index b013fd6..79e677f 100644
--- a/test/CodeGen/R600/fdiv.v4f32.ll
+++ b/test/CodeGen/R600/fdiv.v4f32.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
index aad44d9..71705a6 100644
--- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -4,7 +4,7 @@
 ;to a SETNE_INT.  There should only be one SETNE_INT instruction.
 
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK_NOT: SETNE_INT
+;CHECK-NOT: SETNE_INT
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 382f78c..e8e2bf5 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -1,38 +1,38 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @main1
 ; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}}
-
-define void @main() {
+define void @main1() {
 main_body:
-  %0 = load <4 x float> addrspace(9)* null
+  %0 = load <4 x float> addrspace(8)* null
   %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = fcmp ult float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float> addrspace(9)* null
+  %8 = load <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %11 = extractelement <4 x float> %10, i32 1
-  %12 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
   %14 = fcmp ult float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float> addrspace(9)* null
+  %16 = load <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %19 = extractelement <4 x float> %18, i32 2
-  %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %21 = extractelement <4 x float> %20, i32 2
   %22 = fcmp ult float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float> addrspace(9)* null
+  %24 = load <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 3
   %30 = fcmp ult float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
@@ -48,5 +48,53 @@ main_body:
   ret void
 }
 
+; CHECK: @main2
+; CHECK-NOT: MOV
+define void @main2() {
+main_body:
+  %0 = load <4 x float> addrspace(8)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = fcmp ult float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %11 = extractelement <4 x float> %10, i32 0
+  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ult float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float> addrspace(8)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 3
+  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ult float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float> addrspace(8)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 2
+  %30 = fcmp ult float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
 declare float @llvm.AMDIL.clamp.(float, float, float) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
new file mode 100644
index 0000000..1aae7f9
--- /dev/null
+++ b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests a bug where LegalizeDAG was not checking the target's
+; BooleanContents value and always using one for true, when expanding
+; setcc to select_cc.
+;
+; This bug caused the icmp IR instruction to be expanded to two machine
+; instructions, when only one is needed.
+;
+
+; CHECK: @setcc_expand
+; CHECK: SET
+; CHECK-NOT: CND
+define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp eq i32 %in, 5
+  br i1 %0, label %IF, label %ENDIF
+IF:
+  %1 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %1
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index be62342..e69f64e 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -6,7 +6,7 @@
 ; or
 ; ADD_INT literal.x REG, 5
 
-; CHECK; @i32_literal
+; CHECK: @i32_literal
 ; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
index 0c19f14..a8f604a 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
@@ -3,20 +3,18 @@
 ;CHECK: S_MOV_B32
 ;CHECK-NEXT: V_INTERP_MOV_F32
 
-define void @main() {
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:
-  call void @llvm.AMDGPU.shader.type(i32 0)
-  %0 = load i32 addrspace(8)* inttoptr (i32 6 to i32 addrspace(8)*)
-  %1 = call float @llvm.SI.fs.interp.constant(i32 0, i32 0, i32 %0)
-  %2 = call i32 @llvm.SI.packf16(float %1, float %1)
-  %3 = bitcast i32 %2 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %5 = call i32 @llvm.SI.packf16(float %4, float %4)
+  %6 = bitcast i32 %5 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
   ret void
 }
 
 declare void @llvm.AMDGPU.shader.type(i32)
 
-declare float @llvm.SI.fs.interp.constant(i32, i32, i32) readonly
+declare float @llvm.SI.fs.constant(i32, i32, i32) readonly
 
 declare i32 @llvm.SI.packf16(float, float) readnone
 
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index 34d1935..d397f3b 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,35 +1,35 @@
 ;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
 
 ;CHECK: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE_C
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
-;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: S_WAITCNT 1904
 ;CHECK-NEXT: IMAGE_SAMPLE
 
 define void @test() {
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
new file mode 100644
index 0000000..ba9620c
--- /dev/null
+++ b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
@@ -0,0 +1,83 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = call float @llvm.R600.interp.input(i32 0, i32 0)
+  %1 = call float @llvm.R600.interp.input(i32 1, i32 0)
+  %2 = call float @llvm.R600.interp.input(i32 2, i32 0)
+  %3 = call float @llvm.R600.interp.input(i32 3, i32 0)
+  %4 = fcmp ult float %1, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = fcmp ult float %0, 5.700000e+01
+  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
+  %11 = fsub float -0.000000e+00, %10
+  %12 = fptosi float %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %8 to i32
+  %15 = bitcast float %13 to i32
+  %16 = and i32 %14, %15
+  %17 = bitcast i32 %16 to float
+  %18 = bitcast float %17 to i32
+  %19 = icmp ne i32 %18, 0
+  %20 = fcmp ult float %0, 0.000000e+00
+  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fptosi float %22 to i32
+  %24 = bitcast i32 %23 to float
+  %25 = bitcast float %24 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %19, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  br i1 %26, label %ENDIF, label %ELSE17
+
+ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
+  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00)
+  %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
+  %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
+  %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %31 = insertelement <4 x float> undef, float %27, i32 0
+  %32 = insertelement <4 x float> %31, float %28, i32 1
+  %33 = insertelement <4 x float> %32, float %29, i32 2
+  %34 = insertelement <4 x float> %33, float %30, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  ret void
+
+ELSE17:                                           ; preds = %ELSE
+  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %38 = fadd float %35, 0x3FC99999A0000000
+  %39 = fadd float %36, 0x3FC99999A0000000
+  %40 = fadd float %37, 0x3FC99999A0000000
+  %41 = fadd float %38, 0x3FC99999A0000000
+  %42 = fadd float %39, 0x3FC99999A0000000
+  %43 = fadd float %40, 0x3FC99999A0000000
+  %44 = fadd float %41, 0x3FC99999A0000000
+  %45 = fadd float %42, 0x3FC99999A0000000
+  %46 = fadd float %43, 0x3FC99999A0000000
+  %47 = fadd float %44, 0x3FC99999A0000000
+  %48 = fadd float %45, 0x3FC99999A0000000
+  %49 = fadd float %46, 0x3FC99999A0000000
+  br label %ENDIF
+}
+
+declare float @llvm.R600.interp.input(i32, i32) #0
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll
new file mode 100644
index 0000000..5e875c4
--- /dev/null
+++ b/test/CodeGen/R600/schedule-fs-loop-nested.ll
@@ -0,0 +1,88 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = bitcast float %3 to i32
+  %5 = sdiv i32 %4, 4
+  %6 = bitcast i32 %5 to float
+  %7 = bitcast float %6 to i32
+  %8 = mul i32 %7, 4
+  %9 = bitcast i32 %8 to float
+  %10 = bitcast float %9 to i32
+  %11 = sub i32 0, %10
+  %12 = bitcast i32 %11 to float
+  %13 = bitcast float %3 to i32
+  %14 = bitcast float %12 to i32
+  %15 = add i32 %13, %14
+  %16 = bitcast i32 %15 to float
+  %17 = load <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = load <4 x float> addrspace(9)* null
+  %20 = extractelement <4 x float> %19, i32 1
+  %21 = load <4 x float> addrspace(9)* null
+  %22 = extractelement <4 x float> %21, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %IF31, %main_body
+  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
+  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
+  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
+  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
+  %23 = bitcast float %temp12.0 to i32
+  %24 = bitcast float %6 to i32
+  %25 = icmp sge i32 %23, %24
+  %26 = sext i1 %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  %29 = icmp ne i32 %28, 0
+  br i1 %29, label %IF, label %LOOP29
+
+IF:                                               ; preds = %LOOP
+  %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %34 = insertelement <4 x float> undef, float %30, i32 0
+  %35 = insertelement <4 x float> %34, float %31, i32 1
+  %36 = insertelement <4 x float> %35, float %32, i32 2
+  %37 = insertelement <4 x float> %36, float %33, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  ret void
+
+LOOP29:                                           ; preds = %LOOP, %ENDIF30
+  %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
+  %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
+  %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
+  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
+  %38 = bitcast float %temp20.0 to i32
+  %39 = bitcast float %16 to i32
+  %40 = icmp sge i32 %38, %39
+  %41 = sext i1 %40 to i32
+  %42 = bitcast i32 %41 to float
+  %43 = bitcast float %42 to i32
+  %44 = icmp ne i32 %43, 0
+  br i1 %44, label %IF31, label %ENDIF30
+
+IF31:                                             ; preds = %LOOP29
+  %45 = bitcast float %temp12.0 to i32
+  %46 = add i32 %45, 1
+  %47 = bitcast i32 %46 to float
+  br label %LOOP
+
+ENDIF30:                                          ; preds = %LOOP29
+  %48 = bitcast float %temp20.0 to i32
+  %49 = add i32 %48, 1
+  %50 = bitcast i32 %49 to float
+  br label %LOOP29
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll
new file mode 100644
index 0000000..d142cac
--- /dev/null
+++ b/test/CodeGen/R600/schedule-fs-loop.ll
@@ -0,0 +1,55 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = fptosi float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = load <4 x float> addrspace(9)* null
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = load <4 x float> addrspace(9)* null
+  %7 = extractelement <4 x float> %6, i32 1
+  %8 = load <4 x float> addrspace(9)* null
+  %9 = extractelement <4 x float> %8, i32 2
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
+  %10 = bitcast float %temp8.0 to i32
+  %11 = bitcast float %3 to i32
+  %12 = icmp sge i32 %10, %11
+  %13 = sext i1 %12 to i32
+  %14 = bitcast i32 %13 to float
+  %15 = bitcast float %14 to i32
+  %16 = icmp ne i32 %15, 0
+  br i1 %16, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
+  %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
+  %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
+  %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  %21 = insertelement <4 x float> undef, float %17, i32 0
+  %22 = insertelement <4 x float> %21, float %18, i32 1
+  %23 = insertelement <4 x float> %22, float %19, i32 2
+  %24 = insertelement <4 x float> %23, float %20, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %25 = bitcast float %temp8.0 to i32
+  %26 = add i32 %25, 1
+  %27 = bitcast i32 %26 to float
+  br label %LOOP
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll
new file mode 100644
index 0000000..6afd677
--- /dev/null
+++ b/test/CodeGen/R600/schedule-if-2.ll
@@ -0,0 +1,94 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = fadd float 1.000000e+03, %1
+  %3 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %4 = extractelement <4 x float> %3, i32 0
+  %5 = bitcast float %4 to i32
+  %6 = icmp eq i32 %5, 0
+  %7 = sext i1 %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %IF, label %ELSE
+
+IF:                                               ; preds = %main_body
+  %11 = call float @fabs(float %2)
+  %12 = fcmp ueq float %11, 0x7FF0000000000000
+  %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00
+  %14 = fsub float -0.000000e+00, %13
+  %15 = fptosi float %14 to i32
+  %16 = bitcast i32 %15 to float
+  %17 = bitcast float %16 to i32
+  %18 = icmp ne i32 %17, 0
+  %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00
+  %19 = fcmp une float %2, %2
+  %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00
+  %21 = fsub float -0.000000e+00, %20
+  %22 = fptosi float %21 to i32
+  %23 = bitcast i32 %22 to float
+  %24 = bitcast float %23 to i32
+  %25 = icmp ne i32 %24, 0
+  %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00
+  %26 = bitcast float %. to i32
+  %27 = sitofp i32 %26 to float
+  %28 = bitcast float %temp8.0 to i32
+  %29 = sitofp i32 %28 to float
+  %30 = fcmp ugt float %2, 0.000000e+00
+  %31 = select i1 %30, float 1.000000e+00, float %2
+  %32 = fcmp uge float %31, 0.000000e+00
+  %33 = select i1 %32, float %31, float -1.000000e+00
+  %34 = fadd float %33, 1.000000e+00
+  %35 = fmul float %34, 5.000000e-01
+  br label %ENDIF
+
+ELSE:                                             ; preds = %main_body
+  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = bitcast float %37 to i32
+  %39 = icmp eq i32 %38, 1
+  %40 = sext i1 %39 to i32
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp ne i32 %42, 0
+  br i1 %43, label %IF23, label %ENDIF
+
+ENDIF:                                            ; preds = %IF23, %ELSE, %IF
+  %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ]
+  %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ]
+  %44 = insertelement <4 x float> undef, float %temp4.0, i32 0
+  %45 = insertelement <4 x float> %44, float %temp5.0, i32 1
+  %46 = insertelement <4 x float> %45, float %temp6.0, i32 2
+  %47 = insertelement <4 x float> %46, float %temp7.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0)
+  ret void
+
+IF23:                                             ; preds = %ELSE
+  %48 = fcmp ult float 0.000000e+00, %2
+  %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00
+  %50 = fsub float -0.000000e+00, %49
+  %51 = fptosi float %50 to i32
+  %52 = bitcast i32 %51 to float
+  %53 = bitcast float %52 to i32
+  %54 = icmp ne i32 %53, 0
+  %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00
+  %55 = bitcast float %.28 to i32
+  %56 = sitofp i32 %55 to float
+  %57 = load <4 x float> addrspace(8)* null
+  %58 = extractelement <4 x float> %57, i32 0
+  %59 = fsub float -0.000000e+00, %58
+  %60 = fadd float %2, %59
+  br label %ENDIF
+}
+
+declare float @fabs(float) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readonly }
diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll
new file mode 100644
index 0000000..347d92f
--- /dev/null
+++ b/test/CodeGen/R600/schedule-if.ll
@@ -0,0 +1,46 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  %4 = sext i1 %3 to i32
+  %5 = bitcast i32 %4 to float
+  %6 = bitcast float %5 to i32
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = bitcast float %9 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF13, label %ENDIF
+
+ENDIF:                                            ; preds = %IF13, %ELSE, %main_body
+  %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ]
+  %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ]
+  %16 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp1.0, i32 1
+  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2
+  %19 = insertelement <4 x float> %18, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0)
+  ret void
+
+IF13:                                             ; preds = %ELSE
+  %20 = load <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fsub float -0.000000e+00, %21
+  %23 = fadd float 1.000000e+03, %22
+  br label %ENDIF
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
new file mode 100644
index 0000000..44b7c2f
--- /dev/null
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
@@ -0,0 +1,134 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;REQUIRES: asserts
+
+define void @main() {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = fcmp ult float %0, 0.000000e+00
+  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
+  %6 = fsub float -0.000000e+00, %5
+  %7 = fptosi float %6 to i32
+  %8 = bitcast i32 %7 to float
+  %9 = bitcast float %8 to i32
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %LOOP, label %ENDIF
+
+ENDIF:                                            ; preds = %ENDIF16, %LOOP, %main_body
+  %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ]
+  %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ]
+  %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ]
+  %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ]
+  %11 = load <4 x float> addrspace(9)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %12, %0
+  %14 = load <4 x float> addrspace(9)* null
+  %15 = extractelement <4 x float> %14, i32 1
+  %16 = fmul float %15, %0
+  %17 = load <4 x float> addrspace(9)* null
+  %18 = extractelement <4 x float> %17, i32 2
+  %19 = fmul float %18, %0
+  %20 = load <4 x float> addrspace(9)* null
+  %21 = extractelement <4 x float> %20, i32 3
+  %22 = fmul float %21, %0
+  %23 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float %24, %1
+  %26 = fadd float %25, %13
+  %27 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %28 = extractelement <4 x float> %27, i32 1
+  %29 = fmul float %28, %1
+  %30 = fadd float %29, %16
+  %31 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %32 = extractelement <4 x float> %31, i32 2
+  %33 = fmul float %32, %1
+  %34 = fadd float %33, %19
+  %35 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %36 = extractelement <4 x float> %35, i32 3
+  %37 = fmul float %36, %1
+  %38 = fadd float %37, %22
+  %39 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %40 = extractelement <4 x float> %39, i32 0
+  %41 = fmul float %40, %2
+  %42 = fadd float %41, %26
+  %43 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %44 = extractelement <4 x float> %43, i32 1
+  %45 = fmul float %44, %2
+  %46 = fadd float %45, %30
+  %47 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %48 = extractelement <4 x float> %47, i32 2
+  %49 = fmul float %48, %2
+  %50 = fadd float %49, %34
+  %51 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %52 = extractelement <4 x float> %51, i32 3
+  %53 = fmul float %52, %2
+  %54 = fadd float %53, %38
+  %55 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %56 = extractelement <4 x float> %55, i32 0
+  %57 = fmul float %56, %3
+  %58 = fadd float %57, %42
+  %59 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %60 = extractelement <4 x float> %59, i32 1
+  %61 = fmul float %60, %3
+  %62 = fadd float %61, %46
+  %63 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %64 = extractelement <4 x float> %63, i32 2
+  %65 = fmul float %64, %3
+  %66 = fadd float %65, %50
+  %67 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
+  %68 = extractelement <4 x float> %67, i32 3
+  %69 = fmul float %68, %3
+  %70 = fadd float %69, %54
+  %71 = insertelement <4 x float> undef, float %58, i32 0
+  %72 = insertelement <4 x float> %71, float %62, i32 1
+  %73 = insertelement <4 x float> %72, float %66, i32 2
+  %74 = insertelement <4 x float> %73, float %70, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1)
+  %75 = insertelement <4 x float> undef, float %temp.0, i32 0
+  %76 = insertelement <4 x float> %75, float %temp1.0, i32 1
+  %77 = insertelement <4 x float> %76, float %temp2.0, i32 2
+  %78 = insertelement <4 x float> %77, float %temp3.0, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2)
+  ret void
+
+LOOP:                                             ; preds = %main_body, %ENDIF19
+  %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ]
+  %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ]
+  %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ]
+  %79 = fcmp uge float %temp4.0, %0
+  %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00
+  %81 = fsub float -0.000000e+00, %80
+  %82 = fptosi float %81 to i32
+  %83 = bitcast i32 %82 to float
+  %84 = bitcast float %83 to i32
+  %85 = icmp ne i32 %84, 0
+  br i1 %85, label %ENDIF, label %ENDIF16
+
+ENDIF16:                                          ; preds = %LOOP
+  %86 = fcmp une float %2, %temp4.0
+  %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00
+  %88 = fsub float -0.000000e+00, %87
+  %89 = fptosi float %88 to i32
+  %90 = bitcast i32 %89 to float
+  %91 = bitcast float %90 to i32
+  %92 = icmp ne i32 %91, 0
+  br i1 %92, label %ENDIF, label %ENDIF19
+
+ENDIF19:                                          ; preds = %ENDIF16
+  %93 = fadd float %temp.1, 1.000000e+00
+  %94 = fadd float %temp1.1, 0.000000e+00
+  %95 = fadd float %temp2.1, 0.000000e+00
+  %96 = fadd float %temp3.1, 0.000000e+00
+  %97 = fadd float %temp4.0, 1.000000e+00
+  br label %LOOP
+}
+
+declare float @llvm.R600.load.input(i32) #0
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
new file mode 100644
index 0000000..02d9353
--- /dev/null
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @test_a
+; CHECK-NOT: CND
+; CHECK: SET{{[NEQGTL]+}}_DX10
+
+define void @test_a(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 0.000000e+00
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %IF, label %ENDIF
+
+IF:
+  %7 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Same as test_a, but the branch labels are swapped to produce the inverse cc
+; for the icmp instruction
+
+; CHECK: @test_b
+; CHECK: SET{{[GTEQN]+}}_DX10
+; CHECK-NEXT: PRED_
+define void @test_b(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 0.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  %4 = bitcast i32 %3 to float
+  %5 = bitcast float %4 to i32
+  %6 = icmp ne i32 %5, 0
+  br i1 %6, label %ENDIF, label %IF
+
+IF:
+  %7 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 0, i32 addrspace(1)* %7
+  br label %ENDIF
+
+ENDIF:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test a CND*_INT instruction with float true/false values
+; CHECK: @test_c
+; CHECK: CND{{[GTE]+}}_INT
+define void @test_c(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  %1 = select i1 %0, float 2.0, float 3.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}