80 files changed, 2534 insertions, 1261 deletions
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index 114f9e7..5c14270 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -1,16 +1,26 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; CHECK: @v4i32_kernel_arg
-; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
+; R600-CHECK: @v4i32_kernel_arg
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
+; SI-CHECK: @v4i32_kernel_arg
+; SI-CHECK: BUFFER_STORE_DWORDX4
 define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; CHECK: @v4f32_kernel_arg
-; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
+; R600-CHECK: @v4f32_kernel_arg
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
+; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
+; SI-CHECK: @v4f32_kernel_arg
+; SI-CHECK: BUFFER_STORE_DWORDX4
 define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
new file mode 100644
index 0000000..34a0a87
--- /dev/null
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s --check-prefix=SI-CHECK
+
+; SI-CHECK: @f64_kernel_arg
+; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 9
+; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 11
+; SI-CHECK: BUFFER_STORE_DWORDX2
+define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+entry:
+  store double %in, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 185998b..16f7f97 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,11 +1,36 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: ADD_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: ADD_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: ADD_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @test2
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;SI-CHECK: @test2
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = add <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test4
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/alu-split.ll b/test/CodeGen/R600/alu-split.ll
deleted file mode 100644
index 48496f6..0000000
--- a/test/CodeGen/R600/alu-split.ll
+++ /dev/null
@@ -1,851 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: ALU
-;CHECK: ALU
-;CHECK: ALU
-;CHECK-NOT: ALU
-;CHECK: CF_END
-
-define void @main() #0 {
-main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp une float 0x4016F2B020000000, %5
-  %7 = select i1 %6, float 1.000000e+00, float 0.000000e+00
-  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = fcmp une float 0x401FDCC640000000, %9
-  %11 = select i1 %10, float 1.000000e+00, float 0.000000e+00
-  %12 = fsub float -0.000000e+00, %7
-  %13 = fptosi float %12 to i32
-  %14 = fsub float -0.000000e+00, %11
-  %15 = fptosi float %14 to i32
-  %16 = bitcast i32 %13 to float
-  %17 = bitcast i32 %15 to float
-  %18 = bitcast float %16 to i32
-  %19 = bitcast float %17 to i32
-  %20 = or i32 %18, %19
-  %21 = bitcast i32 %20 to float
-  %22 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 17)
-  %23 = extractelement <4 x float> %22, i32 0
-  %24 = fcmp une float 0xC00574BC60000000, %23
-  %25 = select i1 %24, float 1.000000e+00, float 0.000000e+00
-  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 17)
-  %27 = extractelement <4 x float> %26, i32 1
-  %28 = fcmp une float 0x40210068E0000000, %27
-  %29 = select i1 %28, float 1.000000e+00, float 0.000000e+00
-  %30 = fsub float -0.000000e+00, %25
-  %31 = fptosi float %30 to i32
-  %32 = fsub float -0.000000e+00, %29
-  %33 = fptosi float %32 to i32
-  %34 = bitcast i32 %31 to float
-  %35 = bitcast i32 %33 to float
-  %36 = bitcast float %34 to i32
-  %37 = bitcast float %35 to i32
-  %38 = or i32 %36, %37
-  %39 = bitcast i32 %38 to float
-  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 18)
-  %41 = extractelement <4 x float> %40, i32 0
-  %42 = fcmp une float 0xBFC9A6B500000000, %41
-  %43 = select i1 %42, float 1.000000e+00, float 0.000000e+00
-  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 18)
-  %45 = extractelement <4 x float> %44, i32 1
-  %46 = fcmp une float 0xC0119BDA60000000, %45
-  %47 = select i1 %46, float 1.000000e+00, float 0.000000e+00
-  %48 = fsub float -0.000000e+00, %43
-  %49 = fptosi float %48 to i32
-  %50 = fsub float -0.000000e+00, %47
-  %51 = fptosi float %50 to i32
-  %52 = bitcast i32 %49 to float
-  %53 = bitcast i32 %51 to float
-  %54 = bitcast float %52 to i32
-  %55 = bitcast float %53 to i32
-  %56 = or i32 %54, %55
-  %57 = bitcast i32 %56 to float
-  %58 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 19)
-  %59 = extractelement <4 x float> %58, i32 0
-  %60 = fcmp une float 0xC02085D640000000, %59
-  %61 = select i1 %60, float 1.000000e+00, float 0.000000e+00
-  %62 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 19)
-  %63 = extractelement <4 x float> %62, i32 1
-  %64 = fcmp une float 0xBFD7C1BDA0000000, %63
-  %65 = select i1 %64, float 1.000000e+00, float 0.000000e+00
-  %66 = fsub float -0.000000e+00, %61
-  %67 = fptosi float %66 to i32
-  %68 = fsub float -0.000000e+00, %65
-  %69 = fptosi float %68 to i32
-  %70 = bitcast i32 %67 to float
-  %71 = bitcast i32 %69 to float
-  %72 = bitcast float %70 to i32
-  %73 = bitcast float %71 to i32
-  %74 = or i32 %72, %73
-  %75 = bitcast i32 %74 to float
-  %76 = insertelement <4 x float> undef, float %21, i32 0
-  %77 = insertelement <4 x float> %76, float %39, i32 1
-  %78 = insertelement <4 x float> %77, float %57, i32 2
-  %79 = insertelement <4 x float> %78, float %75, i32 3
-  %80 = insertelement <4 x float> undef, float %21, i32 0
-  %81 = insertelement <4 x float> %80, float %39, i32 1
-  %82 = insertelement <4 x float> %81, float %57, i32 2
-  %83 = insertelement <4 x float> %82, float %75, i32 3
-  %84 = call float @llvm.AMDGPU.dp4(<4 x float> %79, <4 x float> %83)
-  %85 = bitcast float %84 to i32
-  %86 = icmp ne i32 %85, 0
-  %87 = sext i1 %86 to i32
-  %88 = bitcast i32 %87 to float
-  %89 = bitcast float %88 to i32
-  %90 = xor i32 %89, -1
-  %91 = bitcast i32 %90 to float
-  %92 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 20)
-  %93 = extractelement <4 x float> %92, i32 0
-  %94 = fcmp une float 0x401FDCC640000000, %93
-  %95 = select i1 %94, float 1.000000e+00, float 0.000000e+00
-  %96 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 20)
-  %97 = extractelement <4 x float> %96, i32 1
-  %98 = fcmp une float 0xC00574BC60000000, %97
-  %99 = select i1 %98, float 1.000000e+00, float 0.000000e+00
-  %100 = fsub float -0.000000e+00, %95
-  %101 = fptosi float %100 to i32
-  %102 = fsub float -0.000000e+00, %99
-  %103 = fptosi float %102 to i32
-  %104 = bitcast i32 %101 to float
-  %105 = bitcast i32 %103 to float
-  %106 = bitcast float %104 to i32
-  %107 = bitcast float %105 to i32
-  %108 = or i32 %106, %107
-  %109 = bitcast i32 %108 to float
-  %110 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 21)
-  %111 = extractelement <4 x float> %110, i32 0
-  %112 = fcmp une float 0x40210068E0000000, %111
-  %113 = select i1 %112, float 1.000000e+00, float 0.000000e+00
-  %114 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 21)
-  %115 = extractelement <4 x float> %114, i32 1
-  %116 = fcmp une float 0xBFC9A6B500000000, %115
-  %117 = select i1 %116, float 1.000000e+00, float 0.000000e+00
-  %118 = fsub float -0.000000e+00, %113
-  %119 = fptosi float %118 to i32
-  %120 = fsub float -0.000000e+00, %117
-  %121 = fptosi float %120 to i32
-  %122 = bitcast i32 %119 to float
-  %123 = bitcast i32 %121 to float
-  %124 = bitcast float %122 to i32
-  %125 = bitcast float %123 to i32
-  %126 = or i32 %124, %125
-  %127 = bitcast i32 %126 to float
-  %128 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 22)
-  %129 = extractelement <4 x float> %128, i32 0
-  %130 = fcmp une float 0xC0119BDA60000000, %129
-  %131 = select i1 %130, float 1.000000e+00, float 0.000000e+00
-  %132 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 22)
-  %133 = extractelement <4 x float> %132, i32 1
-  %134 = fcmp une float 0xC02085D640000000, %133
-  %135 = select i1 %134, float 1.000000e+00, float 0.000000e+00
-  %136 = fsub float -0.000000e+00, %131
-  %137 = fptosi float %136 to i32
-  %138 = fsub float -0.000000e+00, %135
-  %139 = fptosi float %138 to i32
-  %140 = bitcast i32 %137 to float
-  %141 = bitcast i32 %139 to float
-  %142 = bitcast float %140 to i32
-  %143 = bitcast float %141 to i32
-  %144 = or i32 %142, %143
-  %145 = bitcast i32 %144 to float
-  %146 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
-  %147 = extractelement <4 x float> %146, i32 0
-  %148 = fcmp une float 0xBFD7C1BDA0000000, %147
-  %149 = select i1 %148, float 1.000000e+00, float 0.000000e+00
-  %150 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
-  %151 = extractelement <4 x float> %150, i32 1
-  %152 = fcmp une float 0x401E1D7DC0000000, %151
-  %153 = select i1 %152, float 1.000000e+00, float 0.000000e+00
-  %154 = fsub float -0.000000e+00, %149
-  %155 = fptosi float %154 to i32
-  %156 = fsub float -0.000000e+00, %153
-  %157 = fptosi float %156 to i32
-  %158 = bitcast i32 %155 to float
-  %159 = bitcast i32 %157 to float
-  %160 = bitcast float %158 to i32
-  %161 = bitcast float %159 to i32
-  %162 = or i32 %160, %161
-  %163 = bitcast i32 %162 to float
-  %164 = insertelement <4 x float> undef, float %109, i32 0
-  %165 = insertelement <4 x float> %164, float %127, i32 1
-  %166 = insertelement <4 x float> %165, float %145, i32 2
-  %167 = insertelement <4 x float> %166, float %163, i32 3
-  %168 = insertelement <4 x float> undef, float %109, i32 0
-  %169 = insertelement <4 x float> %168, float %127, i32 1
-  %170 = insertelement <4 x float> %169, float %145, i32 2
-  %171 = insertelement <4 x float> %170, float %163, i32 3
-  %172 = call float @llvm.AMDGPU.dp4(<4 x float> %167, <4 x float> %171)
-  %173 = bitcast float %172 to i32
-  %174 = icmp ne i32 %173, 0
-  %175 = sext i1 %174 to i32
-  %176 = bitcast i32 %175 to float
-  %177 = bitcast float %176 to i32
-  %178 = xor i32 %177, -1
-  %179 = bitcast i32 %178 to float
-  %180 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %181 = extractelement <4 x float> %180, i32 0
-  %182 = fcmp une float 0x401FDCC640000000, %181
-  %183 = select i1 %182, float 1.000000e+00, float 0.000000e+00
-  %184 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %185 = extractelement <4 x float> %184, i32 1
-  %186 = fcmp une float 0xC00574BC60000000, %185
-  %187 = select i1 %186, float 1.000000e+00, float 0.000000e+00
-  %188 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %189 = extractelement <4 x float> %188, i32 2
-  %190 = fcmp une float 0x40210068E0000000, %189
-  %191 = select i1 %190, float 1.000000e+00, float 0.000000e+00
-  %192 = fsub float -0.000000e+00, %183
-  %193 = fptosi float %192 to i32
-  %194 = fsub float -0.000000e+00, %187
-  %195 = fptosi float %194 to i32
-  %196 = fsub float -0.000000e+00, %191
-  %197 = fptosi float %196 to i32
-  %198 = bitcast i32 %193 to float
-  %199 = bitcast i32 %195 to float
-  %200 = bitcast i32 %197 to float
-  %201 = bitcast float %199 to i32
-  %202 = bitcast float %200 to i32
-  %203 = or i32 %201, %202
-  %204 = bitcast i32 %203 to float
-  %205 = bitcast float %198 to i32
-  %206 = bitcast float %204 to i32
-  %207 = or i32 %205, %206
-  %208 = bitcast i32 %207 to float
-  %209 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %210 = extractelement <4 x float> %209, i32 0
-  %211 = fcmp une float 0xBFC9A6B500000000, %210
-  %212 = select i1 %211, float 1.000000e+00, float 0.000000e+00
-  %213 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %214 = extractelement <4 x float> %213, i32 1
-  %215 = fcmp une float 0xC0119BDA60000000, %214
-  %216 = select i1 %215, float 1.000000e+00, float 0.000000e+00
-  %217 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %218 = extractelement <4 x float> %217, i32 2
-  %219 = fcmp une float 0xC02085D640000000, %218
-  %220 = select i1 %219, float 1.000000e+00, float 0.000000e+00
-  %221 = fsub float -0.000000e+00, %212
-  %222 = fptosi float %221 to i32
-  %223 = fsub float -0.000000e+00, %216
-  %224 = fptosi float %223 to i32
-  %225 = fsub float -0.000000e+00, %220
-  %226 = fptosi float %225 to i32
-  %227 = bitcast i32 %222 to float
-  %228 = bitcast i32 %224 to float
-  %229 = bitcast i32 %226 to float
-  %230 = bitcast float %228 to i32
-  %231 = bitcast float %229 to i32
-  %232 = or i32 %230, %231
-  %233 = bitcast i32 %232 to float
-  %234 = bitcast float %227 to i32
-  %235 = bitcast float %233 to i32
-  %236 = or i32 %234, %235
-  %237 = bitcast i32 %236 to float
-  %238 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %239 = extractelement <4 x float> %238, i32 0
-  %240 = fcmp une float 0xBFD7C1BDA0000000, %239
-  %241 = select i1 %240, float 1.000000e+00, float 0.000000e+00
-  %242 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %243 = extractelement <4 x float> %242, i32 1
-  %244 = fcmp une float 0x401E1D7DC0000000, %243
-  %245 = select i1 %244, float 1.000000e+00, float 0.000000e+00
-  %246 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %247 = extractelement <4 x float> %246, i32 2
-  %248 = fcmp une float 0xC019893740000000, %247
-  %249 = select i1 %248, float 1.000000e+00, float 0.000000e+00
-  %250 = fsub float -0.000000e+00, %241
-  %251 = fptosi float %250 to i32
-  %252 = fsub float -0.000000e+00, %245
-  %253 = fptosi float %252 to i32
-  %254 = fsub float -0.000000e+00, %249
-  %255 = fptosi float %254 to i32
-  %256 = bitcast i32 %251 to float
-  %257 = bitcast i32 %253 to float
-  %258 = bitcast i32 %255 to float
-  %259 = bitcast float %257 to i32
-  %260 = bitcast float %258 to i32
-  %261 = or i32 %259, %260
-  %262 = bitcast i32 %261 to float
-  %263 = bitcast float %256 to i32
-  %264 = bitcast float %262 to i32
-  %265 = or i32 %263, %264
-  %266 = bitcast i32 %265 to float
-  %267 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %268 = extractelement <4 x float> %267, i32 0
-  %269 = fcmp une float 0x40220F0D80000000, %268
-  %270 = select i1 %269, float 1.000000e+00, float 0.000000e+00
-  %271 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %272 = extractelement <4 x float> %271, i32 1
-  %273 = fcmp une float 0xC018E2EB20000000, %272
-  %274 = select i1 %273, float 1.000000e+00, float 0.000000e+00
-  %275 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
-  %276 = extractelement <4 x float> %275, i32 2
-  %277 = fcmp une float 0xBFEA8DB8C0000000, %276
-  %278 = select i1 %277, float 1.000000e+00, float 0.000000e+00
-  %279 = fsub float -0.000000e+00, %270
-  %280 = fptosi float %279 to i32
-  %281 = fsub float -0.000000e+00, %274
-  %282 = fptosi float %281 to i32
-  %283 = fsub float -0.000000e+00, %278
-  %284 = fptosi float %283 to i32
-  %285 = bitcast i32 %280 to float
-  %286 = bitcast i32 %282 to float
-  %287 = bitcast i32 %284 to float
-  %288 = bitcast float %286 to i32
-  %289 = bitcast float %287 to i32
-  %290 = or i32 %288, %289
-  %291 = bitcast i32 %290 to float
-  %292 = bitcast float %285 to i32
-  %293 = bitcast float %291 to i32
-  %294 = or i32 %292, %293
-  %295 = bitcast i32 %294 to float
-  %296 = insertelement <4 x float> undef, float %208, i32 0
-  %297 = insertelement <4 x float> %296, float %237, i32 1
-  %298 = insertelement <4 x float> %297, float %266, i32 2
-  %299 = insertelement <4 x float> %298, float %295, i32 3
-  %300 = insertelement <4 x float> undef, float %208, i32 0
-  %301 = insertelement <4 x float> %300, float %237, i32 1
-  %302 = insertelement <4 x float> %301, float %266, i32 2
-  %303 = insertelement <4 x float> %302, float %295, i32 3
-  %304 = call float @llvm.AMDGPU.dp4(<4 x float> %299, <4 x float> %303)
-  %305 = bitcast float %304 to i32
-  %306 = icmp ne i32 %305, 0
-  %307 = sext i1 %306 to i32
-  %308 = bitcast i32 %307 to float
-  %309 = bitcast float %308 to i32
-  %310 = xor i32 %309, -1
-  %311 = bitcast i32 %310 to float
-  %312 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %313 = extractelement <4 x float> %312, i32 0
-  %314 = fcmp une float 0xC00574BC60000000, %313
-  %315 = select i1 %314, float 1.000000e+00, float 0.000000e+00
-  %316 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %317 = extractelement <4 x float> %316, i32 1
-  %318 = fcmp une float 0x40210068E0000000, %317
-  %319 = select i1 %318, float 1.000000e+00, float 0.000000e+00
-  %320 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
-  %321 = extractelement <4 x float> %320, i32 2
-  %322 = fcmp une float 0xBFC9A6B500000000, %321
-  %323 = select i1 %322, float 1.000000e+00, float 0.000000e+00
-  %324 = fsub float -0.000000e+00, %315
-  %325 = fptosi float %324 to i32
-  %326 = fsub float -0.000000e+00, %319
-  %327 = fptosi float %326 to i32
-  %328 = fsub float -0.000000e+00, %323
-  %329 = fptosi float %328 to i32
-  %330 = bitcast i32 %325 to float
-  %331 = bitcast i32 %327 to float
-  %332 = bitcast i32 %329 to float
-  %333 = bitcast float %331 to i32
-  %334 = bitcast float %332 to i32
-  %335 = or i32 %333, %334
-  %336 = bitcast i32 %335 to float
-  %337 = bitcast float %330 to i32
-  %338 = bitcast float %336 to i32
-  %339 = or i32 %337, %338
-  %340 = bitcast i32 %339 to float
-  %341 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
-  %342 = extractelement <4 x float> %341, i32 0
-  %343 = fcmp une float 0xC0119BDA60000000, %342
-  %344 = select i1 %343, float 1.000000e+00, float 0.000000e+00
-  %345 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
-  %346 = extractelement <4 x float> %345, i32 1
-  %347 = fcmp une float 0xC02085D640000000, %346
-  %348 = select i1 %347, float 1.000000e+00, float 0.000000e+00
-  %349 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13)
-  %350 = extractelement <4 x float> %349, i32 2
-  %351 = fcmp une float 0xBFD7C1BDA0000000, %350
-  %352 = select i1 %351, float 1.000000e+00, float 0.000000e+00
-  %353 = fsub float -0.000000e+00, %344
-  %354 = fptosi float %353 to i32
-  %355 = fsub float -0.000000e+00, %348
-  %356 = fptosi float %355 to i32
-  %357 = fsub float -0.000000e+00, %352
-  %358 = fptosi float %357 to i32
-  %359 = bitcast i32 %354 to float
-  %360 = bitcast i32 %356 to float
-  %361 = bitcast i32 %358 to float
-  %362 = bitcast float %360 to i32
-  %363 = bitcast float %361 to i32
-  %364 = or i32 %362, %363
-  %365 = bitcast i32 %364 to float
-  %366 = bitcast float %359 to i32
-  %367 = bitcast float %365 to i32
-  %368 = or i32 %366, %367
-  %369 = bitcast i32 %368 to float
-  %370 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %371 = extractelement <4 x float> %370, i32 0
-  %372 = fcmp une float 0x401E1D7DC0000000, %371
-  %373 = select i1 %372, float 1.000000e+00, float 0.000000e+00
-  %374 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %375 = extractelement <4 x float> %374, i32 1
-  %376 = fcmp une float 0xC019893740000000, %375
-  %377 = select i1 %376, float 1.000000e+00, float 0.000000e+00
-  %378 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
-  %379 = extractelement <4 x float> %378, i32 2
-  %380 = fcmp une float 0x40220F0D80000000, %379
-  %381 = select i1 %380, float 1.000000e+00, float 0.000000e+00
-  %382 = fsub float -0.000000e+00, %373
-  %383 = fptosi float %382 to i32
-  %384 = fsub float -0.000000e+00, %377
-  %385 = fptosi float %384 to i32
-  %386 = fsub float -0.000000e+00, %381
-  %387 = fptosi float %386 to i32
-  %388 = bitcast i32 %383 to float
-  %389 = bitcast i32 %385 to float
-  %390 = bitcast i32 %387 to float
-  %391 = bitcast float %389 to i32
-  %392 = bitcast float %390 to i32
-  %393 = or i32 %391, %392
-  %394 = bitcast i32 %393 to float
-  %395 = bitcast float %388 to i32
-  %396 = bitcast float %394 to i32
-  %397 = or i32 %395, %396
-  %398 = bitcast i32 %397 to float
-  %399 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
-  %400 = extractelement <4 x float> %399, i32 0
-  %401 = fcmp une float 0xC018E2EB20000000, %400
-  %402 = select i1 %401, float 1.000000e+00, float 0.000000e+00
-  %403 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
-  %404 = extractelement <4 x float> %403, i32 1
-  %405 = fcmp une float 0xBFEA8DB8C0000000, %404
-  %406 = select i1 %405, float 1.000000e+00, float 0.000000e+00
-  %407 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15)
-  %408 = extractelement <4 x float> %407, i32 2
-  %409 = fcmp une float 0x4015236E20000000, %408
-  %410 = select i1 %409, float 1.000000e+00, float 0.000000e+00
-  %411 = fsub float -0.000000e+00, %402
-  %412 = fptosi float %411 to i32
-  %413 = fsub float -0.000000e+00, %406
-  %414 = fptosi float %413 to i32
-  %415 = fsub float -0.000000e+00, %410
-  %416 = fptosi float %415 to i32
-  %417 = bitcast i32 %412 to float
-  %418 = bitcast i32 %414 to float
-  %419 = bitcast i32 %416 to float
-  %420 = bitcast float %418 to i32
-  %421 = bitcast float %419 to i32
-  %422 = or i32 %420, %421
-  %423 = bitcast i32 %422 to float
-  %424 = bitcast float %417 to i32
-  %425 = bitcast float %423 to i32
-  %426 = or i32 %424, %425
-  %427 = bitcast i32 %426 to float
-  %428 = insertelement <4 x float> undef, float %340, i32 0
-  %429 = insertelement <4 x float> %428, float %369, i32 1
-  %430 = insertelement <4 x float> %429, float %398, i32 2
-  %431 = insertelement <4 x float> %430, float %427, i32 3
-  %432 = insertelement <4 x float> undef, float %340, i32 0
-  %433 = insertelement <4 x float> %432, float %369, i32 1
-  %434 = insertelement <4 x float> %433, float %398, i32 2
-  %435 = insertelement <4 x float> %434, float %427, i32 3
-  %436 = call float @llvm.AMDGPU.dp4(<4 x float> %431, <4 x float> %435)
-  %437 = bitcast float %436 to i32
-  %438 = icmp ne i32 %437, 0
-  %439 = sext i1 %438 to i32
-  %440 = bitcast i32 %439 to float
-  %441 = bitcast float %440 to i32
-  %442 = xor i32 %441, -1
-  %443 = bitcast i32 %442 to float
-  %444 = load <4 x float> addrspace(8)* null
-  %445 = extractelement <4 x float> %444, i32 0
-  %446 = fcmp une float 0xC00574BC60000000, %445
-  %447 = select i1 %446, float 1.000000e+00, float 0.000000e+00
-  %448 = load <4 x float> addrspace(8)* null
-  %449 = extractelement <4 x float> %448, i32 1
-  %450 = fcmp une float 0x40210068E0000000, %449
-  %451 = select i1 %450, float 1.000000e+00, float 0.000000e+00
-  %452 = load <4 x float> addrspace(8)* null
-  %453 = extractelement <4 x float> %452, i32 2
-  %454 = fcmp une float 0xBFC9A6B500000000, %453
-  %455 = select i1 %454, float 1.000000e+00, float 0.000000e+00
-  %456 = load <4 x float> addrspace(8)* null
-  %457 = extractelement <4 x float> %456, i32 3
-  %458 = fcmp une float 0xC0119BDA60000000, %457
-  %459 = select i1 %458, float 1.000000e+00, float 0.000000e+00
-  %460 = fsub float -0.000000e+00, %447
-  %461 = fptosi float %460 to i32
-  %462 = fsub float -0.000000e+00, %451
-  %463 = fptosi float %462 to i32
-  %464 = fsub float -0.000000e+00, %455
-  %465 = fptosi float %464 to i32
-  %466 = fsub float -0.000000e+00, %459
-  %467 = fptosi float %466 to i32
-  %468 = bitcast i32 %461 to float
-  %469 = bitcast i32 %463 to float
-  %470 = bitcast i32 %465 to float
-  %471 = bitcast i32 %467 to float
-  %472 = bitcast float %468 to i32
-  %473 = bitcast float %469 to i32
-  %474 = or i32 %472, %473
-  %475 = bitcast i32 %474 to float
-  %476 = bitcast float %470 to i32
-  %477 = bitcast float %471 to i32
-  %478 = or i32 %476, %477
-  %479 = bitcast i32 %478 to float
-  %480 = bitcast float %475 to i32
-  %481 = bitcast float %479 to i32
-  %482 = or i32 %480, %481
-  %483 = bitcast i32 %482 to float
-  %484 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %485 = extractelement <4 x float> %484, i32 0
-  %486 = fcmp une float 0xC02085D640000000, %485
-  %487 = select i1 %486, float 1.000000e+00, float 0.000000e+00
-  %488 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %489 = extractelement <4 x float> %488, i32 1
-  %490 = fcmp une float 0xBFD7C1BDA0000000, %489
-  %491 = select i1 %490, float 1.000000e+00, float 0.000000e+00
-  %492 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %493 = extractelement <4 x float> %492, i32 2
-  %494 = fcmp une float 0x401E1D7DC0000000, %493
-  %495 = select i1 %494, float 1.000000e+00, float 0.000000e+00
-  %496 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %497 = extractelement <4 x float> %496, i32 3
-  %498 = fcmp une float 0xC019893740000000, %497
-  %499 = select i1 %498, float 1.000000e+00, float 0.000000e+00
-  %500 = fsub float -0.000000e+00, %487
-  %501 = fptosi float %500 to i32
-  %502 = fsub float -0.000000e+00, %491
-  %503 = fptosi float %502 to i32
-  %504 = fsub float -0.000000e+00, %495
-  %505 = fptosi float %504 to i32
-  %506 = fsub float -0.000000e+00, %499
-  %507 = fptosi float %506 to i32
-  %508 = bitcast i32 %501 to float
-  %509 = bitcast i32 %503 to float
-  %510 = bitcast i32 %505 to float
-  %511 = bitcast i32 %507 to float
-  %512 = bitcast float %508 to i32
-  %513 = bitcast float %509 to i32
-  %514 = or i32 %512, %513
-  %515 = bitcast i32 %514 to float
-  %516 = bitcast float %510 to i32
-  %517 = bitcast float %511 to i32
-  %518 = or i32 %516, %517
-  %519 = bitcast i32 %518 to float
-  %520 = bitcast float %515 to i32
-  %521 = bitcast float %519 to i32
-  %522 = or i32 %520, %521
-  %523 = bitcast i32 %522 to float
-  %524 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %525 = extractelement <4 x float> %524, i32 0
-  %526 = fcmp une float 0x40220F0D80000000, %525
-  %527 = select i1 %526, float 1.000000e+00, float 0.000000e+00
-  %528 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %529 = extractelement <4 x float> %528, i32 1
-  %530 = fcmp une float 0xC018E2EB20000000, %529
-  %531 = select i1 %530, float 1.000000e+00, float 0.000000e+00
-  %532 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %533 = extractelement <4 x float> %532, i32 2
-  %534 = fcmp une float 0xBFEA8DB8C0000000, %533
-  %535 = select i1 %534, float 1.000000e+00, float 0.000000e+00
-  %536 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %537 = extractelement <4 x float> %536, i32 3
-  %538 = fcmp une float 0x4015236E20000000, %537
-  %539 = select i1 %538, float 1.000000e+00, float 0.000000e+00
-  %540 = fsub float -0.000000e+00, %527
-  %541 = fptosi float %540 to i32
-  %542 = fsub float -0.000000e+00, %531
-  %543 = fptosi float %542 to i32
-  %544 = fsub float -0.000000e+00, %535
-  %545 = fptosi float %544 to i32
-  %546 = fsub float -0.000000e+00, %539
-  %547 = fptosi float %546 to i32
-  %548 = bitcast i32 %541 to float
-  %549 = bitcast i32 %543 to float
-  %550 = bitcast i32 %545 to float
-  %551 = bitcast i32 %547 to float
-  %552 = bitcast float %548 to i32
-  %553 = bitcast float %549 to i32
-  %554 = or i32 %552, %553
-  %555 = bitcast i32 %554 to float
-  %556 = bitcast float %550 to i32
-  %557 = bitcast float %551 to i32
-  %558 = or i32 %556, %557
-  %559 = bitcast i32 %558 to float
-  %560 = bitcast float %555 to i32
-  %561 = bitcast float %559 to i32
-  %562 = or i32 %560, %561
-  %563 = bitcast i32 %562 to float
-  %564 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %565 = extractelement <4 x float> %564, i32 0
-  %566 = fcmp une float 0x4016ED5D00000000, %565
-  %567 = select i1 %566, float 1.000000e+00, float 0.000000e+00
-  %568 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %569 = extractelement <4 x float> %568, i32 1
-  %570 = fcmp une float 0x402332FEC0000000, %569
-  %571 = select i1 %570, float 1.000000e+00, float 0.000000e+00
-  %572 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %573 = extractelement <4 x float> %572, i32 2
-  %574 = fcmp une float 0xC01484B5E0000000, %573
-  %575 = select i1 %574, float 1.000000e+00, float 0.000000e+00
-  %576 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %577 = extractelement <4 x float> %576, i32 3
-  %578 = fcmp une float 0x400179A6C0000000, %577
-  %579 = select i1 %578, float 1.000000e+00, float 0.000000e+00
-  %580 = fsub float -0.000000e+00, %567
-  %581 = fptosi float %580 to i32
-  %582 = fsub float -0.000000e+00, %571
-  %583 = fptosi float %582 to i32
-  %584 = fsub float -0.000000e+00, %575
-  %585 = fptosi float %584 to i32
-  %586 = fsub float -0.000000e+00, %579
-  %587 = fptosi float %586 to i32
-  %588 = bitcast i32 %581 to float
-  %589 = bitcast i32 %583 to float
-  %590 = bitcast i32 %585 to float
-  %591 = bitcast i32 %587 to float
-  %592 = bitcast float %588 to i32
-  %593 = bitcast float %589 to i32
-  %594 = or i32 %592, %593
-  %595 = bitcast i32 %594 to float
-  %596 = bitcast float %590 to i32
-  %597 = bitcast float %591 to i32
-  %598 = or i32 %596, %597
-  %599 = bitcast i32 %598 to float
-  %600 = bitcast float %595 to i32
-  %601 = bitcast float %599 to i32
-  %602 = or i32 %600, %601
-  %603 = bitcast i32 %602 to float
-  %604 = insertelement <4 x float> undef, float %483, i32 0
-  %605 = insertelement <4 x float> %604, float %523, i32 1
-  %606 = insertelement <4 x float> %605, float %563, i32 2
-  %607 = insertelement <4 x float> %606, float %603, i32 3
-  %608 = insertelement <4 x float> undef, float %483, i32 0
-  %609 = insertelement <4 x float> %608, float %523, i32 1
-  %610 = insertelement <4 x float> %609, float %563, i32 2
-  %611 = insertelement <4 x float> %610, float %603, i32 3
-  %612 = call float @llvm.AMDGPU.dp4(<4 x float> %607, <4 x float> %611)
-  %613 = bitcast float %612 to i32
-  %614 = icmp ne i32 %613, 0
-  %615 = sext i1 %614 to i32
-  %616 = bitcast i32 %615 to float
-  %617 = bitcast float %616 to i32
-  %618 = xor i32 %617, -1
-  %619 = bitcast i32 %618 to float
-  %620 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %621 = extractelement <4 x float> %620, i32 0
-  %622 = fcmp une float 0x40210068E0000000, %621
-  %623 = select i1 %622, float 1.000000e+00, float 0.000000e+00
-  %624 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %625 = extractelement <4 x float> %624, i32 1
-  %626 = fcmp une float 0xBFC9A6B500000000, %625
-  %627 = select i1 %626, float 1.000000e+00, float 0.000000e+00
-  %628 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %629 = extractelement <4 x float> %628, i32 2
-  %630 = fcmp une float 0xC0119BDA60000000, %629
-  %631 = select i1 %630, float 1.000000e+00, float 0.000000e+00
-  %632 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %633 = extractelement <4 x float> %632, i32 3
-  %634 = fcmp une float 0xC02085D640000000, %633
-  %635 = select i1 %634, float 1.000000e+00, float 0.000000e+00
-  %636 = fsub float -0.000000e+00, %623
-  %637 = fptosi float %636 to i32
-  %638 = fsub float -0.000000e+00, %627
-  %639 = fptosi float %638 to i32
-  %640 = fsub float -0.000000e+00, %631
-  %641 = fptosi float %640 to i32
-  %642 = fsub float -0.000000e+00, %635
-  %643 = fptosi float %642 to i32
-  %644 = bitcast i32 %637 to float
-  %645 = bitcast i32 %639 to float
-  %646 = bitcast i32 %641 to float
-  %647 = bitcast i32 %643 to float
-  %648 = bitcast float %644 to i32
-  %649 = bitcast float %645 to i32
-  %650 = or i32 %648, %649
-  %651 = bitcast i32 %650 to float
-  %652 = bitcast float %646 to i32
-  %653 = bitcast float %647 to i32
-  %654 = or i32 %652, %653
-  %655 = bitcast i32 %654 to float
-  %656 = bitcast float %651 to i32
-  %657 = bitcast float %655 to i32
-  %658 = or i32 %656, %657
-  %659 = bitcast i32 %658 to float
-  %660 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %661 = extractelement <4 x float> %660, i32 0
-  %662 = fcmp une float 0xBFD7C1BDA0000000, %661
-  %663 = select i1 %662, float 1.000000e+00, float 0.000000e+00
-  %664 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %665 = extractelement <4 x float> %664, i32 1
-  %666 = fcmp une float 0x401E1D7DC0000000, %665
-  %667 = select i1 %666, float 1.000000e+00, float 0.000000e+00
-  %668 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %669 = extractelement <4 x float> %668, i32 2
-  %670 = fcmp une float 0xC019893740000000, %669
-  %671 = select i1 %670, float 1.000000e+00, float 0.000000e+00
-  %672 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %673 = extractelement <4 x float> %672, i32 3
-  %674 = fcmp une float 0x40220F0D80000000, %673
-  %675 = select i1 %674, float 1.000000e+00, float 0.000000e+00
-  %676 = fsub float -0.000000e+00, %663
-  %677 = fptosi float %676 to i32
-  %678 = fsub float -0.000000e+00, %667
-  %679 = fptosi float %678 to i32
-  %680 = fsub float -0.000000e+00, %671
-  %681 = fptosi float %680 to i32
-  %682 = fsub float -0.000000e+00, %675
-  %683 = fptosi float %682 to i32
-  %684 = bitcast i32 %677 to float
-  %685 = bitcast i32 %679 to float
-  %686 = bitcast i32 %681 to float
-  %687 = bitcast i32 %683 to float
-  %688 = bitcast float %684 to i32
-  %689 = bitcast float %685 to i32
-  %690 = or i32 %688, %689
-  %691 = bitcast i32 %690 to float
-  %692 = bitcast float %686 to i32
-  %693 = bitcast float %687 to i32
-  %694 = or i32 %692, %693
-  %695 = bitcast i32 %694 to float
-  %696 = bitcast float %691 to i32
-  %697 = bitcast float %695 to i32
-  %698 = or i32 %696, %697
-  %699 = bitcast i32 %698 to float
-  %700 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %701 = extractelement <4 x float> %700, i32 0
-  %702 = fcmp une float 0xC018E2EB20000000, %701
-  %703 = select i1 %702, float 1.000000e+00, float 0.000000e+00
-  %704 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %705 = extractelement <4 x float> %704, i32 1
-  %706 = fcmp une float 0xBFEA8DB8C0000000, %705
-  %707 = select i1 %706, float 1.000000e+00, float 0.000000e+00
-  %708 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %709 = extractelement <4 x float> %708, i32 2
-  %710 = fcmp une float 0x4015236E20000000, %709
-  %711 = select i1 %710, float 1.000000e+00, float 0.000000e+00
-  %712 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
-  %713 = extractelement <4 x float> %712, i32 3
-  %714 = fcmp une float 0x4016ED5D00000000, %713
-  %715 = select i1 %714, float 1.000000e+00, float 0.000000e+00
-  %716 = fsub float -0.000000e+00, %703
-  %717 = fptosi float %716 to i32
-  %718 = fsub float -0.000000e+00, %707
-  %719 = fptosi float %718 to i32
-  %720 = fsub float -0.000000e+00, %711
-  %721 = fptosi float %720 to i32
-  %722 = fsub float -0.000000e+00, %715
-  %723 = fptosi float %722 to i32
-  %724 = bitcast i32 %717 to float
-  %725 = bitcast i32 %719 to float
-  %726 = bitcast i32 %721 to float
-  %727 = bitcast i32 %723 to float
-  %728 = bitcast float %724 to i32
-  %729 = bitcast float %725 to i32
-  %730 = or i32 %728, %729
-  %731 = bitcast i32 %730 to float
-  %732 = bitcast float %726 to i32
-  %733 = bitcast float %727 to i32
-  %734 = or i32 %732, %733
-  %735 = bitcast i32 %734 to float
-  %736 = bitcast float %731 to i32
-  %737 = bitcast float %735 to i32
-  %738 = or i32 %736, %737
-  %739 = bitcast i32 %738 to float
-  %740 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %741 = extractelement <4 x float> %740, i32 0
-  %742 = fcmp une float 0x402332FEC0000000, %741
-  %743 = select i1 %742, float 1.000000e+00, float 0.000000e+00
-  %744 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %745 = extractelement <4 x float> %744, i32 1
-  %746 = fcmp une float 0xC01484B5E0000000, %745
-  %747 = select i1 %746, float 1.000000e+00, float 0.000000e+00
-  %748 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %749 = extractelement <4 x float> %748, i32 2
-  %750 = fcmp une float 0x400179A6C0000000, %749
-  %751 = select i1 %750, float 1.000000e+00, float 0.000000e+00
-  %752 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %753 = extractelement <4 x float> %752, i32 3
-  %754 = fcmp une float 0xBFEE752540000000, %753
-  %755 = select i1 %754, float 1.000000e+00, float 0.000000e+00
-  %756 = fsub float -0.000000e+00, %743
-  %757 = fptosi float %756 to i32
-  %758 = fsub float -0.000000e+00, %747
-  %759 = fptosi float %758 to i32
-  %760 = fsub float -0.000000e+00, %751
-  %761 = fptosi float %760 to i32
-  %762 = fsub float -0.000000e+00, %755
-  %763 = fptosi float %762 to i32
-  %764 = bitcast i32 %757 to float
-  %765 = bitcast i32 %759 to float
-  %766 = bitcast i32 %761 to float
-  %767 = bitcast i32 %763 to float
-  %768 = bitcast float %764 to i32
-  %769 = bitcast float %765 to i32
-  %770 = or i32 %768, %769
-  %771 = bitcast i32 %770 to float
-  %772 = bitcast float %766 to i32
-  %773 = bitcast float %767 to i32
-  %774 = or i32 %772, %773
-  %775 = bitcast i32 %774 to float
-  %776 = bitcast float %771 to i32
-  %777 = bitcast float %775 to i32
-  %778 = or i32 %776, %777
-  %779 = bitcast i32 %778 to float
-  %780 = insertelement <4 x float> undef, float %659, i32 0
-  %781 = insertelement <4 x float> %780, float %699, i32 1
-  %782 = insertelement <4 x float> %781, float %739, i32 2
-  %783 = insertelement <4 x float> %782, float %779, i32 3
-  %784 = insertelement <4 x float> undef, float %659, i32 0
-  %785 = insertelement <4 x float> %784, float %699, i32 1
-  %786 = insertelement <4 x float> %785, float %739, i32 2
-  %787 = insertelement <4 x float> %786, float %779, i32 3
-  %788 = call float @llvm.AMDGPU.dp4(<4 x float> %783, <4 x float> %787)
-  %789 = bitcast float %788 to i32
-  %790 = icmp ne i32 %789, 0
-  %791 = sext i1 %790 to i32
-  %792 = bitcast i32 %791 to float
-  %793 = bitcast float %792 to i32
-  %794 = xor i32 %793, -1
-  %795 = bitcast i32 %794 to float
-  %796 = bitcast float %91 to i32
-  %797 = bitcast float %179 to i32
-  %798 = and i32 %796, %797
-  %799 = bitcast i32 %798 to float
-  %800 = bitcast float %311 to i32
-  %801 = bitcast float %443 to i32
-  %802 = and i32 %800, %801
-  %803 = bitcast i32 %802 to float
-  %804 = bitcast float %799 to i32
-  %805 = bitcast float %803 to i32
-  %806 = and i32 %804, %805
-  %807 = bitcast i32 %806 to float
-  %808 = bitcast float %619 to i32
-  %809 = bitcast float %795 to i32
-  %810 = and i32 %808, %809
-  %811 = bitcast i32 %810 to float
-  %812 = bitcast float %807 to i32
-  %813 = bitcast float %811 to i32
-  %814 = and i32 %812, %813
-  %815 = bitcast i32 %814 to float
-  %816 = bitcast float %815 to i32
-  %817 = icmp ne i32 %816, 0
-  %. = select i1 %817, float 1.000000e+00, float 0.000000e+00
-  %.32 = select i1 %817, float 0.000000e+00, float 1.000000e+00
-  %818 = insertelement <4 x float> undef, float %0, i32 0
-  %819 = insertelement <4 x float> %818, float %1, i32 1
-  %820 = insertelement <4 x float> %819, float %2, i32 2
-  %821 = insertelement <4 x float> %820, float %3, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %821, i32 60, i32 1)
-  %822 = insertelement <4 x float> undef, float %.32, i32 0
-  %823 = insertelement <4 x float> %822, float %., i32 1
-  %824 = insertelement <4 x float> %823, float 0.000000e+00, i32 2
-  %825 = insertelement <4 x float> %824, float 1.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %825, i32 0, i32 2)
-  ret void
-}
-
-declare float @llvm.R600.load.input(i32) #1
-
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index 166af2d..44c21bd 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,11 +1,36 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @test2
+;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;SI-CHECK: @test2
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = and <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test4
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index a1bd09a..cdccdfa 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -36,9 +36,9 @@ entry:
 ; SHA-256 Ma function
 ; ((x & z) | (y & (x | z)))
 ; R600-CHECK: @bfi_sha256_ma
-; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: V_XOR_B32_e64 [[DST:VGPR[0-9]+]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
+; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
+; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
+; SI-CHECK: V_XOR_B32_e64 [[DST:VGPR[0-9]+]], {{[SV]GPR[0-9]+, VGPR[0-9]+}}
 ; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
new file mode 100644
index 0000000..9b738a2
--- /dev/null
+++ b/test/CodeGen/R600/build_vector.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @build_vector2
+; R600-CHECK: MOV
+; R600-CHECK: MOV
+; R600-CHECK-NOT: MOV
+; SI-CHECK: @build_vector2
+; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
+; SI-CHECK: BUFFER_STORE_DWORDX2 [[X]]_[[Y]]
+define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+entry:
+  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @build_vector4
+; R600-CHECK: MOV
+; R600-CHECK: MOV
+; R600-CHECK: MOV
+; R600-CHECK: MOV
+; R600-CHECK-NOT: MOV
+; SI-CHECK: @build_vector4
+; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
+; SI-CHECK-DAG: V_MOV_B32_e32 [[Z:VGPR[0-9]]], 7
+; SI-CHECK-DAG: V_MOV_B32_e32 [[W:VGPR[0-9]]], 8
+; SI-CHECK: BUFFER_STORE_DWORDX4 [[X]]_[[Y]]_[[Z]]_[[W]]
+define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 85f2882..78ffd57 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,16 +1,22 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
-;CHECK: MOV * T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
+; DAGCombiner will transform:
+; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
+; unless isFabsFree returns true
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @fabs( float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
-   ret void
-}
-
-declare float @llvm.R600.load.input(i32) readnone
+; R600-CHECK: @fabs_free
+; R600-CHECK-NOT: AND
+; R600-CHECK: |PV.{{[XYZW]}}|
+; SI-CHECK: @fabs_free
+; SI-CHECK: V_ADD_F32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 0, 1, 0, 0, 0
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = bitcast i32 %in to float
+  %1 = call float @fabs(float %0)
+  store float %1, float addrspace(1)* %out
+  ret void
+}
 
 declare float @fabs(float ) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 9a67232..97dbe44 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fadd_v2f32
+; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fadd <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fadd_v4f32
 ; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
new file mode 100644
index 0000000..130302f
--- /dev/null
+++ b/test/CodeGen/R600/fadd64.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fadd_f64
+; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fadd double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
new file mode 100644
index 0000000..8f2513b
--- /dev/null
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @flt_f64
+; CHECK: V_CMP_LT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp ult double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @fle_f64
+; CHECK: V_CMP_LE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp ule double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @fgt_f64
+; CHECK: V_CMP_GT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp ugt double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @fge_f64
+; CHECK: V_CMP_GE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @fne_f64
+; CHECK: V_CMP_NEQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp une double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @feq_f64
+; CHECK: V_CMP_EQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fcmp ueq double %r0, %r1
+   %r3 = select i1 %r2, double %r0, double %r1
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
new file mode 100644
index 0000000..2402a9c
--- /dev/null
+++ b/test/CodeGen/R600/fconst64.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fconst_f64
+; CHECK: V_MOV_B32_e32 {{VGPR[0-9]+}}, 0.000000e+00
+; CHECK-NEXT: V_MOV_B32_e32 {{VGPR[0-9]+}}, 2.312500e+00
+
+define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r1 = load double addrspace(1)* %in
+   %r2 = fadd double %r1, 5.000000e+00
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 003590b..6798eac 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,15 +1,32 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; These tests check that fdiv is expanded correctly and also test that the
+; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
+; instruction groups.
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; CHECK: @fdiv_v2f32
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fdiv <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fdiv_v4f32
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
   %b = load <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
new file mode 100644
index 0000000..76c5ca3
--- /dev/null
+++ b/test/CodeGen/R600/fdiv64.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fdiv_f64
+; CHECK: V_RCP_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fdiv double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index a40e818..6ef3a11 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -15,6 +15,16 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
+; CHECK: @fmul_v2f32
+; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
+; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
+define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fmul <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fmul_v4f32
 ; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
new file mode 100644
index 0000000..8a57d4a
--- /dev/null
+++ b/test/CodeGen/R600/fmul64.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fmul_f64
+; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fmul double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
new file mode 100644
index 0000000..799db0c
--- /dev/null
+++ b/test/CodeGen/R600/fneg.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @fneg_v2
+; CHECK: -PV
+; CHECK: -PV
+define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+entry:
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fneg_v4
+; CHECK: -PV
+; CHECK: -PV
+; CHECK: -PV
+; CHECK: -PV
+define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+entry:
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; CHECK-NOT: XOR
+; CHECK: -KC0[2].Z
+
+define void @fneg_free(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = bitcast i32 %in to float
+  %1 = fsub float 0.0, %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index f5716e1..6471270 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,11 +1,28 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fp_to_sint_v4i32
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: @fp_to_sint_v2i32
+; R600-CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI-CHECK: @fp_to_sint_v2i32
+; SI-CHECK: V_CVT_I32_F32_e32
+; SI-CHECK: V_CVT_I32_F32_e32
+define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptosi <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
 
+; R600-CHECK: @fp_to_sint_v4i32
+; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI-CHECK: @fp_to_sint_v4i32
+; SI-CHECK: V_CVT_I32_F32_e32
+; SI-CHECK: V_CVT_I32_F32_e32
+; SI-CHECK: V_CVT_I32_F32_e32
+; SI-CHECK: V_CVT_I32_F32_e32
 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 1c3c0c6..2a365f9 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,10 +1,20 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_uint_v2i32
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+
+define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+  %result = fptoui <2 x float> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
new file mode 100644
index 0000000..2613805
--- /dev/null
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fsqrt_f32
+; CHECK: V_SQRT_F32_e32 {{VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+   %r0 = load float addrspace(1)* %in
+   %r1 = call float @llvm.sqrt.f32(float %r0)
+   store float %r1, float addrspace(1)* %out
+   ret void
+}
+
+; CHECK: @fsqrt_f64
+; CHECK: V_SQRT_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+
+define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+   %r0 = load double addrspace(1)* %in
+   %r1 = call double @llvm.sqrt.f64(double %r0)
+   store double %r1, double addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.sqrt.f32(float %Val)
+declare double @llvm.sqrt.f64(double %Val)
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index f784cde..0fc5860 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -15,12 +15,21 @@ declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: @fsub_v2f32
+; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+entry:
+  %0 = fsub <2 x float> %a, %b
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
 
+; CHECK: @fsub_v4f32
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
new file mode 100644
index 0000000..fa59dcc
--- /dev/null
+++ b/test/CodeGen/R600/fsub64.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; CHECK: @fsub_f64
+; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}, 0, 0, 0, 0, 2
+
+define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = fsub double %r0, %r1
+   store double %r2, double addrspace(1)* %out
+   ret void
+}
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
new file mode 100644
index 0000000..ba5de22
--- /dev/null
+++ b/test/CodeGen/R600/indirect-addressing-si.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+; Tests for indirect addressing on SI, which is implemented using dynamic
+; indexing of vectors.
+
+; CHECK: extract_w_offset
+; CHECK: S_MOV_B32 M0
+; CHECK-NEXT: V_MOVRELS_B32_e32
+define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: extract_wo_offset
+; CHECK: S_MOV_B32 M0
+; CHECK-NEXT: V_MOVRELS_B32_e32
+define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: insert_w_offset
+; CHECK: S_MOV_B32 M0
+; CHECK-NEXT: V_MOVRELD_B32_e32
+define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 %in, 1
+  %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
+  %2 = extractelement <4 x float> %1, i32 2
+  store float %2, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: insert_wo_offset
+; CHECK: S_MOV_B32 M0
+; CHECK-NEXT: V_MOVRELD_B32_e32
+define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
+  %1 = extractelement <4 x float> %0, i32 2
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
index ae9c8bb..26c298b 100644
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: JUMP @3
+; CHECK: JUMP @5
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 21e5d4c..77b168e 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -2,12 +2,12 @@
 
 ; Test using an integer literal constant.
 ; Generated ASM should be:
-; ADD_INT REG literal.x, 5
+; ADD_INT KC0[2].Z literal.x, 5
 ; or
-; ADD_INT literal.x REG, 5
+; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: @i32_literal
-; CHECK: ADD_INT * {{[A-Z0-9,. ]*}}literal.x
+; CHECK: ADD_INT * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -18,12 +18,12 @@ entry:
 
 ; Test using a float literal constant.
 ; Generated ASM should be:
-; ADD REG literal.x, 5.0
+; ADD KC0[2].Z literal.x, 5.0
 ; or
-; ADD literal.x REG, 5.0
+; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: @float_literal
-; CHECK: ADD * {{[A-Z0-9,. ]*}}literal.x
+; CHECK: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
@@ -31,169 +31,3 @@ entry:
   store float %0, float addrspace(1)* %out
   ret void
 }
-
-; CHECK: @main
-; CHECK: -2147483648
-; CHECK-NEXT-NOT: -2147483648
-
-define void @main() #0 {
-main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
-  %5 = call float @llvm.R600.load.input(i32 9)
-  %6 = call float @llvm.R600.load.input(i32 10)
-  %7 = call float @llvm.R600.load.input(i32 11)
-  %8 = call float @llvm.R600.load.input(i32 12)
-  %9 = call float @llvm.R600.load.input(i32 13)
-  %10 = call float @llvm.R600.load.input(i32 14)
-  %11 = call float @llvm.R600.load.input(i32 15)
-  %12 = load <4 x float> addrspace(8)* null
-  %13 = extractelement <4 x float> %12, i32 0
-  %14 = fsub float -0.000000e+00, %13
-  %15 = fadd float %0, %14
-  %16 = load <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 1
-  %18 = fsub float -0.000000e+00, %17
-  %19 = fadd float %1, %18
-  %20 = load <4 x float> addrspace(8)* null
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fadd float %2, %22
-  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %25 = extractelement <4 x float> %24, i32 0
-  %26 = fmul float %25, %0
-  %27 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %28 = extractelement <4 x float> %27, i32 1
-  %29 = fmul float %28, %0
-  %30 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %31 = extractelement <4 x float> %30, i32 2
-  %32 = fmul float %31, %0
-  %33 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %34 = extractelement <4 x float> %33, i32 3
-  %35 = fmul float %34, %0
-  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %37 = extractelement <4 x float> %36, i32 0
-  %38 = fmul float %37, %1
-  %39 = fadd float %38, %26
-  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %41 = extractelement <4 x float> %40, i32 1
-  %42 = fmul float %41, %1
-  %43 = fadd float %42, %29
-  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %45 = extractelement <4 x float> %44, i32 2
-  %46 = fmul float %45, %1
-  %47 = fadd float %46, %32
-  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %49 = extractelement <4 x float> %48, i32 3
-  %50 = fmul float %49, %1
-  %51 = fadd float %50, %35
-  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %53 = extractelement <4 x float> %52, i32 0
-  %54 = fmul float %53, %2
-  %55 = fadd float %54, %39
-  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %57 = extractelement <4 x float> %56, i32 1
-  %58 = fmul float %57, %2
-  %59 = fadd float %58, %43
-  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %61 = extractelement <4 x float> %60, i32 2
-  %62 = fmul float %61, %2
-  %63 = fadd float %62, %47
-  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %65 = extractelement <4 x float> %64, i32 3
-  %66 = fmul float %65, %2
-  %67 = fadd float %66, %51
-  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %69 = extractelement <4 x float> %68, i32 0
-  %70 = fmul float %69, %3
-  %71 = fadd float %70, %55
-  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %73 = extractelement <4 x float> %72, i32 1
-  %74 = fmul float %73, %3
-  %75 = fadd float %74, %59
-  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %77 = extractelement <4 x float> %76, i32 2
-  %78 = fmul float %77, %3
-  %79 = fadd float %78, %63
-  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %81 = extractelement <4 x float> %80, i32 3
-  %82 = fmul float %81, %3
-  %83 = fadd float %82, %67
-  %84 = insertelement <4 x float> undef, float %15, i32 0
-  %85 = insertelement <4 x float> %84, float %19, i32 1
-  %86 = insertelement <4 x float> %85, float %23, i32 2
-  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 3
-  %88 = insertelement <4 x float> undef, float %15, i32 0
-  %89 = insertelement <4 x float> %88, float %19, i32 1
-  %90 = insertelement <4 x float> %89, float %23, i32 2
-  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
-  %92 = call float @llvm.AMDGPU.dp4(<4 x float> %87, <4 x float> %91)
-  %93 = call float @fabs(float %92)
-  %94 = call float @llvm.AMDGPU.rsq(float %93)
-  %95 = fmul float %15, %94
-  %96 = fmul float %19, %94
-  %97 = fmul float %23, %94
-  %98 = insertelement <4 x float> undef, float %4, i32 0
-  %99 = insertelement <4 x float> %98, float %5, i32 1
-  %100 = insertelement <4 x float> %99, float %6, i32 2
-  %101 = insertelement <4 x float> %100, float 0.000000e+00, i32 3
-  %102 = insertelement <4 x float> undef, float %4, i32 0
-  %103 = insertelement <4 x float> %102, float %5, i32 1
-  %104 = insertelement <4 x float> %103, float %6, i32 2
-  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
-  %106 = call float @llvm.AMDGPU.dp4(<4 x float> %101, <4 x float> %105)
-  %107 = call float @fabs(float %106)
-  %108 = call float @llvm.AMDGPU.rsq(float %107)
-  %109 = fmul float %4, %108
-  %110 = fmul float %5, %108
-  %111 = fmul float %6, %108
-  %112 = insertelement <4 x float> undef, float %95, i32 0
-  %113 = insertelement <4 x float> %112, float %96, i32 1
-  %114 = insertelement <4 x float> %113, float %97, i32 2
-  %115 = insertelement <4 x float> %114, float 0.000000e+00, i32 3
-  %116 = insertelement <4 x float> undef, float %109, i32 0
-  %117 = insertelement <4 x float> %116, float %110, i32 1
-  %118 = insertelement <4 x float> %117, float %111, i32 2
-  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
-  %120 = call float @llvm.AMDGPU.dp4(<4 x float> %115, <4 x float> %119)
-  %121 = fsub float -0.000000e+00, %120
-  %122 = fcmp uge float 0.000000e+00, %121
-  %123 = select i1 %122, float 0.000000e+00, float %121
-  %124 = insertelement <4 x float> undef, float %8, i32 0
-  %125 = insertelement <4 x float> %124, float %9, i32 1
-  %126 = insertelement <4 x float> %125, float 5.000000e-01, i32 2
-  %127 = insertelement <4 x float> %126, float 1.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %127, i32 60, i32 1)
-  %128 = insertelement <4 x float> undef, float %71, i32 0
-  %129 = insertelement <4 x float> %128, float %75, i32 1
-  %130 = insertelement <4 x float> %129, float %79, i32 2
-  %131 = insertelement <4 x float> %130, float %83, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %131, i32 0, i32 2)
-  %132 = insertelement <4 x float> undef, float %123, i32 0
-  %133 = insertelement <4 x float> %132, float %96, i32 1
-  %134 = insertelement <4 x float> %133, float %97, i32 2
-  %135 = insertelement <4 x float> %134, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %135, i32 1, i32 2)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
-
-declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { readonly }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
new file mode 100644
index 0000000..8d3c9ca
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: GROUP_BARRIER
+
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.local()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32 addrspace(1)* %out, i32 %4
+  %6 = load i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
new file mode 100644
index 0000000..110bbfd
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
@@ -0,0 +1,59 @@
+
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @cube
+; CHECK: CUBE T{{[0-9]}}.X
+; CHECK: CUBE T{{[0-9]}}.Y
+; CHECK: CUBE T{{[0-9]}}.Z
+; CHECK: CUBE * T{{[0-9]}}.W
+define void @cube() #0 {
+main_body:
+  %0 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %1 = extractelement <4 x float> %0, i32 3
+  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = fdiv float %3, %1
+  %5 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %6 = extractelement <4 x float> %5, i32 1
+  %7 = fdiv float %6, %1
+  %8 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %9 = extractelement <4 x float> %8, i32 2
+  %10 = fdiv float %9, %1
+  %11 = insertelement <4 x float> undef, float %4, i32 0
+  %12 = insertelement <4 x float> %11, float %7, i32 1
+  %13 = insertelement <4 x float> %12, float %10, i32 2
+  %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3
+  %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14)
+  %16 = extractelement <4 x float> %15, i32 0
+  %17 = extractelement <4 x float> %15, i32 1
+  %18 = extractelement <4 x float> %15, i32 2
+  %19 = extractelement <4 x float> %15, i32 3
+  %20 = call float @fabs(float %18)
+  %21 = fdiv float 1.000000e+00, %20
+  %22 = fmul float %16, %21
+  %23 = fadd float %22, 1.500000e+00
+  %24 = fmul float %17, %21
+  %25 = fadd float %24, 1.500000e+00
+  %26 = insertelement <4 x float> undef, float %25, i32 0
+  %27 = insertelement <4 x float> %26, float %23, i32 1
+  %28 = insertelement <4 x float> %27, float %19, i32 2
+  %29 = insertelement <4 x float> %28, float %25, i32 3
+  %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4)
+  call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
+
+; Function Attrs: readnone
+declare float @fabs(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index cdc03f8..7627783 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
 ; R600-CHECK: @amdgpu_trunc
-; R600-CHECK: TRUNC * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: TRUNC * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK: @amdgpu_trunc
 ; SI-CHECK: V_TRUNC_F32
 
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 6b321f0..0adcdfc 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -1,15 +1,15 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 2, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 1, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 4, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
-;CHECK: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index de06354..7655996 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,21 +1,21 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 3
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+}}, 2
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+}}, 1
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+}}, 4
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 5
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 9
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 6
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 10
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 12
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
-;CHECK: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
-;CHECK: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 3
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 6
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 10
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
+;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
new file mode 100644
index 0000000..3b05551
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -0,0 +1,140 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
+;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+   %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
+   %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1
+   %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2
+   %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3
+   %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0
+   %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1
+   %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2
+   %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3
+   %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0
+   %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1
+   %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2
+   %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3
+   %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0
+   %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1
+   %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
+   %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
+   %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
+      <8 x i32> undef, <4 x i32> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
+      <8 x i32> undef, <4 x i32> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
+      <8 x i32> undef, <4 x i32> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
+      <8 x i32> undef, <4 x i32> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
+      <8 x i32> undef, <4 x i32> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
+      <8 x i32> undef, <4 x i32> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
+      <8 x i32> undef, <4 x i32> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
+      <8 x i32> undef, <4 x i32> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
+      <8 x i32> undef, <4 x i32> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
+      <8 x i32> undef, <4 x i32> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
+      <8 x i32> undef, <4 x i32> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
+      <8 x i32> undef, <4 x i32> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
+      <8 x i32> undef, <4 x i32> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
+      <8 x i32> undef, <4 x i32> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
+      <8 x i32> undef, <4 x i32> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
+      <8 x i32> undef, <4 x i32> undef, i32 16)
+   %e1 = extractelement <4 x float> %res1, i32 0
+   %e2 = extractelement <4 x float> %res2, i32 1
+   %e3 = extractelement <4 x float> %res3, i32 2
+   %e4 = extractelement <4 x float> %res4, i32 3
+   %t0 = extractelement <4 x float> %res5, i32 0
+   %t1 = extractelement <4 x float> %res5, i32 1
+   %e5 = fadd float %t0, %t1
+   %t2 = extractelement <4 x float> %res6, i32 0
+   %t3 = extractelement <4 x float> %res6, i32 2
+   %e6 = fadd float %t2, %t3
+   %t4 = extractelement <4 x float> %res7, i32 0
+   %t5 = extractelement <4 x float> %res7, i32 3
+   %e7 = fadd float %t4, %t5
+   %t6 = extractelement <4 x float> %res8, i32 1
+   %t7 = extractelement <4 x float> %res8, i32 2
+   %e8 = fadd float %t6, %t7
+   %t8 = extractelement <4 x float> %res9, i32 1
+   %t9 = extractelement <4 x float> %res9, i32 3
+   %e9 = fadd float %t8, %t9
+   %t10 = extractelement <4 x float> %res10, i32 2
+   %t11 = extractelement <4 x float> %res10, i32 3
+   %e10 = fadd float %t10, %t11
+   %t12 = extractelement <4 x float> %res11, i32 0
+   %t13 = extractelement <4 x float> %res11, i32 1
+   %t14 = extractelement <4 x float> %res11, i32 2
+   %t15 = fadd float %t12, %t13
+   %e11 = fadd float %t14, %t15
+   %t16 = extractelement <4 x float> %res12, i32 0
+   %t17 = extractelement <4 x float> %res12, i32 1
+   %t18 = extractelement <4 x float> %res12, i32 3
+   %t19 = fadd float %t16, %t17
+   %e12 = fadd float %t18, %t19
+   %t20 = extractelement <4 x float> %res13, i32 0
+   %t21 = extractelement <4 x float> %res13, i32 2
+   %t22 = extractelement <4 x float> %res13, i32 3
+   %t23 = fadd float %t20, %t21
+   %e13 = fadd float %t22, %t23
+   %t24 = extractelement <4 x float> %res14, i32 1
+   %t25 = extractelement <4 x float> %res14, i32 2
+   %t26 = extractelement <4 x float> %res14, i32 3
+   %t27 = fadd float %t24, %t25
+   %e14 = fadd float %t26, %t27
+   %t28 = extractelement <4 x float> %res15, i32 0
+   %t29 = extractelement <4 x float> %res15, i32 1
+   %t30 = extractelement <4 x float> %res15, i32 2
+   %t31 = extractelement <4 x float> %res15, i32 3
+   %t32 = fadd float %t28, %t29
+   %t33 = fadd float %t30, %t31
+   %e15 = fadd float %t32, %t33
+   %e16 = extractelement <4 x float> %res16, i32 3
+   %s1 = fadd float %e1, %e2
+   %s2 = fadd float %s1, %e3
+   %s3 = fadd float %s2, %e4
+   %s4 = fadd float %s3, %e5
+   %s5 = fadd float %s4, %e6
+   %s6 = fadd float %s5, %e7
+   %s7 = fadd float %s6, %e8
+   %s8 = fadd float %s7, %e9
+   %s9 = fadd float %s8, %e10
+   %s10 = fadd float %s9, %e11
+   %s11 = fadd float %s10, %e12
+   %s12 = fadd float %s11, %e13
+   %s13 = fadd float %s12, %e14
+   %s14 = fadd float %s13, %e15
+   %s15 = fadd float %s14, %e16
+   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
new file mode 100644
index 0000000..238d9f2
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+;CHECK: V_MBCNT_LO_U32_B32_e64
+;CHECK: V_MBCNT_HI_U32_B32_e32
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call i32 @llvm.SI.tid()
+  %5 = bitcast i32 %4 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+declare i32 @llvm.SI.tid() readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 9b28167..8fb4559 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -1,6 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: COS * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULADD_IEEE *
+;CHECK: FRACT *
+;CHECK: ADD *
+;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index 1422083..0f51cf4 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
+;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 803dc2d..e94c2ba 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -1,6 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: SIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULADD_IEEE *
+;CHECK: FRACT *
+;CHECK: ADD *
+;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index ff774ec..f478ef5 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,6 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
 
+;===------------------------------------------------------------------------===;
+; GLOBAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
 ; Load an i8 value from the global address space.
 ; R600-CHECK: @load_i8
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
@@ -14,6 +19,51 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   ret void
 }
 
+; R600-CHECK: @load_i8_sext
+; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600-CHECK: 24
+; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600-CHECK: 24
+; SI-CHECK: @load_i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE
+define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+entry:
+  %0 = load i8 addrspace(1)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the global address space.
+; R600-CHECK: @load_i16
+; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_i16
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16	 addrspace(1)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @load_i16_sext
+; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600-CHECK: 16
+; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600-CHECK: 16
+; SI-CHECK: @load_i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+entry:
+  %0 = load i16 addrspace(1)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
 ; load an i32 value from the global address space.
 ; R600-CHECK: @load_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
@@ -40,6 +90,153 @@ entry:
   ret void
 }
 
+; load a v2f32 value from the global address space
+; R600-CHECK: @load_v2f32
+; R600-CHECK: VTX_READ_64
+
+; SI-CHECK: @load_v2f32
+; SI-CHECK: BUFFER_LOAD_DWORDX2
+define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x float> addrspace(1)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @load_i64
+; R600-CHECK: RAT
+; R600-CHECK: RAT
+
+; SI-CHECK: @load_i64
+; SI-CHECK: BUFFER_LOAD_DWORDX2
+define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64 addrspace(1)* %in
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @load_i64_sext
+; R600-CHECK: RAT
+; R600-CHECK: RAT
+; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
+; R600-CHECK: 31
+; SI-CHECK: @load_i64_sext
+; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:VGPR[0-9]_VGPR[0-9]]]
+; SI-CHECK: V_LSHL_B64 [[LSHL:VGPR[0-9]_VGPR[0-9]]], [[VAL]], 32
+; SI-CHECK: V_ASHR_I64 VGPR{{[0-9]}}_VGPR{{[0-9]}}, [[LSHL]], 32
+
+define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %1 = sext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @load_i64_zext
+; R600-CHECK: RAT
+; R600-CHECK: RAT
+define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %1 = zext i32 %0 to i64
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; CONSTANT ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load a sign-extended i8 value
+; R600-CHECK: @load_const_i8_sext
+; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600-CHECK: 24
+; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600-CHECK: 24
+; SI-CHECK: @load_const_i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE VGPR{{[0-9]+}},
+define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8 addrspace(2)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i8 value
+; R600-CHECK: @load_const_i8_aligned
+; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_const_i8_aligned
+; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = load i8 addrspace(2)* %in
+  %1 = zext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i8 value
+; R600-CHECK: @load_const_i8_unaligned
+; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_const_i8_unaligned
+; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i8 addrspace(2)* %in, i32 1
+  %1 = load i8 addrspace(2)* %0
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load a sign-extended i16 value
+; R600-CHECK: @load_const_i16_sext
+; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
+; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
+; R600-CHECK: 16
+; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
+; R600-CHECK: 16
+; SI-CHECK: @load_const_i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16 addrspace(2)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an aligned i16 value
+; R600-CHECK: @load_const_i16_aligned
+; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_const_i16_aligned
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = load i16 addrspace(2)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; Load an un-aligned i16 value
+; R600-CHECK: @load_const_i16_unaligned
+; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK: @load_const_i16_unaligned
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
+entry:
+  %0 = getelementptr i16 addrspace(2)* %in, i32 1
+  %1 = load i16 addrspace(2)* %0
+  %2 = zext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
 ; Load an i32 value from the constant address space.
 ; R600-CHECK: @load_const_addrspace_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
@@ -64,3 +261,4 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(
   store float %1, float addrspace(1)* %out
   ret void
 }
+
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
new file mode 100644
index 0000000..8cba0b6
--- /dev/null
+++ b/test/CodeGen/R600/load.vec.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
+
+; load a v2i32 value from the global address space.
+; EG-CHECK: @load_v2i32
+; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
+; SI-CHECK: @load_v2i32
+; SI-CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %a = load <2 x i32> addrspace(1) * %in
+  store <2 x i32> %a, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load a v4i32 value from the global address space.
+; EG-CHECK: @load_v4i32
+; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
+; SI-CHECK: @load_v4i32
+; SI-CHECK: BUFFER_LOAD_DWORDX4 VGPR{{[0-9]+}}
+define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %a = load <4 x i32> addrspace(1) * %in
+  store <4 x i32> %a, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
new file mode 100644
index 0000000..3b4a8f8
--- /dev/null
+++ b/test/CodeGen/R600/load64.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+
+; load a f64 value from the global address space.
+; CHECK: @load_f64
+; CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+entry:
+  %0 = load double addrspace(1)* %in
+  store double %0, double addrspace(1)* %out
+  ret void
+}
+
+; Load a f64 value from the constant address space.
+; CHECK: @load_const_addrspace_f64
+; CHECK: S_LOAD_DWORDX2 SGPR{{[0-9]+}}
+define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
+  %1 = load double addrspace(2)* %in
+  store double %1, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
new file mode 100644
index 0000000..6d3610e
--- /dev/null
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; TODO: Add RUN and CHECK lines for SI once this test works there
+
+@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+
+; CHECK: @local_memory_two_objects
+
+; Check that the LDS size emitted correctly
+; CHECK: .long 166120
+; CHECK-NEXT: .long 8
+
+; Make sure the lds writes are using different addresses.
+; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
+; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
+
+; GROUP_BARRIER must be the last instruction in a clause
+; CHECK: GROUP_BARRIER
+; CHECK-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses.
+; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %0 = load i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %1 = load i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
new file mode 100644
index 0000000..5458fb9
--- /dev/null
+++ b/test/CodeGen/R600/local-memory.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+
+@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
+
+; EG-CHECK: @local_memory
+; SI-CHECK: @local_memory
+
+; Check that the LDS size emitted correctly
+; EG-CHECK: .long 166120
+; EG-CHECK-NEXT: .long 16
+; SI-CHECK: .long 47180
+; SI-CHECK-NEXT: .long 32768
+
+; EG-CHECK: LDS_WRITE
+; SI-CHECK: DS_WRITE_B32
+
+; GROUP_BARRIER must be the last instruction in a clause
+; EG-CHECK: GROUP_BARRIER
+; EG-CHECK-NEXT: ALU clause
+; SI-CHECK: S_BARRIER
+
+; EG-CHECK: LDS_READ_RET
+; SI-CHECK: DS_READ_B32
+
+define void @local_memory(i32 addrspace(1)* %out) {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %0 = load i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll
index 8a5458b..b46d8e9 100644
--- a/test/CodeGen/R600/loop-address.ll
+++ b/test/CodeGen/R600/loop-address.ll
@@ -1,13 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: TEX
 ;CHECK: ALU_PUSH
-;CHECK: JUMP @4
-;CHECK: ELSE @16
-;CHECK: TEX
-;CHECK: LOOP_START_DX10 @15
-;CHECK: LOOP_BREAK @14
-;CHECK: POP @16
+;CHECK: LOOP_START_DX10 @11
+;CHECK: LOOP_BREAK @10
+;CHECK: POP @10
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
 target triple = "r600--"
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 9e29b0d..806e681 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-;CHECK: V_LSHL_B32_e64 VGPR{{[0-9]+}}, {{[SV]GPR[0-9]+}}, 1
+;CHECK: V_LSHL_B32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = mul i32 %p, 2
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index eab3fbf..cfbcc34 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-;CHECK: V_LSHR_B32_e64 {{VGPR[0-9]+}}, {{[SV]GPR[0-9]+}}, 1
+;CHECK: V_LSHR_B32_e64 {{VGPR[0-9]}}, SGPR{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 2
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
new file mode 100644
index 0000000..ce42ae7
--- /dev/null
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK: @i32_mad24
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG-CHECK: MULLO_INT
+; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
+; SI-CHECK: V_MAD_I32_I24
+define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
new file mode 100644
index 0000000..00aa64a
--- /dev/null
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK: @u32_mad24
+; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
+; SI-CHECK: @u32_mad24
+; SI-CHECK: V_MAD_U32_U24
+
+define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  %3 = add i32 %2, %c
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK: @i16_mad24
+; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
+; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
+; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
+; The order of A and B does not matter.
+; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; The result must be sign-extended
+; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x
+; EG-CHECK: 16
+; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: 16
+; SI-CHECK: @i16_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MAD:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MAD]]
+; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+
+define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = add i16 %0, %c
+  %2 = sext i16 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK: @i8_mad24
+; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
+; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
+; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
+; The order of A and B does not matter.
+; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
+; The result must be sign-extended
+; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x
+; EG-CHECK: 24
+; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: 24
+; SI-CHECK: @i8_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+
+define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = add i8 %0, %c
+  %2 = sext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
new file mode 100644
index 0000000..c31b7c0
--- /dev/null
+++ b/test/CodeGen/R600/max-literals.ll
@@ -0,0 +1,68 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @main
+; CHECK: ADD *
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = call float @llvm.R600.load.input(i32 8)
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 5.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; CHECK: @main
+; CHECK-NOT: ADD *
+
+define void @main2() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = call float @llvm.R600.load.input(i32 8)
+  %5 = fadd float %0, 2.0
+  %6 = fadd float %1, 3.0
+  %7 = fadd float %2, 4.0
+  %8 = fadd float %3, 2.0
+  %9 = bitcast float %4 to i32
+  %10 = mul i32 %9, 6
+  %11 = bitcast i32 %10 to float
+  %12 = insertelement <4 x float> undef, float %5, i32 0
+  %13 = insertelement <4 x float> %12, float %6, i32 1
+  %14 = insertelement <4 x float> %13, float %7, i32 2
+  %15 = insertelement <4 x float> %14, float %8, i32 3
+  %16 = insertelement <4 x float> %15, float %11, i32 3
+
+  %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16)
+  %18 = insertelement <4 x float> undef, float %17, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 7278e90..18a17b6 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,12 +1,38 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
 ; mul24 and mad24 are affected
-;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;EG-CHECK: @test2
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test2
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = mul <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test4
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
new file mode 100644
index 0000000..16ae760
--- /dev/null
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK: @i32_mul24
+; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
+; EG-CHECK: MULLO_INT
+; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: V_MUL_I32_I24
+define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = ashr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = ashr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
new file mode 100644
index 0000000..b1a7f94
--- /dev/null
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK: @u32_mul24
+; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: @u32_mul24
+; SI-CHECK: V_MUL_U32_U24
+
+define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK: @i16_mul24
+; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
+; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
+; The order of A and B does not matter.
+; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; The result must be sign-extended
+; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x
+; EG-CHECK: 16
+; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: 16
+; SI-CHECK: @i16_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+
+define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %0 = mul i16 %a, %b
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK: @i8_mul24
+; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
+; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
+; The order of A and B does not matter.
+; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
+; The result must be sign-extended
+; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x
+; EG-CHECK: 24
+; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
+; EG-CHECK: 24
+; SI-CHECK: @i8_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+
+define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+entry:
+  %0 = mul i8 %a, %b
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index b0dbb02..4a4e892 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,12 +1,39 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @or_v4i32
-; CHECK: OR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: OR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: OR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: OR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK: @or_v2i32
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+;SI-CHECK: @or_v2i32
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = or <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK: @or_v4i32
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @or_v4i32
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
   %result = or <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/packetizer.ll b/test/CodeGen/R600/packetizer.ll
new file mode 100644
index 0000000..0a405c5
--- /dev/null
+++ b/test/CodeGen/R600/packetizer.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+; CHECK: @test
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
+; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
+; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
+
+define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+entry:
+  %shl = sub i32 32, %e
+  %x = add i32 %x_arg, 1
+  %x.0 = shl i32 %x, %shl
+  %x.1 = lshr i32 %x, %e
+  %x.2 = or i32 %x.0, %x.1
+  %y = add i32 %y_arg, 1
+  %y.0 = shl i32 %y, %shl
+  %y.1 = lshr i32 %y, %e
+  %y.2 = or i32 %y.0, %y.1
+  %z = add i32 %z_arg, 1
+  %z.0 = shl i32 %z, %shl
+  %z.1 = lshr i32 %z, %e
+  %z.2 = or i32 %z.0, %z.1
+  %w = add i32 %w_arg, 1
+  %w.0 = shl i32 %w, %shl
+  %w.1 = lshr i32 %w, %e
+  %w.2 = or i32 %w.0, %w.1
+  %xy = or i32 %x.2, %y.2
+  %zw = or i32 %z.2, %w.2
+  %xyzw = or i32 %xy, %zw
+  store i32 %xyzw, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
new file mode 100644
index 0000000..4afaf68
--- /dev/null
+++ b/test/CodeGen/R600/parallelandifcollapse.ll
@@ -0,0 +1,54 @@
+; Function Attrs: nounwind
+; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+;
+; CFG flattening should use parallel-and mode to generate branch conditions and
+; then merge if-regions with the same bodies.
+;
+; CHECK: AND_INT
+; CHECK-NEXT: AND_INT
+; CHECK-NEXT: OR_INT
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32* %a0, align 4
+  %1 = load i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32* %c0, align 4
+  %3 = load i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %entry
+  %4 = load i32* %a1, align 4
+  %5 = load i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32* %c1, align 4
+  %7 = load i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.end6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  store i32 1, i32* %data, align 4
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.then5, %land.lhs.true3, %if.end
+  ret void
+}
diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
new file mode 100644
index 0000000..b0db7cd
--- /dev/null
+++ b/test/CodeGen/R600/parallelorifcollapse.ll
@@ -0,0 +1,61 @@
+; Function Attrs: nounwind
+; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+;
+; CFG flattening should use parallel-or to generate branch conditions and
+; then merge if-regions with the same bodies.
+;
+; CHECK: OR_INT
+; CHECK-NEXT: OR_INT
+; CHECK-NEXT: OR_INT
+define void @_Z9chk1D_512v() #0 {
+entry:
+  %a0 = alloca i32, align 4
+  %b0 = alloca i32, align 4
+  %c0 = alloca i32, align 4
+  %d0 = alloca i32, align 4
+  %a1 = alloca i32, align 4
+  %b1 = alloca i32, align 4
+  %c1 = alloca i32, align 4
+  %d1 = alloca i32, align 4
+  %data = alloca i32, align 4
+  %0 = load i32* %a0, align 4
+  %1 = load i32* %b0, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %land.lhs.true, label %if.else
+
+land.lhs.true:                                    ; preds = %entry
+  %2 = load i32* %c0, align 4
+  %3 = load i32* %d0, align 4
+  %cmp1 = icmp ne i32 %2, %3
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %land.lhs.true
+  br label %if.end
+
+if.else:                                          ; preds = %land.lhs.true, %entry
+  store i32 1, i32* %data, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %4 = load i32* %a1, align 4
+  %5 = load i32* %b1, align 4
+  %cmp2 = icmp ne i32 %4, %5
+  br i1 %cmp2, label %land.lhs.true3, label %if.else6
+
+land.lhs.true3:                                   ; preds = %if.end
+  %6 = load i32* %c1, align 4
+  %7 = load i32* %d1, align 4
+  %cmp4 = icmp ne i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.else6
+
+if.then5:                                         ; preds = %land.lhs.true3
+  br label %if.end7
+
+if.else6:                                         ; preds = %land.lhs.true3, %if.end
+  store i32 1, i32* %data, align 4
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else6, %if.then5
+  ret void
+}
+
diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll
new file mode 100644
index 0000000..03fc204
--- /dev/null
+++ b/test/CodeGen/R600/pv-packing.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
+
+;CHECK: DOT4  T{{[0-9]\.X}}
+;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 8)
+  %4 = call float @llvm.R600.load.input(i32 9)
+  %5 = call float @llvm.R600.load.input(i32 10)
+  %6 = call float @llvm.R600.load.input(i32 12)
+  %7 = call float @llvm.R600.load.input(i32 13)
+  %8 = call float @llvm.R600.load.input(i32 14)
+  %9 = load <4 x float> addrspace(8)* null
+  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
+  %12 = fmul float %0, %3
+  %13 = fadd float %12, %6
+  %14 = fmul float %1, %4
+  %15 = fadd float %14, %7
+  %16 = fmul float %2, %5
+  %17 = fadd float %16, %8
+  %18 = fmul float %11, %11
+  %19 = fadd float %18, %0
+  %20 = insertelement <4 x float> undef, float %13, i32 0
+  %21 = insertelement <4 x float> %20, float %15, i32 1
+  %22 = insertelement <4 x float> %21, float %17, i32 2
+  %23 = insertelement <4 x float> %22, float %19, i32 3
+  %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10)
+  %25 = insertelement <4 x float> undef, float %24, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
+attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
new file mode 100644
index 0000000..895ad5e
--- /dev/null
+++ b/test/CodeGen/R600/r600cfg.ll
@@ -0,0 +1,124 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood
+;REQUIRES: asserts
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = bitcast float %0 to i32
+  %5 = icmp eq i32 %4, 0
+  %6 = sext i1 %5 to i32
+  %7 = bitcast i32 %6 to float
+  %8 = bitcast float %7 to i32
+  %9 = icmp ne i32 %8, 0
+  %. = select i1 %9, float 0x36A0000000000000, float %0
+  br label %LOOP
+
+LOOP:                                             ; preds = %LOOP47, %main_body
+  %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ]
+  %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ]
+  %10 = bitcast float %temp4.1 to i32
+  %11 = icmp eq i32 %10, 1
+  %12 = sext i1 %11 to i32
+  %13 = bitcast i32 %12 to float
+  %14 = bitcast float %13 to i32
+  %15 = icmp ne i32 %14, 0
+  br i1 %15, label %IF41, label %ENDIF40
+
+IF41:                                             ; preds = %LOOP
+  %16 = insertelement <4 x float> undef, float %0, i32 0
+  %17 = insertelement <4 x float> %16, float %temp8.0, i32 1
+  %18 = insertelement <4 x float> %17, float %temp12.0, i32 2
+  %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1)
+  %20 = insertelement <4 x float> undef, float %0, i32 0
+  %21 = insertelement <4 x float> %20, float %temp8.0, i32 1
+  %22 = insertelement <4 x float> %21, float %temp12.0, i32 2
+  %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2)
+  %24 = insertelement <4 x float> undef, float %0, i32 0
+  %25 = insertelement <4 x float> %24, float %temp8.0, i32 1
+  %26 = insertelement <4 x float> %25, float %temp12.0, i32 2
+  %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4)
+  %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1
+  %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2
+  %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1)
+  %32 = insertelement <4 x float> undef, float %0, i32 0
+  %33 = insertelement <4 x float> %32, float %temp8.0, i32 1
+  %34 = insertelement <4 x float> %33, float %temp12.0, i32 2
+  %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2)
+  ret void
+
+ENDIF40:                                          ; preds = %LOOP
+  %36 = bitcast float %temp8.0 to i32
+  %37 = add i32 %36, 1
+  %38 = bitcast i32 %37 to float
+  %39 = bitcast float %temp4.1 to i32
+  %40 = urem i32 %39, 2
+  %41 = bitcast i32 %40 to float
+  %42 = bitcast float %41 to i32
+  %43 = icmp eq i32 %42, 0
+  %44 = sext i1 %43 to i32
+  %45 = bitcast i32 %44 to float
+  %46 = bitcast float %45 to i32
+  %47 = icmp ne i32 %46, 0
+  %48 = bitcast float %temp4.1 to i32
+  br i1 %47, label %IF44, label %ELSE45
+
+IF44:                                             ; preds = %ENDIF40
+  %49 = udiv i32 %48, 2
+  br label %ENDIF43
+
+ELSE45:                                           ; preds = %ENDIF40
+  %50 = mul i32 3, %48
+  %51 = add i32 %50, 1
+  br label %ENDIF43
+
+ENDIF43:                                          ; preds = %ELSE45, %IF44
+  %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ]
+  %52 = bitcast i32 %.sink to float
+  %53 = load <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = bitcast float %54 to i32
+  br label %LOOP47
+
+LOOP47:                                           ; preds = %ENDIF48, %ENDIF43
+  %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ]
+  %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ]
+  %56 = bitcast float %temp28.0 to i32
+  %57 = icmp uge i32 %56, %55
+  %58 = sext i1 %57 to i32
+  %59 = bitcast i32 %58 to float
+  %60 = bitcast float %59 to i32
+  %61 = icmp ne i32 %60, 0
+  br i1 %61, label %LOOP, label %ENDIF48
+
+ENDIF48:                                          ; preds = %LOOP47
+  %62 = bitcast float %temp12.1 to i32
+  %63 = mul i32 %62, 2
+  %64 = bitcast i32 %63 to float
+  %65 = bitcast float %64 to i32
+  %66 = urem i32 %65, 2147483647
+  %67 = bitcast i32 %66 to float
+  %68 = bitcast float %temp28.0 to i32
+  %69 = add i32 %68, 1
+  %70 = bitcast i32 %69 to float
+  br label %LOOP47
+}
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+
+declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index 960d30d..5c4c4e9 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -19,7 +19,7 @@ entry:
 ; R600-CHECK: @rotl
 ; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
 ; R600-CHECK-NEXT: 32
-; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PV.[XYZW]}}
+; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
 
 ; SI-CHECK: @rotl
 ; SI-CHECK: V_SUB_I32_e64 [[DST:VGPR[0-9]+]], 32, {{[SV]GPR[0-9]+}}
diff --git a/test/CodeGen/R600/rv7x0_count3.ll b/test/CodeGen/R600/rv7x0_count3.ll
new file mode 100644
index 0000000..474d6ba
--- /dev/null
+++ b/test/CodeGen/R600/rv7x0_count3.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
+
+; CHECK: TEX 9 @4 ;  encoding: [0x04,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+   %1 = call float @llvm.R600.load.input(i32 4)
+   %2 = call float @llvm.R600.load.input(i32 5)
+   %3 = call float @llvm.R600.load.input(i32 6)
+   %4 = call float @llvm.R600.load.input(i32 7)
+   %5 = insertelement <4 x float> undef, float %1, i32 0
+   %6 = insertelement <4 x float> %5, float %2, i32 1
+   %7 = insertelement <4 x float> %6, float %3, i32 2
+   %8 = insertelement <4 x float> %7, float %4, i32 3
+   %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1)
+   %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1)
+   %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1)
+   %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1)
+   %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1)
+   %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1)
+   %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1)
+   %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1)
+   %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1)
+   %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1)
+   %19 = fadd <4 x float> %9, %10
+   %20 = fadd <4 x float> %19, %11
+   %21 = fadd <4 x float> %20, %12
+   %22 = fadd <4 x float> %21, %13
+   %23 = fadd <4 x float> %22, %14
+   %24 = fadd <4 x float> %23, %15
+   %25 = fadd <4 x float> %24, %16
+   %26 = fadd <4 x float> %25, %17
+   %27 = fadd <4 x float> %26, %18
+   call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2)
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 7f568fc..7e2d559 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -29,7 +29,6 @@ ENDIF:
 ; for the icmp instruction
 
 ; CHECK: @test_b
-; CHECK: VTX_READ
 ; CHECK: SET{{[GTEQN]+}}_DX10
 ; CHECK-NEXT: PRED_
 ; CHECK-NEXT: ALU clause starting
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
index eb6e9d2..291a7bd 100644
--- a/test/CodeGen/R600/set-dx10.ll
+++ b/test/CodeGen/R600/set-dx10.ll
@@ -5,7 +5,7 @@
 ; SET*DX10 instructions.
 
 ; CHECK: @fcmp_une_select_fptosi
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -18,7 +18,7 @@ entry:
 }
 
 ; CHECK: @fcmp_une_select_i32
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -29,7 +29,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ueq_select_fptosi
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -42,7 +42,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ueq_select_i32
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -53,7 +53,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ugt_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -66,7 +66,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ugt_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -77,7 +77,7 @@ entry:
 }
 
 ; CHECK: @fcmp_uge_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -90,7 +90,7 @@ entry:
 }
 
 ; CHECK: @fcmp_uge_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x,
+; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -101,7 +101,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ule_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}},
+; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -114,7 +114,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ule_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}},
+; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -125,7 +125,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ult_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}},
+; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -138,7 +138,7 @@ entry:
 }
 
 ; CHECK: @fcmp_ult_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}},
+; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 0752f2e..992de70 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,7 +1,23 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+; CHECK: @setcc_v2i32
+; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
+
+define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
+  %result = icmp eq <2 x i32> %a, %b
+  %sext = sext <2 x i1> %result to <2 x i32>
+  store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @setcc_v4i32
+; EG-CHECK-DAG: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
new file mode 100644
index 0000000..b0d4549
--- /dev/null
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+; This test checks that no VGPR to SGPR copies are created by the register
+; allocator.
+; CHECK: @main
+; CHECK: S_BUFFER_LOAD_DWORD [[DST:SGPR[0-9]]], {{[SGPR_[0-9]+}}, 0
+; CHECK: V_MOV_B32_e32 VGPR{{[0-9]}}, [[DST]]
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %25 = fptosi float %23 to i32
+  %26 = icmp ne i32 %25, 0
+  br i1 %26, label %ENDIF, label %ELSE
+
+ELSE:                                             ; preds = %main_body
+  %27 = fsub float -0.000000e+00, %22
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %ELSE
+  %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ]
+  %28 = fadd float %temp.0, %24
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00)
+  ret void
+}
+
+; We just want ot make sure the program doesn't crash
+; CHECK: @loop
+
+define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12)
+  %26 = fptosi float %25 to i32
+  %27 = bitcast i32 %26 to float
+  %28 = bitcast float %27 to i32
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ]
+  %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ]
+  %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ]
+  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ]
+  %29 = bitcast float %temp8.0 to i32
+  %30 = icmp sge i32 %29, %28
+  %31 = sext i1 %30 to i32
+  %32 = bitcast i32 %31 to float
+  %33 = bitcast float %32 to i32
+  %34 = icmp ne i32 %33, 0
+  br i1 %34, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  ret void
+
+ENDIF:                                            ; preds = %LOOP
+  %35 = bitcast float %temp8.0 to i32
+  %36 = add i32 %35, 1
+  %37 = bitcast i32 %36 to float
+  br label %LOOP
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readonly }
+
+!0 = metadata !{metadata !"const", null, i32 1}
+
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index db970e9..d99e325 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -1,12 +1,39 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @shl_v4i32
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @shl_v2i32
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+;SI-CHECK: @shl_v2i32
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = shl <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @shl_v4i32
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @shl_v4i32
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
   %result = shl <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
index b69e327..20d0ae4 100644
--- a/test/CodeGen/R600/short-args.ll
+++ b/test/CodeGen/R600/short-args.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @i8_arg
-; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; EG-CHECK: @i8_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: BUFFER_LOAD_UBYTE
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -10,8 +13,9 @@ entry:
   ret void
 }
 
-; CHECK: @i8_zext_arg
-; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+; EG-CHECK: @i8_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -20,8 +24,20 @@ entry:
   ret void
 }
 
-; CHECK: @i16_arg
-; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; EG-CHECK: @i8_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
+
+define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+entry:
+  %0 = sext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK: @i16_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: BUFFER_LOAD_USHORT
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -30,8 +46,9 @@ entry:
   ret void
 }
 
-; CHECK: @i16_zext_arg
-; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+; EG-CHECK: @i16_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -39,3 +56,14 @@ entry:
   store i32 %0, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; EG-CHECK: @i16_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
+
+define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+entry:
+  %0 = sext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 91a8eb7..4e88494 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,11 +1,28 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @sint_to_fp_v4i32
-; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: @sint_to_fp_v2i32
+; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+; SI-CHECK: @sint_to_fp_v2i32
+; SI-CHECK: V_CVT_F32_I32_e32
+; SI-CHECK: V_CVT_F32_I32_e32
+define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = sitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
 
+; R600-CHECK: @sint_to_fp_v4i32
+; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @sint_to_fp_v4i32
+; SI-CHECK: V_CVT_F32_I32_e32
+; SI-CHECK: V_CVT_F32_I32_e32
+; SI-CHECK: V_CVT_F32_I32_e32
+; SI-CHECK: V_CVT_F32_I32_e32
 define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 972542d..5220a96 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -1,13 +1,54 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @ashr_v4i32
-; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @ashr_v2i32
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+;SI-CHECK: @ashr_v2i32
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = ashr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @ashr_v4i32
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @ashr_v4i32
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
   %result = ashr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @ashr_i64
+;EG-CHECK: ASHR
+
+;SI-CHECK: @ashr_i64
+;SI-CHECK: V_ASHR_I64
+define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = sext i32 %in to i64
+  %1 = ashr i64 %0, 8
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 5f63600..d1dcd7f 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,12 +1,40 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @lshr_v4i32
-; CHECK: LSHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: LSHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @lshr_v2i32
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+;SI-CHECK: @lshr_v2i32
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = lshr <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+;EG-CHECK: @lshr_v4i32
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @lshr_v4i32
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
   %result = lshr <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 4d673f3..1bda5e6 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -1,9 +1,12 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
 ; floating-point store
 ; EG-CHECK: @store_f32
 ; EG-CHECK: RAT_WRITE_CACHELESS_32_eg T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+; CM-CHECK: @store_f32
+; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
 ; SI-CHECK: @store_f32
 ; SI-CHECK: BUFFER_STORE_DWORD
 
@@ -11,3 +14,49 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
+
+; vec2 floating-point stores
+; EG-CHECK: @store_v2f32
+; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
+; CM-CHECK: @store_v2f32
+; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
+; SI-CHECK: @store_v2f32
+; SI-CHECK: BUFFER_STORE_DWORDX2
+
+define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 0
+  store <2 x float> %1, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; The stores in this function are combined by the optimizer to create a
+; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
+; should not try to split the 64-bit store back into 2 32-bit stores.
+;
+; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
+; be two 32-bit stores.
+
+; EG-CHECK: @vecload2
+; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
+; CM-CHECK: @vecload2
+; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
+; SI-CHECK: @vecload2
+; SI-CHECK: BUFFER_STORE_DWORDX2
+define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+entry:
+  %0 = load i32 addrspace(2)* %mem, align 4, !tbaa !5
+  %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1
+  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4, !tbaa !5
+  store i32 %0, i32 addrspace(1)* %out, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4, !tbaa !5
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!5 = metadata !{metadata !"int", metadata !6}
+!6 = metadata !{metadata !"omnipotent char", metadata !7}
+!7 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 12bfba3..3bd4cb8 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,11 +1,36 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-;CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @test2
+;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;SI-CHECK: @test2
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = sub <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test4
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
new file mode 100644
index 0000000..b2175af
--- /dev/null
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+
+;EG-CHECK: @main
+;EG-CHECK: EXPORT T{{[0-9]+}}.XYXX
+;EG-CHECK: EXPORT T{{[0-9]+}}.ZXXX
+;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
+;EG-CHECK: EXPORT T{{[0-9]+}}.XXXW
+
+define void @main() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = load <4 x float> addrspace(8)* null
+  %5 = extractelement <4 x float> %4, i32 1
+  %6 = load <4 x float> addrspace(8)* null
+  %7 = extractelement <4 x float> %6, i32 2
+  %8 = load <4 x float> addrspace(8)* null
+  %9 = extractelement <4 x float> %8, i32 0
+  %10 = fmul float 0.000000e+00, %9
+  %11 = load <4 x float> addrspace(8)* null
+  %12 = extractelement <4 x float> %11, i32 0
+  %13 = fmul float %5, %12
+  %14 = load <4 x float> addrspace(8)* null
+  %15 = extractelement <4 x float> %14, i32 0
+  %16 = fmul float 0.000000e+00, %15
+  %17 = load <4 x float> addrspace(8)* null
+  %18 = extractelement <4 x float> %17, i32 0
+  %19 = fmul float 0.000000e+00, %18
+  %20 = load <4 x float> addrspace(8)* null
+  %21 = extractelement <4 x float> %20, i32 0
+  %22 = fmul float %7, %21
+  %23 = load <4 x float> addrspace(8)* null
+  %24 = extractelement <4 x float> %23, i32 0
+  %25 = fmul float 0.000000e+00, %24
+  %26 = load <4 x float> addrspace(8)* null
+  %27 = extractelement <4 x float> %26, i32 0
+  %28 = fmul float 0.000000e+00, %27
+  %29 = load <4 x float> addrspace(8)* null
+  %30 = extractelement <4 x float> %29, i32 0
+  %31 = fmul float 0.000000e+00, %30
+  %32 = load <4 x float> addrspace(8)* null
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float 0.000000e+00, %33
+  %35 = load <4 x float> addrspace(8)* null
+  %36 = extractelement <4 x float> %35, i32 0
+  %37 = fmul float 0.000000e+00, %36
+  %38 = load <4 x float> addrspace(8)* null
+  %39 = extractelement <4 x float> %38, i32 0
+  %40 = fmul float 1.000000e+00, %39
+  %41 = load <4 x float> addrspace(8)* null
+  %42 = extractelement <4 x float> %41, i32 0
+  %43 = fmul float 0.000000e+00, %42
+  %44 = load <4 x float> addrspace(8)* null
+  %45 = extractelement <4 x float> %44, i32 0
+  %46 = fmul float 0.000000e+00, %45
+  %47 = load <4 x float> addrspace(8)* null
+  %48 = extractelement <4 x float> %47, i32 0
+  %49 = fmul float 0.000000e+00, %48
+  %50 = load <4 x float> addrspace(8)* null
+  %51 = extractelement <4 x float> %50, i32 0
+  %52 = fmul float 0.000000e+00, %51
+  %53 = load <4 x float> addrspace(8)* null
+  %54 = extractelement <4 x float> %53, i32 0
+  %55 = fmul float 1.000000e+00, %54
+  %56 = insertelement <4 x float> undef, float %0, i32 0
+  %57 = insertelement <4 x float> %56, float %1, i32 1
+  %58 = insertelement <4 x float> %57, float %2, i32 2
+  %59 = insertelement <4 x float> %58, float %3, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1)
+  %60 = insertelement <4 x float> undef, float %10, i32 0
+  %61 = insertelement <4 x float> %60, float %13, i32 1
+  %62 = insertelement <4 x float> %61, float %16, i32 2
+  %63 = insertelement <4 x float> %62, float %19, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2)
+  %64 = insertelement <4 x float> undef, float %22, i32 0
+  %65 = insertelement <4 x float> %64, float %25, i32 1
+  %66 = insertelement <4 x float> %65, float %28, i32 2
+  %67 = insertelement <4 x float> %66, float %31, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2)
+  %68 = insertelement <4 x float> undef, float %34, i32 0
+  %69 = insertelement <4 x float> %68, float %37, i32 1
+  %70 = insertelement <4 x float> %69, float %40, i32 2
+  %71 = insertelement <4 x float> %70, float %43, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2)
+  %72 = insertelement <4 x float> undef, float %46, i32 0
+  %73 = insertelement <4 x float> %72, float %49, i32 1
+  %74 = insertelement <4 x float> %73, float %52, i32 2
+  %75 = insertelement <4 x float> %74, float %55, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2)
+  ret void
+}
+
+; EG-CHECK: @main2
+; EG-CHECK: T{{[0-9]+}}.ZXY0
+
+define void @main2() #0 {
+main_body:
+  %0 = call float @llvm.R600.load.input(i32 4)
+  %1 = call float @llvm.R600.load.input(i32 5)
+  %2 = call float @llvm.R600.load.input(i32 6)
+  %3 = call float @llvm.R600.load.input(i32 7)
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = call float @llvm.cos.f32(float %5)
+  %7 = load <4 x float> addrspace(8)* null
+  %8 = extractelement <4 x float> %7, i32 0
+  %9 = load <4 x float> addrspace(8)* null
+  %10 = extractelement <4 x float> %9, i32 1
+  %11 = insertelement <4 x float> undef, float %0, i32 0
+  %12 = insertelement <4 x float> %11, float %1, i32 1
+  %13 = insertelement <4 x float> %12, float %2, i32 2
+  %14 = insertelement <4 x float> %13, float %3, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %14, i32 60, i32 1)
+  %15 = insertelement <4 x float> undef, float %6, i32 0
+  %16 = insertelement <4 x float> %15, float %8, i32 1
+  %17 = insertelement <4 x float> %16, float %10, i32 2
+  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.R600.load.input(i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.cos.f32(float) #2
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index b81e366..08fe2ef 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,11 +1,30 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
 ;The code generated by udiv is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
-;CHECK: CF_END
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;EG-CHECK: @test2
+;EG-CHECK: CF_END
+;SI-CHECK: @test2
+;SI-CHECK: S_ENDPGM
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = udiv <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: CF_END
+;SI-CHECK: @test4
+;SI-CHECK: S_ENDPGM
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index 9054fc4..faac77a 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,11 +1,28 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @uint_to_fp_v4i32
-; CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: @uint_to_fp_v2i32
+; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+; SI-CHECK: @uint_to_fp_v2i32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
+define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
+  %result = uitofp <2 x i32> %in to <2 x float>
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
 
+; R600-CHECK: @uint_to_fp_v4i32
+; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @uint_to_fp_v4i32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
 define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
diff --git a/test/CodeGen/R600/uitofp.ll b/test/CodeGen/R600/uitofp.ll
deleted file mode 100644
index 6cf9e6a..0000000
--- a/test/CodeGen/R600/uitofp.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
-
-;CHECK: V_CVT_F32_U32_e32
-
-define void @main(i32 %p) #0 {
-main_body:
-  %0 = uitofp i32 %p to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %0, float %0, float %0, float %0)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { "ShaderType"="0" }
-
-!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
index b311f4c..cf29833 100644
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ b/test/CodeGen/R600/unsupported-cc.ll
@@ -3,7 +3,7 @@
 ; These tests are for condition codes that are not supported by the hardware
 
 ; CHECK: @slt
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -14,7 +14,7 @@ entry:
 }
 
 ; CHECK: @ult_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -25,7 +25,7 @@ entry:
 }
 
 ; CHECK: @ult_float
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
@@ -36,7 +36,7 @@ entry:
 }
 
 ; CHECK: @olt
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ;CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
@@ -47,7 +47,7 @@ entry:
 }
 
 ; CHECK: @sle
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -58,7 +58,7 @@ entry:
 }
 
 ; CHECK: @ule_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -69,7 +69,7 @@ entry:
 }
 
 ; CHECK: @ule_float
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
@@ -80,7 +80,7 @@ entry:
 }
 
 ; CHECK: @ole
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}},
+; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index a2cc0bd..cf3474c 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,11 +1,30 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
 ;The code generated by urem is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
-;a v4i32 urem
-;CHECK: CF_END
+;a v2i32/v4i32 urem
 
-define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+;EG-CHECK: @test2
+;EG-CHECK: CF_END
+;SI-CHECK: @test2
+;SI-CHECK: S_ENDPGM
+
+define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %result = urem <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test4
+;EG-CHECK: CF_END
+;SI-CHECK: @test4
+;SI-CHECK: S_ENDPGM
+
+define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
new file mode 100644
index 0000000..d892229
--- /dev/null
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
+; RUN: not llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
+
+; NI-CHECK: @vtx_fetch32
+; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
+; CM-CHECK: @vtx_fetch32
+; CM-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
+
+define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; NI-CHECK: @vtx_fetch128
+; NI-CHECK: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
+; XXX: Add a case for Cayman when v4i32 stores are supported.
+
+define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i32> addrspace(1)* %in
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index edd7ba0..72a9084 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -1,10 +1,53 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @test_select_v4i32
-; CHECK: CNDE_INT T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: CNDE_INT T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @test_select_v2i32
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test_select_v2i32
+;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: V_CNDMASK_B32_e64
+
+define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x i32> addrspace(1)* %in0
+  %1 = load <2 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <2 x i32> %0, %1
+  %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test_select_v2f32
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test_select_v2f32
+;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: V_CNDMASK_B32_e64
+
+define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <2 x float> addrspace(1)* %in0
+  %1 = load <2 x float> addrspace(1)* %in1
+  %cmp = fcmp one <2 x float> %0, %1
+  %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
+  store <2 x float> %result, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @test_select_v4i32
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @test_select_v4i32
+;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: V_CNDMASK_B32_e64
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
@@ -15,3 +58,19 @@ entry:
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @test_select_v4f32
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x float> addrspace(1)* %in0
+  %1 = load <4 x float> addrspace(1)* %in1
+  %cmp = fcmp one <4 x float> %0, %1
+  %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
index a0c79e3..97d37ed 100644
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ b/test/CodeGen/R600/vtx-schedule.ll
@@ -6,17 +6,13 @@
 
 ; CHECK: @test
 ; CHECK: Fetch clause
-; CHECK_VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 40
-; CHECK_VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 44
-; CHECK: Fetch clause
 ; CHECK_VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
+; CHECK: Fetch clause
 ; CHECK_VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in0, i32 addrspace(1)* nocapture %in1) {
+define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
-  %0 = load i32 addrspace(1)* %in0, align 4
-  %1 = load i32 addrspace(1)* %in1, align 4
-  %cmp.i = icmp slt i32 %0, %1
-  %cond.i = select i1 %cmp.i, i32 %0, i32 %1
-  store i32 %cond.i, i32 addrspace(1)* %out, align 4
+  %0 = load i32 addrspace(1)* addrspace(1)* %in0
+  %1 = load i32 addrspace(1)* %0
+  store i32 %1, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 46e3e54..7998983 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -3,7 +3,7 @@
 
 ; R600-CHECK: @ngroups_x
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 0
+; R600-CHECK: MOV * [[VAL]], KC0[0].X
 ; SI-CHECK: @ngroups_x
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 0
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -17,7 +17,7 @@ entry:
 
 ; R600-CHECK: @ngroups_y
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 4
+; R600-CHECK: MOV * [[VAL]], KC0[0].Y
 ; SI-CHECK: @ngroups_y
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 1
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -31,7 +31,7 @@ entry:
 
 ; R600-CHECK: @ngroups_z
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 8
+; R600-CHECK: MOV * [[VAL]], KC0[0].Z
 ; SI-CHECK: @ngroups_z
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 2
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -45,7 +45,7 @@ entry:
 
 ; R600-CHECK: @global_size_x
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 12
+; R600-CHECK: MOV * [[VAL]], KC0[0].W
 ; SI-CHECK: @global_size_x
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 3
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -59,7 +59,7 @@ entry:
 
 ; R600-CHECK: @global_size_y
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 16
+; R600-CHECK: MOV * [[VAL]], KC0[1].X
 ; SI-CHECK: @global_size_y
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 4
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -73,7 +73,7 @@ entry:
 
 ; R600-CHECK: @global_size_z
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 20
+; R600-CHECK: MOV * [[VAL]], KC0[1].Y
 ; SI-CHECK: @global_size_z
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 5
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -87,7 +87,7 @@ entry:
 
 ; R600-CHECK: @local_size_x
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 24
+; R600-CHECK: MOV * [[VAL]], KC0[1].Z
 ; SI-CHECK: @local_size_x
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 6
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -101,7 +101,7 @@ entry:
 
 ; R600-CHECK: @local_size_y
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 28
+; R600-CHECK: MOV * [[VAL]], KC0[1].W
 ; SI-CHECK: @local_size_y
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 7
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
@@ -115,7 +115,7 @@ entry:
 
 ; R600-CHECK: @local_size_z
 ; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: VTX_READ_32 [[VAL]], [[VAL]], 32
+; R600-CHECK: MOV * [[VAL]], KC0[2].X
 ; SI-CHECK: @local_size_z
 ; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 8
 ; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index cf612e0..f52729d 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -1,12 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
 
-; CHECK: @xor_v4i32
-; CHECK: XOR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: XOR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: XOR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: XOR_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: @xor_v2i32
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+;SI-CHECK: @xor_v2i32
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+
+define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+  %a = load <2 x i32> addrspace(1) * %in0
+  %b = load <2 x i32> addrspace(1) * %in1
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK: @xor_v4i32
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK: @xor_v4i32
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+
+define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+  %a = load <4 x i32> addrspace(1) * %in0
+  %b = load <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
new file mode 100644
index 0000000..413b849
--- /dev/null
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @test
+; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
+; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
+
+; SI-CHECK: @test
+; SI-CHECK: V_MOV_B32_e32 [[ZERO:VGPR[0-9]]], 0
+; SI-CHECK: BUFFER_STORE_DWORDX2 VGPR0_[[ZERO]]
+define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %0 = mul i32 %a, %b
+  %1 = add i32 %0, %c
+  %2 = zext i32 %1 to i64
+  store i64 %2, i64 addrspace(1)* %out
+  ret void
+}