23 files changed, 467 insertions, 0 deletions
diff --git a/test/CodeGen/CellSPU/and_ops.ll b/test/CodeGen/CellSPU/and_ops.ll
index f23355e..6858dba 100644
--- a/test/CodeGen/CellSPU/and_ops.ll
+++ b/test/CodeGen/CellSPU/and_ops.ll
@@ -4,6 +4,8 @@
 ; RUN: grep andi   %t1.s | count 36
 ; RUN: grep andhi  %t1.s | count 30
 ; RUN: grep andbi  %t1.s | count 4
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 ; AND instruction generation:
 define <4 x i32> @and_v4i32_1(<4 x i32> %arg1, <4 x i32> %arg2) {
diff --git a/test/CodeGen/CellSPU/call_indirect.ll b/test/CodeGen/CellSPU/call_indirect.ll
new file mode 100644
index 0000000..7aa8abc
--- /dev/null
+++ b/test/CodeGen/CellSPU/call_indirect.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep bisl    %t1.s | count 6 &&
+; RUN: grep ila     %t1.s | count 1 &&
+; RUN: grep rotqbyi %t1.s | count 4 &&
+; RUN: grep lqa     %t1.s | count 4 &&
+; RUN: grep lqd     %t1.s | count 6 &&
+; RUN: grep dispatch_tab %t1.s | count 10
+; ModuleID = 'call_indirect.bc'
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128"
+target triple = "spu-unknown-elf"
+
+@dispatch_tab = global [6 x void (i32, float)*] zeroinitializer, align 16
+
+define void @dispatcher(i32 %i_arg, float %f_arg) {
+entry:
+	%tmp2 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 0), align 16
+	tail call void %tmp2( i32 %i_arg, float %f_arg )
+	%tmp2.1 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 1), align 4
+	tail call void %tmp2.1( i32 %i_arg, float %f_arg )
+	%tmp2.2 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 2), align 4
+	tail call void %tmp2.2( i32 %i_arg, float %f_arg )
+	%tmp2.3 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 3), align 4
+	tail call void %tmp2.3( i32 %i_arg, float %f_arg )
+	%tmp2.4 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 4), align 4
+	tail call void %tmp2.4( i32 %i_arg, float %f_arg )
+	%tmp2.5 = load void (i32, float)** getelementptr ([6 x void (i32, float)*]* @dispatch_tab, i32 0, i32 5), align 4
+	tail call void %tmp2.5( i32 %i_arg, float %f_arg )
+	ret void
+}
diff --git a/test/CodeGen/CellSPU/ctpop.ll b/test/CodeGen/CellSPU/ctpop.ll
index 3e2bc64..406a20a 100644
--- a/test/CodeGen/CellSPU/ctpop.ll
+++ b/test/CodeGen/CellSPU/ctpop.ll
@@ -3,6 +3,8 @@
 ; RUN: grep andi    %t1.s | count 3 &&
 ; RUN: grep rotmi   %t1.s | count 2 &&
 ; RUN: grep rothmi  %t1.s | count 1
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 declare i32 @llvm.ctpop.i8(i8)
 declare i32 @llvm.ctpop.i16(i16)
diff --git a/test/CodeGen/CellSPU/dp_farith.ll b/test/CodeGen/CellSPU/dp_farith.ll
index 58c56e1..5cdb33e 100644
--- a/test/CodeGen/CellSPU/dp_farith.ll
+++ b/test/CodeGen/CellSPU/dp_farith.ll
@@ -7,6 +7,8 @@
 ; RUN: grep dfnms  %t1.s | count 4
 ;
 ; This file includes double precision floating point arithmetic instructions
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define double @fadd(double %arg1, double %arg2) {
 	%A = add double %arg1, %arg2
diff --git a/test/CodeGen/CellSPU/eqv.ll b/test/CodeGen/CellSPU/eqv.ll
index a4d6dbb..0f02180 100644
--- a/test/CodeGen/CellSPU/eqv.ll
+++ b/test/CodeGen/CellSPU/eqv.ll
@@ -10,6 +10,8 @@
 ; Alternatively, a ^ ~b, which the compiler will also match.
 
 ; ModuleID = 'eqv.bc'
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define <4 x i32> @equiv_v4i32_1(<4 x i32> %arg1, <4 x i32> %arg2) {
 	%A = and <4 x i32> %arg1, %arg2		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/CellSPU/extract_elt.ll b/test/CodeGen/CellSPU/extract_elt.ll
index ab485a8..f9cc32e 100644
--- a/test/CodeGen/CellSPU/extract_elt.ll
+++ b/test/CodeGen/CellSPU/extract_elt.ll
@@ -5,6 +5,8 @@
 ; RUN: grep   lqx %t2.s | count 27 &&
 ; RUN: grep space %t1.s | count 8 &&
 ; RUN: grep  byte %t1.s | count 424
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define i32 @i32_extract_0(<4 x i32> %v) {
 entry:
diff --git a/test/CodeGen/CellSPU/fcmp.ll b/test/CodeGen/CellSPU/fcmp.ll
index 8ae97e6..f4406d6 100644
--- a/test/CodeGen/CellSPU/fcmp.ll
+++ b/test/CodeGen/CellSPU/fcmp.ll
@@ -3,6 +3,8 @@
 ; RUN: grep fcmeq %t1.s | count 1
 ;
 ; This file includes standard floating point arithmetic instructions
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 declare double @fabs(double)
 declare float @fabsf(float)
diff --git a/test/CodeGen/CellSPU/fdiv.ll b/test/CodeGen/CellSPU/fdiv.ll
index d55b12b..a107bbe 100644
--- a/test/CodeGen/CellSPU/fdiv.ll
+++ b/test/CodeGen/CellSPU/fdiv.ll
@@ -6,6 +6,8 @@
 ; RUN: grep fnms     %t1.s | count 2
 ;
 ; This file includes standard floating point arithmetic instructions
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define float @fdiv32(float %arg1, float %arg2) {
 	%A = fdiv float %arg1,  %arg2
diff --git a/test/CodeGen/CellSPU/fneg-fabs.ll b/test/CodeGen/CellSPU/fneg-fabs.ll
index 1abdcf6..a183483 100644
--- a/test/CodeGen/CellSPU/fneg-fabs.ll
+++ b/test/CodeGen/CellSPU/fneg-fabs.ll
@@ -4,6 +4,8 @@
 ; RUN: grep xor     %t1.s | count 4 &&
 ; RUN: grep and     %t1.s | count 5 &&
 ; RUN: grep andbi   %t1.s | count 3
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define double @fneg_dp(double %X) {
 	%Y = sub double -0.000000e+00, %X
diff --git a/test/CodeGen/CellSPU/immed16.ll b/test/CodeGen/CellSPU/immed16.ll
index 19cabc4..603ec05 100644
--- a/test/CodeGen/CellSPU/immed16.ll
+++ b/test/CodeGen/CellSPU/immed16.ll
@@ -1,5 +1,7 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
 ; RUN: grep "ilh" %t1.s | count 5
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define i16 @test_1() {
   %x = alloca i16, align 16
diff --git a/test/CodeGen/CellSPU/immed32.ll b/test/CodeGen/CellSPU/immed32.ll
index 6a5a361..4bf5bbd 100644
--- a/test/CodeGen/CellSPU/immed32.ll
+++ b/test/CodeGen/CellSPU/immed32.ll
@@ -12,6 +12,8 @@
 ; RUN: grep 49077 %t1.s | count 1 &&
 ; RUN: grep  1267 %t1.s | count 2 &&
 ; RUN: grep 16309 %t1.s | count 1
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define i32 @test_1() {
   ret i32 4784128		;; ILHU via pattern (0x49000)
diff --git a/test/CodeGen/CellSPU/immed64.ll b/test/CodeGen/CellSPU/immed64.ll
index c4eec8b..4d388b1 100644
--- a/test/CodeGen/CellSPU/immed64.ll
+++ b/test/CodeGen/CellSPU/immed64.ll
@@ -11,6 +11,9 @@
 ; RUN: grep      128 %t1.s | count 30 &&
 ; RUN: grep      224 %t1.s | count 2
 
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
 ;  1311768467750121234 => 0x 12345678 abcdef12 (4660,22136/43981,61202)
 ; 18446744073709551591 => 0x ffffffff ffffffe7 (-25)
 ; 18446744073708516742 => 0x ffffffff fff03586 (-1034874)
diff --git a/test/CodeGen/CellSPU/int2fp.ll b/test/CodeGen/CellSPU/int2fp.ll
index 95a4984..b4cfea8 100644
--- a/test/CodeGen/CellSPU/int2fp.ll
+++ b/test/CodeGen/CellSPU/int2fp.ll
@@ -7,6 +7,9 @@
 ; RUN: grep andi  %t1.s | count 1 &&
 ; RUN: grep ila   %t1.s | count 1
 
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
 define float @sitofp_i32(i32 %arg1) {
 	%A = sitofp i32 %arg1 to float		; <float> [#uses=1]
 	ret float %A
diff --git a/test/CodeGen/CellSPU/intrinsics_branch.ll b/test/CodeGen/CellSPU/intrinsics_branch.ll
new file mode 100644
index 0000000..5051cd5
--- /dev/null
+++ b/test/CodeGen/CellSPU/intrinsics_branch.ll
@@ -0,0 +1,150 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep ceq     %t1.s | count 30 &&
+; RUN: grep ceqb    %t1.s | count 10 &&
+; RUN: grep ceqhi   %t1.s | count 5 &&
+; RUN: grep ceqi    %t1.s | count 5 &&
+; RUN: grep cgt     %t1.s | count 30 &&
+; RUN: grep cgtb    %t1.s | count 10 &&
+; RUN: grep cgthi   %t1.s | count 5 &&
+; RUN: grep cgti    %t1.s | count 5
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+declare <4 x i32> @llvm.spu.si.shli(<4 x i32>, i8)
+
+declare <4 x i32> @llvm.spu.si.ceq(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.spu.si.ceqb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.spu.si.ceqh(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.spu.si.ceqi(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.ceqhi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.ceqbi(<16 x i8>, i8)
+
+declare <4 x i32> @llvm.spu.si.cgt(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.spu.si.cgtb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.spu.si.cgth(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.spu.si.cgti(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.cgthi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.cgtbi(<16 x i8>, i8)
+
+declare <4 x i32> @llvm.spu.si.clgt(<4 x i32>, <4 x i32>)
+declare <16 x i8> @llvm.spu.si.clgtb(<16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.spu.si.clgth(<8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.spu.si.clgti(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.clgthi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.clgtbi(<16 x i8>, i8)
+
+
+
+define <4 x i32> @test(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.shli(<4 x i32> %A, i8 3)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <4 x i32> @ceqtest(<4 x i32> %A, <4 x i32> %B) {
+	call <4 x i32> @llvm.spu.si.ceq(<4 x i32> %A, <4 x i32> %B)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @ceqhtest(<8 x i16> %A, <8 x i16> %B) {
+	call <8 x i16> @llvm.spu.si.ceqh(<8 x i16> %A, <8 x i16> %B)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @ceqbtest(<16 x i8> %A, <16 x i8> %B) {
+	call <16 x i8> @llvm.spu.si.ceqb(<16 x i8> %A, <16 x i8> %B)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
+
+define <4 x i32> @ceqitest(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.ceqi(<4 x i32> %A, i16 65)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @ceqhitest(<8 x i16> %A) {
+	call <8 x i16> @llvm.spu.si.ceqhi(<8 x i16> %A, i16 65)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @ceqbitest(<16 x i8> %A) {
+	call <16 x i8> @llvm.spu.si.ceqbi(<16 x i8> %A, i8 65)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
+
+define <4 x i32> @cgttest(<4 x i32> %A, <4 x i32> %B) {
+	call <4 x i32> @llvm.spu.si.cgt(<4 x i32> %A, <4 x i32> %B)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @cgthtest(<8 x i16> %A, <8 x i16> %B) {
+	call <8 x i16> @llvm.spu.si.cgth(<8 x i16> %A, <8 x i16> %B)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @cgtbtest(<16 x i8> %A, <16 x i8> %B) {
+	call <16 x i8> @llvm.spu.si.cgtb(<16 x i8> %A, <16 x i8> %B)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
+
+define <4 x i32> @cgtitest(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.cgti(<4 x i32> %A, i16 65)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @cgthitest(<8 x i16> %A) {
+	call <8 x i16> @llvm.spu.si.cgthi(<8 x i16> %A, i16 65)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @cgtbitest(<16 x i8> %A) {
+	call <16 x i8> @llvm.spu.si.cgtbi(<16 x i8> %A, i8 65)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
+
+define <4 x i32> @clgttest(<4 x i32> %A, <4 x i32> %B) {
+	call <4 x i32> @llvm.spu.si.clgt(<4 x i32> %A, <4 x i32> %B)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @clgthtest(<8 x i16> %A, <8 x i16> %B) {
+	call <8 x i16> @llvm.spu.si.clgth(<8 x i16> %A, <8 x i16> %B)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @clgtbtest(<16 x i8> %A, <16 x i8> %B) {
+	call <16 x i8> @llvm.spu.si.clgtb(<16 x i8> %A, <16 x i8> %B)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
+
+define <4 x i32> @clgtitest(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.clgti(<4 x i32> %A, i16 65)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @clgthitest(<8 x i16> %A) {
+	call <8 x i16> @llvm.spu.si.clgthi(<8 x i16> %A, i16 65)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
+
+define <16 x i8> @clgtbitest(<16 x i8> %A) {
+	call <16 x i8> @llvm.spu.si.clgtbi(<16 x i8> %A, i8 65)
+	%Y = bitcast <16 x i8> %1 to <16 x i8>
+	ret <16 x i8> %Y
+}
diff --git a/test/CodeGen/CellSPU/intrinsics_float.ll b/test/CodeGen/CellSPU/intrinsics_float.ll
new file mode 100644
index 0000000..f5a192a
--- /dev/null
+++ b/test/CodeGen/CellSPU/intrinsics_float.ll
@@ -0,0 +1,94 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep fa      %t1.s | count 5 &&
+; RUN: grep fs      %t1.s | count 5 &&
+; RUN: grep fm      %t1.s | count 15 &&
+; RUN: grep fceq    %t1.s | count 5 &&
+; RUN: grep fcmeq   %t1.s | count 5 &&
+; RUN: grep fcgt    %t1.s | count 5 &&
+; RUN: grep fcmgt   %t1.s | count 5 &&
+; RUN: grep fma     %t1.s | count 5 &&
+; RUN: grep fnms    %t1.s | count 5 &&
+; RUN: grep fms     %t1.s | count 5
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+declare <4 x i32> @llvm.spu.si.shli(<4 x i32>, i8)
+
+declare <4 x float> @llvm.spu.si.fa(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fs(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fm(<4 x float>, <4 x float>)
+
+declare <4 x float> @llvm.spu.si.fceq(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fcmeq(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fcgt(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fcmgt(<4 x float>, <4 x float>)
+
+declare <4 x float> @llvm.spu.si.fma(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fnms(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.spu.si.fms(<4 x float>, <4 x float>, <4 x float>)
+
+define <4 x i32> @test(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.shli(<4 x i32> %A, i8 3)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <4 x float> @fatest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fa(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fstest(<4 x float> %A, <4 x float> %B) {
+        call <4 x float> @llvm.spu.si.fs(<4 x float> %A, <4 x float> %B)
+        %Y = bitcast <4 x float> %1 to <4 x float>
+        ret <4 x float> %Y
+}
+
+define <4 x float> @fmtest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fm(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fceqtest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fceq(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fcmeqtest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fcmeq(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fcgttest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fcgt(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fcmgttest(<4 x float> %A, <4 x float> %B) {
+	call <4 x float> @llvm.spu.si.fcmgt(<4 x float> %A, <4 x float> %B)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fmatest(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+	call <4 x float> @llvm.spu.si.fma(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fnmstest(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+	call <4 x float> @llvm.spu.si.fnms(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+
+define <4 x float> @fmstest(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+	call <4 x float> @llvm.spu.si.fms(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+	%Y = bitcast <4 x float> %1 to <4 x float>
+	ret <4 x float> %Y
+}
+\ No newline at end of file
diff --git a/test/CodeGen/CellSPU/intrinsics_logical.ll b/test/CodeGen/CellSPU/intrinsics_logical.ll
new file mode 100644
index 0000000..e43558c
--- /dev/null
+++ b/test/CodeGen/CellSPU/intrinsics_logical.ll
@@ -0,0 +1,49 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep and       %t1.s | count 20 &&
+; RUN: grep andc      %t1.s | count 5
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+declare <4 x i32> @llvm.spu.si.and(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.andc(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.andi(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.andhi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.andbi(<16 x i8>, i8)
+
+declare <4 x i32> @llvm.spu.si.or(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.orc(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.ori(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.orhi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.orbi(<16 x i8>, i8)
+
+declare <4 x i32> @llvm.spu.si.xor(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.xori(<4 x i32>, i16)
+declare <8 x i16> @llvm.spu.si.xorhi(<8 x i16>, i16)
+declare <16 x i8> @llvm.spu.si.xorbi(<16 x i8>, i8)
+
+declare <4 x i32> @llvm.spu.si.nand(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.spu.si.nor(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @andtest(<4 x i32> %A, <4 x i32> %B) {
+	call <4 x i32> @llvm.spu.si.and(<4 x i32> %A, <4 x i32> %B)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <4 x i32> @andctest(<4 x i32> %A, <4 x i32> %B) {
+        call <4 x i32> @llvm.spu.si.andc(<4 x i32> %A, <4 x i32> %B)
+        %Y = bitcast <4 x i32> %1 to <4 x i32>
+        ret <4 x i32> %Y
+}
+
+define <4 x i32> @anditest(<4 x i32> %A) {
+	call <4 x i32> @llvm.spu.si.andi(<4 x i32> %A, i16 65)
+	%Y = bitcast <4 x i32> %1 to <4 x i32>
+	ret <4 x i32> %Y
+}
+
+define <8 x i16> @andhitest(<8 x i16> %A) {
+	call <8 x i16> @llvm.spu.si.andhi(<8 x i16> %A, i16 65)
+	%Y = bitcast <8 x i16> %1 to <8 x i16>
+	ret <8 x i16> %Y
+}
diff --git a/test/CodeGen/CellSPU/nand.ll b/test/CodeGen/CellSPU/nand.ll
index 091f4b2..841a3ec 100644
--- a/test/CodeGen/CellSPU/nand.ll
+++ b/test/CodeGen/CellSPU/nand.ll
@@ -3,6 +3,8 @@
 ; RUN: grep and    %t1.s | count 94
 ; RUN: grep xsbh   %t1.s | count 2
 ; RUN: grep xshw   %t1.s | count 4
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define <4 x i32> @nand_v4i32_1(<4 x i32> %arg1, <4 x i32> %arg2) {
         %A = and <4 x i32> %arg2, %arg1      ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/CellSPU/or_ops.ll b/test/CodeGen/CellSPU/or_ops.ll
index 6c46b41..91e3e21 100644
--- a/test/CodeGen/CellSPU/or_ops.ll
+++ b/test/CodeGen/CellSPU/or_ops.ll
@@ -4,6 +4,8 @@
 ; RUN: grep ori    %t1.s | count 30
 ; RUN: grep orhi   %t1.s | count 30
 ; RUN: grep orbi   %t1.s | count 15
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 ; OR instruction generation:
 define <4 x i32> @or_v4i32_1(<4 x i32> %arg1, <4 x i32> %arg2) {
diff --git a/test/CodeGen/CellSPU/rotate_ops.ll b/test/CodeGen/CellSPU/rotate_ops.ll
index 6983c18..0386838 100644
--- a/test/CodeGen/CellSPU/rotate_ops.ll
+++ b/test/CodeGen/CellSPU/rotate_ops.ll
@@ -8,6 +8,8 @@
 ; RUN grep rothi.*,.3    %t1.s | count 1
 ; RUN: grep andhi        %t1.s | count 4
 ; RUN: grep shlhi        %t1.s | count 4
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 ; Vector rotates are not currently supported in gcc or llvm assembly. These are
 ; not tested.
diff --git a/test/CodeGen/CellSPU/select_bits.ll b/test/CodeGen/CellSPU/select_bits.ll
index 3cbb7a0..b1600bf 100644
--- a/test/CodeGen/CellSPU/select_bits.ll
+++ b/test/CodeGen/CellSPU/select_bits.ll
@@ -3,6 +3,8 @@
 ; RUN: grep and    %t1.s | count 2
 ; RUN: grep xsbh   %t1.s | count 1
 ; RUN: grep xshw   %t1.s | count 2
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define <16 x i8> @selb_v16i8_1(<16 x i8> %arg1, <16 x i8> %arg2, <16 x i8> %arg3) {
 	%A = xor <16 x i8> %arg3, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
diff --git a/test/CodeGen/CellSPU/shift_ops.ll b/test/CodeGen/CellSPU/shift_ops.ll
index 162ca16..4256d91 100644
--- a/test/CodeGen/CellSPU/shift_ops.ll
+++ b/test/CodeGen/CellSPU/shift_ops.ll
@@ -5,6 +5,8 @@
 ; RUN: grep shli   %t1.s | count 51
 ; RUN: grep xshw   %t1.s | count 5
 ; RUN: grep and    %t1.s | count 5
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 ; Vector shifts are not currently supported in gcc or llvm assembly. These are
 ; not tested.
diff --git a/test/CodeGen/CellSPU/sp_farith.ll b/test/CodeGen/CellSPU/sp_farith.ll
index c7e7199..473e9a3 100644
--- a/test/CodeGen/CellSPU/sp_farith.ll
+++ b/test/CodeGen/CellSPU/sp_farith.ll
@@ -8,6 +8,8 @@
 ;
 ; This file includes standard floating point arithmetic instructions
 ; NOTE fdiv is tested separately since it is a compound operation
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
 
 define float @fp_add(float %arg1, float %arg2) {
 	%A = add float %arg1, %arg2 	; <float> [#uses=1]
diff --git a/test/CodeGen/CellSPU/struct_1.ll b/test/CodeGen/CellSPU/struct_1.ll
new file mode 100644
index 0000000..1159b55
--- /dev/null
+++ b/test/CodeGen/CellSPU/struct_1.ll
@@ -0,0 +1,107 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep lqa     %t1.s | count 10 &&
+; RUN: grep lqd     %t1.s | count 2 &&
+; RUN: grep rotqbyi %t1.s | count 5 &&
+; RUN: grep xshw    %t1.s | count 1 &&
+; RUN: grep andi    %t1.s | count 4 &&
+; RUN: grep cbd     %t1.s | count 3 &&
+; RUN: grep chd     %t1.s | count 1 &&
+; RUN: grep cwd     %t1.s | count 1 &&
+; RUN: grep shufb   %t1.s | count 5 &&
+; RUN: grep stqa    %t1.s | count 5
+; ModuleID = 'struct_1.bc'
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+; struct hackstate {
+;   unsigned char c1;   // offset 0 (rotate left by 13 bytes to byte 3)
+;   unsigned char c2;   // offset 1 (rotate left by 14 bytes to byte 3)
+;   unsigned char c3;   // offset 2 (rotate left by 15 bytes to byte 3)
+;   int           i1;   // offset 4 (rotate left by 4 bytes to byte 0)
+;   short         s1;   // offset 8 (rotate left by 6 bytes to byte 2)
+;   int           i2;   // offset 12 [ignored]
+;   unsigned char c4;   // offset 16 [ignored]
+;   unsigned char c5;   // offset 17 [ignored]
+;   unsigned char c6;   // offset 18 [ignored]
+;   unsigned char c7;   // offset 19 (no rotate, in preferred slot)
+;   int           i3;   // offset 20 [ignored]
+;   int           i4;   // offset 24 [ignored]
+;   int           i5;   // offset 28 [ignored]
+;   int           i6;   // offset 32 (no rotate, in preferred slot)
+; }
+%struct.hackstate = type { i8, i8, i8, i32, i16, i32, i8, i8, i8, i8, i32, i32, i32, i32 }
+
+; struct hackstate state = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+@state = global %struct.hackstate zeroinitializer, align 16
+
+define i8 @get_hackstate_c1() zeroext  {
+entry:
+        %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 0), align 16
+        ret i8 %tmp2
+}
+
+define i8 @get_hackstate_c2() zeroext  {
+entry:
+        %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 1), align 16
+        ret i8 %tmp2
+}
+
+define i8 @get_hackstate_c3() zeroext  {
+entry:
+        %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 2), align 16
+        ret i8 %tmp2
+}
+
+define i32 @get_hackstate_i1() {
+entry:
+        %tmp2 = load i32* getelementptr (%struct.hackstate* @state, i32 0, i32 3), align 16
+        ret i32 %tmp2
+}
+
+define i16 @get_hackstate_s1() signext  {
+entry:
+        %tmp2 = load i16* getelementptr (%struct.hackstate* @state, i32 0, i32 4), align 16
+        ret i16 %tmp2
+}
+
+define i8 @get_hackstate_c7() zeroext  {
+entry:
+        %tmp2 = load i8* getelementptr (%struct.hackstate* @state, i32 0, i32 9), align 16
+        ret i8 %tmp2
+}
+
+define i32 @get_hackstate_i6() zeroext  {
+entry:
+        %tmp2 = load i32* getelementptr (%struct.hackstate* @state, i32 0, i32 13), align 16
+        ret i32 %tmp2
+}
+
+define void @set_hackstate_c1(i8 zeroext  %c) {
+entry:
+        store i8 %c, i8* getelementptr (%struct.hackstate* @state, i32 0, i32 0), align 16
+        ret void
+}
+
+define void @set_hackstate_c2(i8 zeroext  %c) {
+entry:
+        store i8 %c, i8* getelementptr (%struct.hackstate* @state, i32 0, i32 1), align 16
+        ret void
+}
+
+define void @set_hackstate_c3(i8 zeroext  %c) {
+entry:
+        store i8 %c, i8* getelementptr (%struct.hackstate* @state, i32 0, i32 2), align 16
+        ret void
+}
+
+define void @set_hackstate_i1(i32 %i) {
+entry:
+        store i32 %i, i32* getelementptr (%struct.hackstate* @state, i32 0, i32 3), align 16
+        ret void
+}
+
+define void @set_hackstate_s1(i16 signext  %s) {
+entry:
+        store i16 %s, i16* getelementptr (%struct.hackstate* @state, i32 0, i32 4), align 16
+        ret void
+}