61 files changed, 1910 insertions, 543 deletions
diff --git a/test/Transforms/BBVectorize/no-ldstr-conn.ll b/test/Transforms/BBVectorize/no-ldstr-conn.ll
new file mode 100644
index 0000000..ada2a71
--- /dev/null
+++ b/test/Transforms/BBVectorize/no-ldstr-conn.ll
@@ -0,0 +1,23 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s
+
+; Make sure that things (specifically getelementptr) are not connected to loads
+; and stores via the address operand (which would be bad because the address
+; is really a scalar even after vectorization)
+define i64 @test2(i64 %a) nounwind uwtable readonly {
+entry:
+  %a1 = inttoptr i64 %a to i64*
+  %a2 = getelementptr i64* %a1, i64 1
+  %a3 = getelementptr i64* %a1, i64 2
+  %v2 = load i64* %a2, align 8
+  %v3 = load i64* %a3, align 8
+  %v2a = add i64 %v2, 5
+  %v3a = add i64 %v3, 7
+  store i64 %v2a, i64* %a2, align 8
+  store i64 %v3a, i64* %a3, align 8
+  %r = add i64 %v2, %v3
+  ret i64 %r
+; CHECK: @test2
+; CHECK-NOT: getelementptr <2 x i64*>
+}
+
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index b2ef27b..6844977 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -3,6 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 declare double @llvm.fma.f64(double, double, double)
 declare double @llvm.cos.f64(double)
+declare double @llvm.powi.f64(double, i32)
 
 ; Basic depth-3 chain with fma
 define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
@@ -54,6 +55,49 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: ret double %R
 }
 
+; Basic depth-3 chain with powi
+define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
+
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
+	%Y2 = call double @llvm.powi.f64(double %X2, i32 %P)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test3
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with powi (different powers: should not vectorize)
+define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
+
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+        %P2 = add i32 %P, 1
+	%Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
+	%Y2 = call double @llvm.powi.f64(double %X2, i32 %P2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test4
+; CHECK-NOT: <2 x double>
+; CHECK: ret double %R
+}
+
 ; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
 ; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
+; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) nounwind readonly
 
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
new file mode 100644
index 0000000..f992d41
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
@@ -0,0 +1,81 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
+
+; Simple 3-pair chain also with loads and stores (using ptrs and gep)
+define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load i64* %a, align 8
+  %i1 = load i64* %b, align 8
+  %mul = mul i64 %i0, %i1
+  %arrayidx3 = getelementptr inbounds i64* %a, i64 1
+  %i3 = load i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64* %b, i64 1
+  %i4 = load i64* %arrayidx4, align 8
+  %mul5 = mul i64 %i3, %i4
+  %ptr = inttoptr i64 %mul to double*
+  %ptr5 = inttoptr i64 %mul5 to double*
+  %aptr = getelementptr inbounds double* %ptr, i64 2
+  %aptr5 = getelementptr inbounds double* %ptr5, i64 3
+  %av = load double* %aptr, align 16
+  %av5 = load double* %aptr5, align 16
+  %r = fmul double %av, %av5
+  store i64 %mul, i64* %c, align 8
+  %arrayidx5 = getelementptr inbounds i64* %c, i64 1
+  store i64 %mul5, i64* %arrayidx5, align 8
+  ret double %r
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast i64* %a to <2 x i64>*
+; CHECK: %i1.v.i0 = bitcast i64* %b to <2 x i64>*
+; CHECK: %i0 = load <2 x i64>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x i64>* %i1.v.i0, align 8
+; CHECK: %mul = mul <2 x i64> %i0, %i1
+; CHECK: %ptr = inttoptr <2 x i64> %mul to <2 x double*>
+; CHECK: %aptr = getelementptr inbounds <2 x double*> %ptr, <2 x i64> <i64 2, i64 3>
+; CHECK: %aptr.v.r1 = extractelement <2 x double*> %aptr, i32 0
+; CHECK: %aptr.v.r2 = extractelement <2 x double*> %aptr, i32 1
+; CHECK: %av = load double* %aptr.v.r1, align 16
+; CHECK: %av5 = load double* %aptr.v.r2, align 16
+; CHECK: %r = fmul double %av, %av5
+; CHECK: %0 = bitcast i64* %c to <2 x i64>*
+; CHECK: store <2 x i64> %mul, <2 x i64>* %0, align 8
+; CHECK: ret double %r
+; CHECK-AO: @test1
+; CHECK-AO-NOT: load <2 x
+}
+
+; Simple 3-pair chain with loads and stores (using ptrs and gep)
+define void @test2(i64** %a, i64** %b, i64** %c) nounwind uwtable readonly {
+entry:
+  %i0 = load i64** %a, align 8
+  %i1 = load i64** %b, align 8
+  %arrayidx3 = getelementptr inbounds i64** %a, i64 1
+  %i3 = load i64** %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64** %b, i64 1
+  %i4 = load i64** %arrayidx4, align 8
+  %o1 = load i64* %i1, align 8
+  %o4 = load i64* %i4, align 8
+  %ptr0 = getelementptr inbounds i64* %i0, i64 %o1
+  %ptr3 = getelementptr inbounds i64* %i3, i64 %o4
+  store i64* %ptr0, i64** %c, align 8
+  %arrayidx5 = getelementptr inbounds i64** %c, i64 1
+  store i64* %ptr3, i64** %arrayidx5, align 8
+  ret void
+; CHECK: @test2
+; CHECK: %i0.v.i0 = bitcast i64** %a to <2 x i64*>*
+; CHECK: %i1 = load i64** %b, align 8
+; CHECK: %i0 = load <2 x i64*>* %i0.v.i0, align 8
+; CHECK: %arrayidx4 = getelementptr inbounds i64** %b, i64 1
+; CHECK: %i4 = load i64** %arrayidx4, align 8
+; CHECK: %o1 = load i64* %i1, align 8
+; CHECK: %o4 = load i64* %i4, align 8
+; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
+; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
+; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %i0, <2 x i64> %ptr0.v.i1.2
+; CHECK: %0 = bitcast i64** %c to <2 x i64*>*
+; CHECK: store <2 x i64*> %ptr0, <2 x i64*>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test2
+; CHECK-AO-NOT: <2 x
+}
+
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
new file mode 100644
index 0000000..4daa571
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -0,0 +1,30 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain with select
+define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+        %Z1 = select i1 %C1, double %Y1, double %B1
+        %Z2 = select i1 %C2, double %Y2, double %B2
+; CHECK: %Z1.v.i0.1 = insertelement <2 x i1> undef, i1 %C1, i32 0
+; CHECK: %Z1.v.i0.2 = insertelement <2 x i1> %Z1.v.i0.1, i1 %C2, i32 1
+; CHECK: %Z1 = select <2 x i1> %Z1.v.i0.2, <2 x double> %Y1, <2 x double> %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+
diff --git a/test/Transforms/GVN/pre-compare.ll b/test/Transforms/GVN/pre-compare.ll
new file mode 100644
index 0000000..18d0c2e
--- /dev/null
+++ b/test/Transforms/GVN/pre-compare.ll
@@ -0,0 +1,68 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+
+; C source:
+;
+;   void f(int x) {
+;     if (x != 1)
+;       puts (x == 2 ? "a" : "b");
+;     for (;;) {
+;       puts("step 1");
+;       if (x == 2)
+;         continue;
+;       printf("step 2: %d\n", x);
+;     }
+;   }
+;
+; If we PRE %cmp3, CodeGenPrepare won't be able to sink the compare down to its
+; uses, and we are forced to keep both %x and %cmp3 in registers in the loop.
+;
+; It is just as cheap to recompute the icmp against %x as it is to compare a
+; GPR against 0. On x86-64, the br i1 %cmp3 becomes:
+;
+;   testb %r12b, %r12b
+;   jne	LBB0_3
+;
+; The sunk icmp is:
+;
+;   cmpl $2, %ebx
+;   je	LBB0_3
+;
+; This is just as good, and it doesn't require a separate register.
+;
+; CHECK-NOT: phi i1
+
+@.str = private unnamed_addr constant [2 x i8] c"a\00", align 1
+@.str1 = private unnamed_addr constant [2 x i8] c"b\00", align 1
+@.str2 = private unnamed_addr constant [7 x i8] c"step 1\00", align 1
+@.str3 = private unnamed_addr constant [12 x i8] c"step 2: %d\0A\00", align 1
+
+define void @f(i32 %x) noreturn nounwind uwtable ssp {
+entry:
+  %cmp = icmp eq i32 %x, 1
+  br i1 %cmp, label %for.cond.preheader, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp1 = icmp eq i32 %x, 2
+  %cond = select i1 %cmp1, i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)
+  %call = tail call i32 @puts(i8* %cond) nounwind
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry, %if.then
+  %cmp3 = icmp eq i32 %x, 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.backedge, %for.cond.preheader
+  %call2 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str2, i64 0, i64 0)) nounwind
+  br i1 %cmp3, label %for.cond.backedge, label %if.end5
+
+if.end5:                                          ; preds = %for.cond
+  %call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str3, i64 0, i64 0), i32 %x) nounwind
+  br label %for.cond.backedge
+
+for.cond.backedge:                                ; preds = %if.end5, %for.cond
+  br label %for.cond
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/Transforms/GlobalOpt/zeroinitializer-gep-load.ll b/test/Transforms/GlobalOpt/zeroinitializer-gep-load.ll
new file mode 100644
index 0000000..d613601
--- /dev/null
+++ b/test/Transforms/GlobalOpt/zeroinitializer-gep-load.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -S -globalopt | FileCheck %s
+
+@zero = internal global [10 x i32] zeroinitializer
+
+define i32 @test1(i64 %idx) nounwind {
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero, i64 0, i64 %idx
+  %l = load i32* %arrayidx
+  ret i32 %l
+; CHECK: @test1
+; CHECK: ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
index 4ad63aa..af9f1b3 100644
--- a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
+++ b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
+; RUN: opt < %s -indvars -S | FileCheck %s
 ; Test WidenIV::GetExtendedOperandRecurrence.
 ; add219 should be extended to i64 because it is nsw, even though its
 ; sext cannot be hoisted outside the loop.
diff --git a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
index c7809b8..c0c508f 100644
--- a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
+++ b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite=false "-default-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
-; RUN: opt < %s -indvars -S -enable-iv-rewrite=true  "-default-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
-; RUN: opt < %s -indvars -S -enable-iv-rewrite=false "-default-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
-; RUN: opt < %s -indvars -S -enable-iv-rewrite=true  "-default-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
+; RUN: opt < %s -indvars -S "-default-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
+; RUN: opt < %s -indvars -S "-default-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
 ;
 ; PR11279: Assertion !IVLimit->getType()->isPointerTy()
 ;
diff --git a/test/Transforms/IndVarSimplify/ada-loops.ll b/test/Transforms/IndVarSimplify/ada-loops.ll
index 154de6f..c093298 100644
--- a/test/Transforms/IndVarSimplify/ada-loops.ll
+++ b/test/Transforms/IndVarSimplify/ada-loops.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -indvars -S | FileCheck %s
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
 ;
 ; PR1301
 
diff --git a/test/Transforms/IndVarSimplify/complex-scev.ll b/test/Transforms/IndVarSimplify/complex-scev.ll
deleted file mode 100644
index 395377e..0000000
--- a/test/Transforms/IndVarSimplify/complex-scev.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; The i induction variable looks like a wrap-around, but it really is just
-; a simple affine IV.  Make sure that indvars eliminates it.
-
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; CHECK: phi
-; CHECK-NOT: phi
-
-define void @foo() {
-entry:
-        br label %bb6
-
-bb6:            ; preds = %cond_true, %entry
-        %j.0 = phi i32 [ 1, %entry ], [ %tmp5, %cond_true ]             ; <i32> [#uses=3]
-        %i.0 = phi i32 [ 0, %entry ], [ %j.0, %cond_true ]              ; <i32> [#uses=1]
-        %tmp7 = call i32 (...)* @foo2( )                ; <i32> [#uses=1]
-        %tmp = icmp ne i32 %tmp7, 0             ; <i1> [#uses=1]
-        br i1 %tmp, label %cond_true, label %return
-
-cond_true:              ; preds = %bb6
-        %tmp2 = call i32 (...)* @bar( i32 %i.0, i32 %j.0 )              ; <i32> [#uses=0]
-        %tmp5 = add i32 %j.0, 1         ; <i32> [#uses=1]
-        br label %bb6
-
-return:         ; preds = %bb6
-        ret void
-}
-
-declare i32 @bar(...)
-
-declare i32 @foo2(...)
-
diff --git a/test/Transforms/IndVarSimplify/elim-extend.ll b/test/Transforms/IndVarSimplify/elim-extend.ll
index 43c162f..ad5679f 100644
--- a/test/Transforms/IndVarSimplify/elim-extend.ll
+++ b/test/Transforms/IndVarSimplify/elim-extend.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
+; RUN: opt < %s -indvars -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll b/test/Transforms/IndVarSimplify/gep-with-mul-base.ll
deleted file mode 100644
index 7e1e2a3..0000000
--- a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; CHECK: define void @foo
-; CHECK: mul
-; CHECK: mul
-; CHECK: mul
-; CHECK: add
-; CHECK: sub
-; CHECK: define void @bar
-; CHECK: mul
-; CHECK: mul
-; CHECK: mul
-; CHECK: add
-; CHECK: sub
-
-define void @foo(i64 %n, i64 %m, i64 %o, double* nocapture %p) nounwind {
-entry:
-	%tmp = icmp sgt i64 %n, 0		; <i1> [#uses=1]
-	br i1 %tmp, label %bb.nph, label %return
-
-bb.nph:		; preds = %entry
-	%tmp1 = mul i64 %n, 37		; <i64> [#uses=1]
-	%tmp2 = mul i64 %tmp1, %m		; <i64> [#uses=1]
-	%tmp3 = mul i64 %tmp2, %o		; <i64> [#uses=1]
-	br label %bb
-
-bb:		; preds = %bb, %bb.nph
-	%i.01 = phi i64 [ %tmp3, %bb.nph ], [ %tmp13, %bb ]		; <i64> [#uses=3]
-	%tmp9 = getelementptr double* %p, i64 %i.01		; <double*> [#uses=1]
-	%tmp10 = load double* %tmp9, align 8		; <double> [#uses=1]
-	%tmp11 = fdiv double %tmp10, 2.100000e+00		; <double> [#uses=1]
-	store double %tmp11, double* %tmp9, align 8
-	%tmp13 = add i64 %i.01, 1		; <i64> [#uses=2]
-	%tmp14 = icmp slt i64 %tmp13, %n		; <i1> [#uses=1]
-	br i1 %tmp14, label %bb, label %return.loopexit
-
-return.loopexit:		; preds = %bb
-	br label %return
-
-return:		; preds = %return.loopexit, %entry
-	ret void
-}
-define void @bar(i64 %n, i64 %m, i64 %o, i64 %q, double* nocapture %p) nounwind {
-entry:
-	%tmp = icmp sgt i64 %n, 0		; <i1> [#uses=1]
-	br i1 %tmp, label %bb.nph, label %return
-
-bb.nph:		; preds = %entry
-	%tmp1 = mul i64 %n, %q		; <i64> [#uses=1]
-	%tmp2 = mul i64 %tmp1, %m		; <i64> [#uses=1]
-	%tmp3 = mul i64 %tmp2, %o		; <i64> [#uses=1]
-	br label %bb
-
-bb:		; preds = %bb, %bb.nph
-	%i.01 = phi i64 [ %tmp3, %bb.nph ], [ %tmp13, %bb ]		; <i64> [#uses=3]
-	%tmp9 = getelementptr double* %p, i64 %i.01		; <double*> [#uses=1]
-	%tmp10 = load double* %tmp9, align 8		; <double> [#uses=1]
-	%tmp11 = fdiv double %tmp10, 2.100000e+00		; <double> [#uses=1]
-	store double %tmp11, double* %tmp9, align 8
-	%tmp13 = add i64 %i.01, 1		; <i64> [#uses=2]
-	%tmp14 = icmp slt i64 %tmp13, %n		; <i1> [#uses=1]
-	br i1 %tmp14, label %bb, label %return.loopexit
-
-return.loopexit:		; preds = %bb
-	br label %return
-
-return:		; preds = %return.loopexit, %entry
-	ret void
-}
diff --git a/test/Transforms/IndVarSimplify/iv-fold.ll b/test/Transforms/IndVarSimplify/iv-fold.ll
index 2e19118..e0b05cd 100644
--- a/test/Transforms/IndVarSimplify/iv-fold.ll
+++ b/test/Transforms/IndVarSimplify/iv-fold.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
+; RUN: opt < %s -indvars -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
 
diff --git a/test/Transforms/IndVarSimplify/iv-zext.ll b/test/Transforms/IndVarSimplify/iv-zext.ll
index 646e6c0..2e0f70c 100644
--- a/test/Transforms/IndVarSimplify/iv-zext.ll
+++ b/test/Transforms/IndVarSimplify/iv-zext.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -indvars -S | FileCheck %s
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
 ; CHECK-NOT: and
 ; CHECK-NOT: zext
 
diff --git a/test/Transforms/IndVarSimplify/lftr-reuse.ll b/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 490eee9..9abfe13 100644
--- a/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/test/Transforms/IndVarSimplify/lftr-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
+; RUN: opt < %s -indvars -S | FileCheck %s
 ;
 ; Make sure that indvars can perform LFTR without a canonical IV.
 
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index 23fdc87..bfdd000 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
+; RUN: opt < %s -indvars -S | FileCheck %s
 ;
 ; Make sure that indvars isn't inserting canonical IVs.
 ; This is kinda hard to do until linear function test replacement is removed.
diff --git a/test/Transforms/IndVarSimplify/preserve-gep-nested.ll b/test/Transforms/IndVarSimplify/preserve-gep-nested.ll
deleted file mode 100644
index cdcaaa0..0000000
--- a/test/Transforms/IndVarSimplify/preserve-gep-nested.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; No explicit integer multiplications!
-; No i8* arithmetic or pointer casting anywhere!
-; CHECK-NOT: = {{= mul|i8\*|bitcast|inttoptr|ptrtoint}}
-; Exactly one getelementptr for each load+store.
-; Each getelementptr using %struct.Q* %s as a base and not i8*.
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK: getelementptr %struct.Q* %s,
-; CHECK-NOT: = {{= mul|i8\*|bitcast|inttoptr|ptrtoint}}
-
-; FIXME: This test should pass with or without TargetData. Until opt
-; supports running tests without targetdata, just hardware this in.
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
-
-%struct.Q = type { [10 x %struct.N] }
-%struct.N = type { %struct.S }
-%struct.S = type { [100 x double], [100 x double] }
-
-define void @foo(%struct.Q* %s, i64 %n) nounwind {
-entry:
-  br label %bb1
-
-bb1:
-  %i = phi i64 [ 2, %entry ], [ %i.next, %bb ]
-  %j = phi i64 [ 0, %entry ], [ %j.next, %bb ]
-  %t5 = icmp slt i64 %i, %n
-  br i1 %t5, label %bb, label %return
-
-bb:
-  %t0 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 0, i64 %i
-  %t1 = load double* %t0, align 8
-  %t2 = fmul double %t1, 3.200000e+00
-  %t3 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 0, i64 %i
-  store double %t2, double* %t3, align 8
-
-  %s0 = getelementptr inbounds %struct.Q* %s, i64 13, i32 0, i64 7, i32 0, i32 1, i64 %i
-  %s1 = load double* %s0, align 8
-  %s2 = fmul double %s1, 3.200000e+00
-  %s3 = getelementptr inbounds %struct.Q* %s, i64 13, i32 0, i64 7, i32 0, i32 1, i64 %i
-  store double %s2, double* %s3, align 8
-
-  %u0 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 7, i32 0, i32 1, i64 %j
-  %u1 = load double* %u0, align 8
-  %u2 = fmul double %u1, 3.200000e+00
-  %u3 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 7, i32 0, i32 1, i64 %j
-  store double %u2, double* %u3, align 8
-
-  %v0 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 1, i64 %i
-  %v1 = load double* %v0, align 8
-  %v2 = fmul double %v1, 3.200000e+00
-  %v3 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 1, i64 %i
-  store double %v2, double* %v3, align 8
-
-  %w0 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 0, i64 %j
-  %w1 = load double* %w0, align 8
-  %w2 = fmul double %w1, 3.200000e+00
-  %w3 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 0, i32 0, i32 0, i64 %j
-  store double %w2, double* %w3, align 8
-
-  %x0 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 3, i32 0, i32 0, i64 %i
-  %x1 = load double* %x0, align 8
-  %x2 = fmul double %x1, 3.200000e+00
-  %x3 = getelementptr inbounds %struct.Q* %s, i64 0, i32 0, i64 3, i32 0, i32 0, i64 %i
-  store double %x2, double* %x3, align 8
-
-  %i.next = add i64 %i, 1
-  %j.next = add i64 %j, 1
-  br label %bb1
-
-return:
-  ret void
-}
diff --git a/test/Transforms/IndVarSimplify/preserve-gep-remainder.ll b/test/Transforms/IndVarSimplify/preserve-gep-remainder.ll
deleted file mode 100644
index a62943d..0000000
--- a/test/Transforms/IndVarSimplify/preserve-gep-remainder.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; CHECK: %p.2.ip.1 = getelementptr [3 x [3 x double]]* %p, i64 2, i64 %0, i64 1
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n32:64"
-
-; Indvars shouldn't expand this to
-;   %p.2.ip.1 = getelementptr [3 x [3 x double]]* %p, i64 0, i64 %tmp, i64 19
-; or something. That's valid, but more obscure.
-
-define void @foo([3 x [3 x double]]* noalias %p) nounwind {
-entry:
-  br label %loop
-
-loop:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
-  %ip = add i64 %i, 1
-  %p.2.ip.1 = getelementptr [3 x [3 x double]]* %p, i64 2, i64 %ip, i64 1
-  store volatile double 0.0, double* %p.2.ip.1
-  %i.next = add i64 %i, 1
-  br label %loop
-}
diff --git a/test/Transforms/IndVarSimplify/preserve-gep.ll b/test/Transforms/IndVarSimplify/preserve-gep.ll
deleted file mode 100644
index fec8a28..0000000
--- a/test/Transforms/IndVarSimplify/preserve-gep.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; CHECK-NOT: {{ptrtoint|inttoptr}}
-; CHECK: getelementptr
-; CHECK-NOT: {{ptrtoint|inttoptr|getelementptr}}
-
-; Indvars shouldn't leave getelementptrs expanded out as
-; inttoptr+ptrtoint in its output in common cases.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
-target triple = "x86_64-unknown-linux-gnu"
-	%struct.Foo = type { i32, i32, [10 x i32], i32 }
-
-define void @me(%struct.Foo* nocapture %Bar) nounwind {
-entry:
-	br i1 false, label %return, label %bb.nph
-
-bb.nph:		; preds = %entry
-	br label %bb
-
-bb:		; preds = %bb1, %bb.nph
-	%i.01 = phi i64 [ %4, %bb1 ], [ 0, %bb.nph ]		; <i64> [#uses=3]
-	%0 = getelementptr %struct.Foo* %Bar, i64 %i.01, i32 2, i64 3		; <i32*> [#uses=1]
-	%1 = load i32* %0, align 4		; <i32> [#uses=1]
-	%2 = mul i32 %1, 113		; <i32> [#uses=1]
-	%3 = getelementptr %struct.Foo* %Bar, i64 %i.01, i32 2, i64 3		; <i32*> [#uses=1]
-	store i32 %2, i32* %3, align 4
-	%4 = add i64 %i.01, 1		; <i64> [#uses=2]
-	br label %bb1
-
-bb1:		; preds = %bb
-	%phitmp = icmp sgt i64 %4, 19999		; <i1> [#uses=1]
-	br i1 %phitmp, label %bb1.return_crit_edge, label %bb
-
-bb1.return_crit_edge:		; preds = %bb1
-	br label %return
-
-return:		; preds = %bb1.return_crit_edge, %entry
-	ret void
-}
diff --git a/test/Transforms/IndVarSimplify/preserve-signed-wrap.ll b/test/Transforms/IndVarSimplify/preserve-signed-wrap.ll
index 22e2092..f619e8d 100644
--- a/test/Transforms/IndVarSimplify/preserve-signed-wrap.ll
+++ b/test/Transforms/IndVarSimplify/preserve-signed-wrap.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -indvars -S | FileCheck %s
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -S | FileCheck %s
 
 ; Indvars should insert a 64-bit induction variable to eliminate the
 ; sext for the addressing, however it shouldn't eliminate the sext
diff --git a/test/Transforms/IndVarSimplify/variable-stride-ivs-0.ll b/test/Transforms/IndVarSimplify/variable-stride-ivs-0.ll
index fc906cd..fb9ef22 100644
--- a/test/Transforms/IndVarSimplify/variable-stride-ivs-0.ll
+++ b/test/Transforms/IndVarSimplify/variable-stride-ivs-0.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -indvars -instcombine -S | FileCheck %s
-; RUN: opt < %s -indvars -enable-iv-rewrite=false -instcombine -S | FileCheck %s
 ;
 ; Test that -indvars can reduce variable stride IVs.  If it can reduce variable
 ; stride iv's, it will make %iv. and %m.0.0 isomorphic to each other without
diff --git a/test/Transforms/Inline/2008-09-02-AlwaysInline.ll b/test/Transforms/Inline/2008-09-02-AlwaysInline.ll
deleted file mode 100644
index 39095c4..0000000
--- a/test/Transforms/Inline/2008-09-02-AlwaysInline.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: opt < %s  -inline-threshold=0 -inline -S | not grep call 
-
-define i32 @fn2() alwaysinline {
-  ret i32 1
-}
-
-define i32 @fn3() {
-   %r = call i32 @fn2()
-   ret i32 %r
-}
diff --git a/test/Transforms/Inline/2008-10-30-AlwaysInline.ll b/test/Transforms/Inline/2008-10-30-AlwaysInline.ll
deleted file mode 100644
index 11e5012..0000000
--- a/test/Transforms/Inline/2008-10-30-AlwaysInline.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt < %s -always-inline -S | not grep call 
-
-; Ensure that threshold doesn't disrupt always inline.
-; RUN: opt < %s -inline-threshold=-2000000001 -always-inline -S | not grep call 
-
-
-define internal i32 @if0() alwaysinline {
-       ret i32 1 
-}
-
-define i32 @f0() {
-       %r = call i32 @if0()
-       ret i32 %r
-}
diff --git a/test/Transforms/Inline/2008-11-04-AlwaysInline.ll b/test/Transforms/Inline/2008-11-04-AlwaysInline.ll
deleted file mode 100644
index bc9787b..0000000
--- a/test/Transforms/Inline/2008-11-04-AlwaysInline.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: opt < %s -always-inline -S | grep {@foo}
-; Ensure that foo is not removed by always inliner
-; PR 2945
-
-define internal i32 @foo() nounwind {
-  ret i32 0
-}
diff --git a/test/Transforms/Inline/alloca-bonus.ll b/test/Transforms/Inline/alloca-bonus.ll
index fb4062f..d04d54e 100644
--- a/test/Transforms/Inline/alloca-bonus.ll
+++ b/test/Transforms/Inline/alloca-bonus.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -inline < %s -S -o - -inline-threshold=8 | FileCheck %s
 
+target datalayout = "p:32:32"
+
 declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
 
 @glbl = external global i32
@@ -15,8 +17,8 @@ define void @outer1() {
 define void @inner1(i32 *%ptr) {
   %A = load i32* %ptr
   store i32 0, i32* %ptr
-  %C = getelementptr i32* %ptr, i32 0
-  %D = getelementptr i32* %ptr, i32 1
+  %C = getelementptr inbounds i32* %ptr, i32 0
+  %D = getelementptr inbounds i32* %ptr, i32 1
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
   call void @llvm.lifetime.start(i64 0, i8* %E)
@@ -35,8 +37,8 @@ define void @outer2() {
 define void @inner2(i32 *%ptr) {
   %A = load i32* %ptr
   store i32 0, i32* %ptr
-  %C = getelementptr i32* %ptr, i32 0
-  %D = getelementptr i32* %ptr, i32 %A
+  %C = getelementptr inbounds i32* %ptr, i32 0
+  %D = getelementptr inbounds i32* %ptr, i32 %A
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
   call void @llvm.lifetime.start(i64 0, i8* %E)
@@ -90,12 +92,12 @@ define void @outer4(i32 %A) {
   ret void
 }
 
-; %D poisons this call, scalar-repl can't handle that instruction. However, we
+; %B poisons this call, scalar-repl can't handle that instruction. However, we
 ; still want to detect that the icmp and branch *can* be handled.
 define void @inner4(i32 *%ptr, i32 %A) {
-  %B = getelementptr i32* %ptr, i32 %A
-  %E = icmp eq i32* %ptr, null
-  br i1 %E, label %bb.true, label %bb.false
+  %B = getelementptr inbounds i32* %ptr, i32 %A
+  %C = icmp eq i32* %ptr, null
+  br i1 %C, label %bb.true, label %bb.false
 bb.true:
   ; This block musn't be counted in the inline cost.
   %t1 = load i32* %ptr
@@ -122,3 +124,32 @@ bb.true:
 bb.false:
   ret void
 }
+
+define void @outer5() {
+; CHECK: @outer5
+; CHECK-NOT: call void @inner5
+  %ptr = alloca i32
+  call void @inner5(i1 false, i32* %ptr)
+  ret void
+}
+
+; %D poisons this call, scalar-repl can't handle that instruction. However, if
+; the flag is set appropriately, the poisoning instruction is inside of dead
+; code, and so shouldn't be counted.
+define void @inner5(i1 %flag, i32 *%ptr) {
+  %A = load i32* %ptr
+  store i32 0, i32* %ptr
+  %C = getelementptr inbounds i32* %ptr, i32 0
+  br i1 %flag, label %if.then, label %exit
+
+if.then:
+  %D = getelementptr inbounds i32* %ptr, i32 %A
+  %E = bitcast i32* %ptr to i8*
+  %F = select i1 false, i32* %ptr, i32* @glbl
+  call void @llvm.lifetime.start(i64 0, i8* %E)
+  ret void
+
+exit:
+  ret void
+}
+
diff --git a/test/Transforms/Inline/always-inline.ll b/test/Transforms/Inline/always-inline.ll
new file mode 100644
index 0000000..e0be41f
--- /dev/null
+++ b/test/Transforms/Inline/always-inline.ll
@@ -0,0 +1,125 @@
+; RUN: opt < %s -inline-threshold=0 -always-inline -S | FileCheck %s
+;
+; Ensure the threshold has no impact on these decisions.
+; RUN: opt < %s -inline-threshold=20000000 -always-inline -S | FileCheck %s
+; RUN: opt < %s -inline-threshold=-20000000 -always-inline -S | FileCheck %s
+
+define i32 @inner1() alwaysinline {
+  ret i32 1
+}
+define i32 @outer1() {
+; CHECK: @outer1
+; CHECK-NOT: call
+; CHECK: ret
+
+   %r = call i32 @inner1()
+   ret i32 %r
+}
+
+; The always inliner can't DCE internal functions. PR2945
+; CHECK: @pr2945
+define internal i32 @pr2945() nounwind {
+  ret i32 0
+}
+
+define internal void @inner2(i32 %N) alwaysinline {
+  %P = alloca i32, i32 %N
+  ret void
+}
+define void @outer2(i32 %N) {
+; The always inliner (unlike the normal one) should be willing to inline
+; a function with a dynamic alloca into one without a dynamic alloca.
+; rdar://6655932
+;
+; CHECK: @outer2
+; CHECK-NOT: call void @inner2
+; CHECK alloca i32, i32 %N
+; CHECK-NOT: call void @inner2
+; CHECK: ret void
+
+  call void @inner2( i32 %N )
+  ret void
+}
+
+declare i32 @a() returns_twice
+declare i32 @b() returns_twice
+
+define i32 @inner3() alwaysinline {
+entry:
+  %call = call i32 @a() returns_twice
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+define i32 @outer3() {
+entry:
+; CHECK: @outer3
+; CHECK-NOT: call i32 @a
+; CHECK: ret
+
+  %call = call i32 @inner3()
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+
+define i32 @inner4() alwaysinline returns_twice {
+entry:
+  %call = call i32 @b() returns_twice
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+
+define i32 @outer4() {
+entry:
+; CHECK: @outer4
+; CHECK: call i32 @b()
+; CHECK: ret
+
+  %call = call i32 @inner4() returns_twice
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+
+define i32 @inner5(i8* %addr) alwaysinline {
+entry:
+  indirectbr i8* %addr, [ label %one, label %two ]
+
+one:
+  ret i32 42
+
+two:
+  ret i32 44
+}
+define i32 @outer5(i32 %x) {
+; CHECK: @outer5
+; CHECK: call i32 @inner5
+; CHECK: ret
+
+  %cmp = icmp slt i32 %x, 42
+  %addr = select i1 %cmp, i8* blockaddress(@inner5, %one), i8* blockaddress(@inner5, %two)
+  %call = call i32 @inner5(i8* %addr)
+  ret i32 %call
+}
+
+define void @inner6(i32 %x) alwaysinline {
+entry:
+  %icmp = icmp slt i32 %x, 0
+  br i1 %icmp, label %return, label %bb
+
+bb:
+  %sub = sub nsw i32 %x, 1
+  call void @inner6(i32 %sub)
+  ret void
+
+return:
+  ret void
+}
+define void @outer6() {
+; CHECK: @outer6
+; CHECK: call void @inner6(i32 42)
+; CHECK: ret
+
+entry:
+  call void @inner6(i32 42)
+  ret void
+}
+
diff --git a/test/Transforms/Inline/always_inline_dyn_alloca.ll b/test/Transforms/Inline/always_inline_dyn_alloca.ll
deleted file mode 100644
index 25cfc49..0000000
--- a/test/Transforms/Inline/always_inline_dyn_alloca.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt < %s -inline -S | not grep callee
-; rdar://6655932
-
-; If callee is marked alwaysinline, inline it! Even if callee has dynamic
-; alloca and caller does not,
-
-define internal void @callee(i32 %N) alwaysinline {
-        %P = alloca i32, i32 %N
-        ret void
-}
-
-define void @foo(i32 %N) {
-        call void @callee( i32 %N )
-        ret void
-}
diff --git a/test/Transforms/Inline/dynamic_alloca_test.ll b/test/Transforms/Inline/dynamic_alloca_test.ll
index 0286535..15a5c66 100644
--- a/test/Transforms/Inline/dynamic_alloca_test.ll
+++ b/test/Transforms/Inline/dynamic_alloca_test.ll
@@ -3,33 +3,43 @@
 ; Functions with dynamic allocas can only be inlined into functions that
 ; already have dynamic allocas.
 
-; RUN: opt < %s -inline -S | \
-; RUN:   grep llvm.stacksave
-; RUN: opt < %s -inline -S | not grep callee
-
+; RUN: opt < %s -inline -S | FileCheck %s
+;
+; FIXME: This test is xfailed because the inline cost rewrite disabled *all*
+; inlining of functions which contain a dynamic alloca. It should be re-enabled
+; once that functionality is restored.
+; XFAIL: *
 
 declare void @ext(i32*)
 
 define internal void @callee(i32 %N) {
-        %P = alloca i32, i32 %N         ; <i32*> [#uses=1]
-        call void @ext( i32* %P )
-        ret void
+  %P = alloca i32, i32 %N
+  call void @ext(i32* %P)
+  ret void
 }
 
 define void @foo(i32 %N) {
-; <label>:0
-        %P = alloca i32, i32 %N         ; <i32*> [#uses=1]
-        call void @ext( i32* %P )
-        br label %Loop
-
-Loop:           ; preds = %Loop, %0
-        %count = phi i32 [ 0, %0 ], [ %next, %Loop ]            ; <i32> [#uses=2]
-        %next = add i32 %count, 1               ; <i32> [#uses=1]
-        call void @callee( i32 %N )
-        %cond = icmp eq i32 %count, 100000              ; <i1> [#uses=1]
-        br i1 %cond, label %out, label %Loop
-
-out:            ; preds = %Loop
-        ret void
+; CHECK: @foo
+; CHECK: alloca i32, i32 %{{.*}}
+; CHECK: call i8* @llvm.stacksave()
+; CHECK: alloca i32, i32 %{{.*}}
+; CHECK: call void @ext
+; CHECK: call void @llvm.stackrestore
+; CHECK: ret
+
+entry:
+  %P = alloca i32, i32 %N
+  call void @ext(i32* %P)
+  br label %loop
+
+loop:
+  %count = phi i32 [ 0, %entry ], [ %next, %loop ]
+  %next = add i32 %count, 1
+  call void @callee(i32 %N)
+  %cond = icmp eq i32 %count, 100000
+  br i1 %cond, label %out, label %loop
+
+out:
+  ret void
 }
 
diff --git a/test/Transforms/Inline/inline_cleanup.ll b/test/Transforms/Inline/inline_cleanup.ll
index 4c64721..3898aa7 100644
--- a/test/Transforms/Inline/inline_cleanup.ll
+++ b/test/Transforms/Inline/inline_cleanup.ll
@@ -1,10 +1,8 @@
 ; Test that the inliner doesn't leave around dead allocas, and that it folds
 ; uncond branches away after it is done specializing.
 
-; RUN: opt < %s -inline -S | \
-; RUN:    not grep {alloca.*uses=0}
-; RUN: opt < %s -inline -S | \
-; RUN:    not grep {br label}
+; RUN: opt < %s -inline -S | FileCheck %s
+
 @A = weak global i32 0		; <i32*> [#uses=1]
 @B = weak global i32 0		; <i32*> [#uses=1]
 @C = weak global i32 0		; <i32*> [#uses=1]
@@ -54,6 +52,18 @@ UnifiedReturnBlock:		; preds = %cond_next13
 declare void @ext(i32*)
 
 define void @test() {
+; CHECK: @test
+; CHECK-NOT: ret
+;
+; FIXME: This should be a CHECK-NOT, but currently we have a bug that causes us
+; to not nuke unused allocas.
+; CHECK: alloca
+; CHECK-NOT: ret
+;
+; No branches should survive the inliner's cleanup.
+; CHECK-NOT: br
+; CHECK: ret void
+
 entry:
 	tail call fastcc void @foo( i32 1 )
 	tail call fastcc void @foo( i32 2 )
@@ -61,3 +71,143 @@ entry:
 	tail call fastcc void @foo( i32 8 )
 	ret void
 }
+
+declare void @f(i32 %x)
+
+define void @inner2(i32 %x, i32 %y, i32 %z, i1 %b) {
+entry:
+  %cmp1 = icmp ne i32 %x, 0
+  br i1 %cmp1, label %then1, label %end1
+
+then1:
+  call void @f(i32 %x)
+  br label %end1
+
+end1:
+  %x2 = and i32 %x, %z
+  %cmp2 = icmp sgt i32 %x2, 1
+  br i1 %cmp2, label %then2, label %end2
+
+then2:
+  call void @f(i32 %x2)
+  br label %end2
+
+end2:
+  %y2 = or i32 %y, %z
+  %cmp3 = icmp sgt i32 %y2, 0
+  br i1 %cmp3, label %then3, label %end3
+
+then3:
+  call void @f(i32 %y2)
+  br label %end3
+
+end3:
+  br i1 %b, label %end3.1, label %end3.2
+
+end3.1:
+  %x3.1 = or i32 %x, 10
+  br label %end3.3
+
+end3.2:
+  %x3.2 = or i32 %x, 10
+  br label %end3.3
+
+end3.3:
+  %x3.3 = phi i32 [ %x3.1, %end3.1 ], [ %x3.2, %end3.2 ]
+  %cmp4 = icmp slt i32 %x3.3, 1
+  br i1 %cmp4, label %then4, label %end4
+
+then4:
+  call void @f(i32 %x3.3)
+  br label %end4
+
+end4:
+  ret void
+}
+
+define void @outer2(i32 %z, i1 %b) {
+; Ensure that after inlining, none of the blocks with a call to @f actually
+; make it through inlining.
+; CHECK: define void @outer2
+; CHECK-NOT: call
+; CHECK: ret void
+
+entry:
+  call void @inner2(i32 0, i32 -1, i32 %z, i1 %b)
+  ret void
+}
+
+define void @PR12470_inner(i16 signext %p1) nounwind uwtable {
+entry:
+  br i1 undef, label %cond.true, label %cond.false
+
+cond.true:
+  br label %cond.end
+
+cond.false:
+  %conv = sext i16 %p1 to i32
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ undef, %cond.true ], [ 0, %cond.false ]
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end5, label %if.then
+
+if.then:
+  ret void
+
+if.end5:
+  ret void
+}
+
+define void @PR12470_outer() {
+; This previously crashed during inliner cleanup and folding inner return
+; instructions. Check that we don't crash and we produce a function with a single
+; return instruction due to merging the returns of the inlined function.
+; CHECK: define void @PR12470_outer
+; CHECK-NOT: call
+; CHECK: ret void
+; CHECK-NOT: ret void
+; CHECK: }
+
+entry:
+  call void @PR12470_inner(i16 signext 1)
+  ret void
+}
+
+define void @crasher_inner() nounwind uwtable {
+entry:
+  br i1 false, label %for.end28, label %for.body6
+
+for.body6:
+  br i1 undef, label %for.body6, label %for.cond12.for.inc26_crit_edge
+
+for.cond12.for.inc26_crit_edge:
+  br label %for.body6.1
+
+for.end28:
+  ret void
+
+for.body6.1:
+  br i1 undef, label %for.body6.1, label %for.cond12.for.inc26_crit_edge.1
+
+for.cond12.for.inc26_crit_edge.1:
+  br label %for.body6.2
+
+for.body6.2:
+  br i1 undef, label %for.body6.2, label %for.cond12.for.inc26_crit_edge.2
+
+for.cond12.for.inc26_crit_edge.2:
+  br label %for.end28
+}
+
+define void @crasher_outer() {
+; CHECK: @crasher_outer
+; CHECK-NOT: call
+; CHECK: ret void
+; CHECK-NOT: ret
+; CHECK: }
+entry:
+  tail call void @crasher_inner()
+  ret void
+}
diff --git a/test/Transforms/Inline/inline_constprop.ll b/test/Transforms/Inline/inline_constprop.ll
index cc7aaac..dc35b60 100644
--- a/test/Transforms/Inline/inline_constprop.ll
+++ b/test/Transforms/Inline/inline_constprop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -inline -S | FileCheck %s
+; RUN: opt < %s -inline -inline-threshold=20 -S | FileCheck %s
 
 define internal i32 @callee1(i32 %A, i32 %B) {
   %C = sdiv i32 %A, %B
@@ -14,17 +14,18 @@ define i32 @caller1() {
 }
 
 define i32 @caller2() {
+; Check that we can constant-prop through instructions after inlining callee21
+; to get constants in the inlined callsite to callee22.
+; FIXME: Currently, the threshold is fixed at 20 because we don't perform
+; *recursive* cost analysis to realize that the nested call site will definitely
+; inline and be cheap. We should eventually do that and lower the threshold here
+; to 1.
+;
 ; CHECK: @caller2
 ; CHECK-NOT: call void @callee2
 ; CHECK: ret
 
-; We contrive to make this hard for *just* the inline pass to do in order to
-; simulate what can actually happen with large, complex functions getting
-; inlined.
-  %a = add i32 42, 0
-  %b = add i32 48, 0
-
-  %x = call i32 @callee21(i32 %a, i32 %b)
+  %x = call i32 @callee21(i32 42, i32 48)
   ret i32 %x
 }
 
@@ -41,49 +42,71 @@ define i32 @callee22(i32 %x) {
   br i1 %icmp, label %bb.true, label %bb.false
 bb.true:
   ; This block musn't be counted in the inline cost.
-  %ptr = call i8* @getptr()
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
-  load volatile i8* %ptr
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  %x4 = add i32 %x3, 1
+  %x5 = add i32 %x4, 1
+  %x6 = add i32 %x5, 1
+  %x7 = add i32 %x6, 1
+  %x8 = add i32 %x7, 1
 
-  ret i32 %x
+  ret i32 %x8
 bb.false:
   ret i32 %x
 }
+
+define i32 @caller3() {
+; Check that even if the expensive path is hidden behind several basic blocks,
+; it doesn't count toward the inline cost when constant-prop proves those paths
+; dead.
+;
+; CHECK: @caller3
+; CHECK-NOT: call
+; CHECK: ret i32 6
+
+entry:
+  %x = call i32 @callee3(i32 42, i32 48)
+  ret i32 %x
+}
+
+define i32 @callee3(i32 %x, i32 %y) {
+  %sub = sub i32 %y, %x
+  %icmp = icmp ugt i32 %sub, 42
+  br i1 %icmp, label %bb.true, label %bb.false
+
+bb.true:
+  %icmp2 = icmp ult i32 %sub, 64
+  br i1 %icmp2, label %bb.true.true, label %bb.true.false
+
+bb.true.true:
+  ; This block musn't be counted in the inline cost.
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  %x4 = add i32 %x3, 1
+  %x5 = add i32 %x4, 1
+  %x6 = add i32 %x5, 1
+  %x7 = add i32 %x6, 1
+  %x8 = add i32 %x7, 1
+  br label %bb.merge
+
+bb.true.false:
+  ; This block musn't be counted in the inline cost.
+  %y1 = add i32 %y, 1
+  %y2 = add i32 %y1, 1
+  %y3 = add i32 %y2, 1
+  %y4 = add i32 %y3, 1
+  %y5 = add i32 %y4, 1
+  %y6 = add i32 %y5, 1
+  %y7 = add i32 %y6, 1
+  %y8 = add i32 %y7, 1
+  br label %bb.merge
+
+bb.merge:
+  %result = phi i32 [ %x8, %bb.true.true ], [ %y8, %bb.true.false ]
+  ret i32 %result
+
+bb.false:
+  ret i32 %sub
+}
diff --git a/test/Transforms/Inline/noinline-recursive-fn.ll b/test/Transforms/Inline/noinline-recursive-fn.ll
index d56b390..6cde0e2 100644
--- a/test/Transforms/Inline/noinline-recursive-fn.ll
+++ b/test/Transforms/Inline/noinline-recursive-fn.ll
@@ -71,3 +71,40 @@ entry:
   call void @f2(i32 123, i8* bitcast (void (i32, i8*, i8*)* @f1 to i8*), i8* bitcast (void (i32, i8*, i8*)* @f2 to i8*)) nounwind ssp
   ret void
 }
+
+
+; Check that a recursive function, when called with a constant that makes the
+; recursive path dead code can actually be inlined.
+define i32 @fib(i32 %i) {
+entry:
+  %is.zero = icmp eq i32 %i, 0
+  br i1 %is.zero, label %zero.then, label %zero.else
+
+zero.then:
+  ret i32 0
+
+zero.else:
+  %is.one = icmp eq i32 %i, 1
+  br i1 %is.one, label %one.then, label %one.else
+
+one.then:
+  ret i32 1
+
+one.else:
+  %i1 = sub i32 %i, 1
+  %f1 = call i32 @fib(i32 %i1)
+  %i2 = sub i32 %i, 2
+  %f2 = call i32 @fib(i32 %i2)
+  %f = add i32 %f1, %f2
+  ret i32 %f
+}
+
+define i32 @fib_caller() {
+; CHECK: @fib_caller
+; CHECK-NOT: call
+; CHECK: ret
+  %f1 = call i32 @fib(i32 0)
+  %f2 = call i32 @fib(i32 1)
+  %result = add i32 %f1, %f2
+  ret i32 %result
+}
diff --git a/test/Transforms/Inline/ptr-diff.ll b/test/Transforms/Inline/ptr-diff.ll
index 0b431d6..60fc3e2 100644
--- a/test/Transforms/Inline/ptr-diff.ll
+++ b/test/Transforms/Inline/ptr-diff.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -inline < %s -S -o - -inline-threshold=10 | FileCheck %s
 
+target datalayout = "p:32:32"
+
 define i32 @outer1() {
 ; CHECK: @outer1
 ; CHECK-NOT: call
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index e4d1367..ef7185c 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -44,3 +44,47 @@ define i32* @test4(i32 %n) {
   %A = alloca i32, i32 %n
   ret i32* %A
 }
+
+; Allocas which are only used by GEPs, bitcasts, and stores (transitively)
+; should be deleted.
+define void @test5() {
+; CHECK: @test5
+; CHECK-NOT: alloca
+; CHECK-NOT: store
+; CHECK: ret
+
+entry:
+  %a = alloca { i32 }
+  %b = alloca i32*
+  %a.1 = getelementptr { i32 }* %a, i32 0, i32 0
+  store i32 123, i32* %a.1
+  store i32* %a.1, i32** %b
+  %b.1 = bitcast i32** %b to i32*
+  store i32 123, i32* %b.1
+  %a.2 = getelementptr { i32 }* %a, i32 0, i32 0
+  store atomic i32 2, i32* %a.2 unordered, align 4
+  %a.3 = getelementptr { i32 }* %a, i32 0, i32 0
+  store atomic i32 3, i32* %a.3 release, align 4
+  %a.4 = getelementptr { i32 }* %a, i32 0, i32 0
+  store atomic i32 4, i32* %a.4 seq_cst, align 4
+  ret void
+}
+
+declare void @f(i32* %p)
+
+; Check that we don't delete allocas in some erroneous cases.
+define void @test6() {
+; CHECK: @test6
+; CHECK-NOT: ret
+; CHECK: alloca
+; CHECK-NEXT: alloca
+; CHECK: ret
+
+entry:
+  %a = alloca { i32 }
+  %b = alloca i32
+  %a.1 = getelementptr { i32 }* %a, i32 0, i32 0
+  store volatile i32 123, i32* %a.1
+  tail call void @f(i32* %b)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index 55243a6..0ea73a0 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -1,70 +1,93 @@
-; This test makes sure that shit instructions are properly eliminated
+; This test makes sure that shift instructions are properly eliminated
 ; even with arbitrary precision integers.
-; RUN: opt < %s -instcombine -S | not grep sh
-; END.
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; CHECK: @test1
+; CHECK-NOT: sh
 define i47 @test1(i47 %A) {
 	%B = shl i47 %A, 0		; <i47> [#uses=1]
 	ret i47 %B
 }
 
+; CHECK: @test2
+; CHECK-NOT: sh
 define i41 @test2(i7 %X) {
 	%A = zext i7 %X to i41		; <i41> [#uses=1]
 	%B = shl i41 0, %A		; <i41> [#uses=1]
 	ret i41 %B
 }
 
+; CHECK: @test3
+; CHECK-NOT: sh
 define i41 @test3(i41 %A) {
 	%B = ashr i41 %A, 0		; <i41> [#uses=1]
 	ret i41 %B
 }
 
+; CHECK: @test4
+; CHECK-NOT: sh
 define i39 @test4(i7 %X) {
 	%A = zext i7 %X to i39		; <i39> [#uses=1]
 	%B = ashr i39 0, %A		; <i39> [#uses=1]
 	ret i39 %B
 }
 
+; CHECK: @test5
+; CHECK-NOT: sh
 define i55 @test5(i55 %A) {
 	%B = lshr i55 %A, 55		; <i55> [#uses=1]
 	ret i55 %B
 }
 
+; CHECK: @test5a
+; CHECK-NOT: sh
 define i32 @test5a(i32 %A) {
 	%B = shl i32 %A, 32		; <i32> [#uses=1]
 	ret i32 %B
 }
 
+; CHECK: @test6
+; CHECK-NOT: sh
 define i55 @test6(i55 %A) {
 	%B = shl i55 %A, 1		; <i55> [#uses=1]
 	%C = mul i55 %B, 3		; <i55> [#uses=1]
 	ret i55 %C
 }
 
+; CHECK: @test7
+; CHECK-NOT: sh
 define i29 @test7(i8 %X) {
 	%A = zext i8 %X to i29		; <i29> [#uses=1]
 	%B = ashr i29 -1, %A		; <i29> [#uses=1]
 	ret i29 %B
 }
 
+; CHECK: @test8
+; CHECK-NOT: sh
 define i7 @test8(i7 %A) {
 	%B = shl i7 %A, 4		; <i7> [#uses=1]
 	%C = shl i7 %B, 3		; <i7> [#uses=1]
 	ret i7 %C
 }
 
+; CHECK: @test9
+; CHECK-NOT: sh
 define i17 @test9(i17 %A) {
 	%B = shl i17 %A, 16		; <i17> [#uses=1]
 	%C = lshr i17 %B, 16		; <i17> [#uses=1]
 	ret i17 %C
 }
 
+; CHECK: @test10
+; CHECK-NOT: sh
 define i19 @test10(i19 %A) {
 	%B = lshr i19 %A, 18		; <i19> [#uses=1]
 	%C = shl i19 %B, 18		; <i19> [#uses=1]
 	ret i19 %C
 }
 
+; CHECK: @test11
+; CHECK-NOT: sh
 define i23 @test11(i23 %A) {
 	%a = mul i23 %A, 3		; <i23> [#uses=1]
 	%B = lshr i23 %a, 11		; <i23> [#uses=1]
@@ -72,12 +95,16 @@ define i23 @test11(i23 %A) {
 	ret i23 %C
 }
 
+; CHECK: @test12
+; CHECK-NOT: sh
 define i47 @test12(i47 %A) {
 	%B = ashr i47 %A, 8		; <i47> [#uses=1]
 	%C = shl i47 %B, 8		; <i47> [#uses=1]
 	ret i47 %C
 }
 
+; CHECK: @test13
+; CHECK-NOT: sh
 define i18 @test13(i18 %A) {
 	%a = mul i18 %A, 3		; <i18> [#uses=1]
 	%B = ashr i18 %a, 8		; <i18> [#uses=1]
@@ -85,6 +112,8 @@ define i18 @test13(i18 %A) {
 	ret i18 %C
 }
 
+; CHECK: @test14
+; CHECK-NOT: sh
 define i35 @test14(i35 %A) {
 	%B = lshr i35 %A, 4		; <i35> [#uses=1]
 	%C = or i35 %B, 1234		; <i35> [#uses=1]
@@ -92,6 +121,8 @@ define i35 @test14(i35 %A) {
 	ret i35 %D
 }
 
+; CHECK: @test14a
+; CHECK-NOT: sh
 define i79 @test14a(i79 %A) {
 	%B = shl i79 %A, 4		; <i79> [#uses=1]
 	%C = and i79 %B, 1234		; <i79> [#uses=1]
@@ -99,12 +130,16 @@ define i79 @test14a(i79 %A) {
 	ret i79 %D
 }
 
+; CHECK: @test15
+; CHECK-NOT: sh
 define i45 @test15(i1 %C) {
 	%A = select i1 %C, i45 3, i45 1	; <i45> [#uses=1]
 	%V = shl i45 %A, 2		; <i45> [#uses=1]
 	ret i45 %V
 }
 
+; CHECK: @test15a
+; CHECK-NOT: sh
 define i53 @test15a(i1 %X) {
 	%A = select i1 %X, i8 3, i8 1	; <i8> [#uses=1]
 	%B = zext i8 %A to i53		; <i53> [#uses=1]
@@ -112,6 +147,8 @@ define i53 @test15a(i1 %X) {
 	ret i53 %V
 }
 
+; CHECK: @test16
+; CHECK-NOT: sh
 define i1 @test16(i84 %X) {
 	%tmp.3 = ashr i84 %X, 4		; <i84> [#uses=1]
 	%tmp.6 = and i84 %tmp.3, 1	; <i84> [#uses=1]
@@ -119,48 +156,64 @@ define i1 @test16(i84 %X) {
 	ret i1 %tmp.7
 }
 
+; CHECK: @test17
+; CHECK-NOT: sh
 define i1 @test17(i106 %A) {
 	%B = lshr i106 %A, 3		; <i106> [#uses=1]
 	%C = icmp eq i106 %B, 1234	; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test18
+; CHECK-NOT: sh
 define i1 @test18(i11 %A) {
 	%B = lshr i11 %A, 10		; <i11> [#uses=1]
 	%C = icmp eq i11 %B, 123	; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test19
+; CHECK-NOT: sh
 define i1 @test19(i37 %A) {
 	%B = ashr i37 %A, 2		; <i37> [#uses=1]
 	%C = icmp eq i37 %B, 0		; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test19a
+; CHECK-NOT: sh
 define i1 @test19a(i39 %A) {
 	%B = ashr i39 %A, 2		; <i39> [#uses=1]
 	%C = icmp eq i39 %B, -1		; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test20
+; CHECK-NOT: sh
 define i1 @test20(i13 %A) {
 	%B = ashr i13 %A, 12		; <i13> [#uses=1]
 	%C = icmp eq i13 %B, 123	; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test21
+; CHECK-NOT: sh
 define i1 @test21(i12 %A) {
 	%B = shl i12 %A, 6		; <i12> [#uses=1]
 	%C = icmp eq i12 %B, -128		; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test22
+; CHECK-NOT: sh
 define i1 @test22(i14 %A) {
 	%B = shl i14 %A, 7		; <i14> [#uses=1]
 	%C = icmp eq i14 %B, 0		; <i1> [#uses=1]
 	ret i1 %C
 }
 
+; CHECK: @test23
+; CHECK-NOT: sh
 define i11 @test23(i44 %A) {
 	%B = shl i44 %A, 33		; <i44> [#uses=1]
 	%C = ashr i44 %B, 33		; <i44> [#uses=1]
@@ -168,6 +221,8 @@ define i11 @test23(i44 %A) {
 	ret i11 %D
 }
 
+; CHECK: @test25
+; CHECK-NOT: sh
 define i37 @test25(i37 %tmp.2, i37 %AA) {
 	%x = lshr i37 %AA, 17		; <i37> [#uses=1]
 	%tmp.3 = lshr i37 %tmp.2, 17		; <i37> [#uses=1]
@@ -176,6 +231,8 @@ define i37 @test25(i37 %tmp.2, i37 %AA) {
 	ret i37 %tmp.6
 }
 
+; CHECK: @test26
+; CHECK-NOT: sh
 define i40 @test26(i40 %A) {
 	%B = lshr i40 %A, 1		; <i40> [#uses=1]
 	%C = bitcast i40 %B to i40		; <i40> [#uses=1]
diff --git a/test/Transforms/InstCombine/apint-shl-trunc.ll b/test/Transforms/InstCombine/apint-shl-trunc.ll
index 8163e6d..f2dc7d5 100644
--- a/test/Transforms/InstCombine/apint-shl-trunc.ll
+++ b/test/Transforms/InstCombine/apint-shl-trunc.ll
@@ -1,13 +1,24 @@
-; RUN: opt < %s -instcombine -S | grep shl
-; END.
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i1 @test0(i39 %X, i39 %A) {
+; CHECK: @test0
+; CHECK: %[[V1:.*]] = shl i39 1, %A
+; CHECK: %[[V2:.*]] = and i39 %[[V1]], %X
+; CHECK: %[[V3:.*]] = icmp ne i39 %[[V2]], 0
+; CHECK: ret i1 %[[V3]]
+
 	%B = lshr i39 %X, %A
 	%D = trunc i39 %B to i1
 	ret i1 %D
 }
 
 define i1 @test1(i799 %X, i799 %A) {
+; CHECK: @test1
+; CHECK: %[[V1:.*]] = shl i799 1, %A
+; CHECK: %[[V2:.*]] = and i799 %[[V1]], %X
+; CHECK: %[[V3:.*]] = icmp ne i799 %[[V2]], 0
+; CHECK: ret i1 %[[V3]]
+
 	%B = lshr i799 %X, %A
 	%D = trunc i799 %B to i1
 	ret i1 %D
diff --git a/test/Transforms/InstCombine/pr12251.ll b/test/Transforms/InstCombine/pr12251.ll
new file mode 100644
index 0000000..74a41eb
--- /dev/null
+++ b/test/Transforms/InstCombine/pr12251.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define zeroext i1 @_Z3fooPb(i8* nocapture %x) {
+entry:
+  %a = load i8* %x, align 1, !range !0
+  %b = and i8 %a, 1
+  %tobool = icmp ne i8 %b, 0
+  ret i1 %tobool
+}
+
+; CHECK: %a = load i8* %x, align 1, !range !0
+; CHECK-NEXT: %tobool = icmp ne i8 %a, 0
+; CHECK-NEXT: ret i1 %tobool
+
+!0 = metadata !{i8 0, i8 2}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index e15bfaa..ced74bd 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -103,6 +103,68 @@ define i1 @gep8(%gept* %x) {
 ; CHECK: ret i1 %equal
 }
 
+define i1 @gep9(i8* %ptr) {
+; CHECK: @gep9
+; CHECK-NOT: ret
+; CHECK: ret i1 true
+
+entry:
+  %first1 = getelementptr inbounds i8* %ptr, i32 0
+  %first2 = getelementptr inbounds i8* %first1, i32 1
+  %first3 = getelementptr inbounds i8* %first2, i32 2
+  %first4 = getelementptr inbounds i8* %first3, i32 4
+  %last1 = getelementptr inbounds i8* %first2, i32 48
+  %last2 = getelementptr inbounds i8* %last1, i32 8
+  %last3 = getelementptr inbounds i8* %last2, i32 -4
+  %last4 = getelementptr inbounds i8* %last3, i32 -4
+  %first.int = ptrtoint i8* %first4 to i32
+  %last.int = ptrtoint i8* %last4 to i32
+  %cmp = icmp ne i32 %last.int, %first.int
+  ret i1 %cmp
+}
+
+define i1 @gep10(i8* %ptr) {
+; CHECK: @gep10
+; CHECK-NOT: ret
+; CHECK: ret i1 true
+
+entry:
+  %first1 = getelementptr inbounds i8* %ptr, i32 -2
+  %first2 = getelementptr inbounds i8* %first1, i32 44
+  %last1 = getelementptr inbounds i8* %ptr, i32 48
+  %last2 = getelementptr inbounds i8* %last1, i32 -6
+  %first.int = ptrtoint i8* %first2 to i32
+  %last.int = ptrtoint i8* %last2 to i32
+  %cmp = icmp eq i32 %last.int, %first.int
+  ret i1 %cmp
+}
+
+define i1 @gep11(i8* %ptr) {
+; CHECK: @gep11
+; CHECK-NOT: ret
+; CHECK: ret i1 true
+
+entry:
+  %first1 = getelementptr inbounds i8* %ptr, i32 -2
+  %last1 = getelementptr inbounds i8* %ptr, i32 48
+  %last2 = getelementptr inbounds i8* %last1, i32 -6
+  %cmp = icmp ult i8* %first1, %last2
+  ret i1 %cmp
+}
+
+define i1 @gep12(i8* %ptr) {
+; CHECK: @gep12
+; CHECK-NOT: ret
+; CHECK: ret i1 %cmp
+
+entry:
+  %first1 = getelementptr inbounds i8* %ptr, i32 -2
+  %last1 = getelementptr inbounds i8* %ptr, i32 48
+  %last2 = getelementptr inbounds i8* %last1, i32 -6
+  %cmp = icmp slt i8* %first1, %last2
+  ret i1 %cmp
+}
+
 define i1 @zext(i32 %x) {
 ; CHECK: @zext
   %e1 = zext i32 %x to i64
diff --git a/test/Transforms/InstSimplify/ptr_diff.ll b/test/Transforms/InstSimplify/ptr_diff.ll
index 013964c..1eb1fd4 100644
--- a/test/Transforms/InstSimplify/ptr_diff.ll
+++ b/test/Transforms/InstSimplify/ptr_diff.ll
@@ -6,8 +6,8 @@ define i64 @ptrdiff1(i8* %ptr) {
 ; CHECK: @ptrdiff1
 ; CHECK-NEXT: ret i64 42
 
-  %first = getelementptr i8* %ptr, i32 0
-  %last = getelementptr i8* %ptr, i32 42
+  %first = getelementptr inbounds i8* %ptr, i32 0
+  %last = getelementptr inbounds i8* %ptr, i32 42
   %first.int = ptrtoint i8* %first to i64
   %last.int = ptrtoint i8* %last to i64
   %diff = sub i64 %last.int, %first.int
@@ -18,16 +18,31 @@ define i64 @ptrdiff2(i8* %ptr) {
 ; CHECK: @ptrdiff2
 ; CHECK-NEXT: ret i64 42
 
-  %first1 = getelementptr i8* %ptr, i32 0
-  %first2 = getelementptr i8* %first1, i32 1
-  %first3 = getelementptr i8* %first2, i32 2
-  %first4 = getelementptr i8* %first3, i32 4
-  %last1 = getelementptr i8* %first2, i32 48
-  %last2 = getelementptr i8* %last1, i32 8
-  %last3 = getelementptr i8* %last2, i32 -4
-  %last4 = getelementptr i8* %last3, i32 -4
+  %first1 = getelementptr inbounds i8* %ptr, i32 0
+  %first2 = getelementptr inbounds i8* %first1, i32 1
+  %first3 = getelementptr inbounds i8* %first2, i32 2
+  %first4 = getelementptr inbounds i8* %first3, i32 4
+  %last1 = getelementptr inbounds i8* %first2, i32 48
+  %last2 = getelementptr inbounds i8* %last1, i32 8
+  %last3 = getelementptr inbounds i8* %last2, i32 -4
+  %last4 = getelementptr inbounds i8* %last3, i32 -4
   %first.int = ptrtoint i8* %first4 to i64
   %last.int = ptrtoint i8* %last4 to i64
   %diff = sub i64 %last.int, %first.int
   ret i64 %diff
 }
+
+define i64 @ptrdiff3(i8* %ptr) {
+; Don't bother with non-inbounds GEPs.
+; CHECK: @ptrdiff3
+; CHECK: getelementptr
+; CHECK: sub
+; CHECK: ret
+
+  %first = getelementptr i8* %ptr, i32 0
+  %last = getelementptr i8* %ptr, i32 42
+  %first.int = ptrtoint i8* %first to i64
+  %last.int = ptrtoint i8* %last to i64
+  %diff = sub i64 %last.int, %first.int
+  ret i64 %diff
+}
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index 9287178..b32ee82 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -1,11 +1,13 @@
 ; RUN: opt -S -loop-rotate  %s  | FileCheck %s
 
-; CHECK: entry
-; CHECK-NEXT: call void @llvm.dbg.value(metadata !{i32 %x}
-
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp {
+; CHECK: define i32 @tak
+; CHECK: entry
+; CHECK-NEXT: call void @llvm.dbg.value(metadata !{i32 %x}
+
 entry:
   br label %tailrecurse
 
@@ -35,7 +37,45 @@ return:                                           ; preds = %if.end
   ret i32 %z.tr, !dbg !17
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+@channelColumns = external global i64
+@horzPlane = external global i8*, align 8
+
+define void @FindFreeHorzSeg(i64 %startCol, i64 %row, i64* %rowStart) {
+; Ensure that the loop increment basic block is rotated into the tail of the
+; body, even though it contains a debug intrinsic call.
+; CHECK: define void @FindFreeHorzSeg
+; CHECK: %dec = add
+; CHECK-NEXT: tail call void @llvm.dbg.value
+; CHECK-NEXT: br i1 %tobool, label %for.cond, label %for.end
+
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i64 [ %startCol, %entry ], [ %dec, %for.inc ]
+  %cmp = icmp eq i64 %i.0, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.body:
+  %0 = load i64* @channelColumns, align 8
+  %mul = mul i64 %0, %row
+  %add = add i64 %mul, %i.0
+  %1 = load i8** @horzPlane, align 8
+  %arrayidx = getelementptr inbounds i8* %1, i64 %add
+  %2 = load i8* %arrayidx, align 1
+  %tobool = icmp eq i8 %2, 0
+  br i1 %tobool, label %for.inc, label %for.end
+
+for.inc:
+  %dec = add i64 %i.0, -1
+  tail call void @llvm.dbg.value(metadata !{i64 %dec}, i64 0, metadata undef)
+  br label %for.cond
+
+for.end:
+  %add1 = add i64 %i.0, 1
+  store i64 %add1, i64* %rowStart, align 8
+  ret void
+}
 
 !llvm.dbg.sp = !{!0}
 
diff --git a/test/Transforms/LoopStrengthReduce/2012-03-26-constexpr.ll b/test/Transforms/LoopStrengthReduce/2012-03-26-constexpr.ll
new file mode 100644
index 0000000..c9b11a9
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2012-03-26-constexpr.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-reduce -S
+; PR11950: isHighCostExpansion crashes on ConstExpr
+;
+; The crash happened during IVChain analysis (CollectChains). We don't
+; really care how LSR decides to transform this loop, so we don't
+; check it. As long as the analysis doesn't crash we're ok.
+target datalayout = "e-p:64:64:64-n32:64"
+
+%struct.this_structure_s.0.5 = type { [6144 x [8 x i32]], [6144 x [8 x i32]], [6147 x [4 x i32]], [8 x i32], [2 x i8*], [2 x i8*], [6144 x i8], [6144 x i32], [6144 x i32], [4 x [4 x i8]] }
+
+define internal fastcc void @someFunction(%struct.this_structure_s.0.5* nocapture %scratch, i32 %stage, i32 %cbSize) nounwind {
+entry:
+  %0 = getelementptr inbounds %struct.this_structure_s.0.5* %scratch, i32 0, i32 4, i32 %stage
+  %1 = load i8** %0, align 4
+  %2 = getelementptr inbounds %struct.this_structure_s.0.5* %scratch, i32 0, i32 5, i32 %stage
+  %3 = load i8** %2, align 4
+  %4 = getelementptr inbounds %struct.this_structure_s.0.5* %scratch, i32 0, i32 2, i32 0, i32 0
+  %tmp11 = shl i32 %stage, 1
+  %tmp1325 = or i32 %tmp11, 1
+  br label %__label_D_1608
+
+__label_D_1608:                                   ; preds = %__label_D_1608, %entry
+  %i.12 = phi i32 [ 0, %entry ], [ %10, %__label_D_1608 ]
+  %tmp = shl i32 %i.12, 2
+  %lvar_g.13 = getelementptr i32* %4, i32 %tmp
+  %tmp626 = or i32 %tmp, 1
+  %scevgep = getelementptr i32* %4, i32 %tmp626
+  %tmp727 = or i32 %tmp, 2
+  %scevgep8 = getelementptr i32* %4, i32 %tmp727
+  %tmp928 = or i32 %tmp, 3
+  %scevgep10 = getelementptr i32* %4, i32 %tmp928
+  %scevgep12 = getelementptr %struct.this_structure_s.0.5* %scratch, i32 0, i32 9, i32 %tmp11, i32 %i.12
+  %scevgep14 = getelementptr %struct.this_structure_s.0.5* %scratch, i32 0, i32 9, i32 %tmp1325, i32 %i.12
+  %5 = load i8* %scevgep12, align 1
+  %6 = sext i8 %5 to i32
+  %7 = load i8* %scevgep14, align 1
+  %8 = sext i8 %7 to i32
+  store i32 0, i32* %lvar_g.13, align 4
+  store i32 %8, i32* %scevgep, align 4
+  store i32 %6, i32* %scevgep8, align 4
+  %9 = add nsw i32 %8, %6
+  store i32 %9, i32* %scevgep10, align 4
+  %10 = add nsw i32 %i.12, 1
+  %exitcond = icmp eq i32 %10, 3
+  br i1 %exitcond, label %return, label %__label_D_1608
+
+return:                                           ; preds = %__label_D_1608
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
index d622529..bac2ffa 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
@@ -1,13 +1,6 @@
 config.suffixes = ['.ll']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
 
diff --git a/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index 2dcaab8..ed32ca8 100644
--- a/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -61,7 +61,7 @@ exit:                                 ; preds = %cond.true29.i, %cond.true.i
 ; CHECK: @test2
 ; CHECK: %entry
 ; CHECK-NOT: mov
-; CHECK: jne
+; CHECK: je
 define void @test2(i32 %n) nounwind uwtable {
 entry:
   br i1 undef, label %while.end, label %for.cond468
diff --git a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
index 84bd88c..da2db5a 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
@@ -1,13 +1,6 @@
 config.suffixes = ['.ll']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
 
diff --git a/test/Transforms/IndVarSimplify/addrec-gep.ll b/test/Transforms/LoopStrengthReduce/addrec-gep.ll
index b62d093..3e4e369 100644
--- a/test/Transforms/IndVarSimplify/addrec-gep.ll
+++ b/test/Transforms/LoopStrengthReduce/addrec-gep.ll
@@ -1,13 +1,17 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
-; CHECK: getelementptr
-; CHECK: mul {{.*}}, 37
-; CHECK: add {{.*}}, 5203
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; CHECK: bb1:
+; CHECK: load double* [[IV:%[^,]+]]
+; CHECK: store double {{.*}}, double* [[IV]]
+; CHECK: getelementptr double*
 ; CHECK-NOT: cast
+; CHECK: br {{.*}} label %bb1
 
 ; This test tests several things. The load and store should use the
 ; same address instead of having it computed twice, and SCEVExpander should
 ; be able to reconstruct the full getelementptr, despite it having a few
 ; obstacles set in its way.
+; We only check that the inner loop (bb1-bb2) is "reduced" because LSR
+; currently only operates on inner loops.
 
 target datalayout = "e-p:64:64:64-n32:64"
 
diff --git a/test/Transforms/IndVarSimplify/preserve-gep-loop-variant.ll b/test/Transforms/LoopStrengthReduce/preserve-gep-loop-variant.ll
index 251d34e..f90d030 100644
--- a/test/Transforms/IndVarSimplify/preserve-gep-loop-variant.ll
+++ b/test/Transforms/LoopStrengthReduce/preserve-gep-loop-variant.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -indvars -S -enable-iv-rewrite | FileCheck %s
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
 ; CHECK-NOT: {{inttoptr|ptrtoint}}
 ; CHECK: scevgep
 ; CHECK-NOT: {{inttoptr|ptrtoint}}
diff --git a/test/Transforms/LoopUnroll/2011-08-09-IVSimplify.ll b/test/Transforms/LoopUnroll/2011-08-09-IVSimplify.ll
index 59551d5..a43a4ff 100644
--- a/test/Transforms/LoopUnroll/2011-08-09-IVSimplify.ll
+++ b/test/Transforms/LoopUnroll/2011-08-09-IVSimplify.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S < %s -loop-unroll -unroll-count=4 -enable-iv-rewrite=false | FileCheck %s
+; RUN: opt -S < %s -loop-unroll -unroll-count=4 | FileCheck %s
 ;
 ; Test induction variable simplify after loop unrolling. It should
 ; expose nice opportunities for GVN.
diff --git a/test/Transforms/LoopUnroll/2012-04-09-unroll-indirectbr.ll b/test/Transforms/LoopUnroll/2012-04-09-unroll-indirectbr.ll
new file mode 100644
index 0000000..8946a23
--- /dev/null
+++ b/test/Transforms/LoopUnroll/2012-04-09-unroll-indirectbr.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -S -loop-unroll -simplifycfg | FileCheck %s
+; PR12513: Loop unrolling breaks with indirect branches.
+; If loop unrolling attempts to transform this loop, it replaces the
+; indirectbr successors. SimplifyCFG then considers them to be unreachable.
+declare void @subtract() nounwind uwtable
+
+; CHECK-NOT: unreachable
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind uwtable {
+entry:
+  %vals19 = alloca [5 x i32], align 16
+  %x20 = alloca i32, align 4
+  store i32 135, i32* %x20, align 4
+  br label %for.body
+
+for.body:                                         ; preds = ; %call2_termjoin, %call3_termjoin
+  %indvars.iv = phi i64 [ 0, %entry ], [ %joinphi15.in.in, %call2_termjoin ]
+  %a6 = call coldcc i8* @funca(i8* blockaddress(@main, %for.body_code), i8*
+blockaddress(@main, %for.body_codeprime)) nounwind
+  indirectbr i8* %a6, [label %for.body_code, label %for.body_codeprime]
+
+for.body_code:                                    ; preds = %for.body
+  call void @subtract()
+  br label %call2_termjoin
+
+call2_termjoin:                                   ; preds = %for.body_codeprime, %for.body_code
+  %joinphi15.in.in = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %joinphi15.in.in, 5
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %call2_termjoin
+  ret i32 0
+
+for.body_codeprime:                               ; preds = %for.body
+  call void @subtract_v2(i64 %indvars.iv)
+  br label %call2_termjoin
+}
+
+declare coldcc i8* @funca(i8*, i8*) readonly
+
+declare void @subtract_v2(i64) nounwind uwtable
diff --git a/test/Transforms/LoopUnroll/partial-unroll-optsize.ll b/test/Transforms/LoopUnroll/partial-unroll-optsize.ll
new file mode 100644
index 0000000..3179d55
--- /dev/null
+++ b/test/Transforms/LoopUnroll/partial-unroll-optsize.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -S -loop-unroll -unroll-allow-partial | FileCheck %s
+; Loop size = 3, when the function has the optsize attribute, the
+; OptSizeUnrollThreshold, i.e. 50, is used, hence the loop should be unrolled
+; by 16 times because 3 * 16 < 50.
+define void @unroll_opt_for_size() nounwind optsize {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; CHECK:      add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: icmp
diff --git a/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll b/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll
new file mode 100644
index 0000000..c92f0a2
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -S -loop-unswitch -verify-loop-info -verify-dom-info | FileCheck %s
+; PR12343: -loop-unswitch crash on indirect branch
+
+; CHECK:       %0 = icmp eq i64 undef, 0
+; CHECK-NEXT:  br i1 %0, label %"5", label %"4"
+
+; CHECK:       "5":                                              ; preds = %entry
+; CHECK-NEXT:  br label %"16"
+
+; CHECK:       "16":                                             ; preds = %"22", %"5"
+; CHECK-NEXT:  indirectbr i8* undef, [label %"22", label %"33"]
+
+; CHECK:       "22":                                             ; preds = %"16"
+; CHECK-NEXT:  br i1 %0, label %"16", label %"26"
+
+; CHECK:       "26":                                             ; preds = %"22"
+; CHECK-NEXT:  unreachable
+
+define void @foo() {
+entry:
+  %0 = icmp eq i64 undef, 0
+  br i1 %0, label %"5", label %"4"
+
+"4":                                              ; preds = %entry
+  unreachable
+
+"5":                                              ; preds = %entry
+  br label %"16"
+
+"16":                                             ; preds = %"22", %"5"
+  indirectbr i8* undef, [label %"22", label %"33"]
+
+"22":                                             ; preds = %"16"
+  br i1 %0, label %"16", label %"26"
+
+"26":                                             ; preds = %"22"
+  unreachable
+
+"33":                                             ; preds = %"16"
+  unreachable
+}
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 08bd8c0..ba2f778 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -3,10 +3,12 @@
 target datalayout = "e-p:64:64:64"
 
 declare i8* @objc_retain(i8*)
+declare i8* @objc_retainAutoreleasedReturnValue(i8*)
 declare void @objc_release(i8*)
 declare i8* @objc_autorelease(i8*)
+declare i8* @objc_autoreleaseReturnValue(i8*)
 declare void @objc_autoreleasePoolPop(i8*)
-declare void @objc_autoreleasePoolPush()
+declare i8* @objc_autoreleasePoolPush()
 declare i8* @objc_retainBlock(i8*)
 
 declare i8* @objc_retainedObject(i8*)
@@ -526,7 +528,7 @@ entry:
 define void @test13d(i8* %x, i64 %n) {
 entry:
   call i8* @objc_retain(i8* %x) nounwind
-  call void @objc_autoreleasePoolPush()
+  call i8* @objc_autoreleasePoolPush()
   call i8* @objc_retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
   call void @use_pointer(i8* %x)
@@ -1400,7 +1402,7 @@ entry:
 ; CHECK-NEXT: call i8* @objc_autorelease(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK-NEXT: call void @objc_autoreleasePoolPush()
+; CHECK-NEXT: call i8* @objc_autoreleasePoolPush()
 ; CHECK-NEXT: ret void
 ; CHECK-NEXT: }
 define void @test43b(i8* %p) {
@@ -1410,7 +1412,7 @@ entry:
   call i8* @objc_retain(i8* %p)
   call void @use_pointer(i8* %p)
   call void @use_pointer(i8* %p)
-  call void @objc_autoreleasePoolPush()
+  call i8* @objc_autoreleasePoolPush()
   call void @objc_release(i8* %p)
   ret void
 }
@@ -1797,6 +1799,78 @@ exit:
   ret void
 }
 
+; Move an autorelease past a phi with a null.
+
+; CHECK: define i8* @test65(
+; CHECK: if.then:
+; CHECK:   call i8* @objc_autorelease(
+; CHECK: return:
+; CHECK-NOT: @objc_autorelease
+; CHECK: }
+define i8* @test65(i1 %x) {
+entry:
+  br i1 %x, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %c = call i8* @returner()
+  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  br label %return
+
+return:                                           ; preds = %if.then, %entry
+  %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
+  %q = call i8* @objc_autorelease(i8* %retval) nounwind
+  ret i8* %retval
+}
+
+; Don't move an autorelease past an autorelease pool boundary.
+
+; CHECK: define i8* @test65b(
+; CHECK: if.then:
+; CHECK-NOT: @objc_autorelease
+; CHECK: return:
+; CHECK:   call i8* @objc_autorelease(
+; CHECK: }
+define i8* @test65b(i1 %x) {
+entry:
+  %t = call i8* @objc_autoreleasePoolPush()
+  br i1 %x, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %c = call i8* @returner()
+  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  br label %return
+
+return:                                           ; preds = %if.then, %entry
+  %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
+  call void @objc_autoreleasePoolPop(i8* %t)
+  %q = call i8* @objc_autorelease(i8* %retval) nounwind
+  ret i8* %retval
+}
+
+; Don't move an autoreleaseReuturnValue, which would break
+; the RV optimization.
+
+; CHECK: define i8* @test65c(
+; CHECK: if.then:
+; CHECK-NOT: @objc_autorelease
+; CHECK: return:
+; CHECK:   call i8* @objc_autoreleaseReturnValue(
+; CHECK: }
+define i8* @test65c(i1 %x) {
+entry:
+  br i1 %x, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %c = call i8* @returner()
+  %s = call i8* @objc_retainAutoreleasedReturnValue(i8* %c) nounwind
+  br label %return
+
+return:                                           ; preds = %if.then, %entry
+  %retval = phi i8* [ %s, %if.then ], [ null, %entry ]
+  %q = call i8* @objc_autoreleaseReturnValue(i8* %retval) nounwind
+  ret i8* %retval
+}
+
 declare void @bar(i32 ()*)
 
 ; A few real-world testcases.
diff --git a/test/Transforms/ObjCARC/contract.ll b/test/Transforms/ObjCARC/contract.ll
index 04ae3ca..c48f8a5 100644
--- a/test/Transforms/ObjCARC/contract.ll
+++ b/test/Transforms/ObjCARC/contract.ll
@@ -143,3 +143,21 @@ define i8* @test7(i8* %p) {
   %2 = tail call i8* @objc_autoreleaseReturnValue(i8* %p)
   ret i8* %p
 }
+
+; Do the return value substitution for PHI nodes too.
+
+; CHECK: define i8* @test8(
+; CHECK: %retval = phi i8* [ %p, %if.then ], [ null, %entry ]
+; CHECK: }
+define i8* @test8(i1 %x, i8* %c) {
+entry:
+  br i1 %x, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %p = call i8* @objc_retain(i8* %c) nounwind
+  br label %return
+
+return:                                           ; preds = %if.then, %entry
+  %retval = phi i8* [ %c, %if.then ], [ null, %entry ]
+  ret i8* %retval
+}
diff --git a/test/Transforms/ObjCARC/escape.ll b/test/Transforms/ObjCARC/escape.ll
new file mode 100644
index 0000000..3f694cf
--- /dev/null
+++ b/test/Transforms/ObjCARC/escape.ll
@@ -0,0 +1,131 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+; rdar://11229925
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.__block_byref_weakLogNTimes = type { i8*, %struct.__block_byref_weakLogNTimes*, i32, i32, i8*, i8*, void (...)* }
+%struct.__block_descriptor = type { i64, i64 }
+
+; Don't optimize away the retainBlock, because the object's address "escapes"
+; with the objc_storeWeak call.
+
+; CHECK: define void @test0(
+; CHECK: %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
+; CHECK: call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+; CHECK: }
+define void @test0() nounwind {
+entry:
+  %weakLogNTimes = alloca %struct.__block_byref_weakLogNTimes, align 8
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, align 8
+  %byref.isa = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 0
+  store i8* null, i8** %byref.isa, align 8
+  %byref.forwarding = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 1
+  store %struct.__block_byref_weakLogNTimes* %weakLogNTimes, %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
+  %byref.flags = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 2
+  store i32 33554432, i32* %byref.flags, align 8
+  %byref.size = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 3
+  store i32 48, i32* %byref.size, align 4
+  %tmp1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 4
+  store i8* bitcast (void (i8*, i8*)* @__Block_byref_object_copy_ to i8*), i8** %tmp1, align 8
+  %tmp2 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 5
+  store i8* bitcast (void (i8*)* @__Block_byref_object_dispose_ to i8*), i8** %tmp2, align 8
+  %weakLogNTimes1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 6
+  %tmp3 = bitcast void (...)** %weakLogNTimes1 to i8**
+  %tmp4 = call i8* @objc_initWeak(i8** %tmp3, i8* null) nounwind
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
+  store i8* null, i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*, i32)* @__main_block_invoke_0 to i8*), i8** %block.invoke, align 8
+  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_descriptor* null, %struct.__block_descriptor** %block.descriptor, align 8
+  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 5
+  %tmp5 = bitcast %struct.__block_byref_weakLogNTimes* %weakLogNTimes to i8*
+  store i8* %tmp5, i8** %block.captured, align 8
+  %tmp6 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
+  %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
+  %tmp8 = load %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
+  %weakLogNTimes3 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %tmp8, i64 0, i32 6
+  %tmp9 = bitcast void (...)** %weakLogNTimes3 to i8**
+  %tmp10 = call i8* @objc_storeWeak(i8** %tmp9, i8* %tmp7) nounwind
+  %tmp11 = getelementptr inbounds i8* %tmp7, i64 16
+  %tmp12 = bitcast i8* %tmp11 to i8**
+  %tmp13 = load i8** %tmp12, align 8
+  %tmp14 = bitcast i8* %tmp13 to void (i8*, i32)*
+  call void %tmp14(i8* %tmp7, i32 10) nounwind, !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  call void @_Block_object_dispose(i8* %tmp5, i32 8) nounwind
+  call void @objc_destroyWeak(i8** %tmp3) nounwind
+  ret void
+}
+
+; Like test0, but it makes a regular call instead of a storeWeak call,
+; so the optimization is valid.
+
+; CHECK: define void @test1(
+; CHECK-NOT: @objc_retainBlock
+; CHECK: }
+define void @test1() nounwind {
+entry:
+  %weakLogNTimes = alloca %struct.__block_byref_weakLogNTimes, align 8
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, align 8
+  %byref.isa = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 0
+  store i8* null, i8** %byref.isa, align 8
+  %byref.forwarding = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 1
+  store %struct.__block_byref_weakLogNTimes* %weakLogNTimes, %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
+  %byref.flags = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 2
+  store i32 33554432, i32* %byref.flags, align 8
+  %byref.size = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 3
+  store i32 48, i32* %byref.size, align 4
+  %tmp1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 4
+  store i8* bitcast (void (i8*, i8*)* @__Block_byref_object_copy_ to i8*), i8** %tmp1, align 8
+  %tmp2 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 5
+  store i8* bitcast (void (i8*)* @__Block_byref_object_dispose_ to i8*), i8** %tmp2, align 8
+  %weakLogNTimes1 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %weakLogNTimes, i64 0, i32 6
+  %tmp3 = bitcast void (...)** %weakLogNTimes1 to i8**
+  %tmp4 = call i8* @objc_initWeak(i8** %tmp3, i8* null) nounwind
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
+  store i8* null, i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*, i32)* @__main_block_invoke_0 to i8*), i8** %block.invoke, align 8
+  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_descriptor* null, %struct.__block_descriptor** %block.descriptor, align 8
+  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 5
+  %tmp5 = bitcast %struct.__block_byref_weakLogNTimes* %weakLogNTimes to i8*
+  store i8* %tmp5, i8** %block.captured, align 8
+  %tmp6 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
+  %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
+  %tmp8 = load %struct.__block_byref_weakLogNTimes** %byref.forwarding, align 8
+  %weakLogNTimes3 = getelementptr inbounds %struct.__block_byref_weakLogNTimes* %tmp8, i64 0, i32 6
+  %tmp9 = bitcast void (...)** %weakLogNTimes3 to i8**
+  %tmp10 = call i8* @not_really_objc_storeWeak(i8** %tmp9, i8* %tmp7) nounwind
+  %tmp11 = getelementptr inbounds i8* %tmp7, i64 16
+  %tmp12 = bitcast i8* %tmp11 to i8**
+  %tmp13 = load i8** %tmp12, align 8
+  %tmp14 = bitcast i8* %tmp13 to void (i8*, i32)*
+  call void %tmp14(i8* %tmp7, i32 10) nounwind, !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+  call void @_Block_object_dispose(i8* %tmp5, i32 8) nounwind
+  call void @objc_destroyWeak(i8** %tmp3) nounwind
+  ret void
+}
+
+declare void @__Block_byref_object_copy_(i8*, i8*) nounwind
+declare void @__Block_byref_object_dispose_(i8*) nounwind
+declare void @objc_destroyWeak(i8**)
+declare i8* @objc_initWeak(i8**, i8*)
+declare void @__main_block_invoke_0(i8* nocapture, i32) nounwind ssp
+declare void @_Block_object_dispose(i8*, i32)
+declare i8* @objc_retainBlock(i8*)
+declare i8* @objc_storeWeak(i8**, i8*)
+declare i8* @not_really_objc_storeWeak(i8**, i8*)
+declare void @objc_release(i8*)
+
+!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/invoke.ll b/test/Transforms/ObjCARC/invoke.ll
index 9e26209..76e82a5 100644
--- a/test/Transforms/ObjCARC/invoke.ll
+++ b/test/Transforms/ObjCARC/invoke.ll
@@ -6,6 +6,7 @@ declare i8* @objc_retainAutoreleasedReturnValue(i8*)
 declare i8* @objc_msgSend(i8*, i8*, ...)
 declare void @use_pointer(i8*)
 declare void @callee()
+declare i8* @returner()
 
 ; ARCOpt shouldn't try to move the releases to the block containing the invoke.
 
@@ -103,6 +104,114 @@ finally.rethrow:                                  ; preds = %invoke.cont, %entry
   unreachable
 }
 
+; Don't try to place code on invoke critical edges.
+
+; CHECK: define void @test3(
+; CHECK: if.end:
+; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: ret void
+define void @test3(i8* %p, i1 %b) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  br i1 %b, label %if.else, label %if.then
+
+if.then:
+  invoke void @use_pointer(i8* %p)
+          to label %if.end unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+if.else:
+  invoke void @use_pointer(i8* %p)
+          to label %if.end unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+lpad:
+  %r = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+       cleanup
+  ret void
+
+if.end:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Like test3, but with ARC-relevant exception handling.
+
+; CHECK: define void @test4(
+; CHECK: lpad:
+; CHECK-NEXT: %r = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: ret void
+define void @test4(i8* %p, i1 %b) {
+entry:
+  %0 = call i8* @objc_retain(i8* %p)
+  call void @callee()
+  br i1 %b, label %if.else, label %if.then
+
+if.then:
+  invoke void @use_pointer(i8* %p)
+          to label %if.end unwind label %lpad
+
+if.else:
+  invoke void @use_pointer(i8* %p)
+          to label %if.end unwind label %lpad
+
+lpad:
+  %r = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+       cleanup
+  call void @objc_release(i8* %p)
+  ret void
+
+if.end:
+  call void @objc_release(i8* %p)
+  ret void
+}
+
+; Don't turn the retainAutoreleaseReturnValue into retain, because it's
+; for an invoke which we can assume codegen will put immediately prior.
+
+; CHECK: define void @test5(
+; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+; CHECK: }
+define void @test5() {
+entry:
+  %z = invoke i8* @returner()
+          to label %if.end unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+lpad:
+  %r13 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          cleanup
+  ret void
+
+if.end:
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  ret void
+}
+
+; Like test5, but there's intervening code.
+
+; CHECK: define void @test6(
+; CHECK: call i8* @objc_retain(i8* %z)
+; CHECK: }
+define void @test6() {
+entry:
+  %z = invoke i8* @returner()
+          to label %if.end unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+lpad:
+  %r13 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          cleanup
+  ret void
+
+if.end:
+  call void @callee()
+  call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  ret void
+}
+
 declare i32 @__gxx_personality_v0(...)
 declare i32 @__objc_personality_v0(...)
 
diff --git a/test/Transforms/ObjCARC/pr12270.ll b/test/Transforms/ObjCARC/pr12270.ll
index 30610f8..1faae5f 100644
--- a/test/Transforms/ObjCARC/pr12270.ll
+++ b/test/Transforms/ObjCARC/pr12270.ll
@@ -9,7 +9,13 @@ entry:
 return:                                           ; No predecessors!
   %bar = bitcast %2* %x to i8*
   %foo = call i8* @objc_autoreleaseReturnValue(i8* %bar) nounwind
+  call void @callee()
+  call void @use_pointer(i8* %foo)
+  call void @objc_release(i8* %foo) nounwind
   ret void
 }
 
 declare i8* @objc_autoreleaseReturnValue(i8*)
+declare void @objc_release(i8*)
+declare void @callee()
+declare void @use_pointer(i8*)
diff --git a/test/Transforms/ObjCARC/retain-not-declared.ll b/test/Transforms/ObjCARC/retain-not-declared.ll
index 41bde01..f876e51 100644
--- a/test/Transforms/ObjCARC/retain-not-declared.ll
+++ b/test/Transforms/ObjCARC/retain-not-declared.ll
@@ -30,7 +30,7 @@ entry:
 
 ; CHECK: @test1(
 ; CHECK: @objc_retain(
-; CHECK: @objc_retain(
+; CHECK: @objc_retainAutoreleasedReturnValue(
 ; CHECK: @objc_release(
 ; CHECK: @objc_release(
 ; CHECK: }
diff --git a/test/Transforms/PhaseOrdering/PR6627.ll b/test/Transforms/PhaseOrdering/PR6627.ll
new file mode 100644
index 0000000..ef9947f
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/PR6627.ll
@@ -0,0 +1,93 @@
+; RUN: opt -O3 -S %s | FileCheck %s
+; XFAIL: *
+
+declare i32 @doo(...)
+
+; PR6627 - This whole nasty sequence should be flattened down to a single
+; 32-bit comparison.
+define void @test2(i8* %arrayidx) nounwind ssp {
+entry:
+  %xx = bitcast i8* %arrayidx to i32*
+  %x1 = load i32* %xx, align 4
+  %tmp = trunc i32 %x1 to i8
+  %conv = zext i8 %tmp to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2
+; CHECK: %x1 = load i32* %xx, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
+
+; PR6627 - This should all be flattened down to one compare.  This is the same
+; as test2, except that the initial load is done as an i8 instead of i32, thus
+; requiring widening.
+define void @test2a(i8* %arrayidx) nounwind ssp {
+entry:
+  %x1 = load i8* %arrayidx, align 4
+  %conv = zext i8 %x1 to i32
+  %cmp = icmp eq i32 %conv, 127
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
+  %tmp5 = load i8* %arrayidx4, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  %cmp7 = icmp eq i32 %conv6, 69
+  br i1 %cmp7, label %land.lhs.true9, label %if.end
+
+land.lhs.true9:                                   ; preds = %land.lhs.true
+  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
+  %tmp13 = load i8* %arrayidx12, align 1
+  %conv14 = zext i8 %tmp13 to i32
+  %cmp15 = icmp eq i32 %conv14, 76
+  br i1 %cmp15, label %land.lhs.true17, label %if.end
+
+land.lhs.true17:                                  ; preds = %land.lhs.true9
+  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
+  %tmp21 = load i8* %arrayidx20, align 1
+  %conv22 = zext i8 %tmp21 to i32
+  %cmp23 = icmp eq i32 %conv22, 70
+  br i1 %cmp23, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true17
+  %call25 = call i32 (...)* @doo()
+  br label %if.end
+
+if.end:
+  ret void
+
+; CHECK: @test2a
+; CHECK: %x1 = load i32* {{.*}}, align 4
+; CHECK-NEXT: icmp eq i32 %x1, 1179403647
+; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
+}
diff --git a/test/Transforms/PhaseOrdering/basic.ll b/test/Transforms/PhaseOrdering/basic.ll
index e5b2ba4..2d52ae5 100644
--- a/test/Transforms/PhaseOrdering/basic.ll
+++ b/test/Transforms/PhaseOrdering/basic.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -O3 -S %s | FileCheck %s
-; XFAIL: *
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
@@ -23,96 +22,3 @@ define void @test1() nounwind ssp {
 ; CHECK: @test1
 ; CHECK-NEXT: ret void
 }
-
-
-; PR6627 - This whole nasty sequence should be flattened down to a single
-; 32-bit comparison.
-define void @test2(i8* %arrayidx) nounwind ssp {
-entry:
-  %xx = bitcast i8* %arrayidx to i32*
-  %x1 = load i32* %xx, align 4
-  %tmp = trunc i32 %x1 to i8
-  %conv = zext i8 %tmp to i32
-  %cmp = icmp eq i32 %conv, 127
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
-  %tmp5 = load i8* %arrayidx4, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  %cmp7 = icmp eq i32 %conv6, 69
-  br i1 %cmp7, label %land.lhs.true9, label %if.end
-
-land.lhs.true9:                                   ; preds = %land.lhs.true
-  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
-  %tmp13 = load i8* %arrayidx12, align 1
-  %conv14 = zext i8 %tmp13 to i32
-  %cmp15 = icmp eq i32 %conv14, 76
-  br i1 %cmp15, label %land.lhs.true17, label %if.end
-
-land.lhs.true17:                                  ; preds = %land.lhs.true9
-  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
-  %tmp21 = load i8* %arrayidx20, align 1
-  %conv22 = zext i8 %tmp21 to i32
-  %cmp23 = icmp eq i32 %conv22, 70
-  br i1 %cmp23, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true17
-  %call25 = call i32 (...)* @doo()
-  br label %if.end
-
-if.end:
-  ret void
-
-; CHECK: @test2
-; CHECK: %x1 = load i32* %xx, align 4
-; CHECK-NEXT: icmp eq i32 %x1, 1179403647
-; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
-}
-
-declare i32 @doo(...)
-
-; PR6627 - This should all be flattened down to one compare.  This is the same
-; as test2, except that the initial load is done as an i8 instead of i32, thus
-; requiring widening.
-define void @test2a(i8* %arrayidx) nounwind ssp {
-entry:
-  %x1 = load i8* %arrayidx, align 4
-  %conv = zext i8 %x1 to i32
-  %cmp = icmp eq i32 %conv, 127
-  br i1 %cmp, label %land.lhs.true, label %if.end
-
-land.lhs.true:                                    ; preds = %entry
-  %arrayidx4 = getelementptr inbounds i8* %arrayidx, i64 1
-  %tmp5 = load i8* %arrayidx4, align 1
-  %conv6 = zext i8 %tmp5 to i32
-  %cmp7 = icmp eq i32 %conv6, 69
-  br i1 %cmp7, label %land.lhs.true9, label %if.end
-
-land.lhs.true9:                                   ; preds = %land.lhs.true
-  %arrayidx12 = getelementptr inbounds i8* %arrayidx, i64 2
-  %tmp13 = load i8* %arrayidx12, align 1
-  %conv14 = zext i8 %tmp13 to i32
-  %cmp15 = icmp eq i32 %conv14, 76
-  br i1 %cmp15, label %land.lhs.true17, label %if.end
-
-land.lhs.true17:                                  ; preds = %land.lhs.true9
-  %arrayidx20 = getelementptr inbounds i8* %arrayidx, i64 3
-  %tmp21 = load i8* %arrayidx20, align 1
-  %conv22 = zext i8 %tmp21 to i32
-  %cmp23 = icmp eq i32 %conv22, 70
-  br i1 %cmp23, label %if.then, label %if.end
-
-if.then:                                          ; preds = %land.lhs.true17
-  %call25 = call i32 (...)* @doo()
-  br label %if.end
-
-if.end:
-  ret void
-
-; CHECK: @test2a
-; CHECK: %x1 = load i32* {{.*}}, align 4
-; CHECK-NEXT: icmp eq i32 %x1, 1179403647
-; CHECK-NEXT: br i1 {{.*}}, label %if.then, label %if.end 
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/floor.ll b/test/Transforms/SimplifyLibCalls/floor.ll
index 8780e32..03dcdf5 100644
--- a/test/Transforms/SimplifyLibCalls/floor.ll
+++ b/test/Transforms/SimplifyLibCalls/floor.ll
@@ -1,16 +1,31 @@
-; RUN: opt < %s -simplify-libcalls -S > %t
-; RUN: not grep {call.*floor(} %t
-; RUN: grep {call.*floorf(} %t
-; RUN: not grep {call.*ceil(} %t
-; RUN: grep {call.*ceilf(} %t
-; RUN: not grep {call.*nearbyint(} %t
-; RUN: grep {call.*nearbyintf(} %t
-; XFAIL: sparc
+; RUN: opt < %s -simplify-libcalls -S -mtriple "i386-pc-linux" | FileCheck -check-prefix=DO-SIMPLIFY %s
+; RUN: opt < %s -simplify-libcalls -S -mtriple "i386-pc-win32" | FileCheck -check-prefix=DONT-SIMPLIFY %s
+; RUN: opt < %s -simplify-libcalls -S -mtriple "x86_64-pc-win32" | FileCheck -check-prefix=C89-SIMPLIFY %s
+; RUN: opt < %s -simplify-libcalls -S -mtriple "i386-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
+; RUN: opt < %s -simplify-libcalls -S -mtriple "x86_64-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
+; RUN: opt < %s -simplify-libcalls -S -mtriple "sparc-sun-solaris" | FileCheck -check-prefix=DO-SIMPLIFY %s
+
+; DO-SIMPLIFY: call float @floorf(
+; DO-SIMPLIFY: call float @ceilf(
+; DO-SIMPLIFY: call float @roundf(
+; DO-SIMPLIFY: call float @nearbyintf(
+
+; C89-SIMPLIFY: call float @floorf(
+; C89-SIMPLIFY: call float @ceilf(
+; C89-SIMPLIFY: call double @round(
+; C89-SIMPLIFY: call double @nearbyint(
+
+; DONT-SIMPLIFY: call double @floor(
+; DONT-SIMPLIFY: call double @ceil(
+; DONT-SIMPLIFY: call double @round(
+; DONT-SIMPLIFY: call double @nearbyint(
 
 declare double @floor(double)
 
 declare double @ceil(double)
 
+declare double @round(double)
+
 declare double @nearbyint(double)
 
 define float @test_floor(float %C) {
@@ -29,8 +44,14 @@ define float @test_ceil(float %C) {
 	ret float %F
 }
 
-; PR8466
-; XFAIL: win32
+define float @test_round(float %C) {
+	%D = fpext float %C to double		; <double> [#uses=1]
+	; --> roundf
+        %E = call double @round( double %D )		; <double> [#uses=1]
+	%F = fptrunc double %E to float		; <float> [#uses=1]
+	ret float %F
+}
+
 define float @test_nearbyint(float %C) {
 	%D = fpext float %C to double		; <double> [#uses=1]
 	; --> nearbyintf
diff --git a/test/Transforms/SimplifyLibCalls/win-math.ll b/test/Transforms/SimplifyLibCalls/win-math.ll
new file mode 100644
index 0000000..367e5b8
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/win-math.ll
@@ -0,0 +1,275 @@
+; RUN: opt -O2 -S -mtriple=i386-pc-win32 < %s | FileCheck %s -check-prefix=WIN32
+; RUN: opt -O2 -S -mtriple=x86_64-pc-win32 < %s | FileCheck %s -check-prefix=WIN64
+; RUN: opt -O2 -S -mtriple=i386-pc-mingw32 < %s | FileCheck %s -check-prefix=MINGW32
+; RUN: opt -O2 -S -mtriple=x86_64-pc-mingw32 < %s | FileCheck %s -check-prefix=MINGW64
+
+; x86 win32 msvcrt does not provide entry points for single-precision libm.
+; x86-64 win32 msvcrt does (except for fabsf)
+; msvcrt does not provide C99 math, but mingw32 does.
+
+declare double @acos(double %x)
+define float @float_acos(float %x) nounwind readnone {
+; WIN32: @float_acos
+; WIN32-NOT: float @acosf
+; WIN32: double @acos
+    %1 = fpext float %x to double
+    %2 = call double @acos(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @asin(double %x)
+define float @float_asin(float %x) nounwind readnone {
+; WIN32: @float_asin
+; WIN32-NOT: float @asinf
+; WIN32: double @asin
+    %1 = fpext float %x to double
+    %2 = call double @asin(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @atan(double %x)
+define float @float_atan(float %x) nounwind readnone {
+; WIN32: @float_atan
+; WIN32-NOT: float @atanf
+; WIN32: double @atan
+    %1 = fpext float %x to double
+    %2 = call double @atan(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @atan2(double %x, double %y)
+define float @float_atan2(float %x, float %y) nounwind readnone {
+; WIN32: @float_atan2
+; WIN32-NOT: float @atan2f
+; WIN32: double @atan2
+    %1 = fpext float %x to double
+    %2 = fpext float %y to double
+    %3 = call double @atan2(double %1, double %2)
+    %4 = fptrunc double %3 to float
+    ret float %4
+}
+
+declare double @ceil(double %x)
+define float @float_ceil(float %x) nounwind readnone {
+; WIN32: @float_ceil
+; WIN32-NOT: float @ceilf
+; WIN32: double @ceil
+; WIN64: @float_ceil
+; WIN64: float @ceilf
+; WIN64-NOT: double @ceil
+; MINGW32: @float_ceil
+; MINGW32: float @ceilf
+; MINGW32-NOT: double @ceil
+; MINGW64: @float_ceil
+; MINGW64: float @ceilf
+; MINGW64-NOT: double @ceil
+    %1 = fpext float %x to double
+    %2 = call double @ceil(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @_copysign(double %x)
+define float @float_copysign(float %x) nounwind readnone {
+; WIN32: @float_copysign
+; WIN32-NOT: float @copysignf
+; WIN32-NOT: float @_copysignf
+; WIN32: double @_copysign
+    %1 = fpext float %x to double
+    %2 = call double @_copysign(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @cos(double %x)
+define float @float_cos(float %x) nounwind readnone {
+; WIN32: @float_cos
+; WIN32-NOT: float @cosf
+; WIN32: double @cos
+    %1 = fpext float %x to double
+    %2 = call double @cos(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @cosh(double %x)
+define float @float_cosh(float %x) nounwind readnone {
+; WIN32: @float_cosh
+; WIN32-NOT: float @coshf
+; WIN32: double @cosh
+    %1 = fpext float %x to double
+    %2 = call double @cosh(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @exp(double %x, double %y)
+define float @float_exp(float %x, float %y) nounwind readnone {
+; WIN32: @float_exp
+; WIN32-NOT: float @expf
+; WIN32: double @exp
+    %1 = fpext float %x to double
+    %2 = fpext float %y to double
+    %3 = call double @exp(double %1, double %2)
+    %4 = fptrunc double %3 to float
+    ret float %4
+}
+
+declare double @fabs(double %x, double %y)
+define float @float_fabs(float %x, float %y) nounwind readnone {
+; WIN32: @float_fabs
+; WIN32-NOT: float @fabsf
+; WIN32: double @fabs
+; WIN64: @float_fabs
+; WIN64-NOT: float @fabsf
+; WIN64: double @fabs
+    %1 = fpext float %x to double
+    %2 = fpext float %y to double
+    %3 = call double @fabs(double %1, double %2)
+    %4 = fptrunc double %3 to float
+    ret float %4
+}
+
+declare double @floor(double %x)
+define float @float_floor(float %x) nounwind readnone {
+; WIN32: @float_floor
+; WIN32-NOT: float @floorf
+; WIN32: double @floor
+; WIN64: @float_floor
+; WIN64: float @floorf
+; WIN64-NOT: double @floor
+; MINGW32: @float_floor
+; MINGW32: float @floorf
+; MINGW32-NOT: double @floor
+; MINGW64: @float_floor
+; MINGW64: float @floorf
+; MINGW64-NOT: double @floor
+    %1 = fpext float %x to double
+    %2 = call double @floor(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @fmod(double %x, double %y)
+define float @float_fmod(float %x, float %y) nounwind readnone {
+; WIN32: @float_fmod
+; WIN32-NOT: float @fmodf
+; WIN32: double @fmod
+    %1 = fpext float %x to double
+    %2 = fpext float %y to double
+    %3 = call double @fmod(double %1, double %2)
+    %4 = fptrunc double %3 to float
+    ret float %4
+}
+
+declare double @log(double %x)
+define float @float_log(float %x) nounwind readnone {
+; WIN32: @float_log
+; WIN32-NOT: float @logf
+; WIN32: double @log
+    %1 = fpext float %x to double
+    %2 = call double @log(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @pow(double %x, double %y)
+define float @float_pow(float %x, float %y) nounwind readnone {
+; WIN32: @float_pow
+; WIN32-NOT: float @powf
+; WIN32: double @pow
+    %1 = fpext float %x to double
+    %2 = fpext float %y to double
+    %3 = call double @pow(double %1, double %2)
+    %4 = fptrunc double %3 to float
+    ret float %4
+}
+
+declare double @sin(double %x)
+define float @float_sin(float %x) nounwind readnone {
+; WIN32: @float_sin
+; WIN32-NOT: float @sinf
+; WIN32: double @sin
+    %1 = fpext float %x to double
+    %2 = call double @sin(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @sinh(double %x)
+define float @float_sinh(float %x) nounwind readnone {
+; WIN32: @float_sinh
+; WIN32-NOT: float @sinhf
+; WIN32: double @sinh
+    %1 = fpext float %x to double
+    %2 = call double @sinh(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @sqrt(double %x)
+define float @float_sqrt(float %x) nounwind readnone {
+; WIN32: @float_sqrt
+; WIN32-NOT: float @sqrtf
+; WIN32: double @sqrt
+; WIN64: @float_sqrt
+; WIN64: float @sqrtf
+; WIN64-NOT: double @sqrt
+; MINGW32: @float_sqrt
+; MINGW32: float @sqrtf
+; MINGW32-NOT: double @sqrt
+; MINGW64: @float_sqrt
+; MINGW64: float @sqrtf
+; MINGW64-NOT: double @sqrt
+    %1 = fpext float %x to double
+    %2 = call double @sqrt(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @tan(double %x)
+define float @float_tan(float %x) nounwind readnone {
+; WIN32: @float_tan
+; WIN32-NOT: float @tanf
+; WIN32: double @tan
+    %1 = fpext float %x to double
+    %2 = call double @tan(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+declare double @tanh(double %x)
+define float @float_tanh(float %x) nounwind readnone {
+; WIN32: @float_tanh
+; WIN32-NOT: float @tanhf
+; WIN32: double @tanh
+    %1 = fpext float %x to double
+    %2 = call double @tanh(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
+; win32 does not have round; mingw32 does
+declare double @round(double %x)
+define float @float_round(float %x) nounwind readnone {
+; WIN32: @float_round
+; WIN32-NOT: float @roundf
+; WIN32: double @round
+; WIN64: @float_round
+; WIN64-NOT: float @roundf
+; WIN64: double @round
+; MINGW32: @float_round
+; MINGW32: float @roundf
+; MINGW32-NOT: double @round
+; MINGW64: @float_round
+; MINGW64: float @roundf
+; MINGW64-NOT: double @round
+    %1 = fpext float %x to double
+    %2 = call double @round(double %1)
+    %3 = fptrunc double %2 to float
+    ret float %3
+}
+
diff --git a/test/Transforms/TailDup/X86/lit.local.cfg b/test/Transforms/TailDup/X86/lit.local.cfg
index 84bd88c..da2db5a 100644
--- a/test/Transforms/TailDup/X86/lit.local.cfg
+++ b/test/Transforms/TailDup/X86/lit.local.cfg
@@ -1,13 +1,6 @@
 config.suffixes = ['.ll']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
 
diff --git a/test/Transforms/TailDup/lit.local.cfg b/test/Transforms/TailDup/lit.local.cfg
index 39c8039..18c604a 100644
--- a/test/Transforms/TailDup/lit.local.cfg
+++ b/test/Transforms/TailDup/lit.local.cfg
@@ -1,12 +1,5 @@
 config.suffixes = ['.ll', '.c', '.cpp']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True