55 files changed, 1358 insertions, 85 deletions
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 33f935e..a63cdd4 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
 ; RUN:   -mattr=+v6 | grep r9
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
+; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer
 ; | grep 35
 
 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index b543c57..8d3337c 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {add.*#0}
+; RUN: llc < %s -march=arm | not grep "add.*#0"
 
 define i32 @foo() {
 entry:
diff --git a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
index d2eb85d..670048b 100644
--- a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
+++ b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {str.*\\!}
+; RUN: llc < %s -march=arm | not grep "str.*\!"
 
 	%struct.shape_edge_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32 }
 	%struct.shape_path_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index fd2f462..3754db0 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index 44da8e7..5fbed0d 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
index 3526722..7342f69 100644
--- a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
+++ b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {swi 107}
+; RUN: llc < %s -march=arm | grep "swi 107"
 
 define i32 @_swilseek(i32) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
index 813bf3c..7d4cc6e 100644
--- a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
+++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -verify-machineinstrs
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs
 target triple = "arm-pc-linux-gnu"
 
 ; This test case would accidentally use the same physreg for two virtregs
diff --git a/test/CodeGen/ARM/2011-12-14-machine-sink.ll b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
index 5ce600d..b21bb00 100644
--- a/test/CodeGen/ARM/2011-12-14-machine-sink.ll
+++ b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; Radar 10266272
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios4.0.0"
diff --git a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
index 872eca3..f1c85f1 100644
--- a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
+++ b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
@@ -60,8 +60,16 @@ for.end:                                          ; preds = %entry
   ret void
 }
 
+; Check that pseudo-expansion preserves <undef> flags.
+define void @foo3(i8* %p) nounwind ssp {
+entry:
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
+  ret void
+}
+
 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
 
 !0 = metadata !{metadata !"omnipotent char", metadata !1}
 !1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
new file mode 100644
index 0000000..b3a7e34
--- /dev/null
+++ b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 < %s
+
+; CodeGen SplitCriticalEdge() shouldn't try to break edge to a landing pad.
+; rdar://11300144
+
+%0 = type opaque
+%class.FunctionInterpreter.3.15.31 = type { %class.Parser.1.13.29, %class.Parser.1.13.29*, %struct.ParserVariable.2.14.30*, i32 }
+%class.Parser.1.13.29 = type { i32 (...)**, %class.Parser.1.13.29* }
+%struct.ParserVariable.2.14.30 = type opaque
+%struct.ParseErrorMsg.0.12.28 = type { i32, i32, i32 }
+
+@_ZTI13ParseErrorMsg = external hidden unnamed_addr constant { i8*, i8* }
+@"OBJC_IVAR_$_MUMathExpressionDoubleBased.mInterpreter" = external hidden global i32, section "__DATA, __objc_ivar", align 4
+@"\01L_OBJC_SELECTOR_REFERENCES_14" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+define hidden double @t(%0* %self, i8* nocapture %_cmd) optsize ssp {
+entry:
+  %call = invoke double undef(%class.FunctionInterpreter.3.15.31* undef) optsize
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI13ParseErrorMsg to i8*)
+  br i1 undef, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %struct.ParseErrorMsg.0.12.28*)*)(i8* undef, i8* undef, %struct.ParseErrorMsg.0.12.28* undef) optsize
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  %value.0 = phi double [ 0x7FF8000000000000, %invoke.cont2 ], [ %call, %entry ]
+  ret double %value.0
+
+lpad1:                                            ; preds = %catch
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1, %lpad
+  resume { i8*, i32 } undef
+
+terminate.lpad:                                   ; preds = %lpad1
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare i32 @__gxx_personality_sj0(...)
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM/2012-05-29-TailDupBug.ll b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
new file mode 100644
index 0000000..1a57f04
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -verify-machineinstrs < %s
+
+; Teach taildup to update livein set to appease verifier.
+; rdar://11538365
+
+%struct.__CFString.2 = type opaque
+
+declare void @CFRelease(i8*)
+
+define hidden fastcc i32 @t() ssp {
+entry:
+  %mylocale.i.i = alloca [256 x i8], align 1
+  br i1 undef, label %return, label %CFStringIsHyphenationAvailableForLocale.exit
+
+CFStringIsHyphenationAvailableForLocale.exit:     ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %CFStringIsHyphenationAvailableForLocale.exit
+  br i1 undef, label %if.end8.thread.i, label %if.then.i
+
+if.then.i:                                        ; preds = %if.end
+  br i1 undef, label %if.end8.thread.i, label %if.end8.i
+
+if.end8.thread.i:                                 ; preds = %if.then.i, %if.end
+  unreachable
+
+if.end8.i:                                        ; preds = %if.then.i
+  br i1 undef, label %if.then11.i, label %__CFHyphenationPullTokenizer.exit
+
+if.then11.i:                                      ; preds = %if.end8.i
+  unreachable
+
+__CFHyphenationPullTokenizer.exit:                ; preds = %if.end8.i
+  br i1 undef, label %if.end68, label %if.then3
+
+if.then3:                                         ; preds = %__CFHyphenationPullTokenizer.exit
+  br i1 undef, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %if.then3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %if.then3
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %cond.end
+  unreachable
+
+while.end:                                        ; preds = %cond.end
+  br i1 undef, label %if.end5.i, label %if.then.i16
+
+if.then.i16:                                      ; preds = %while.end
+  br i1 undef, label %if.then4.i, label %if.end5.i
+
+if.then4.i:                                       ; preds = %if.then.i16
+  br i1 false, label %cleanup.thread, label %if.end.i20
+
+if.end5.i:                                        ; preds = %if.then.i16, %while.end
+  unreachable
+
+if.end.i20:                                       ; preds = %if.then4.i
+  br label %for.body.i146.i
+
+for.body.i146.i:                                  ; preds = %for.body.i146.i, %if.end.i20
+  br i1 undef, label %if.end20.i, label %for.body.i146.i
+
+if.end20.i:                                       ; preds = %for.body.i146.i
+  br i1 undef, label %cleanup.thread, label %if.end23.i
+
+if.end23.i:                                       ; preds = %if.end20.i
+  br label %for.body.i94.i
+
+for.body.i94.i:                                   ; preds = %for.body.i94.i, %if.end23.i
+  br i1 undef, label %if.then28.i, label %for.body.i94.i
+
+if.then28.i:                                      ; preds = %for.body.i94.i
+  br i1 undef, label %cond.true.i26, label %land.lhs.true
+
+cond.true.i26:                                    ; preds = %if.then28.i
+  br label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %cond.true.i26, %if.then28.i
+  br i1 false, label %cleanup.thread, label %if.end35
+
+if.end35:                                         ; preds = %land.lhs.true
+  br i1 undef, label %cleanup.thread, label %if.end45
+
+if.end45:                                         ; preds = %if.end35
+  br i1 undef, label %if.then50, label %if.end.i37
+
+if.end.i37:                                       ; preds = %if.end45
+  br label %if.then50
+
+if.then50:                                        ; preds = %if.end.i37, %if.end45
+  br i1 undef, label %__CFHyphenationGetHyphensForString.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %if.then50
+  br i1 undef, label %cleanup.i, label %cond.true.i
+
+cond.true.i:                                      ; preds = %if.end.i
+  br i1 undef, label %for.cond16.preheader.i, label %for.cond57.preheader.i
+
+for.cond16.preheader.i:                           ; preds = %cond.true.i
+  %cmp1791.i = icmp sgt i32 undef, 1
+  br i1 %cmp1791.i, label %for.body18.i, label %for.cond57.preheader.i
+
+for.cond57.preheader.i:                           ; preds = %for.cond16.preheader.i, %cond.true.i
+  %sub69.i = add i32 undef, -2
+  br label %cleanup.i
+
+for.body18.i:                                     ; preds = %for.cond16.preheader.i
+  store i16 0, i16* undef, align 2
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %for.body18.i
+  br label %while.body.i
+
+cleanup.i:                                        ; preds = %for.cond57.preheader.i, %if.end.i
+  br label %__CFHyphenationGetHyphensForString.exit
+
+__CFHyphenationGetHyphensForString.exit:          ; preds = %cleanup.i, %if.then50
+  %retval.1.i = phi i32 [ 0, %cleanup.i ], [ -1, %if.then50 ]
+  %phitmp = bitcast %struct.__CFString.2* null to i8*
+  br label %if.end68
+
+cleanup.thread:                                   ; preds = %if.end35, %land.lhs.true, %if.end20.i, %if.then4.i
+  call void @llvm.stackrestore(i8* null)
+  br label %return
+
+if.end68:                                         ; preds = %__CFHyphenationGetHyphensForString.exit, %__CFHyphenationPullTokenizer.exit
+  %hyphenCount.2 = phi i32 [ %retval.1.i, %__CFHyphenationGetHyphensForString.exit ], [ 0, %__CFHyphenationPullTokenizer.exit ]
+  %_token.1 = phi i8* [ %phitmp, %__CFHyphenationGetHyphensForString.exit ], [ undef, %__CFHyphenationPullTokenizer.exit ]
+  call void @CFRelease(i8* %_token.1)
+  br label %return
+
+return:                                           ; preds = %if.end68, %cleanup.thread, %CFStringIsHyphenationAvailableForLocale.exit, %entry
+  %retval.1 = phi i32 [ %hyphenCount.2, %if.end68 ], [ -1, %CFStringIsHyphenationAvailableForLocale.exit ], [ -1, %cleanup.thread ], [ -1, %entry ]
+  ret i32 %retval.1
+}
+
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
new file mode 100644
index 0000000..b05ec63
--- /dev/null
+++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -o /dev/null "-mtriple=thumbv7-apple-ios" -debug-only=post-RA-sched 2> %t
+; RUN: FileCheck %s < %t
+; REQUIRES: asserts
+; Make sure that mayalias store-load dependencies have one cycle
+; latency regardless of whether they are barriers or not.
+
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store volatile i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load volatile i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+define i32 @f2(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/2012-08-09-neon-extload.ll b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
new file mode 100644
index 0000000..b55f1ca
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple=armv7-none-linux-gnueabi < %s | FileCheck %s
+
+@var_v2i8 = global <2 x i8> zeroinitializer
+@var_v4i8 = global <4 x i8> zeroinitializer
+
+@var_v2i16 = global <2 x i16> zeroinitializer
+@var_v4i16 = global <4 x i16> zeroinitializer
+
+@var_v2i32 = global <2 x i32> zeroinitializer
+@var_v4i32 = global <4 x i32> zeroinitializer
+
+@var_v2i64 = global <2 x i64> zeroinitializer
+
+define void @test_v2i8tov2i32() {
+; CHECK: test_v2i8tov2i32:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i32val = sext <2 x i8> %i8val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.16 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i8tov2i64() {
+; CHECK: test_v2i8tov2i64:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i64val = sext <2 x i8> %i8val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.16 {d{{[0-9]+}}[0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.s32 {{q[0-9]+}}, {{d[0-9]+}}
+
+;  %i64val = sext <2 x i8> %i8val to <2 x i64>
+;  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+
+  ret void
+}
+
+define void @test_v4i8tov4i16() {
+; CHECK: test_v4i8tov4i16:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i16>
+  store <4 x i16> %i16val, <4 x i16>* @var_v4i16
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl.s16
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v4i8tov4i32() {
+; CHECK: test_v4i8tov4i32:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i32>
+  store <4 x i32> %i16val, <4 x i32>* @var_v4i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i16tov2i32() {
+; CHECK: test_v2i16tov2i32:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i32val = sext <2 x i16> %i16val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v2i16tov2i64() {
+; CHECK: test_v2i16tov2i64:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i64val = sext <2 x i16> %i16val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s32 {{q[0-9]+}}, d[[LOAD]]
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/addrmode.ll b/test/CodeGen/ARM/addrmode.ll
index 9ccff07..6da9089 100644
--- a/test/CodeGen/ARM/addrmode.ll
+++ b/test/CodeGen/ARM/addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -stats |& grep asm-printer | grep 4
+; RUN: llc < %s -march=arm -stats 2>&1 | grep asm-printer | grep 4
 
 define i32 @t1(i32 %a) {
 	%b = mul i32 %a, 9
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 31c5007..d668334 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
-; RUN: grep { = } %t   | count 5
+; RUN: grep " = " %t   | count 5
 ; RUN: grep globl %t | count 4
 ; RUN: grep weak %t  | count 1
 
diff --git a/test/CodeGen/ARM/bicZext.ll b/test/CodeGen/ARM/bicZext.ll
new file mode 100644
index 0000000..cf4b7ba
--- /dev/null
+++ b/test/CodeGen/ARM/bicZext.ll
@@ -0,0 +1,19 @@
+; RUN: llc %s -o - | FileCheck %s
+; ModuleID = 'bic.c'
+target triple = "thumbv7-apple-ios3.0.0"
+
+define zeroext i16 @foo16(i16 zeroext %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo16
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i16 %f, -4
+  ret i16 %and
+}
+
+define i32 @foo32(i32 %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo32
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i32 %f, -4
+  ret i32 %and
+}
diff --git a/test/CodeGen/ARM/call_nolink.ll b/test/CodeGen/ARM/call_nolink.ll
index efe29d8..00b16888 100644
--- a/test/CodeGen/ARM/call_nolink.ll
+++ b/test/CodeGen/ARM/call_nolink.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:   not grep {bx lr}
+; RUN:   not grep "bx lr"
 
 	%struct.anon = type { i32 (i32, i32, i32)*, i32, i32, [3 x i32], i8*, i8*, i8* }
 @r = external global [14 x i32]		; <[14 x i32]*> [#uses=4]
diff --git a/test/CodeGen/ARM/cmn.ll b/test/CodeGen/ARM/cmn.ll
new file mode 100644
index 0000000..ef73165
--- /dev/null
+++ b/test/CodeGen/ARM/cmn.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/7569620>
+
+define i32 @compare_i_gt(i32 %a) {
+entry:
+; CHECK:     compare_i_gt
+; CHECK-NOT: mvn
+; CHECK:     cmn
+  %cmp = icmp sgt i32 %a, -78
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
+define i32 @compare_r_eq(i32 %a, i32 %b) {
+entry:
+; CHECK: compare_r_eq
+; CHECK: cmn
+  %sub = sub nsw i32 0, %b
+  %cmp = icmp eq i32 %a, %sub
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
new file mode 100644
index 0000000..fb0f4c6
--- /dev/null
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios0.0.0"
+
+; CHECK: f
+; The vld2 and vst2 are not aligned wrt each other, the second Q loaded is the
+; first one stored.
+; The coalescer must find a super-register larger than QQ to eliminate the copy
+; setting up the vst2 data.
+; CHECK: vld2
+; CHECK-NOT: vorr
+; CHECK-NOT: vmov
+; CHECK: vst2
+define void @f(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
+  ret void
+}
+
+; CHECK: f1
+; FIXME: This function still has copies.
+define void @f1(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
+  ret void
+}
+
+; CHECK: f2
+; FIXME: This function still has copies.
+define void @f2(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %qq0.0.1.0 = phi <4 x float> [ %vld224, %entry ], [ %vld2216, %do.body ]
+  %c.addr.0 = phi i32 [ %c, %entry ], [ %dec, %do.body ]
+  %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
+  %add.ptr = getelementptr inbounds float* %p.addr.0, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
+  %dec = add nsw i32 %c.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/cse-libcalls.ll b/test/CodeGen/ARM/cse-libcalls.ll
index 1d011be..62b9e43 100644
--- a/test/CodeGen/ARM/cse-libcalls.ll
+++ b/test/CodeGen/ARM/cse-libcalls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {bl.\*__ltdf} | count 1
+; RUN: llc < %s -march=arm | grep "bl.*__ltdf" | count 1
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
diff --git a/test/CodeGen/ARM/data-in-code-annotations.ll b/test/CodeGen/ARM/data-in-code-annotations.ll
new file mode 100644
index 0000000..a66a9d1
--- /dev/null
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+define double @f1() nounwind {
+; CHECK: f1:
+; CHECK: .data_region
+; CHECK: .long 1413754129
+; CHECK: .long 1074340347
+; CHECK: .end_data_region
+  ret double 0x400921FB54442D11
+}
+
+
+define i32 @f2()  {
+; CHECK: f2:
+; CHECK: .data_region jt32
+; CHECK: .end_data_region
+
+entry:
+  switch i32 undef, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb6
+    i32 3, label %sw.bb13
+    i32 4, label %sw.bb20
+  ]
+
+sw.bb:                                            ; preds = %entry
+  br label %return
+
+sw.bb6:                                           ; preds = %entry
+  br label %return
+
+sw.bb13:                                          ; preds = %entry
+  br label %return
+
+sw.bb20:                                          ; preds = %entry
+  %div = sdiv i32 undef, undef
+  br label %return
+
+return:                                           ; preds = %sw.bb20, %sw.bb13, %sw.bb6, %sw.bb, %entry
+  %retval.0 = phi i32 [ %div, %sw.bb20 ], [ undef, %sw.bb13 ], [ undef, %sw.bb6 ], [ undef, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 49c4103..7fbf8f4 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
@@ -56,3 +56,17 @@ bb1:
 
 declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
 declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
+
+; rdar://11714607
+define i32 @howmany(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: howmany:
+; CHECK: bl ___udivmodsi4
+; CHECK-NOT: ___udivsi3
+  %rem = urem i32 %x, %y
+  %div = udiv i32 %x, %y
+  %not.cmp = icmp ne i32 %rem, 0
+  %add = zext i1 %not.cmp to i32
+  %cond = add i32 %add, %div
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
new file mode 100644
index 0000000..14721a4
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+
+; Fast-isel can't handle non-double multi-reg retvals.
+; This test just check to make sure we don't hit the assert in FinishCall.
+define <16 x i8> @foo() nounwind ssp {
+entry:
+  ret <16 x i8> zeroinitializer
+}
+
+define void @t1() nounwind ssp {
+entry:
+; ARM: @t1
+; THUMB: @t1
+  %call = call <16 x i8> @foo()
+  ret void
+}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index dd460b2..edc805a 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 define i32 @t0(i1 zeroext %a) nounwind {
   %1 = zext i1 %a to i32
@@ -99,6 +101,11 @@ entry:
 ; ARM: uxtb r9, r12
 ; ARM: str r9, [sp, #4]
 ; ARM: bl _bar
+; ARM-LONG: @t10
+; ARM-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; ARM-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; ARM-LONG: ldr lr, [lr]
+; ARM-LONG: blx lr
 ; THUMB: @t10
 ; THUMB: movs r0, #0
 ; THUMB: movt r0, #0
@@ -121,8 +128,96 @@ entry:
 ; THUMB: uxtb.w r9, r12
 ; THUMB: str.w r9, [sp, #4]
 ; THUMB: bl _bar
+; THUMB-LONG: @t10
+; THUMB-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; THUMB-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; THUMB-LONG: ldr.w lr, [lr]
+; THUMB-LONG: blx lr
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
   ret i32 0
 }
 
 declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+define void @foo3() uwtable {
+; ARM: movw    r0, #0
+; ARM: movw    r1, :lower16:_bar0
+; ARM: movt    r1, :upper16:_bar0
+; ARM: blx     r1
+; THUMB: movs    r0, #0
+; THUMB: movw    r1, :lower16:_bar0
+; THUMB: movt    r1, :upper16:_bar0
+; THUMB: blx     r1
+  %fptr = alloca i32 (i32)*, align 8
+  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+  %1 = load i32 (i32)** %fptr, align 8
+  %call = call i32 %1(i32 0)
+  ret void
+}
+
+define i32 @LibCall(i32 %a, i32 %b) {
+entry:
+; ARM: LibCall
+; ARM: bl ___udivsi3
+; ARM-LONG: LibCall
+; ARM-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: ldr r2, [r2]
+; ARM-LONG: blx r2
+; THUMB: LibCall
+; THUMB: bl ___udivsi3
+; THUMB-LONG: LibCall
+; THUMB-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: ldr r2, [r2]
+; THUMB-LONG: blx r2
+        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+define i32 @VarArg() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  %m = alloca i32, align 4
+  %n = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %0 = load i32* %i, align 4
+  %1 = load i32* %j, align 4
+  %2 = load i32* %k, align 4
+  %3 = load i32* %m, align 4
+  %4 = load i32* %n, align 4
+; ARM: VarArg
+; ARM: mov r7, sp
+; ARM: movw r0, #5
+; ARM: ldr r1, [r7, #-4]
+; ARM: ldr r2, [r7, #-8]
+; ARM: ldr r3, [r7, #-12]
+; ARM: ldr r9, [sp, #16]
+; ARM: ldr r12, [sp, #12]
+; ARM: str r9, [sp]
+; ARM: str r12, [sp, #4]
+; ARM: bl _CallVariadic
+; THUMB: mov r7, sp
+; THUMB: movs r0, #5
+; THUMB: movt r0, #0
+; THUMB: ldr r1, [sp, #28]
+; THUMB: ldr r2, [sp, #24]
+; THUMB: ldr r3, [sp, #20]
+; THUMB: ldr.w r9, [sp, #16]
+; THUMB: ldr.w r12, [sp, #12]
+; THUMB: str.w r9, [sp]
+; THUMB: str.w r12, [sp, #4]
+; THUMB: bl _CallVariadic
+  %call = call i32 (i32, ...)* @CallVariadic(i32 5, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  store i32 %call, i32* %tmp, align 4
+  %5 = load i32* %tmp, align 4
+  ret i32 %5
+}
+
+declare i32 @CallVariadic(i32, ...)
diff --git a/test/CodeGen/ARM/fast-isel-frameaddr.ll b/test/CodeGen/ARM/fast-isel-frameaddr.ll
new file mode 100644
index 0000000..8f7b294
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-frameaddr.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-THUMB2
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-THUMB2
+
+define i8* @frameaddr_index0() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index0:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+
+; DARWIN-THUMB2: frameaddr_index0:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+
+; LINUX-ARM: frameaddr_index0:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+
+; LINUX-THUMB2: frameaddr_index0:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index1() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index1:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index1:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index1:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index1:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index3() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index3:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index3:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index3:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index3:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 3)
+  ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index e6bdfa7..b73fcef 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 @message1 = global [60 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 1
 @temp = common global [60 x i8] zeroinitializer, align 1
@@ -13,6 +15,11 @@ define void @t1() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: uxtb r1, r1
 ; ARM: bl _memset
+; ARM-LONG: t1
+; ARM-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t1
 ; THUMB: movw r0, :lower16:_message1
 ; THUMB: movt r0, :upper16:_message1
@@ -23,6 +30,11 @@ define void @t1() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: uxtb r1, r1
 ; THUMB: bl _memset
+; THUMB-LONG: t1
+; THUMB-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 1, i1 false)
   ret void
 }
@@ -41,6 +53,11 @@ define void @t2() nounwind ssp {
 ; ARM: mov r0, r1
 ; ARM: ldr r1, [sp]                @ 4-byte Reload
 ; ARM: bl _memcpy
+; ARM-LONG: t2
+; ARM-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t2
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -51,6 +68,11 @@ define void @t2() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memcpy
+; THUMB-LONG: t2
+; THUMB-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 1, i1 false)
   ret void
 }
@@ -67,6 +89,11 @@ define void @t3() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: mov r0, r1
 ; ARM: bl _memmove
+; ARM-LONG: t3
+; ARM-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t3
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -77,6 +104,11 @@ define void @t3() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memmove
+; THUMB-LONG: t3
+; THUMB-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memmove.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 417e2d9..ecd5fe2 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -226,3 +226,15 @@ define i32 @urem_fold(i32 %a) nounwind {
   %rem = urem i32 %a, 32
   ret i32 %rem
 }
+
+define i32 @test7() noreturn nounwind  {
+entry:
+; ARM: @test7
+; THUMB: @test7
+; ARM: trap
+; THUMB: trap
+  tail call void @llvm.trap( )
+  unreachable
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index bc118b8..3c3182b 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -21,3 +21,12 @@ entry:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
+
+; VFP2: test2
+define float @test2(float %a) nounwind {
+; CHECK-NOT: mul
+; CHECK: mov pc, lr
+  %ret = fmul float %a, 1.0
+  ret float %ret
+}
+
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 802d1b8..303d165 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s
 ; Check generated fused MAC and MLS.
 
 define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -138,8 +138,16 @@ entry:
 ; CHECK: vfms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  %tmp3 = fsub double -0.0, %tmp2
-  ret double %tmp3
+  ret double %tmp2
+}
+
+define float @test_fnms_f32(float %a, float %b, float* %c) nounwind readnone ssp {
+; CHECK: test_fnms_f32
+; CHECK: vfnms.f32
+  %tmp1 = load float* %c, align 4
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = tail call float @llvm.fma.f32(float %a, float %b, float %tmp2) nounwind readnone
+  ret float %tmp3 
 }
 
 define double @test_fnms_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -158,7 +166,8 @@ entry:
 ; CHECK: vfnms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  ret double %tmp2
+  %tmp3 = fsub double -0.0, %tmp2
+  ret double %tmp3
 }
 
 define double @test_fnma_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -180,6 +189,36 @@ entry:
   ret double %tmp3
 }
 
+define float @test_fma_const_fold(float %a, float %b) nounwind {
+; CHECK: test_fma_const_fold
+; CHECK-NOT: vfma
+; CHECK-NOT: vmul
+; CHECK: vadd
+  %ret = call float @llvm.fma.f32(float %a, float 1.0, float %b)
+  ret float %ret
+}
+
+define float @test_fma_canonicalize(float %a, float %b) nounwind {
+; CHECK: test_fma_canonicalize
+; CHECK: vmov.f32 [[R1:s[0-9]+]], #2.000000e+00
+; CHECK: vfma.f32 {{s[0-9]+}}, {{s[0-9]+}}, [[R1]]
+  %ret = call float @llvm.fma.f32(float 2.0, float %a, float %b)
+  ret float %ret
+}
+
+; Check that very wide vector fma's can be split into legal fma's.
+define void @test_fma_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>* %p) nounwind readnone ssp {
+; CHECK: test_fma_v8f32
+; CHECK: vfma.f32
+; CHECK: vfma.f32
+entry:
+  %call = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone
+  store <8 x float> %call, <8 x float>* %p, align 16
+  ret void
+}
+
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/iabs.ll b/test/CodeGen/ARM/iabs.ll
index 89e309d..600a8c2 100644
--- a/test/CodeGen/ARM/iabs.ll
+++ b/test/CodeGen/ARM/iabs.ll
@@ -10,7 +10,25 @@ define i32 @test(i32 %a) {
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
         ret i32 %abs
-; CHECK:  movs r0, r0
+; CHECK:  cmp
 ; CHECK:  rsbmi r0, r0, #0
 ; CHECK:  bx lr
 }
+
+; rdar://11633193
+;; 3 instructions will be generated for abs(a-b):
+;;   subs
+;;   rsbmi
+;;   bx
+define i32 @test2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: test2
+; CHECK: subs
+; CHECK-NEXT: rsbmi
+; CHECK-NEXT: bx
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub1 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 3f8fd75..73b546d 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast | FileCheck %s -check-prefix=M3
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY
@@ -18,7 +18,6 @@ entry:
 
 ; M3: t:
 ; M3-NOT: ldrd
-; M3: ldm.w r2, {r2, r3}
 
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
diff --git a/test/CodeGen/ARM/lsr-scale-addr-mode.ll b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
index 8130019..0c8d387 100644
--- a/test/CodeGen/ARM/lsr-scale-addr-mode.ll
+++ b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep lsl | grep -F {lsl #2\]}
+; RUN: llc < %s -march=arm | grep lsl | grep -F "lsl #2]"
 ; Should use scaled addressing mode.
 
 define void @sintzero(i32* %a) nounwind {
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index 991d728..bbedea1 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
 
 @foo = common global i32 0
 
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index de48fee..4a82c36 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source -disable-post-ra | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
-;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
+;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
 ;CHECK: vmovn.i16
 	%tmp1 = load <8 x i8>* %A
@@ -15,10 +15,10 @@ define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
+;CHECK: vmovn.i32
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
 ;CHECK: vmovn.i32
-;CHECK: vmovn.i32
 ;CHECK: vqmovun.s16
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index b4da552..df98e23 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -mattr=+neon < %s | FileCheck %s
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
diff --git a/test/CodeGen/ARM/pr13249.ll b/test/CodeGen/ARM/pr13249.ll
new file mode 100644
index 0000000..4bc8810
--- /dev/null
+++ b/test/CodeGen/ARM/pr13249.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple armv7--linux-gnueabi
+
+define arm_aapcscc i8* @__strtok_r_1c(i8* %arg, i8 signext %arg1, i8** nocapture %arg2) nounwind {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb
+  %tmp = phi i8* [ %tmp5, %bb3 ], [ %arg, %bb ]
+  %tmp4 = load i8* %tmp, align 1
+  %tmp5 = getelementptr inbounds i8* %tmp, i32 1
+  br i1 undef, label %bb3, label %bb7
+
+bb7:                                              ; preds = %bb13, %bb3
+  %tmp8 = phi i8 [ %tmp14, %bb13 ], [ %tmp4, %bb3 ]
+  %tmp9 = phi i8* [ %tmp12, %bb13 ], [ %tmp, %bb3 ]
+  %tmp10 = icmp ne i8 %tmp8, %arg1
+  %tmp12 = getelementptr inbounds i8* %tmp9, i32 1
+  br i1 %tmp10, label %bb13, label %bb15
+
+bb13:                                             ; preds = %bb7
+  %tmp14 = load i8* %tmp12, align 1
+  br label %bb7
+
+bb15:                                             ; preds = %bb7
+  store i8* %tmp9, i8** %arg2, align 4
+  ret i8* %tmp
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 3e07da8..418d4f3 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -113,3 +113,29 @@ entry:
   call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, [2 x i32], i32, float)*)(i8* undef, i8* undef, [2 x i32] %tmp493, i32 0, float 1.000000e+00) optsize
   ret void
 }
+
+; CHECK: f10
+define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = zext i1 %1 to i32
+  %3 = sitofp i32 %2 to float
+  ret float %3
+}
+
+; CHECK: f11
+define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = sitofp i1 %1 to float
+  ret float %2
+}
+
+; CHECK: f12
+define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatunsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = uitofp i1 %1 to float
+  ret float %2
+}
+
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
new file mode 100644
index 0000000..99df0d4
--- /dev/null
+++ b/test/CodeGen/ARM/smml.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+define i32 @f(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+; CHECK-NOT: smmls
+  %conv4 = zext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %shr5 = lshr i64 %mul, 32
+  %sub = sub nsw i64 %conv4, %shr5
+  %conv3 = trunc i64 %sub to i32
+  ret i32 %conv3
+}
diff --git a/test/CodeGen/ARM/str_pre-2.ll b/test/CodeGen/ARM/str_pre-2.ll
index 983ba45..5ce2bce 100644
--- a/test/CodeGen/ARM/str_pre-2.ll
+++ b/test/CodeGen/ARM/str_pre-2.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnu -regalloc=basic | FileCheck %s
-
-; The greedy register allocator uses a single CSR here, invalidating the test.
+; RUN: llc < %s -mtriple=armv6-linux-gnu | FileCheck %s
 
 @b = external global i64*
 
 define i64 @t(i64 %a) nounwind readonly {
 entry:
-; CHECK: push {lr}
-; CHECK: pop {lr}
+; CHECK: push {r4, r5, lr}
+; CHECK: pop {r4, r5, pc}
+        call void asm sideeffect "", "~{r4},~{r5}"() nounwind
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
 	%2 = mul i64 %1, %a
diff --git a/test/CodeGen/ARM/str_pre.ll b/test/CodeGen/ARM/str_pre.ll
index e56e3f2..d8b3f0e 100644
--- a/test/CodeGen/ARM/str_pre.ll
+++ b/test/CodeGen/ARM/str_pre.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm | \
-; RUN:   grep {str.*\\!} | count 2
+; RUN:   grep "str.*\!" | count 2
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
         %B = load i32* %A               ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
new file mode 100644
index 0000000..99ba475
--- /dev/null
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
+
+; rdar://9877866
+%struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
+%struct.LargeStruct = type { i32, [1001 x i8], [300 x i32] }
+
+define i32 @f() nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: ldr
+; CHECK: str
+; CHECK-NOT:bne
+  %st = alloca %struct.SmallStruct, align 4
+  %call = call i32 @e1(%struct.SmallStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop for large struct byval
+define i32 @g() nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: ldr
+; CHECK: sub
+; CHECK: str
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 4
+  %call = call i32 @e2(%struct.LargeStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop using NEON instructions
+define i32 @h() nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: vld1
+; CHECK: sub
+; CHECK: vst1
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 16
+  %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
+  ret i32 0
+}
+
+declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
+declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
+declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
new file mode 100644
index 0000000..6fcbdee
--- /dev/null
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @g(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %a, %b
+  %sub = sub nsw i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @h(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, 3
+  %sub = sub nsw i32 %a, 3
+  %sub. = select i1 %cmp, i32 %sub, i32 %b
+  ret i32 %sub.
+}
+
+; rdar://11725965
+define i32 @i(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: i:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp ult i32 %a, %b
+  %sub = sub i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+; If CPSR is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK: sub
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 06ea703..474043a 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -36,3 +36,15 @@ entry:
   %sel = select i1 %cmp, i32 1, i32 %sub
   ret i32 %sel
 }
+
+; rdar://11726136
+define i32 @f5(i32 %x) {
+entry:
+; CHECK: f5
+; CHECK: movw r1, #65535
+; CHECK-NOT: movt
+; CHECK-NOT: add
+; CHECK: sub r0, r0, r1
+  %sub = add i32 %x, -65535
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/thread_pointer.ll b/test/CodeGen/ARM/thread_pointer.ll
index 3143387..c403fa5 100644
--- a/test/CodeGen/ARM/thread_pointer.ll
+++ b/test/CodeGen/ARM/thread_pointer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 
 define i8* @test() {
 entry:
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index 28fd469..a25352c 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -3,10 +3,10 @@
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
-; CHECK:        movs.w
+; CHECK:        cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
-; CHECK-NEXT:   movs.w
+; CHECK-NEXT:   cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
  %cmp1 = icmp slt i32 %a, 0
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
new file mode 100644
index 0000000..a5f3c90
--- /dev/null
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -0,0 +1,117 @@
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi < %s | FileCheck -check-prefix=CHECK-NONPIC %s
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-PIC %s
+
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; CHECK-NONPIC:   f1:
+  ; CHECK-NONPIC:   external_gd(gottpoff)
+  ; CHECK-PIC:      f1:
+  ; CHECK-PIC:      external_gd(tlsgd)
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f2:
+  ; CHECK-NONPIC:   internal_gd(tpoff)
+  ; CHECK-PIC:      f2:
+  ; CHECK-PIC:      internal_gd(tlsgd)
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC should use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f3:
+  ; CHECK-NONPIC:   external_ld(gottpoff)
+  ; CHECK-PIC:      f3:
+  ; CHECK-PIC:      external_ld(tlsgd)
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so it falls back to general dynamic.
+  ; CHECK-NONPIC:   f4:
+  ; CHECK-NONPIC:   internal_ld(tpoff)
+  ; CHECK-PIC:      f4:
+  ; CHECK-PIC:      internal_ld(tlsgd)
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; CHECK-NONPIC:   f5:
+  ; CHECK-NONPIC:   external_ie(gottpoff)
+  ; CHECK-PIC:      f5:
+  ; CHECK-PIC:      external_ie(gottpoff)
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; CHECK-NONPIC:   f6:
+  ; CHECK-NONPIC:   internal_ie(tpoff)
+  ; CHECK-PIC:      f6:
+  ; CHECK-PIC:      internal_ie(gottpoff)
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f7:
+  ; CHECK-NONPIC:   external_le(tpoff)
+  ; CHECK-PIC:      f7:
+  ; CHECK-PIC:      external_le(tpoff)
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f8:
+  ; CHECK-NONPIC:   internal_le(tpoff)
+  ; CHECK-PIC:      f8:
+  ; CHECK-PIC:      internal_le(tpoff)
+}
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index 1087094..ec4278c 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {i(tpoff)}
+; RUN:     grep "i(tpoff)"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
-; RUN:     -relocation-model=pic | grep {__tls_get_addr}
+; RUN:     -relocation-model=pic | grep "__tls_get_addr"
 
 
 @i = thread_local global i32 15		; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index df7a4ca..e0e944f 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {tbss}
+; RUN:     grep "tbss"
 
 %struct.anon = type { i32, i32 }
 @teste = internal thread_local global %struct.anon zeroinitializer		; <%struct.anon*> [#uses=1]
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
new file mode 100644
index 0000000..4e227dd
--- /dev/null
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -0,0 +1,21 @@
+; Tests for the two-address instruction pass.
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+define void @PR13378() nounwind {
+; This was orriginally a crasher trying to schedule the instructions.
+; CHECK:      PR13378:
+; CHECK:        vldmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+
+entry:
+  %0 = load <4 x float>* undef
+  store <4 x float> zeroinitializer, <4 x float>* undef
+  store <4 x float> %0, <4 x float>* undef
+  %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
+  store <4 x float> %1, <4 x float>* undef
+  unreachable
+}
diff --git a/test/CodeGen/ARM/unsafe-fsub.ll b/test/CodeGen/ARM/unsafe-fsub.ll
new file mode 100644
index 0000000..3a4477d
--- /dev/null
+++ b/test/CodeGen/ARM/unsafe-fsub.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s
+; RUN: llc -march=arm -mcpu=cortex-a9 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=FAST %s
+
+target triple = "armv7-apple-ios"
+
+; SAFE: test
+; FAST: test
+define float @test(float %x, float %y) {
+entry:
+; SAFE: vmul.f32
+; SAFE: vsub.f32
+; FAST: mov r0, #0
+  %0 = fmul float %x, %y
+  %1 = fsub float %0, %0
+  ret float %1
+}
+
+
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index 450f90d..9f55c24 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -1,79 +1,80 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; NB: this tests vcnt, vclz, and vcls
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
 ;CHECK: vcnt8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vcntQ8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
 ;CHECK: vclz8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
 ;CHECK: vclz16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
 ;CHECK: vclz32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vclzQ8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
 ;CHECK: vclzQ16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
 ;CHECK: vclzQ32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
 	ret <4 x i32> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
 ;CHECK: vclss8:
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 1ec36da..8fd3db2 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -20,7 +20,9 @@ define float @f(<4 x i16>* nocapture %in) {
 
 ; CHECK: g:
 define float @g(<4 x i8>* nocapture %in) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.u8
   ; CHECK: vmovl.u16
   %1 = load <4 x i8>* %in
@@ -47,7 +49,9 @@ define <4 x i8> @h(<4 x float> %v) {
 
 ; CHECK: i:
 define <4 x i8> @i(<4 x i8>* %x) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.s8
   ; CHECK: vmovl.s16
   ; CHECK: vrecpe
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 61d73c1..c69473f 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -75,12 +75,12 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
         ret <8 x i8> %tmp5
 }
 
-define <4 x i16> @vld2dupi16(i16* %A) nounwind {
+define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;CHECK: vld2dupi16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -94,7 +94,8 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld2dupi16_update:
 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -105,11 +106,11 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp5
 }
 
-define <2 x i32> @vld2dupi32(i32* %A) nounwind {
+define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@@ -119,8 +120,8 @@ define <2 x i32> @vld2dupi32(i32* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -144,11 +145,11 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
 	ret <8 x i8> %tmp8
 }
 
-define <4 x i16> @vld3dupi16(i16* %A) nounwind {
+define <4 x i16> @vld3dupi16(i8* %A) nounwind {
 ;CHECK: vld3dupi16:
 ;Check the (default) alignment value. VLD3 does not support alignment.
 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@@ -161,7 +162,7 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
 
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@@ -171,7 +172,8 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4dupi16_update:
 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@@ -188,12 +190,12 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp11
 }
 
-define <2 x i32> @vld4dupi32(i32* %A) nounwind {
+define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;CHECK: vld4dupi32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@@ -208,5 +210,5 @@ define <2 x i32> @vld4dupi32(i32* %A) nounwind {
         ret <2 x i32> %tmp11
 }
 
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 61d89bb..74628f0 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -525,3 +525,77 @@ define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
   %3 = extractelement <8 x i16> %2, i32 0
   ret i16 %3
 }
+
+; A constant build_vector created for a vmull with half-width elements must
+; not introduce illegal types. <rdar://problem/11324364>
+define void @vmull_buildvector() nounwind optsize ssp align 2 {
+; CHECK: vmull_buildvector
+entry:
+  br i1 undef, label %for.end179, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.loopexit:                                ; preds = %for.body33, %for.body
+  br i1 undef, label %for.end179, label %for.body
+
+for.body:                                         ; preds = %for.cond.loopexit, %for.body.lr.ph
+  br i1 undef, label %for.cond.loopexit, label %for.body33.lr.ph
+
+for.body33.lr.ph:                                 ; preds = %for.body
+  %.sub = select i1 undef, i32 0, i32 undef
+  br label %for.body33
+
+for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
+  %add45 = add i32 undef, undef
+  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
+  %0 = load i32** undef, align 4
+  %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
+  %vmovl.i249 = zext <8 x i8> %1 to <8 x i16>
+  %shuffle.i246 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i240 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> <i32 1>
+  %2 = bitcast <1 x i64> %shuffle.i240 to <8 x i8>
+  %3 = bitcast <16 x i8> undef to <2 x i64>
+  %vmovl.i237 = zext <8 x i8> undef to <8 x i16>
+  %shuffle.i234 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i226 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %vmovl.i225 = zext <8 x i8> undef to <8 x i16>
+  %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
+  %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
+  %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
+  %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %sub.i205 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n130
+  %sub.i203 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n134
+  %add.i200 = add <8 x i16> %sub.i205, <i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96>
+  %add.i198 = add <8 x i16> %add.i200, %sub.i203
+  %mul.i194 = mul <8 x i16> %add.i198, %vmovl.i237
+  %mul.i191 = mul <8 x i16> %vshr_n130, undef
+  %add.i192 = add <8 x i16> %mul.i191, %mul.i194
+  %mul.i187 = mul <8 x i16> %vshr_n134, undef
+  %add.i188 = add <8 x i16> %mul.i187, %add.i192
+  %mul.i185 = mul <8 x i16> undef, undef
+  %add.i186 = add <8 x i16> %mul.i185, undef
+  %vrshr_n160 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i188, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %vrshr_n163 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i186, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %mul.i184 = mul <8 x i16> undef, %vrshr_n160
+  %mul.i181 = mul <8 x i16> undef, %vmovl.i225
+  %add.i182 = add <8 x i16> %mul.i181, %mul.i184
+  %vrshr_n170 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i182, <8 x i16> <i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7>)
+  %vqmovn1.i180 = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %vrshr_n170) nounwind
+  %4 = bitcast <8 x i8> %vqmovn1.i180 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %4, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %5 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* undef, align 16
+  %add177 = add nsw i32 undef, 16
+  br i1 undef, label %for.body33, label %for.cond.loopexit
+
+for.end179:                                       ; preds = %for.cond.loopexit, %entry
+  ret void
+}
+
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index e3372a0..f117ab2 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -disable-arm-fast-isel -O0 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -fast-isel=0 -O0 | FileCheck %s
 
 define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8: