212 files changed, 3425 insertions, 972 deletions
diff --git a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
index 664da5e..c45469d 100644
--- a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
@@ -4,7 +4,7 @@
 define void @test() {
 bb.i:
 	%tmp.i660 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
-	call void (i32, ...)* @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
+	call void (i32, ...) @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
 	%tmp152.i = load <4 x i32>, <4 x i32>* null		; <<4 x i32>> [#uses=1]
 	%tmp156.i = bitcast <4 x i32> %tmp152.i to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp175.i = bitcast <4 x float> %tmp.i660 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
index 6b062d5..dd67064 100644
--- a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
+++ b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
@@ -15,11 +15,11 @@ entry:
 	]
 
 bb:		; preds = %entry
-	%tmp1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([14 x i8], [14 x i8]* @str, i32 0, i64 0) )		; <i32> [#uses=0]
+	%tmp1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([14 x i8], [14 x i8]* @str, i32 0, i64 0) )		; <i32> [#uses=0]
 	ret i32 0
 
 bb2:		; preds = %entry
-	%tmp4 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([13 x i8], [13 x i8]* @str.upgrd.1, i32 0, i64 0) )		; <i32> [#uses=0]
+	%tmp4 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([13 x i8], [13 x i8]* @str.upgrd.1, i32 0, i64 0) )		; <i32> [#uses=0]
 	ret i32 0
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 9adfff3..b6a8fc0 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -51,7 +51,7 @@ entry:
         %tmp20 = getelementptr { double, double }, { double, double }* %z, i64 0, i32 0             ; <double*> [#uses=1]
         %tmp21 = load double, double* %tmp20            ; <double> [#uses=1]
         %tmp.upgrd.6 = getelementptr [9 x i8], [9 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
-        %tmp.upgrd.7 = call i32 (i8*, ...)* @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
+        %tmp.upgrd.7 = call i32 (i8*, ...) @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
         br label %finish
 finish:
         %retval.upgrd.8 = load i32, i32* %retval             ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
index f81b303..2c3c5c9 100644
--- a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
+++ b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
@@ -21,55 +21,55 @@ entry:
 	]
 
 bb:		; preds = %entry
-	call void (...)* @foo1( )
+	call void (...) @foo1( )
 	ret void
 
 bb1:		; preds = %entry
-	call void (...)* @foo2( )
+	call void (...) @foo2( )
 	ret void
 
 bb2:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 
 bb3:		; preds = %entry
-	call void (...)* @foo3( )
+	call void (...) @foo3( )
 	ret void
 
 bb4:		; preds = %entry
-	call void (...)* @foo4( )
+	call void (...) @foo4( )
 	ret void
 
 bb5:		; preds = %entry
-	call void (...)* @foo5( )
+	call void (...) @foo5( )
 	ret void
 
 bb6:		; preds = %entry
-	call void (...)* @foo1( )
+	call void (...) @foo1( )
 	ret void
 
 bb7:		; preds = %entry
-	call void (...)* @foo2( )
+	call void (...) @foo2( )
 	ret void
 
 bb8:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 
 bb9:		; preds = %entry
-	call void (...)* @foo3( )
+	call void (...) @foo3( )
 	ret void
 
 bb10:		; preds = %entry
-	call void (...)* @foo4( )
+	call void (...) @foo4( )
 	ret void
 
 bb11:		; preds = %entry
-	call void (...)* @foo5( )
+	call void (...) @foo5( )
 	ret void
 
 bb12:		; preds = %entry
-	call void (...)* @foo6( )
+	call void (...) @foo6( )
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2007-02-16-BranchFold.ll b/test/CodeGen/X86/2007-02-16-BranchFold.ll
index 596021a..22e0a4e 100644
--- a/test/CodeGen/X86/2007-02-16-BranchFold.ll
+++ b/test/CodeGen/X86/2007-02-16-BranchFold.ll
@@ -60,7 +60,7 @@ bb.i9.i.i932.ce:		; preds = %newFuncRoot
 	%tmp1.i6.i = getelementptr %struct.operator, %struct.operator* %tmp66.i62.i, i32 0, i32 2		; <i32*> [#uses=1]
 	%tmp2.i7.i = load i32, i32* %tmp1.i6.i		; <i32> [#uses=1]
 	%tmp3.i8.i = load %struct.FILE*, %struct.FILE** @outfile		; <%struct.FILE*> [#uses=1]
-	%tmp5.i9.i = call i32 (%struct.FILE*, i8*, ...)* @fprintf( %struct.FILE* %tmp3.i8.i, i8* getelementptr ([11 x i8], [11 x i8]* @str1, i32 0, i32 0), i32 %tmp2.i7.i )		; <i32> [#uses=0]
+	%tmp5.i9.i = call i32 (%struct.FILE*, i8*, ...) @fprintf( %struct.FILE* %tmp3.i8.i, i8* getelementptr ([11 x i8], [11 x i8]* @str1, i32 0, i32 0), i32 %tmp2.i7.i )		; <i32> [#uses=0]
 	%tmp7.i10.i = getelementptr %struct.operator, %struct.operator* %tmp66.i62.i, i32 0, i32 5		; <i32*> [#uses=1]
 	%tmp8.i11.i = load i32, i32* %tmp7.i10.i		; <i32> [#uses=7]
 	br label %NodeBlock5
diff --git a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
index 5d2c01a..a9b85b9 100644
--- a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
@@ -7,7 +7,7 @@
 
 define void @__eprintf(i8* %string, i8* %expression, i32 %line, i8* %filename) {
 	%tmp = load %struct._IO_FILE*, %struct._IO_FILE** @stderr
-	%tmp5 = tail call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf( %struct._IO_FILE* %tmp, i8* %string, i8* %expression, i32 %line, i8* %filename )
+	%tmp5 = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf( %struct._IO_FILE* %tmp, i8* %string, i8* %expression, i32 %line, i8* %filename )
 	%tmp6 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr
 	%tmp7 = tail call i32 @fflush( %struct._IO_FILE* %tmp6 )
 	tail call void @abort( )
diff --git a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
index e6eaa57..0edf139 100644
--- a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
+++ b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
@@ -6,7 +6,7 @@
 define void @test() {
 bb.i:
 	%tmp.i660 = load <4 x float>, <4 x float>* null		; <<4 x float>> [#uses=1]
-	call void (i32, ...)* @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
+	call void (i32, ...) @printf( i32 0, i8* getelementptr ([18 x i8], [18 x i8]* @str, i32 0, i64 0), double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00 )
 	%tmp152.i = load <4 x i32>, <4 x i32>* null		; <<4 x i32>> [#uses=1]
 	%tmp156.i = bitcast <4 x i32> %tmp152.i to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%tmp175.i = bitcast <4 x float> %tmp.i660 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
index ecc5835..9ce5f5a 100644
--- a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
@@ -19,7 +19,7 @@ cond_true109:		; preds = %entry
 
 cond_next164:		; preds = %cond_true109
 	%tmp176 = call signext i16 @GetParamDesc( %struct.XDesc* null, i32 1701999219, i32 1413830740, %struct.XDesc* null ) 
-	call void (i64, i8*, ...)* @r_raise( i64 0, i8* null )
+	call void (i64, i8*, ...) @r_raise( i64 0, i8* null )
 	unreachable
 
 cond_true239:		; preds = %cond_true109
diff --git a/test/CodeGen/X86/2007-07-10-StackerAssert.ll b/test/CodeGen/X86/2007-07-10-StackerAssert.ll
index b19f445..c8660f7 100644
--- a/test/CodeGen/X86/2007-07-10-StackerAssert.ll
+++ b/test/CodeGen/X86/2007-07-10-StackerAssert.ll
@@ -30,7 +30,7 @@ cond_true425:		; preds = %bb383
 	%tmp432 = fsub float %tmp430, %tmp408		; <float> [#uses=1]
 	%tmp432433 = fpext float %tmp432 to double		; <double> [#uses=1]
 	%tmp434435 = fpext float %tmp408 to double		; <double> [#uses=1]
-	call void (i8*, ...)* @PR_LogPrint( i8* getelementptr ([56 x i8], [56 x i8]* @.str97, i32 0, i32 0), double 0.000000e+00, double %tmp434435, double %tmp432433 )
+	call void (i8*, ...) @PR_LogPrint( i8* getelementptr ([56 x i8], [56 x i8]* @.str97, i32 0, i32 0), double 0.000000e+00, double %tmp434435, double %tmp432433 )
 	ret i32 0
 
 cond_next443:		; preds = %bb383
diff --git a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
index f2ae922..c6eb6f0 100644
--- a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
@@ -362,7 +362,7 @@ bb1159:		; preds = %cond_next1150
 
 cond_true1169:		; preds = %bb1159
 	%tmp11741175 = trunc i64 %lsum.11225.0 to i32		; <i32> [#uses=1]
-	%tmp1178 = tail call i32 (%struct._IO_FILE*  , i8*  , ...)* @fprintf( %struct._IO_FILE* noalias %file  , i8* getelementptr ([49 x i8], [49 x i8]* @.str32, i32 0, i64 0)  , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
+	%tmp1178 = tail call i32 (%struct._IO_FILE*  , i8*  , ...) @fprintf( %struct._IO_FILE* noalias %file  , i8* getelementptr ([49 x i8], [49 x i8]* @.str32, i32 0, i64 0)  , i32 %tmp11741175, i32 0 )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %bb1159
diff --git a/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll b/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
index 019c442..a20fb47 100644
--- a/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
+++ b/test/CodeGen/X86/2007-11-04-LiveIntervalCrash.ll
@@ -30,7 +30,7 @@ bb37:           ; preds = %bb37.loopexit, %entry
         %hash.0.reg2mem.1 = phi i32 [ %phitmp, %bb37.loopexit ], [ 0, %entry ]          ; <i32> [#uses=1]
         store i32 %hash.0.reg2mem.1, i32* null, align 8
         %tmp75 = tail call i32 null( %struct.dentry* %dir, %struct.qstr* %name )                ; <i32> [#uses=0]
-        %tmp84 = tail call i32 (...)* @d_lookup( %struct.dentry* %dir, %struct.qstr* %name )            ; <i32> [#uses=0]
+        %tmp84 = tail call i32 (...) @d_lookup( %struct.dentry* %dir, %struct.qstr* %name )            ; <i32> [#uses=0]
         ret %struct.dentry* null
 }
 
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index efb87f2..ef69bd0 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -213,7 +213,7 @@ bb456:		; preds = %bb448, %bb425, %bb417, %bb395, %bb385, %bb371
 	%tmp460461 = fpext float %iftmp.7.0 to double		; <double> [#uses=1]
 	%tmp462463 = fpext float %iftmp.14.0 to double		; <double> [#uses=1]
 	%tmp464465 = fpext float %iftmp.0.0 to double		; <double> [#uses=1]
-	%tmp467 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([48 x i8], [48 x i8]* @.str, i32 0, i32 0), double %tmp464465, double %tmp462463, double %tmp460461, double %tmp458459 ) nounwind 		; <i32> [#uses=0]
+	%tmp467 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([48 x i8], [48 x i8]* @.str, i32 0, i32 0), double %tmp464465, double %tmp462463, double %tmp460461, double %tmp458459 ) nounwind 		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2008-04-09-BranchFolding.ll b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
index a758fed..f21a6f3 100644
--- a/test/CodeGen/X86/2008-04-09-BranchFolding.ll
+++ b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
@@ -39,7 +39,7 @@ bb226.i:		; preds = %bb73.i
 bb273.i:		; preds = %bb226.i
 	ret %struct.tree_node* null
 bb260:		; preds = %bb226.i
-	tail call void (i8*, i32, ...)* @pedwarn_with_file_and_line( i8* %file.0, i32 %line.0, i8* null ) nounwind 
+	tail call void (i8*, i32, ...) @pedwarn_with_file_and_line( i8* %file.0, i32 %line.0, i8* null ) nounwind 
 	ret %struct.tree_node* null
 bb344:		; preds = %bb174
 	ret %struct.tree_node* null
diff --git a/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll b/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
index f83c990..b526591 100644
--- a/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
+++ b/test/CodeGen/X86/2008-04-15-LiveVariableBug.ll
@@ -43,7 +43,7 @@ entry:
 	%tmp105 = load %struct.NSArray*, %struct.NSArray** null, align 8		; <%struct.NSArray*> [#uses=1]
 	%tmp107 = load %struct.NSObject*, %struct.NSObject** null, align 8		; <%struct.NSObject*> [#uses=1]
 	call void null( %struct.NSObject* %tmp107, %struct._message_ref_t* @"\01L_OBJC_MESSAGE_REF_228", %struct.NSArray* %tmp105, i8 signext  0 )
-	%tmp111 = call %struct.NSObject* (%struct.NSObject*, %struct.objc_selector*, ...)* @objc_msgSend( %struct.NSObject* null, %struct.objc_selector* null, i32 0, i8* null )		; <%struct.NSObject*> [#uses=0]
+	%tmp111 = call %struct.NSObject* (%struct.NSObject*, %struct.objc_selector*, ...) @objc_msgSend( %struct.NSObject* null, %struct.objc_selector* null, i32 0, i8* null )		; <%struct.NSObject*> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
index df5ceb0..0669a32 100644
--- a/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
+++ b/test/CodeGen/X86/2008-05-12-tailmerge-5.ll
@@ -64,7 +64,7 @@ entry:
 	br i1 %toBool, label %bb, label %bb27
 
 bb:		; preds = %entry
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb27:		; preds = %entry
@@ -77,7 +77,7 @@ bb27:		; preds = %entry
 	br i1 %toBool33, label %bb34, label %bb35
 
 bb34:		; preds = %bb27
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb35:		; preds = %bb27
@@ -98,7 +98,7 @@ bb35:		; preds = %bb27
 	br i1 %toBool49, label %bb50, label %bb51
 
 bb50:		; preds = %bb35
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb51:		; preds = %bb35
@@ -119,7 +119,7 @@ bb51:		; preds = %bb35
 	br i1 %toBool65, label %bb66, label %bb67
 
 bb66:		; preds = %bb51
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb67:		; preds = %bb51
@@ -132,7 +132,7 @@ bb67:		; preds = %bb51
 	br i1 %toBool73, label %bb74, label %bb75
 
 bb74:		; preds = %bb67
-	call void (...)* @abort( ) noreturn nounwind 
+	call void (...) @abort( ) noreturn nounwind 
 	unreachable
 
 bb75:		; preds = %bb67
diff --git a/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll b/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
index 42752eb..a1b9d9d 100644
--- a/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-07-16-CoalescerCrash.ll
@@ -26,7 +26,7 @@ bb31:		; preds = %bb6
 	br label %bb33
 
 bb33:		; preds = %bb31, %bb
-	tail call void (%struct.SV*, i8*, ...)* @Perl_sv_catpvf( %struct.SV* %dsv, i8* getelementptr ([8 x i8], [8 x i8]* @"\01LC25", i32 0, i64 0), i64 %0 ) nounwind 
+	tail call void (%struct.SV*, i8*, ...) @Perl_sv_catpvf( %struct.SV* %dsv, i8* getelementptr ([8 x i8], [8 x i8]* @"\01LC25", i32 0, i64 0), i64 %0 ) nounwind 
 	unreachable
 
 bb40:		; preds = %entry
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index 3a74b48..a030fbe 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -13,7 +13,7 @@ forbody:
         %sub14 = sub i32 1027, %i.0             ; <i32> [#uses=1]
         %mul15 = mul i32 %sub14, 10             ; <i32> [#uses=1]
         %add166 = or i32 %mul15, 1              ; <i32> [#uses=1] *
-        call i32 (i8*, ...)* @printf( i8* noalias  getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
+        call i32 (i8*, ...) @printf( i8* noalias  getelementptr ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
         %inc = add i32 %i.0, 1          ; <i32> [#uses=3]
         %cmp = icmp ne i32 %inc, 1027          ; <i1> [#uses=1]
         br i1 %cmp, label %forbody, label %afterfor
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index d939207..2910902 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
   ret i32 0
 
 if.end:                                           ; preds = %entry
-  %call = tail call i32 (...)* @_Unwind_ForcedUnwind_Phase2() nounwind
+  %call = tail call i32 (...) @_Unwind_ForcedUnwind_Phase2() nounwind
   store i32 %call, i32* @a, align 4
   %tobool1 = icmp eq i32 %call, 0
   br i1 %tobool1, label %cond.end, label %cond.true
diff --git a/test/CodeGen/X86/2008-09-09-LinearScanBug.ll b/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
index c80fbdd..9a1a3dd 100644
--- a/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
+++ b/test/CodeGen/X86/2008-09-09-LinearScanBug.ll
@@ -58,7 +58,7 @@ ifend.i:		; preds = %lor_rhs.i
 safe_mod_int16_t_s_s.exit:		; preds = %ifend.i, %lor_rhs.i, %func_106.exit27
 	%call31 = phi i16 [ %conv8.i, %ifend.i ], [ %conv, %func_106.exit27 ], [ %conv, %lor_rhs.i ]		; <i16> [#uses=1]
 	%conv4 = sext i16 %call31 to i32		; <i32> [#uses=1]
-	%call5 = tail call i32 (...)* @func_104( i32 %conv4 )		; <i32> [#uses=0]
+	%call5 = tail call i32 (...) @func_104( i32 %conv4 )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
index 635194f..8c46bb3 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
@@ -9,7 +9,7 @@ entry:
 	%1 = load i16, i16* @g_15, align 2		; <i16> [#uses=1]
 	%2 = zext i16 %1 to i32		; <i32> [#uses=1]
 	%3 = and i32 %2, 1		; <i32> [#uses=1]
-	%4 = tail call i32 (...)* @rshift_u_s( i32 1 ) nounwind		; <i32> [#uses=1]
+	%4 = tail call i32 (...) @rshift_u_s( i32 1 ) nounwind		; <i32> [#uses=1]
 	%5 = icmp slt i32 %4, 2		; <i1> [#uses=1]
 	%6 = zext i1 %5 to i32		; <i32> [#uses=1]
 	%7 = icmp sge i32 %3, %6		; <i1> [#uses=1]
@@ -17,7 +17,7 @@ entry:
 	%9 = load i16, i16* @g_15, align 2		; <i16> [#uses=1]
 	%10 = icmp eq i16 %9, 0		; <i1> [#uses=1]
 	%11 = zext i1 %10 to i32		; <i32> [#uses=1]
-	%12 = tail call i32 (...)* @func_20( i32 1 ) nounwind		; <i32> [#uses=1]
+	%12 = tail call i32 (...) @func_20( i32 1 ) nounwind		; <i32> [#uses=1]
 	%13 = icmp sge i32 %11, %12		; <i1> [#uses=1]
 	%14 = zext i1 %13 to i32		; <i32> [#uses=1]
 	%15 = sub i32 %8, %14		; <i32> [#uses=1]
@@ -27,7 +27,7 @@ entry:
 	%or.cond = or i1 false, %18		; <i1> [#uses=1]
 	%19 = select i1 %or.cond, i32 0, i32 %0		; <i32> [#uses=1]
 	%.0 = lshr i32 %17, %19		; <i32> [#uses=1]
-	%20 = tail call i32 (...)* @func_7( i32 %.0 ) nounwind		; <i32> [#uses=0]
+	%20 = tail call i32 (...) @func_7( i32 %.0 ) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index 92eb1c8..757dff4 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -38,7 +38,7 @@ bb12:		; preds = %bb11, %entry
 	%.014.in = phi i8 [ %10, %bb11 ], [ %7, %entry ]		; <i8> [#uses=1]
 	%11 = icmp ne i8 %.014.in, 0		; <i1> [#uses=1]
 	%12 = zext i1 %11 to i32		; <i32> [#uses=1]
-	%13 = tail call i32 (...)* @func_48( i32 %12, i32 %3, i32 0 ) nounwind		; <i32> [#uses=0]
+	%13 = tail call i32 (...) @func_48( i32 %12, i32 %3, i32 0 ) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-10-11-CallCrash.ll b/test/CodeGen/X86/2008-10-11-CallCrash.ll
index a859bc6..9ad7ab2 100644
--- a/test/CodeGen/X86/2008-10-11-CallCrash.ll
+++ b/test/CodeGen/X86/2008-10-11-CallCrash.ll
@@ -6,13 +6,13 @@ target triple = "i386-apple-darwin7"
 
 define i32 @func_45(i64 %p_46, i32 %p_48) nounwind {
 entry:
-	%0 = tail call i32 (...)* @lshift_s_u(i64 %p_46, i64 0) nounwind		; <i32> [#uses=0]
+	%0 = tail call i32 (...) @lshift_s_u(i64 %p_46, i64 0) nounwind		; <i32> [#uses=0]
 	%1 = load i32, i32* @g_385, align 4		; <i32> [#uses=1]
 	%2 = shl i32 %1, 1		; <i32> [#uses=1]
 	%3 = and i32 %2, 32		; <i32> [#uses=1]
-	%4 = tail call i32 (...)* @func_87(i32 undef, i32 %p_48, i32 1) nounwind		; <i32> [#uses=1]
+	%4 = tail call i32 (...) @func_87(i32 undef, i32 %p_48, i32 1) nounwind		; <i32> [#uses=1]
 	%5 = add i32 %3, %4		; <i32> [#uses=1]
-	%6 = tail call i32 (...)* @div_rhs(i32 %5) nounwind		; <i32> [#uses=0]
+	%6 = tail call i32 (...) @div_rhs(i32 %5) nounwind		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
index 4d3f8c2..c285ae4 100644
--- a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
@@ -3,7 +3,7 @@
 
 define i32 @func_77(i8 zeroext %p_79) nounwind {
 entry:
-	%0 = tail call i32 (...)* @func_43(i32 1) nounwind		; <i32> [#uses=1]
+	%0 = tail call i32 (...) @func_43(i32 1) nounwind		; <i32> [#uses=1]
 	%1 = icmp eq i32 %0, 0		; <i1> [#uses=1]
 	br i1 %1, label %bb3, label %bb
 
@@ -14,7 +14,7 @@ bb3:		; preds = %bb, %entry
 	%p_79_addr.0 = phi i8 [ 0, %bb ], [ %p_79, %entry ]		; <i8> [#uses=1]
 	%2 = zext i8 %p_79_addr.0 to i32		; <i32> [#uses=2]
 	%3 = zext i1 false to i32		; <i32> [#uses=2]
-	%4 = tail call i32 (...)* @rshift_u_s(i32 1) nounwind		; <i32> [#uses=0]
+	%4 = tail call i32 (...) @rshift_u_s(i32 1) nounwind		; <i32> [#uses=0]
 	%5 = lshr i32 %2, %2		; <i32> [#uses=3]
 	%6 = icmp eq i32 0, 0		; <i1> [#uses=1]
 	br i1 %6, label %bb6, label %bb9
diff --git a/test/CodeGen/X86/2008-11-06-testb.ll b/test/CodeGen/X86/2008-11-06-testb.ll
index 4ee4b4a..c8fad06 100644
--- a/test/CodeGen/X86/2008-11-06-testb.ll
+++ b/test/CodeGen/X86/2008-11-06-testb.ll
@@ -18,7 +18,7 @@ entry:
 	br i1 %4, label %bb5, label %bb
 
 bb:		; preds = %entry
-	%5 = tail call i32 (...)* @xx() nounwind		; <i32> [#uses=1]
+	%5 = tail call i32 (...) @xx() nounwind		; <i32> [#uses=1]
 	ret i32 %5
 
 bb5:		; preds = %entry
diff --git a/test/CodeGen/X86/2008-11-29-ULT-Sign.ll b/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
index 6dca141..03442d6 100644
--- a/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
+++ b/test/CodeGen/X86/2008-11-29-ULT-Sign.ll
@@ -8,7 +8,7 @@ entry:
 	br i1 %cmp, label %if.end, label %if.then
 
 if.then:		; preds = %entry
-	%call = call i32 (...)* @b()		; <i32> [#uses=0]
+	%call = call i32 (...) @b()		; <i32> [#uses=0]
 	br label %if.end
 
 if.end:		; preds = %if.then, %entry
diff --git a/test/CodeGen/X86/2008-12-01-SpillerAssert.ll b/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
index 105489e..cf292e3 100644
--- a/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
+++ b/test/CodeGen/X86/2008-12-01-SpillerAssert.ll
@@ -10,6 +10,6 @@ declare i32 @printk(i8*, ...)
 define void @display_cacheinfo(%struct.cpuinfo_x86* %c) nounwind section ".cpuinit.text" {
 entry:
         %asmtmp = tail call { i32, i32, i32, i32 } asm "cpuid", "={ax},={bx},={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 -2147483643, i32 0) nounwind          ; <{ i32, i32, i32, i32 }> [#uses=0]
-        %0 = tail call i32 (i8*, ...)* @printk(i8* getelementptr ([70 x i8], [70 x i8]* @.str10, i32 0, i64 0), i32 0, i32 0, i32 0, i32 0) nounwind           ; <i32> [#uses=0]
+        %0 = tail call i32 (i8*, ...) @printk(i8* getelementptr ([70 x i8], [70 x i8]* @.str10, i32 0, i64 0), i32 0, i32 0, i32 0, i32 0) nounwind           ; <i32> [#uses=0]
         unreachable
 }
diff --git a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
index 7ac2cd2..6bb29fd 100644
--- a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
@@ -16,7 +16,7 @@ entry:
 	%1 = trunc i64 %u to i32		; <i32> [#uses=4]
 	%2 = lshr i64 %u, 32		; <i64> [#uses=1]
 	%3 = trunc i64 %2 to i32		; <i32> [#uses=2]
-	%4 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 %1) nounwind		; <i32> [#uses=0]
+	%4 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i32 0), i32 %1) nounwind		; <i32> [#uses=0]
 	%5 = icmp ult i32 %1, %0		; <i1> [#uses=1]
 	br i1 %5, label %bb2, label %bb
 
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index db31333..172a00a 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
@@ -21,9 +21,9 @@ bb4:		; preds = %bb.i, %bb26, %bb4, %entry
 ; CHECK: xorl
 ; CHECK: movq
 
-	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
+	%0 = call i32 (...) @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
-	%1 = call i32 (...)* @xxCalculateMidType(%struct.Key* %desc, i32 0) nounwind		; <i32> [#uses=1]
+	%1 = call i32 (...) @xxCalculateMidType(%struct.Key* %desc, i32 0) nounwind		; <i32> [#uses=1]
 	%cond = icmp eq i32 %1, 1		; <i1> [#uses=1]
 	br i1 %cond, label %bb26, label %bb4
 
diff --git a/test/CodeGen/X86/2009-03-25-TestBug.ll b/test/CodeGen/X86/2009-03-25-TestBug.ll
index 79c0863..367a6d2 100644
--- a/test/CodeGen/X86/2009-03-25-TestBug.ll
+++ b/test/CodeGen/X86/2009-03-25-TestBug.ll
@@ -15,11 +15,11 @@ bb1579.i.i:		; preds = %bb1514.i.i, %bb191.i.i
         br i1 %tmp178, label %hello, label %world
 
 hello:
-	%h = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @hello, i32 0, i32 0))
+	%h = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @hello, i32 0, i32 0))
         ret void
 
 world:
-	%w = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([7 x i8], [7 x i8]* @world, i32 0, i32 0))
+	%w = tail call i32 (i8*, ...) @printf( i8* getelementptr ([7 x i8], [7 x i8]* @world, i32 0, i32 0))
         ret void
 }
 
diff --git a/test/CodeGen/X86/2009-04-13-2AddrAssert.ll b/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
index 4362ba4..a3607c6 100644
--- a/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
+++ b/test/CodeGen/X86/2009-04-13-2AddrAssert.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-undermydesk-freebsd8.0"
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-        %call = tail call i32 (...)* @getpid()          ; <i32> [#uses=1]
+        %call = tail call i32 (...) @getpid()          ; <i32> [#uses=1]
         %conv = trunc i32 %call to i16          ; <i16> [#uses=1]
         %0 = tail call i16 asm "xchgb ${0:h}, ${0:b}","=Q,0,~{dirflag},~{fpsr},~{flags}"(i16 %conv) nounwind           ; <i16> [#uses=0]
         ret i32 undef
diff --git a/test/CodeGen/X86/2009-04-14-IllegalRegs.ll b/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
index 1e5e933..8055ea8 100644
--- a/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
+++ b/test/CodeGen/X86/2009-04-14-IllegalRegs.ll
@@ -21,7 +21,7 @@ entry:
 	store i8 %5, i8* %7, align 1
 	%8 = getelementptr %struct.X, %struct.X* %xxx, i32 0, i32 0		; <i8*> [#uses=1]
 	store i8 15, i8* %8, align 1
-	%9 = call i32 (...)* bitcast (i32 (%struct.X*, %struct.X*)* @f to i32 (...)*)(%struct.X* byval align 4 %xxx, %struct.X* byval align 4 %xxx) nounwind		; <i32> [#uses=1]
+	%9 = call i32 (...) bitcast (i32 (%struct.X*, %struct.X*)* @f to i32 (...)*)(%struct.X* byval align 4 %xxx, %struct.X* byval align 4 %xxx) nounwind		; <i32> [#uses=1]
 	store i32 %9, i32* %0, align 4
 	%10 = load i32, i32* %0, align 4		; <i32> [#uses=1]
 	store i32 %10, i32* %retval, align 4
diff --git a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
index 6e062fb..89cd24d 100644
--- a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
+++ b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
@@ -7,7 +7,7 @@ entry:
         %tmp5.i = extractelement <1 x i64> %a, i32 0
         %tmp11 = bitcast i64 %tmp5.i to <1 x i64>
         %tmp8 = extractelement <1 x i64> %tmp11, i32 0
-        %call6 = call i32 (i64)* @foo(i64 %tmp8)
+        %call6 = call i32 (i64) @foo(i64 %tmp8)
         ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
index fac6a66..45e770f 100644
--- a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
+++ b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
@@ -53,7 +53,7 @@ bb5:                                              ; preds = %bb4, %bb3
 bb6.preheader:                                    ; preds = %bb5
   %21 = sext i8 %p_52 to i32                      ; <i32> [#uses=1]
   %22 = load volatile i32, i32* @uint8, align 4        ; <i32> [#uses=0]
-  %23 = tail call i32 (...)* @safefuncts(i32 %21, i32 1) nounwind; <i32> [#uses=0]
+  %23 = tail call i32 (...) @safefuncts(i32 %21, i32 1) nounwind; <i32> [#uses=0]
   unreachable
 
 return:                                           ; preds = %bb5
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index c783ee9..374c696 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -9,7 +9,7 @@ entry:
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !5)
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index e0fd9b0..b03556a 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -51,5 +51,5 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !17 = distinct !MDLexicalBlock(line: 11, column: 0, file: !19, scope: !1)
 !18 = !{!1}
 !19 = !MDFile(filename: "b2.c", directory: "/tmp/")
-!20 = !{i32 0}
+!20 = !{}
 !21 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index ced3708..7c0fd68 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -8,7 +8,7 @@
 
 define i32 @"main(tart.core.String[])->int32"(i32 %args) {
 entry:
-  tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !MDExpression()), !dbg !MDLocation(scope: !9)
   tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
   ret i32 3
 }
diff --git a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
index 4e4e006..6fe31b6 100644
--- a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
+++ b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
@@ -23,7 +23,7 @@ for.body:                                         ; preds = %if.end40, %entry
 
 if.then:                                          ; preds = %for.body
   %conv18 = sext i8 %tmp6 to i32                  ; <i32> [#uses=1]
-  %call = tail call i32 (...)* @invalid(i32 0, i32 0, i32 %conv18) nounwind ; <i32> [#uses=0]
+  %call = tail call i32 (...) @invalid(i32 0, i32 0, i32 %conv18) nounwind ; <i32> [#uses=0]
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
@@ -34,7 +34,7 @@ if.end:                                           ; preds = %if.then, %for.body
 
 if.then36:                                        ; preds = %if.end
   %conv38 = sext i8 %tmp24 to i32                 ; <i32> [#uses=1]
-  %call39 = tail call i32 (...)* @invalid(i32 0, i32 0, i32 %conv38) nounwind ; <i32> [#uses=0]
+  %call39 = tail call i32 (...) @invalid(i32 0, i32 0, i32 %conv38) nounwind ; <i32> [#uses=0]
   br label %if.end40
 
 if.end40:                                         ; preds = %if.then36, %if.end
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 43f05ca..29df291 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -10,10 +10,10 @@
 
 define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
   %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %2 = fcmp olt float %0, %1, !dbg !19            ; <i1> [#uses=1]
@@ -247,5 +247,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !44 = !{!1}
 !45 = !MDFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
 !46 = !MDFile(filename: "libgcc2.h", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
-!47 = !{i32 0}
+!47 = !{}
 !48 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index b8f7ba2..fe68711 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10"
 
 define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !MDExpression()), !dbg !MDLocation(scope: !9)
   %0 = getelementptr inbounds %struct.a, %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
   %1 = load i32, i32* %0, align 8, !dbg !28            ; <i32> [#uses=1]
   tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index f2e8dbd..097cd24 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -4,8 +4,8 @@
 
 define i32 @foo(i32 %y) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !MDExpression())
-  %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !MDExpression()), !dbg !MDLocation(scope: !1)
+  %0 = tail call i32 (...) @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
   ret i32 %0, !dbg !9
 }
 
@@ -15,9 +15,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 
 define i32 @bar(i32 %x) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !MDExpression()) nounwind
-  %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !MDExpression()) nounwind, !dbg !MDLocation(scope: !1)
+  %0 = tail call i32 (...) @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
 }
@@ -44,7 +44,7 @@ entry:
 !16 = !{!7}
 !17 = !{!1, !8}
 !18 = !MDFile(filename: "f.c", directory: "/tmp")
-!19 = !{i32 0}
+!19 = !{}
 
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index b0185ba..942faf4 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -10,8 +10,8 @@ target triple = "x86_64-apple-darwin10.2"
 define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
 ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
 entry:
-  tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !MDExpression())
-  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !MDExpression()), !dbg !MDLocation(scope: !8)
   %0 = mul nsw i32 %x, 7, !dbg !29                ; <i32> [#uses=1]
   %1 = add nsw i32 %0, 1, !dbg !29                ; <i32> [#uses=1]
   ret i32 %1, !dbg !29
@@ -55,6 +55,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !29 = !MDLocation(line: 16, scope: !30)
 !30 = distinct !MDLexicalBlock(line: 15, column: 0, file: !31, scope: !8)
 !31 = !MDFile(filename: "foo.cp", directory: "/tmp/")
-!32 = !{i32 0}
+!32 = !{}
 !33 = !{!1, !8, !18}
 !34 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 198eb31..0b1c36f 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -18,7 +18,7 @@ entry:
   %0 = call i32 asm "bsr   $1, $0\0A\09cmovz $2, $0", "=&r,ro,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %zero, i32 -1) nounwind, !srcloc !0 ; <i32> [#uses=1]
   store i32 %0, i32* %v
   %tmp = load i32, i32* %v                             ; <i32> [#uses=1]
-  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 %tmp) ; <i32> [#uses=0]
+  %call1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 %tmp) ; <i32> [#uses=0]
   store i32 0, i32* %retval
   %1 = load i32, i32* %retval                          ; <i32> [#uses=1]
   ret i32 %0
diff --git a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
index 1a05d0a..ab9715d 100644
--- a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
+++ b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -29,7 +29,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry.if.end_crit_edge, %if.then
   %tmp4 = phi i32 [ %tmp4.pre, %entry.if.end_crit_edge ], [ 1, %if.then ] ; <i32> [#uses=1]
-  %call5 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp4) nounwind ; <i32> [#uses=0]
+  %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp4) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 6bd1217..d3ad860 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -125,5 +125,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !44 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "k", line: 26, scope: !39, file: !2, type: !13)
 !45 = !MDLocation(line: 27, scope: !39)
 !47 = !MDFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!48 = !{i32 0}
+!48 = !{}
 !49 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index fa4fd75..30abdc5 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -31,5 +31,5 @@ entry:
 !13 = !{!0, !6}
 !14 = !MDFile(filename: "", directory: "/private/tmp")
 !15 = !MDFile(filename: "bug.c", directory: "/private/tmp")
-!16 = !{i32 0}
+!16 = !{}
 !17 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 783b34d..d920513 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -36,5 +36,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !15 = !{!0}
 !16 = !{!6}
 !17 = !MDFile(filename: "one.c", directory: "/private/tmp")
-!18 = !{i32 0}
+!18 = !{}
 !19 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index c6a3a78..c02bd2d 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -60,7 +60,7 @@ cond.end:                                         ; preds = %entry, %cond.true
 
 if.then:                                          ; preds = %cond.end
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @str, i64 0, i64 0))
-  %call12 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i32 %call, i32 %cond) nounwind optsize, !dbg !26
+  %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i32 %call, i32 %cond) nounwind optsize, !dbg !26
   ret i32 1, !dbg !27
 
 return:                                           ; preds = %cond.end
@@ -110,5 +110,5 @@ declare i32 @puts(i8* nocapture) nounwind
 !29 = !{!10, !11, !12}
 !30 = !{!14, !17}
 !31 = !MDFile(filename: "rem_small.c", directory: "/private/tmp")
-!32 = !{i32 0}
+!32 = !{}
 !33 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2011-02-23-UnfoldBug.ll b/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
index 900106a..90b90d7 100644
--- a/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
+++ b/test/CodeGen/X86/2011-02-23-UnfoldBug.ll
@@ -22,7 +22,7 @@ for.body33.lr.ph:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.body
   %vecins.i94 = insertelement <2 x double> undef, double 0.000000e+00, i32 0
   %cmpsd.i = tail call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %vecins.i94, <2 x double> <double 0x3FE984B204153B34, double 0x3FE984B204153B34>, i8 2) nounwind
-  tail call void (...)* @_mm_movemask_pd(<2 x double> %cmpsd.i) nounwind
+  tail call void (...) @_mm_movemask_pd(<2 x double> %cmpsd.i) nounwind
   br i1 undef, label %if.then67, label %if.end71
 
 if.then67:                                        ; preds = %for.end
diff --git a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
index 86e579a..d25fbf7 100644
--- a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
+++ b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
@@ -43,7 +43,7 @@ entry:
   %14 = and i32 %13, -129
   %15 = or i32 %14, %12
   store i32 %15, i32* %10, align 4
-  %call = call i32 (...)* @iequals(i32 1841, i32 %bf.value, i32 0)
+  %call = call i32 (...) @iequals(i32 1841, i32 %bf.value, i32 0)
   %16 = load i32, i32* %retval
   ret i32 %16
 }
diff --git a/test/CodeGen/X86/2011-09-14-valcoalesce.ll b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
index a086a79..b8e5100 100644
--- a/test/CodeGen/X86/2011-09-14-valcoalesce.ll
+++ b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
@@ -144,7 +144,7 @@ if.end117.i:                                      ; preds = %if.then108.i, %land
   br i1 undef, label %if.then122.i, label %for.cond138.preheader.i
 
 if.then122.i:                                     ; preds = %if.end117.i
-  call void (...)* @fprintf(i32 undef, i32 %gs.0526.i, i32 %ge.1.i, i32 %aFreq.1.i, double undef) nounwind
+  call void (...) @fprintf(i32 undef, i32 %gs.0526.i, i32 %ge.1.i, i32 %aFreq.1.i, double undef) nounwind
   br label %for.cond138.preheader.i
 
 for.cond138.preheader.i:                          ; preds = %if.then122.i, %if.end117.i
diff --git a/test/CodeGen/X86/2011-10-12-MachineCSE.ll b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
index 5018db7..341a14b 100644
--- a/test/CodeGen/X86/2011-10-12-MachineCSE.ll
+++ b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
@@ -102,7 +102,7 @@ if.end:                                           ; preds = %lor.lhs.false23
   %arrayidx38 = getelementptr inbounds [0 x %struct.insn_data], [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom37
   %genfun = getelementptr inbounds %struct.insn_data, %struct.insn_data* %arrayidx38, i32 0, i32 2
   %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)*, %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8
-  %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...)* %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
+  %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...) %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
   br label %return
 
 return:                                           ; preds = %if.end, %if.then
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index da3c322..07dff95 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -26,7 +26,7 @@ entry:
 }
 
 ; CHECK-LABEL: zero_test
-; CHECK: pxor %xmm0, %xmm0
+; CHECK: xorps %xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 75409f2..677c902 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -1,12 +1,25 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
 
-; CHECK: endless_loop
 define void @endless_loop() {
+; CHECK-LABEL: endless_loop:
+; CHECK-NEXT:     # BB#0:
+; CHECK-NEXT:	vmovaps	(%eax), %ymm0
+; CHECK-NEXT:	vextractf128	$1, %ymm0, %xmm0
+; CHECK-NEXT:	vmovsldup	%xmm0, %xmm0    # xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT:	vmovddup	%xmm0, %xmm1    # xmm1 = xmm0[0,0]
+; CHECK-NEXT:	vinsertf128	$1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:	vxorps	%xmm2, %xmm2, %xmm2
+; CHECK-NEXT:	vblendps	$128, %ymm1, %ymm2, %ymm1 # ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:	vxorps	%ymm2, %ymm2, %ymm2
+; CHECK-NEXT:	vblendps	$1, %ymm0, %ymm2, %ymm0 # ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
+; CHECK-NEXT:	vmovaps	%ymm0, (%eax)
+; CHECK-NEXT:	vmovaps	%ymm1, (%eax)
+; CHECK-NEXT:	vzeroupper
+; CHECK-NEXT:	retl
 entry:
   %0 = load <8 x i32>, <8 x i32> addrspace(1)* undef, align 32
   %1 = shufflevector <8 x i32> %0, <8 x i32> undef, <16 x i32> <i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = shufflevector <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef>, <16 x i32> %1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 17>
   store <16 x i32> %2, <16 x i32> addrspace(1)* undef, align 64
   ret void
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index f33fc8c..a366102 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK: pmovzxwd
   %A27 = load <4 x i16>, <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
-; CHECK: movlpd
+; CHECK: movq
   store <4 x i16> %A28, <4 x i16>* %in, align 4
   ret void
 ; CHECK: ret
@@ -18,7 +18,7 @@ define void @store_64(<2 x i32>* %ptr) {
 BB:
   store <2 x i32> zeroinitializer, <2 x i32>* %ptr
   ret void
-;CHECK: movlpd
+;CHECK: movlps
 ;CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
index 57af20e..a8e0625 100644
--- a/test/CodeGen/X86/2012-09-28-CGPBug.ll
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -35,7 +35,7 @@ define void @h(i8*) nounwind ssp {
   indirectbr i8* %16, [label %17, label %18]
 
 ; <label>:17                                      ; preds = %11
-  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str40, i32 0, i32 0))
+  tail call void (i8*, ...) @g(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str40, i32 0, i32 0))
   br label %22
 
 ; <label>:18                                      ; preds = %11
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index a9b8cc6..d1c0266 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -1,27 +1,28 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
-target triple = "i686-pc-win32"
-
-;CHECK-LABEL: bad_cast:
 define void @bad_cast() {
-entry:
+; CHECK-LABEL: bad_cast:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm0, (%eax)
+; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
   %vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
   store <3 x i64> %vecinit8.i, <3 x i64>* undef, align 32
-;CHECK: ret
   ret void
 }
 
-
-;CHECK-LABEL: bad_insert:
 define void @bad_insert(i32 %t) {
-entry:
-;CHECK: vxorps %ymm1, %ymm1, %ymm1
-;CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; CHECK-LABEL: bad_insert:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovaps %ymm0, (%eax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %v2 = insertelement <8 x i32> zeroinitializer, i32 %t, i32 0
   store <8 x i32> %v2, <8 x i32> addrspace(1)* undef, align 32
-;CHECK: ret
   ret void
 }
 
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
index df4f028..ed1daad 100644
--- a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: merge_stores_can
 ; CHECK: callq foo
-; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK: xorps %xmm0, %xmm0
 ; CHECK-NEXT: movups  %xmm0
 ; CHECK: callq foo
 ; CHECK: ret
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 84e77a8..d175fab 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -16,7 +16,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
 entry:
-  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !14)
   %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0
   %0 = load i16, i16* %type, align 2
   %cmp = icmp eq i16 %0, 1
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index b7124c9..08ade9c 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -43,14 +43,14 @@ if.then3344:
   br label %if.then4073
 
 if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !MDExpression()), !dbg !MDLocation(scope: !5)
   %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0
   %0 = load i32, i32* undef, align 4
   %add4093 = add nsw i32 %0, 0
   %conv4094 = sitofp i32 %add4093 to float
   %div4095 = fdiv float %conv4094, 5.670000e+02
   %conv4096 = fpext float %div4095 to double
-  %call4097 = call i32 (i8*, i32, i64, i8*, ...)* @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
+  %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind
   br i1 %cmp1733, label %if.then4107, label %if.else4114
 
 if.then4107:                                      ; preds = %if.then4073
@@ -108,7 +108,7 @@ cond.true:                                        ; preds = %entry
   unreachable
 
 cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !MDExpression()), !dbg !MDLocation(scope: !37)
   %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
   invoke void @_Znwm()
           to label %exit.i unwind label %lpad2.i.i.i.i
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 5bcff57..871c68f 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -20,7 +20,7 @@ if.then:                                          ; preds = %entry
   unreachable
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !MDExpression())
+  call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !MDExpression()), !dbg !MDLocation(scope: !2)
   %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
   store i32 0, i32* undef, align 8
   %cmp12447 = icmp sgt i32 undef, 0
diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
index 3d9dc57..120eba7 100644
--- a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
+++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
@@ -36,7 +36,7 @@ print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0
   %reg16 = getelementptr inbounds [3 x i8], [3 x i8]* %.str..str1.i, i64 0, i64 0
   %reg17 = shl i64 %iv.i, 1
   %reg19 = inttoptr i64 %reg17 to i8*
-  call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
+  call void (i64*, i8*, ...) @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
   %iv.next.i = add nsw i64 %iv.i, 0
   br label %print_shadow_bytes.exit.i
 }
diff --git a/test/CodeGen/X86/GC/dynamic-frame-size.ll b/test/CodeGen/X86/GC/dynamic-frame-size.ll
new file mode 100644
index 0000000..a3583d4
--- /dev/null
+++ b/test/CodeGen/X86/GC/dynamic-frame-size.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @use(<4 x i8*>*)
+
+; Test that a frame which requires dynamic relocation produces a stack map
+; with a size of UINT64_MAX.
+define void @test(i8* %ptr) gc "erlang" {
+   ; 32 byte alignment (for the alloca) is larger than the default
+   ; 16 byte alignment
+   %slot = alloca <4 x i8*>
+   call void @use(<4 x i8*>* %slot);
+   ret void
+}
+
+; CHECK: .note.gc
+; CHECK-NEXT: .align 8
+; safe point count
+; CHECK .short	1
+; CHECK .long	.Ltmp0
+; stack frame size (in words)
+; CHECK .short	-1
+; stack arity (arguments on the stack)
+; CHECK .short	0
+; live root count
+; CHECK .short	0
+
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 265fec4..79db445 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -49,5 +49,5 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
 !18 = !{!1}
 !19 = !{!6, !7, !10}
 !20 = !MDFile(filename: "a.c", directory: "/private/tmp")
-!21 = !{i32 0}
+!21 = !{}
 !22 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index da4d58a..7ac08d1 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -17,7 +17,7 @@ entry:
 for.body:
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
-  call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !MDExpression()) nounwind
+  call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !MDExpression()) nounwind, !dbg !MDLocation(scope: !2)
   br label %for.body
 }
 
@@ -27,7 +27,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = !MDCompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2)
+!0 = !MDCompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
 !1 = !MDFile(filename: "t.c", directory: "")
 !16 = !MDBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
 !2 = !MDSubprogram()
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 9c24be4..44b587a 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,43 +4,26 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: cmpl %ecx, %eax 
-; CHECK-NOT: addl
-; CHECK: adcl $0, %eax
-  %add4 = add i32 %x, %sum
-  %cmp = icmp ult i32 %add4, %x
-  %inc = zext i1 %cmp to i32
-  %z.0 = add i32 %add4, %inc
-  ret i32 %z.0
-}
-
-; Instcombine transforms test1 into test2:
-; CHECK-LABEL: test2:
 ; CHECK: movl
 ; CHECK-NEXT: addl
 ; CHECK-NEXT: adcl $0
 ; CHECK-NEXT: ret
-define i32 @test2(i32 %sum, i32 %x) nounwind readnone ssp {
-entry:
-  %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %sum)
-  %0 = extractvalue { i32, i1 } %uadd, 0
-  %cmp = extractvalue { i32, i1 } %uadd, 1
+  %add4 = add i32 %x, %sum
+  %cmp = icmp ult i32 %add4, %x
   %inc = zext i1 %cmp to i32
-  %z.0 = add i32 %0, %inc
+  %z.0 = add i32 %add4, %inc
   ret i32 %z.0
 }
 
 ; <rdar://problem/12579915>
-define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+define i32 @test2(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
 entry:
   %cmp = icmp ugt i32 %x, %y
   %dec = sext i1 %cmp to i32
   %dec.res = add nsw i32 %dec, %res
   ret i32 %dec.res
-; CHECK-LABEL: test3:
+; CHECK-LABEL: test2:
 ; CHECK: cmpl
 ; CHECK: sbbl
 ; CHECK: ret
 }
-
-declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index 360f141..3f19a06 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -69,7 +69,7 @@ entry:
    %tmp0 = load i32, i32* @bar_i
    %tmp2 = call i32 @foo_f()
    %tmp3 = add i32 %tmp, %tmp2
-   %tmp4 = call %FunTy* @bar_f()
+   %tmp4 = call i32 @bar_f()
    %tmp5 = add i32 %tmp3, %tmp4
    %tmp6 = add i32 %tmp1, %tmp5
    %tmp7 = add i32 %tmp6, %tmp0
diff --git a/test/CodeGen/X86/and-or-fold.ll b/test/CodeGen/X86/and-or-fold.ll
index 836b5f1..ec39522 100644
--- a/test/CodeGen/X86/and-or-fold.ll
+++ b/test/CodeGen/X86/and-or-fold.ll
@@ -21,6 +21,6 @@ entry:
   %tmp1 = and i64 %x, 123127
   %tmp2 = or i64 %tmp1, 3
   ret i64 %tmp2
-; DARWIN-OPT:       andq $123124
+; DARWIN-OPT:       andl $123124
 ; DARWIN-OPT-NEXT:  leaq 3
 }
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index 640237d..d9e676a 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -17,3 +17,15 @@ define void @foo(i64 %zed, i64* %x) nounwind {
   store i64 %t2, i64* %x, align 8
   ret void
 }
+
+define i64 @bar(i64 %zed) nounwind {
+; CHECK:  andl     $42, %edi               # encoding: [0x83,0xe7,0x2a]
+  %t1 = and i64 %zed, 42
+  ret i64 %t1
+}
+
+define i64 @baz(i64 %zed) nounwind {
+; CHECK:  andl $2147483647, %edi      # encoding: [0x81,0xe7,0xff,0xff,0xff,0x7f]
+  %t1 = and i64 %zed, 2147483647
+  ret i64 %t1
+}
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
index 3abe3d1..a7c104e 100644
--- a/test/CodeGen/X86/anyregcc-crash.ll
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -7,7 +7,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                         i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                         i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                 i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                 i64 %v13, i64 %v14, i64 %v15, i64 %v16)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
index 98ba17c..129aadf 100644
--- a/test/CodeGen/X86/anyregcc.ll
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -60,7 +60,7 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -82,7 +82,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -105,7 +105,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -128,7 +128,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -210,7 +210,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -292,7 +292,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -320,7 +320,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
   ret i64 %result
 }
@@ -360,7 +360,7 @@ entry:
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index 11b4e68..c6b1c39 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -48,7 +48,7 @@ define void @atomic_fetch_and64() nounwind {
 ; X64:       lock
 ; X64:       andq $3
   %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
-; X64:       andq
+; X64:       andl
 ; X64:       lock
 ; X64:       cmpxchgq
   %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
diff --git a/test/CodeGen/X86/avoid-loop-align.ll b/test/CodeGen/X86/avoid-loop-align.ll
index 5d00ed0..d82cf94 100644
--- a/test/CodeGen/X86/avoid-loop-align.ll
+++ b/test/CodeGen/X86/avoid-loop-align.ll
@@ -11,7 +11,7 @@
 
 define i8* @test(i8* %Q, i32* %L) nounwind {
 entry:
-	%tmp = tail call i32 (...)* @foo() nounwind		; <i32> [#uses=2]
+	%tmp = tail call i32 (...) @foo() nounwind		; <i32> [#uses=2]
 	%tmp1 = inttoptr i32 %tmp to i8*		; <i8*> [#uses=1]
 	br label %bb1
 
diff --git a/test/CodeGen/X86/avx-bitcast.ll b/test/CodeGen/X86/avx-bitcast.ll
index bb3e5a5..e34c20f 100644
--- a/test/CodeGen/X86/avx-bitcast.ll
+++ b/test/CodeGen/X86/avx-bitcast.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
-; CHECK: vmovsd (%
-; CHECK-NEXT: vmovq %xmm
 define i64 @bitcasti64tof64() {
+; CHECK-LABEL: bitcasti64tof64:
+; CHECK:       # BB#0:
+; CHECK:         vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    retq
   %a = load double, double* undef
   %b = bitcast double %a to i64
   ret i64 %b
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
index 8cc7190..583c7d5 100644
--- a/test/CodeGen/X86/avx-cvt-2.ll
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
 
 ; Check that we generate vector conversion from float to narrower int types
 
@@ -8,8 +8,16 @@
 
 define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
 ; CHECK-LABEL: fptoui16:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovdqa %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptoui %f32vec_t %a to %i16vec_t
   store %i16vec_t %b, %i16vec_t * %p
   ret void
@@ -17,8 +25,16 @@ define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
 
 define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
 ; CHECK-LABEL: fptosi16:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovdqa %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi %f32vec_t %a to %i16vec_t
   store %i16vec_t %b, %i16vec_t * %p
   ret void
@@ -26,8 +42,17 @@ define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
 
 define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
 ; CHECK-LABEL: fptoui8:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptoui %f32vec_t %a to %i8vec_t
   store %i8vec_t %b, %i8vec_t * %p
   ret void
@@ -35,8 +60,17 @@ define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
 
 define void @fptosi8(%f32vec_t %a, %i8vec_t *%p) {
 ; CHECK-LABEL: fptosi8:
-; CHECK: vcvttps2dq %ymm
-; CHECK-NOT: vcvttss2si
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    vmovq %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi %f32vec_t %a to %i8vec_t
   store %i8vec_t %b, %i8vec_t * %p
   ret void
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index 9f154ab..6df3e53 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -1,84 +1,122 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
-; CHECK: vcvtdq2ps %ymm
 define <8 x float> @sitofp00(<8 x i32> %a) nounwind {
+; CHECK-LABEL: sitofp00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %b
 }
 
-; CHECK: vcvttps2dq %ymm
 define <8 x i32> @fptosi00(<8 x float> %a) nounwind {
+; CHECK-LABEL: fptosi00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = fptosi <8 x float> %a to <8 x i32>
   ret <8 x i32> %b
 }
 
-; CHECK: vcvtdq2pd %xmm
 define <4 x double> @sitofp01(<4 x i32> %a) {
+; CHECK-LABEL: sitofp01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <4 x i32> %a to <4 x double>
   ret <4 x double> %b
 }
 
-; CHECK: vcvtdq2ps %ymm
 define <8 x float> @sitofp02(<8 x i16> %a) {
+; CHECK-LABEL: sitofp02:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %b = sitofp <8 x i16> %a to <8 x float>
   ret <8 x float> %b
 }
 
-; CHECK: vcvttpd2dqy %ymm
 define <4 x i32> @fptosi01(<4 x double> %a) {
+; CHECK-LABEL: fptosi01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
 
-; CHECK: vcvtpd2psy %ymm
-; CHECK-NEXT: vcvtpd2psy %ymm
-; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
+; CHECK-LABEL: fptrunc00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; CHECK-NEXT:    vcvtpd2psy %ymm1, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %a = fptrunc <8 x double> %b to <8 x float>
   ret <8 x float> %a
 }
 
-; CHECK: vcvtps2pd %xmm
 define <4 x double> @fpext00(<4 x float> %b) nounwind {
+; CHECK-LABEL: fpext00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
   %a = fpext <4 x float> %b to <4 x double>
   ret <4 x double> %a
 }
 
-; CHECK: vcvtsi2sdq (%
 define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcA:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to double
   ret double %conv
 }
 
-; CHECK: vcvtsi2sdl (%
 define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcB:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to double
   ret double %conv
 }
 
-; CHECK: vcvtsi2ssl (%
 define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcC:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i32, i32* %e, align 4
   %conv = sitofp i32 %tmp1 to float
   ret float %conv
 }
 
-; CHECK: vcvtsi2ssq  (%
 define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
-entry:
+; CHECK-LABEL: funcD:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %tmp1 = load i64, i64* %e, align 8
   %conv = sitofp i64 %tmp1 to float
   ret float %conv
 }
 
-; CHECK: vcvtss2sd
 define void @fpext() nounwind uwtable {
-entry:
+; CHECK-LABEL: fpext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
   %f = alloca float, align 4
   %d = alloca double, align 8
   %tmp = load float, float* %f, align 4
@@ -88,16 +126,20 @@ entry:
 }
 
 define double @nearbyint_f64(double %a) {
-; CHECK-LABEL: nearbyint_f64
-; CHECK: vroundsd $12
+; CHECK-LABEL: nearbyint_f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call double @llvm.nearbyint.f64(double %a)
   ret double %res
 }
 declare double @llvm.nearbyint.f64(double %p)
 
 define float @floor_f32(float %a) {
-; CHECK-LABEL: floor_f32
-; CHECK: vroundss $1
+; CHECK-LABEL: floor_f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vroundss $1, %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %res = call float @llvm.floor.f32(float %a)
   ret float %res
 }
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index a70d45a..83585b5 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -1,147 +1,224 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ;;; Shift left
-; CHECK: vpslld
-; CHECK: vpslld
-define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift00(<8 x i32> %a) {
+; CHECK-LABEL: vshift00:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsllw
-; CHECK: vpsllw
-define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift01(<16 x i16> %a) {
+; CHECK-LABEL: vshift01:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsllq
-; CHECK: vpsllq
-define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
+define <4 x i64> @vshift02(<4 x i64> %a) {
+; CHECK-LABEL: vshift02:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllq $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Logical Shift right
-; CHECK: vpsrld
-; CHECK: vpsrld
-define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift03(<8 x i32> %a) {
+; CHECK-LABEL: vshift03:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrld $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrld $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: vpsrlw
-define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift04(<16 x i16> %a) {
+; CHECK-LABEL: vshift04:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsrlq
-; CHECK: vpsrlq
-define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
+define <4 x i64> @vshift05(<4 x i64> %a) {
+; CHECK-LABEL: vshift05:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrlq $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrlq $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
   ret <4 x i64> %s
 }
 
 ;;; Arithmetic Shift right
-; CHECK: vpsrad
-; CHECK: vpsrad
-define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
+define <8 x i32> @vshift06(<8 x i32> %a) {
+; CHECK-LABEL: vshift06:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrad $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsrad $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
 2>
   ret <8 x i32> %s
 }
 
-; CHECK: vpsraw
-; CHECK: vpsraw
-define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
+define <16 x i16> @vshift07(<16 x i16> %a) {
+; CHECK-LABEL: vshift07:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   ret <16 x i16> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: pxor
-; CHECK: psubb
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: pxor
-; CHECK: psubb
-define <32 x i8> @vshift09(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift09(<32 x i8> %a) {
+; CHECK-LABEL: vshift09:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; CHECK-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; CHECK-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
-; CHECK: pxor
-; CHECK: pcmpgtb
-; CHECK: pcmpgtb
-define <32 x i8> @vshift10(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift10(<32 x i8> %a) {
+; CHECK-LABEL: vshift10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = ashr <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <32 x i8> %s
 }
 
-; CHECK: vpsrlw
-; CHECK: pand
-; CHECK: vpsrlw
-; CHECK: pand
-define <32 x i8> @vshift11(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift11(<32 x i8> %a) {
+; CHECK-LABEL: vshift11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = lshr <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
-; CHECK: vpsllw
-; CHECK: pand
-; CHECK: vpsllw
-; CHECK: pand
-define <32 x i8> @vshift12(<32 x i8> %a) nounwind readnone {
+define <32 x i8> @vshift12(<32 x i8> %a) {
+; CHECK-LABEL: vshift12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpsllw $2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsllw $2, %xmm0, %xmm0
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %s = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
   ret <32 x i8> %s
 }
 
 ;;; Support variable shifts
-; CHECK: _vshift08
-; CHECK: vpslld $23
-; CHECK: vextractf128 $1
-; CHECK: vpslld $23
-; CHECK: ret
-define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
+define <8 x i32> @vshift08(<8 x i32> %a)  {
+; CHECK-LABEL: vshift08:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $23, %xmm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vcvttps2dq %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $23, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %bitop = shl <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %a
   ret <8 x i32> %bitop
 }
 
 ; PR15141
-; CHECK: _vshift13:
-; CHECK-NOT: vpsll
-; CHECK-NOT: vcvttps2dq
-; CHECK: vpmulld
 define <4 x i32> @vshift13(<4 x i32> %in) {
+; CHECK-LABEL: vshift13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
 }
 
 ;;; Uses shifts for sign extension
-; CHECK: _sext_v16i16
-; CHECK: vpsllw
-; CHECK: vpsraw
-; CHECK: vpsllw
-; CHECK: vpsraw
-; CHECK: vinsertf128
-define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
+define <16 x i16> @sext_v16i16(<16 x i16> %a)  {
+; CHECK-LABEL: sext_v16i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm1
+; CHECK-NEXT:    vpsraw $8, %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vpsraw $8, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %b = trunc <16 x i16> %a to <16 x i8>
   %c = sext <16 x i8> %b to <16 x i16>
   ret <16 x i16> %c
 }
 
-; CHECK: _sext_v8i32
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vinsertf128
-define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
+define <8 x i32> @sext_v8i32(<8 x i32> %a)  {
+; CHECK-LABEL: sext_v8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpsrad $16, %xmm1, %xmm1
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpslld $16, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $16, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %b = trunc <8 x i32> %a to <8 x i16>
   %c = sext <8 x i16> %b to <8 x i32>
   ret <8 x i32> %c
diff --git a/test/CodeGen/X86/avx-varargs-x86_64.ll b/test/CodeGen/X86/avx-varargs-x86_64.ll
index f550733..7ce5e19 100644
--- a/test/CodeGen/X86/avx-varargs-x86_64.ll
+++ b/test/CodeGen/X86/avx-varargs-x86_64.ll
@@ -10,6 +10,6 @@ declare i32 @f(i32, ...)
 define void @test1() nounwind uwtable ssp {
 entry:
   %0 = load <8 x float>, <8 x float>* @x, align 32
-  %call = call i32 (i32, ...)* @f(i32 1, <8 x float> %0)
+  %call = call i32 (i32, ...) @f(i32 1, <8 x float> %0)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index 9b82c88..9814a61 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,50 +1,8 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
-define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfmadd_ps_z
-  ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfmadd_ps
-  ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
-  ret <16 x float> %res
-}
-
-define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_x86_vfmadd_pd_z
-  ; CHECK: vfmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmadd_pd:
-; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
-  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
-  ret <8 x double> %res
-}
-
+declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfmsubps_z
-  ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfmsub_ps
-  ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
-  ret <16 x float> %res
-}
+declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubpd_z
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 46581f7..07d984a 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -515,14 +515,6 @@ define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
 }
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i64> @test_vpmuludq(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmuludq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a0, <16 x i32> %a1,
-                    <8 x i64>zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
-
 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
   ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
   %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
@@ -1606,3 +1598,568 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
                     <8 x double> zeroinitializer, i8 %mask, i32 3)
   ret <8 x double> %res
 }
+
+define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_xor_epi32
+  ;CHECK: vpxord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_xor_epi32
+  ;CHECK: vpxord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_or_epi32
+  ;CHECK: vpord {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_or_epi32
+  ;CHECK: vpord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_and_epi32
+  ;CHECK: vpandd {{.*}}encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_and_epi32
+  ;CHECK: vpandd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_xor_epi64
+  ;CHECK: vpxorq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_xor_epi64
+  ;CHECK: vpxorq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xef,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_or_epi64
+  ;CHECK: vporq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_or_epi64
+  ;CHECK: vporq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xeb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_and_epi64
+  ;CHECK: vpandq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_and_epi64
+  ;CHECK: vpandq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xdb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+
+define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rr
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rrk
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rrkz
+  ;CHECK: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rm
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmk
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmkz
+  ;CHECK: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmb
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfe,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmbk
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfe,0x0f]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi32_rmbkz
+  ;CHECK: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfe,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rr
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rrk
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0xd1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rrkz
+  ;CHECK: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rm
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmk
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmkz
+  ;CHECK: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmb
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0x7d,0x58,0xfa,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmbk
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfa,0x0f]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi32_rmbkz
+  ;CHECK: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfa,0x07]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rr
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rrk
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rrkz
+  ;CHECK: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rm
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmk
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x0f]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmkz
+  ;CHECK: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmb
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmbk
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_add_epi64_rmbkz
+  ;CHECK: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rr
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rrk
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rrkz
+  ;CHECK: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rm
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmk
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x0f]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmkz
+  ;CHECK: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x07]
+  %b = load <8 x i64>, <8 x i64>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmb
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmbk
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_sub_epi64_rmbkz
+  ;CHECK: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rr
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rrk
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rrkz
+  ;CHECK: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rm
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmk
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmkz
+  ;CHECK: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmb
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf2,0xfd,0x58,0x28,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmbk
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epi32_rmbkz
+  ;CHECK: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rr
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rrk
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rrkz
+  ;CHECK: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rm
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmk
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x0f]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmkz
+  ;CHECK: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x07]
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmb
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0  ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmbk
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x0f]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
+  ;CHECK-LABEL: test_mask_mul_epu32_rmbkz
+  ;CHECK: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x07]
+  %q = load i64, i64* %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index f1ef9ef..8b13e96 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -260,7 +260,7 @@ entry:
   %and = and i64 %x, 2147483647
   ret i64 %and
 ; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK: andq  $2147483647, %r[[ARG1]]
+; CHECK: andl  $2147483647, %e[[ARG1]]
 }
 
 define i32 @blsi32(i32 %x) nounwind readnone {
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index 3558376..c98ad9e 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -10,7 +10,7 @@
 define void @bar1(i1 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i1 %v1 to i32
-  %call = tail call i32 (...)* @foo1(i32 %conv) nounwind
+  %call = tail call i32 (...) @foo1(i32 %conv) nounwind
   ret void
 }
 
@@ -23,7 +23,7 @@ entry:
 define void @bar2(i8 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i8 %v1 to i32
-  %call = tail call i32 (...)* @foo1(i32 %conv) nounwind
+  %call = tail call i32 (...) @foo1(i32 %conv) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/brcond.ll b/test/CodeGen/X86/brcond.ll
index 3ebe1a1..f4db3ba 100644
--- a/test/CodeGen/X86/brcond.ll
+++ b/test/CodeGen/X86/brcond.ll
@@ -17,11 +17,11 @@ entry:
   br i1 %4, label %bb1, label %bb
 
 bb:                                               ; preds = %entry
-  %5 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=1]
+  %5 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=1]
   ret i32 %5
 
 bb1:                                              ; preds = %entry
-  %6 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+  %6 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
   ret i32 %6
 }
 
diff --git a/test/CodeGen/X86/byval-align.ll b/test/CodeGen/X86/byval-align.ll
index ac0ab75..8366ae3 100644
--- a/test/CodeGen/X86/byval-align.ll
+++ b/test/CodeGen/X86/byval-align.ll
@@ -18,7 +18,7 @@ entry:
   %1 = ptrtoint i8* %0 to i64                     ; <i64> [#uses=1]
   store i64 %1, i64* %p, align 8
   %2 = load i8*, i8** %ptr, align 8                    ; <i8*> [#uses=1]
-  %3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
+  %3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
   %4 = load i64, i64* %p, align 8                      ; <i64> [#uses=1]
   %5 = and i64 %4, 140737488355264                ; <i64> [#uses=1]
   %6 = load i64, i64* %p, align 8                      ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/byval6.ll b/test/CodeGen/X86/byval6.ll
index 2d39901..c3e7b7e 100644
--- a/test/CodeGen/X86/byval6.ll
+++ b/test/CodeGen/X86/byval6.ll
@@ -6,8 +6,8 @@
 
 define i32 @main() nounwind  {
 entry:
-	tail call void (i32, ...)* @bar( i32 3, %struct.W* byval  @.cpx ) nounwind 
-	tail call void (i32, ...)* @baz( i32 3, %struct.W* byval  @B ) nounwind 
+	tail call void (i32, ...) @bar( i32 3, %struct.W* byval  @.cpx ) nounwind 
+	tail call void (i32, ...) @baz( i32 3, %struct.W* byval  @B ) nounwind 
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/cache-intrinsic.ll b/test/CodeGen/X86/cache-intrinsic.ll
index c023047..0b9d77a 100644
--- a/test/CodeGen/X86/cache-intrinsic.ll
+++ b/test/CodeGen/X86/cache-intrinsic.ll
@@ -10,10 +10,10 @@ define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str1, i32 0, i32 0)) #3
   call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8], [32 x i8]* @buffer, i32 0, i32 0))
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index a885183..f2f36b1 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -108,7 +108,7 @@ func_1.exit:                                      ; preds = %bb.i.i, %func_4.exi
   %g_96.tmp.0.i = phi i8 [ %g_96.promoted.i, %bb.i.i ], [ %.mux.i, %func_4.exit.i ] ; <i8> [#uses=2]
   store i8 %g_96.tmp.0.i, i8* @g_96
   %6 = zext i8 %g_96.tmp.0.i to i32               ; <i32> [#uses=1]
-  %7 = tail call i32 (i8*, ...)* @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
+  %7 = tail call i32 (i8*, ...) @printf(i8* noalias getelementptr ([15 x i8], [15 x i8]* @_2E_str, i64 0, i64 0), i32 %6) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 818138a..584179a 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -75,7 +75,7 @@ define i32 @test5(double %A) nounwind  {
  br i1 %bothcond, label %bb8, label %bb12
 
  bb8:; preds = %entry
- %tmp9 = tail call i32 (...)* @foo( ) nounwind ; <i32> [#uses=1]
+ %tmp9 = tail call i32 (...) @foo( ) nounwind ; <i32> [#uses=1]
  ret i32 %tmp9
 
  bb12:; preds = %entry
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index 13fb46b..62e0562 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -7,7 +7,7 @@ define i32 @main() nounwind {
 entry:
   %t0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
   %0 = extractvalue { i64, i1 } %t0, 0
-  %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
+  %1 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([7 x i8], [7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 5fe5dc5..a95b84d 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -439,7 +439,7 @@ entry:
   %conv = uitofp i64 %sub to float
   %div = fmul float %conv, 5.000000e-01
   %conv2 = fpext float %div to double
-  tail call void (...)* @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }, { [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
+  tail call void (...) @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }, { [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
   ret void
 }
 declare void @_Z6PrintFz(...)
@@ -462,7 +462,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   %cmp = icmp eq i32* undef, %3
   %conv2 = zext i1 %cmp to i32
   %and = and i32 %conv2, %0
-  tail call void (...)* @fn3(i32 %and) nounwind
+  tail call void (...) @fn3(i32 %and) nounwind
   %tobool = icmp eq i32 undef, 0
   br i1 %tobool, label %for.inc, label %if.then
 
diff --git a/test/CodeGen/X86/dag-optnone.ll b/test/CodeGen/X86/dag-optnone.ll
new file mode 100644
index 0000000..f7774e6
--- /dev/null
+++ b/test/CodeGen/X86/dag-optnone.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -O0 -mattr=+avx | FileCheck %s
+
+; Background:
+; If fast-isel bails out to normal selection, then the DAG combiner will run,
+; even at -O0. In principle this should not happen (those are optimizations,
+; and we said -O0) but as a practical matter there are some instruction
+; selection patterns that depend on the legalizations and transforms that the
+; DAG combiner does.
+;
+; The 'optnone' attribute implicitly sets -O0 and fast-isel for the function.
+; The DAG combiner was disabled for 'optnone' (but not -O0) by r221168, then
+; re-enabled in r233153 because of problems with instruction selection patterns
+; mentioned above. (Note: because 'optnone' is supposed to match -O0, r221168
+; really should have disabled the combiner for both.)
+;
+; If instruction selection eventually becomes smart enough to run without DAG
+; combiner, then the combiner can be turned off for -O0 (not just 'optnone')
+; and this test can go away. (To be replaced by a different test that verifies
+; the DAG combiner does *not* run at -O0 or for 'optnone' functions.)
+;
+; In the meantime, this test wants to make sure the combiner stays enabled for
+; 'optnone' functions, just as it is for -O0.
+
+
+; The test cases @foo[WithOptnone] prove that the same DAG combine happens
+; with -O0 and with 'optnone' set.  To prove this, we use a Windows triple to
+; cause fast-isel to bail out (because something about the calling convention
+; is not handled in fast-isel).  Then we have a repeated fadd that can be
+; combined into an fmul.  We show that this happens in both the non-optnone
+; function and the optnone function.
+
+define float @foo(float %x) #0 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @foo
+; CHECK-NOT:   add
+; CHECK:       mul
+; CHECK-NEXT:  ret
+
+define float @fooWithOptnone(float %x) #1 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @fooWithOptnone
+; CHECK-NOT:   add
+; CHECK:       mul
+; CHECK-NEXT:  ret
+
+
+; The test case @bar is derived from an instruction selection failure case
+; that was solved by r233153. It depends on -mattr=+avx.
+; Really all we're trying to prove is that it doesn't crash any more.
+
+@id84 = common global <16 x i32> zeroinitializer, align 64
+
+define void @bar() #1 {
+entry:
+  %id83 = alloca <16 x i8>, align 16
+  %0 = load <16 x i32>, <16 x i32>* @id84, align 64
+  %conv = trunc <16 x i32> %0 to <16 x i8>
+  store <16 x i8> %conv, <16 x i8>* %id83, align 16
+  ret void
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/dagcombine-and-setcc.ll b/test/CodeGen/X86/dagcombine-and-setcc.ll
index bb2bfbe..57adc8b 100644
--- a/test/CodeGen/X86/dagcombine-and-setcc.ll
+++ b/test/CodeGen/X86/dagcombine-and-setcc.ll
@@ -39,7 +39,7 @@ ret2:
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
   %res = alloca i32, align 4
   %t = call i32 @foo(i32 1, i32 2, i32* %res) #3
-  %v = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %t)
+  %v = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %t)
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index fe502bb..c5085a2 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -169,36 +169,36 @@ attributes #2 = { nounwind readnone }
 !53 = distinct !MDLexicalBlock(line: 14, column: 0, file: !1, scope: !51)
 !54 = !MDLocation(line: 16, scope: !53)
 !55 = !MDLocation(line: 17, scope: !24)
-!56 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38, inlinedAt: !55)
+!56 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !57 = !MDLocation(line: 0, scope: !40, inlinedAt: !55)
 !58 = !{i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)}
-!59 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15, inlinedAt: !55)
+!59 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !60 = !MDLocation(line: 5, scope: !40, inlinedAt: !55)
 !61 = !MDLocation(line: 5, scope: !62, inlinedAt: !55)
 !62 = distinct !MDLexicalBlock(line: 5, column: 0, file: !1, scope: !40)
 !63 = !MDLocation(line: 18, scope: !24)
-!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38, inlinedAt: !63)
+!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
 !65 = !MDLocation(line: 0, scope: !40, inlinedAt: !63)
-!66 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15, inlinedAt: !63)
+!66 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
 !67 = !MDLocation(line: 5, scope: !40, inlinedAt: !63)
 !68 = !MDLocation(line: 5, scope: !62, inlinedAt: !63)
 !69 = !MDLocation(line: 20, scope: !70)
 !70 = distinct !MDLexicalBlock(line: 20, column: 0, file: !1, scope: !24)
-!71 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !72)
+!71 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !72 = !MDLocation(line: 21, scope: !70)
 !73 = !MDLocation(line: 0, scope: !35, inlinedAt: !72)
 !74 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)}
-!75 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !72)
+!75 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !76 = !MDLocation(line: 6, scope: !35, inlinedAt: !72)
-!77 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !78)
+!77 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !78 = !MDLocation(line: 23, scope: !70)
 !79 = !MDLocation(line: 0, scope: !35, inlinedAt: !78)
 !80 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)}
-!81 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !78)
+!81 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !82 = !MDLocation(line: 6, scope: !35, inlinedAt: !78)
-!83 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38, inlinedAt: !84)
+!83 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
 !84 = !MDLocation(line: 24, scope: !24)
 !85 = !MDLocation(line: 0, scope: !35, inlinedAt: !84)
-!86 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15, inlinedAt: !84)
+!86 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
 !87 = !MDLocation(line: 6, scope: !35, inlinedAt: !84)
 !88 = !MDLocation(line: 25, scope: !24)
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index 6cdfdc2..8f95338 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo*, %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !MDExpression()), !dbg !MDLocation(scope: !MDSubprogram())
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@ entry:
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble*, %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !MDExpression())
+  tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !MDExpression()), !dbg !MDLocation(scope: !MDSubprogram())
   %1 = load %struct.Wibble*, %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -78,6 +78,6 @@ attributes #2 = { nounwind readnone }
 
 !17 = !MDDerivedType(tag: DW_TAG_reference_type, baseType: null)
 !45 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: null)
-!62 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "arg", line: 4, arg: 2, scope: null, type: !17)
+!62 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "arg", line: 4, arg: 2, scope: !MDSubprogram(), type: !17)
 !64 = !{%struct.Flibble* undef}
-!65 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: null, type: !45)
+!65 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", line: 13, arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !MDSubprogram(), type: !45)
diff --git a/test/CodeGen/X86/discontiguous-loops.ll b/test/CodeGen/X86/discontiguous-loops.ll
index fa7692b..20db750 100644
--- a/test/CodeGen/X86/discontiguous-loops.ll
+++ b/test/CodeGen/X86/discontiguous-loops.ll
@@ -40,7 +40,7 @@ ybb8:                                              ; preds = %ybb1
 
 bb10:                                             ; preds = %ybb8
   %tmp11 = load i8*, i8** undef, align 8               ; <i8*> [#uses=1]
-  call void (i8*, ...)* @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* %tmp11) nounwind
+  call void (i8*, ...) @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* %tmp11) nounwind
   unreachable
 
 ybb12:                                             ; preds = %ybb8
@@ -51,7 +51,7 @@ ybb13:                                             ; preds = %ybb12
   br i1 %tmp14, label %bb16, label %ybb1
 
 bb15:                                             ; preds = %ybb12
-  call void (i8*, ...)* @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* undef) nounwind
+  call void (i8*, ...) @fatal(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str96, i64 0, i64 0), i8* undef) nounwind
   unreachable
 
 bb16:                                             ; preds = %ybb13
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index af15a86..7ee6b43 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -36,13 +36,13 @@ define void @use() nounwind {
 ; OPT-NOT: call void @inline1()
 ; OPT-NOT: call void @inline2()
 ; OPT-NOT: load i32, i32* @Var2
-; OPT: call void (...)* @dummy(i32 %1, i32 1)
+; OPT: call void (...) @dummy(i32 %1, i32 1)
 
 ; CHECK-DAG: movq __imp_Var1(%rip), [[R1:%[a-z]{3}]]
 ; CHECK-DAG: movq __imp_Var2(%rip), [[R2:%[a-z]{3}]]
   %1 = load i32, i32* @Var1
   %2 = load i32, i32* @Var2
-  call void(...)* @dummy(i32 %1, i32 %2)
+  call void(...) @dummy(i32 %1, i32 %2)
 
   ret void
 }
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index eb9484c..9db654f 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll
@@ -47,13 +47,13 @@ define void @use() nounwind {
 ; OPT-NOT: call void @inline1()
 ; OPT-NOT: call void @inline2()
 ; OPT-NOT: load i32, i32* @Var2
-; OPT: call void (...)* @dummy(i32 %1, i32 1)
+; OPT: call void (...) @dummy(i32 %1, i32 1)
 
 ; CHECK-DAG: movl __imp__Var1, [[R1:%[a-z]{3}]]
 ; CHECK-DAG: movl __imp__Var2, [[R2:%[a-z]{3}]]
   %1 = load i32, i32* @Var1
   %2 = load i32, i32* @Var2
-  call void(...)* @dummy(i32 %1, i32 %2)
+  call void(...) @dummy(i32 %1, i32 %2)
 
   ret void
 }
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 6215519..7fcd530 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -62,7 +62,7 @@ if.then37:
 
 if.end41:
   %exit_status.0 = phi i32 [ 2, %if.then29 ], [ 0, %if.then37 ], [ 66, %entry ]
-  call void (...)* @fprintf(i32 %exit_status.0) nounwind
+  call void (...) @fprintf(i32 %exit_status.0) nounwind
   unreachable
 }
 
diff --git a/test/CodeGen/X86/exedeps-movq.ll b/test/CodeGen/X86/exedeps-movq.ll
new file mode 100644
index 0000000..b702c87
--- /dev/null
+++ b/test/CodeGen/X86/exedeps-movq.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
+
+; Verify that we select the correct version of the instruction that stores the low 64-bits
+; of a 128-bit vector. We want to avoid int/fp domain crossing penalties, so ignore the
+; bitcast ops and choose:
+;
+; movlps for floats
+; movlpd for doubles
+; movq for integers
+
+define void @store_floats(<4 x float> %x, i64* %p) {
+; SSE-LABEL: store_floats:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm0, %xmm0
+; SSE-NEXT:    movlps %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_floats:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddps %xmm0, %xmm0, %xmm0
+
+
+; !!! FIXME - the AVX version is not handled correctly.
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+
+
+; AVX-NEXT:    retq
+  %a = fadd <4 x float> %x, %x
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %c = bitcast <2 x float> %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_double(<2 x double> %x, i64* %p) {
+; SSE-LABEL: store_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    addpd %xmm0, %xmm0
+; SSE-NEXT:    movlpd %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_double:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovlpd %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %a = fadd <2 x double> %x, %x
+  %b = extractelement <2 x double> %a, i32 0
+  %c = bitcast double %b to i64
+  store i64 %c, i64* %p
+  ret void
+}
+
+define void @store_int(<4 x i32> %x, <2 x float>* %p) {
+; SSE-LABEL: store_int:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm0, %xmm0
+; SSE-NEXT:    movq %xmm0, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: store_int:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovq %xmm0, (%rdi)
+; AVX-NEXT:    retq
+  %a = add <4 x i32> %x, %x
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %c = bitcast <2 x i32> %b to <2 x float>
+  store <2 x float> %c, <2 x float>* %p
+  ret void
+}
+
diff --git a/test/CodeGen/X86/extern_weak.ll b/test/CodeGen/X86/extern_weak.ll
index 01e32aa..c2ff09f 100644
--- a/test/CodeGen/X86/extern_weak.ll
+++ b/test/CodeGen/X86/extern_weak.ll
@@ -5,7 +5,7 @@
 declare extern_weak i32 @X(i8*)
 
 define void @bar() {
-        tail call void (...)* @foo( )
+        tail call void (...) @foo( )
         ret void
 }
 
diff --git a/test/CodeGen/X86/fast-isel-i1.ll b/test/CodeGen/X86/fast-isel-i1.ll
index d72a31c..589de76 100644
--- a/test/CodeGen/X86/fast-isel-i1.ll
+++ b/test/CodeGen/X86/fast-isel-i1.ll
@@ -23,14 +23,15 @@ exit:		; preds = %next
 
 define void @test2(i8* %a) nounwind {
 entry:
+; clang uses i8 constants for booleans, so we test with an i8 1.
 ; CHECK-LABEL: test2:
 ; CHECK: movb {{.*}} %al
 ; CHECK-NEXT: xorb $1, %al
 ; CHECK-NEXT: testb $1
   %tmp = load i8, i8* %a, align 1
-  %tobool = trunc i8 %tmp to i1
-  %tobool2 = xor i1 %tobool, true
-  br i1 %tobool2, label %if.then, label %if.end
+  %xor = xor i8 %tmp, 1
+  %tobool = trunc i8 %xor to i1
+  br i1 %tobool, label %if.then, label %if.end
 
 if.then:
   call void @test2(i8* null)
diff --git a/test/CodeGen/X86/fast-isel-sext.ll b/test/CodeGen/X86/fast-isel-sext.ll
new file mode 100644
index 0000000..ca1558e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-sext.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux -fast-isel -show-mc-encoding < %s | FileCheck %s
+
+; CHECK-LABEL: f:
+; CHECK:       addl $-2, %eax         # encoding: [0x83,0xc0,0xfe]
+define i32 @f(i32* %y) {
+  %x = load i32, i32* %y
+  %dec = add i32 %x, -2
+  ret i32 %dec
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d4bbb63..d748cba 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -190,7 +190,7 @@ define void @test16() nounwind {
 ; CHECK: movl $1, %edi
 ; CHECK: movb $0, %al
 ; CHECK: callq _test16callee
-  call void (...)* @test16callee(i32 1)
+  call void (...) @test16callee(i32 1)
   br label %block2
 
 block2:
@@ -201,7 +201,7 @@ block2:
 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; AVX: movb $1, %al
 ; AVX: callq _test16callee
-  call void (...)* @test16callee(double 1.000000e+00)
+  call void (...) @test16callee(double 1.000000e+00)
   ret void
 }
 
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
new file mode 100644
index 0000000..279bb06
--- /dev/null
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Anything more than one division using a single divisor operand
+; should be converted into a reciprocal and multiplication.
+
+define float @div1_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div1_arcp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %y
+  ret float %div1
+}
+
+define float @div2_arcp(float %x, float %y, float %z) #0 {
+; CHECK-LABEL: div2_arcp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm2, %xmm3
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    mulss %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %div1 = fdiv arcp float %x, %z
+  %mul = fmul arcp float %div1, %y
+  %div2 = fdiv arcp float %mul, %z
+  ret float %div2
+}
+
+; FIXME: If the backend understands 'arcp', then this attribute is unnecessary.
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fltused.ll b/test/CodeGen/X86/fltused.ll
index dcc1382..6c5d8ce 100644
--- a/test/CodeGen/X86/fltused.ll
+++ b/test/CodeGen/X86/fltused.ll
@@ -11,7 +11,7 @@
 
 define i32 @main() nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/fltused_function_pointer.ll b/test/CodeGen/X86/fltused_function_pointer.ll
index ba5879a..a41ae48 100644
--- a/test/CodeGen/X86/fltused_function_pointer.ll
+++ b/test/CodeGen/X86/fltused_function_pointer.ll
@@ -11,7 +11,7 @@
 
 define i32 @foo(i32 (i8*, ...)* %f) nounwind {
 entry:
-  %call = tail call i32 (i8*, ...)* %f(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
+  %call = tail call i32 (i8*, ...) %f(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double 1.000000e+000) nounwind
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/fp-stack-O0.ll b/test/CodeGen/X86/fp-stack-O0.ll
index df90254..79ef28b 100644
--- a/test/CodeGen/X86/fp-stack-O0.ll
+++ b/test/CodeGen/X86/fp-stack-O0.ll
@@ -17,7 +17,7 @@ declare i32 @x2(x86_fp80, x86_fp80) nounwind
 ; CHECK-NEXT: x2
 define i32 @test1() nounwind uwtable ssp {
 entry:
-  %call = call x86_fp80 (...)* bitcast (x86_fp80 (i32)* @x1 to x86_fp80 (...)*)(i32 -1)
+  %call = call x86_fp80 (...) bitcast (x86_fp80 (i32)* @x1 to x86_fp80 (...)*)(i32 -1)
   %call1 = call i32 @x2(x86_fp80 %call, x86_fp80 0xK401EFFFFFFFF00000000)
   ret i32 %call1
 }
diff --git a/test/CodeGen/X86/fp-stack-ret-store.ll b/test/CodeGen/X86/fp-stack-ret-store.ll
index 05dfc54..c7cbb2a 100644
--- a/test/CodeGen/X86/fp-stack-ret-store.ll
+++ b/test/CodeGen/X86/fp-stack-ret-store.ll
@@ -7,7 +7,7 @@ target triple = "i686-apple-darwin8"
 
 define void @bar(double* %P) {
 entry:
-	%tmp = tail call double (...)* @foo( )		; <double> [#uses=1]
+	%tmp = tail call double (...) @foo( )		; <double> [#uses=1]
 	store double %tmp, double* %P, align 8
 	ret void
 }
@@ -16,7 +16,7 @@ declare double @foo(...)
 
 define void @bar2(float* %P) {
 entry:
-	%tmp = tail call double (...)* @foo2( )		; <double> [#uses=1]
+	%tmp = tail call double (...) @foo2( )		; <double> [#uses=1]
 	%tmp1 = fptrunc double %tmp to float		; <float> [#uses=1]
 	store float %tmp1, float* %P, align 4
 	ret void
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index 56c8c27..8eb452a 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -32,7 +32,7 @@ sw.bb735:                                         ; preds = %if.end511
   unreachable
 
 if.end41.i2210:                                   ; preds = %if.end511
-  call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !MDExpression())
+  call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !MDExpression()), !dbg !MDLocation(scope: !4)
   unreachable
 
 sw.bb992:                                         ; preds = %if.end511
diff --git a/test/CodeGen/X86/frameescape.ll b/test/CodeGen/X86/frameescape.ll
index 40eeb0e..3a624ae 100644
--- a/test/CodeGen/X86/frameescape.ll
+++ b/test/CodeGen/X86/frameescape.ll
@@ -12,11 +12,11 @@ define void @print_framealloc_from_fp(i8* %fp) {
   %a.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 0)
   %a = bitcast i8* %a.i8 to i32*
   %a.val = load i32, i32* %a
-  call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %a.val)
   %b.i8 = call i8* @llvm.framerecover(i8* bitcast (void()* @alloc_func to i8*), i8* %fp, i32 1)
   %b = bitcast i8* %b.i8 to i32*
   %b.val = load i32, i32* %b
-  call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
+  call i32 (i8*, ...) @printf(i8* getelementptr ([10 x i8], [10 x i8]* @str, i32 0, i32 0), i32 %b.val)
   store i32 42, i32* %b
   ret void
 }
@@ -53,7 +53,7 @@ define void @print_framealloc_from_fp(i8* %fp) {
 define void @alloc_func() {
   %a = alloca i32
   %b = alloca i32
-  call void (...)* @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.frameescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   %fp = call i8* @llvm.frameaddress(i32 0)
@@ -97,7 +97,7 @@ define i32 @main() {
 define void @alloc_func_no_frameaddr() {
   %a = alloca i32
   %b = alloca i32
-  call void (...)* @llvm.frameescape(i32* %a, i32* %b)
+  call void (...) @llvm.frameescape(i32* %a, i32* %b)
   store i32 42, i32* %a
   store i32 13, i32* %b
   call void @print_framealloc_from_fp(i8* null)
diff --git a/test/CodeGen/X86/h-registers-3.ll b/test/CodeGen/X86/h-registers-3.ll
index 8a0b07b..29d0c28 100644
--- a/test/CodeGen/X86/h-registers-3.ll
+++ b/test/CodeGen/X86/h-registers-3.ll
@@ -3,7 +3,7 @@
 
 define zeroext i8 @foo() nounwind ssp {
 entry:
-  %0 = tail call zeroext i16 (...)* @bar() nounwind
+  %0 = tail call zeroext i16 (...) @bar() nounwind
   %1 = lshr i16 %0, 8
   %2 = trunc i16 %1 to i8
   ret i8 %2
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
index 01d1b8c..65f8340 100644
--- a/test/CodeGen/X86/hoist-common.ll
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -26,7 +26,7 @@ entry:
 
 if.then:
 ; CHECK: callq
-  %call = tail call zeroext i1 (...)* @foo() nounwind
+  %call = tail call zeroext i1 (...) @foo() nounwind
   br label %return
 
 return:
diff --git a/test/CodeGen/X86/inline-asm-duplicated-constraint.ll b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
new file mode 100644
index 0000000..2ef5474
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86-64 -no-integrated-as -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; CHECK-LABEL: test1:
+; CHECK: movl	(%rdi), %eax
+; CHECK: nop
+; CHECK: movl	%eax, (%rdi)
+; CHECK: ret
+define void @test1(i32* %l) {
+  %load = load i32, i32* %l
+  call void asm "nop", "=*rmrm,0m0m,~{dirflag},~{fpsr},~{flags}"(i32* %l, i32 %load)
+  ret void
+}
diff --git a/test/CodeGen/X86/invalid-shift-immediate.ll b/test/CodeGen/X86/invalid-shift-immediate.ll
index 21ad6e8..1fb80c7 100644
--- a/test/CodeGen/X86/invalid-shift-immediate.ll
+++ b/test/CodeGen/X86/invalid-shift-immediate.ll
@@ -17,7 +17,7 @@ entry:
 	br i1 %toBool, label %bb, label %bb5
 
 bb:		; preds = %entry
-	%tmp4 = call i32 (...)* @bar( ) nounwind 		; <i32> [#uses=0]
+	%tmp4 = call i32 (...) @bar( ) nounwind 		; <i32> [#uses=0]
 	br label %bb5
 
 bb5:		; preds = %bb, %entry
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 31a7af3..ca3e8bf 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -9,11 +9,11 @@ entry:
 	br i1 %tmp, label %cond_true, label %cond_next
 
 cond_true:		; preds = %entry
-	%tmp2 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	br label %cond_next
 
 cond_next:		; preds = %cond_true, %entry
-	%tmp3 = tail call i32 (...)* @baz( )		; <i32> [#uses=0]
+	%tmp3 = tail call i32 (...) @baz( )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/licm-nested.ll b/test/CodeGen/X86/licm-nested.ll
index 4ec2b52..42e6d12 100644
--- a/test/CodeGen/X86/licm-nested.ll
+++ b/test/CodeGen/X86/licm-nested.ll
@@ -81,7 +81,7 @@ for.inc35:                                        ; preds = %for.body15, %for.en
 
 while.end:                                        ; preds = %while.cond.loopexit, %while.cond.preheader
   %count.0.lcssa = phi i32 [ 0, %while.cond.preheader ], [ %count.1, %while.cond.loopexit ] ; <i32> [#uses=1]
-  %call40 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i32 %count.0.lcssa) nounwind ; <i32> [#uses=0]
+  %call40 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i32 %count.0.lcssa) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/licm-regpressure.ll b/test/CodeGen/X86/licm-regpressure.ll
new file mode 100644
index 0000000..0ab6554
--- /dev/null
+++ b/test/CodeGen/X86/licm-regpressure.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; This tests currently fails as MachineLICM does not compute register pressure
+; correctly. More details: llvm.org/PR23143
+; XFAIL: *
+
+; MachineLICM should take register pressure into account.
+; CHECK-NOT: Spill
+
+%struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
+
+define void @test(i1 %b, %struct.A* %a) nounwind {
+entry:
+  br label %loop-header
+
+loop-header:
+  br label %loop-body
+
+loop-body:
+  %0 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  %1 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 1
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 2
+  %3 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 3
+  %4 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 4
+  %5 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 5
+  %6 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 6
+  call void @assign(i32* %0)
+  call void @assign(i32* %1)
+  call void @assign(i32* %2)
+  call void @assign(i32* %3)
+  call void @assign(i32* %4)
+  call void @assign(i32* %5)
+  call void @assign(i32* %6)
+  br i1 %b, label %loop-body, label %loop-exit
+
+loop-exit:
+  ret void
+}
+
+declare void @assign(i32*)
diff --git a/test/CodeGen/X86/licm-symbol.ll b/test/CodeGen/X86/licm-symbol.ll
index 854ea0b..0f115dd 100644
--- a/test/CodeGen/X86/licm-symbol.ll
+++ b/test/CodeGen/X86/licm-symbol.ll
@@ -29,11 +29,11 @@ bb151:                                            ; preds = %bb59, %bb56, %bb14
   br i1 undef, label %bb56, label %bb59
 
 bb56:                                             ; preds = %bb151
-  %t0 = call i32 (%struct.FILE*)* @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
+  %t0 = call i32 (%struct.FILE*) @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
   br label %bb151
 
 bb59:                                             ; preds = %bb151
-  %t1 = call i32 (%struct.FILE*)* @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
+  %t1 = call i32 (%struct.FILE*) @fprintf(%struct.FILE* getelementptr inbounds ([0 x %struct.FILE], [0 x %struct.FILE]* @__sF, i32 0, i32 2)) nounwind
   br label %bb151
 }
 
diff --git a/test/CodeGen/X86/lsr-normalization.ll b/test/CodeGen/X86/lsr-normalization.ll
index e75f5b2..09c892c 100644
--- a/test/CodeGen/X86/lsr-normalization.ll
+++ b/test/CodeGen/X86/lsr-normalization.ll
@@ -71,7 +71,7 @@ bb25:                                             ; preds = %bb25, %bb23
 
 bb32:                                             ; preds = %bb25
   %tmp33 = mul i64 %tmp31, %tmp24                 ; <i64> [#uses=1]
-  %tmp34 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @0, i64 0, i64 0), i64 %tmp33) nounwind
+  %tmp34 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @0, i64 0, i64 0), i64 %tmp33) nounwind
   br label %bb35
 
 bb35:                                             ; preds = %bb32, %bb14
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index ce3ab4c..c6876d2 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -62,7 +62,7 @@ if.end34:                                         ; preds = %sw.bb
 ; CHECK: %if.end34
 ; CHECK: leal
 ; CHECK-NOT: imull
-  tail call void (...)* @printf(i32 %test_case, i32 %mul20) nounwind
+  tail call void (...) @printf(i32 %test_case, i32 %mul20) nounwind
   %tmp = mul i32 %scale, %test_case
   %tmp752 = mul i32 %tmp, 3
   %tmp753 = zext i32 %tmp752 to i64
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index d5a3d8e..e5f1f52 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -11,7 +11,7 @@ declare i32 @memcmp(...)
 
 define void @memcmp2(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -31,7 +31,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -49,7 +49,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -66,7 +66,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -82,7 +82,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
@@ -99,7 +99,7 @@ return:                                           ; preds = %entry
 
 define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
 entry:
-  %0 = tail call i32 (...)* @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; <i32> [#uses=1]
+  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; <i32> [#uses=1]
   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
   br i1 %1, label %return, label %bb
 
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 256db8b..2cc70e1 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -34,9 +34,9 @@ entry:
   %c = alloca %class.C, align 1
   %0 = load i8, i8* @argc, align 1
   %conv = sext i8 %0 to i32
-  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  %call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
   %1 = load i8, i8* @argc, align 1
-  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
   ret void
 }
 
@@ -47,13 +47,13 @@ define void @test_with_debug() {
 entry:
   %c = alloca %class.C, align 1
   %0 = load i8, i8* @argc, align 1
-  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29)
+  tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29), !dbg !MDLocation(scope: !13)
   %conv = sext i8 %0 to i32
-  tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
-  %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
+  tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !MDLocation(scope: !13)
+  %call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
   %1 = load i8, i8* @argc, align 1
-  call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29)
-  %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
+  call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !MDLocation(scope: !13)
+  %call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
   ret void
 }
 
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 36ccfe9..2727e3e 100644
--- a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -14,7 +14,7 @@ define void @t3() nounwind  {
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %tmp3 = load <8 x i8>, <8 x i8>* @g_v8qi, align 8
   %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
   ret void
 }
 
@@ -34,7 +34,7 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
   %v2b = bitcast x86_mmx %v2 to <8 x i8>
   %tmp3 = add <8 x i8> %v1a, %v2b
   %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
-  %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
+  %tmp4 = tail call i32 (...) @pass_v8qi( x86_mmx %tmp3a ) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll
index 4aa10a9..00c8039 100644
--- a/test/CodeGen/X86/mmx-bitcast.ll
+++ b/test/CodeGen/X86/mmx-bitcast.ll
@@ -75,8 +75,7 @@ define i64 @t5(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-NEXT:    movd
 ; CHECK-NEXT:    movd
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; CHECK-NEXT:    movd %xmm0, %rax
+; CHECK-NEXT:    movd %xmm1, %rax
 ; CHECK-NEXT:    retq
   %v0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index 4278910..f89e524 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -265,7 +265,7 @@ define void @test10() optsize {
   store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
   %good_ptr = load volatile void (i32, i32, i32, i32)*, void (i32, i32, i32, i32)** %stack_fptr
   call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
-  call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4)
+  call void (i32, i32, i32, i32) %good_ptr(i32 1, i32 2, i32 3, i32 4)
   ret void
 }
 
diff --git a/test/CodeGen/X86/musttail-fastcall.ll b/test/CodeGen/X86/musttail-fastcall.ll
index ed3668d..a95e0ff 100644
--- a/test/CodeGen/X86/musttail-fastcall.ll
+++ b/test/CodeGen/X86/musttail-fastcall.ll
@@ -9,13 +9,13 @@
 declare void @puts(i8*)
 
 define i32 @call_fast_thunk() {
-  %r = call x86_fastcallcc i32 (...)* @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  %r = call x86_fastcallcc i32 (...) @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3)
   ret i32 %r
 }
 
 define x86_fastcallcc i32 @fast_thunk(...) {
   call void @puts(i8* getelementptr ([4 x i8], [4 x i8]* @asdf, i32 0, i32 0))
-  %r = musttail call x86_fastcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
+  %r = musttail call x86_fastcallcc i32 (...) bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...)
   ret i32 %r
 }
 
@@ -38,13 +38,13 @@ define x86_fastcallcc i32 @fast_target(i32 inreg %a, i32 inreg %b, i32 %c) {
 ; Repeat the test for vectorcall, which has XMM registers.
 
 define i32 @call_vector_thunk() {
-  %r = call x86_vectorcallcc i32 (...)* @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
+  %r = call x86_vectorcallcc i32 (...) @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3)
   ret i32 %r
 }
 
 define x86_vectorcallcc i32 @vector_thunk(...) {
   call void @puts(i8* getelementptr ([4 x i8], [4 x i8]* @asdf, i32 0, i32 0))
-  %r = musttail call x86_vectorcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
+  %r = musttail call x86_vectorcallcc i32 (...) bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...)
   ret i32 %r
 }
 
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 52115b2..3613f4c 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -16,8 +16,8 @@ define void @f_thunk(i8* %this, ...) {
   %ap_i8 = bitcast [4 x i8*]* %ap to i8*
   call void @llvm.va_start(i8* %ap_i8)
 
-  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
-  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+  %fptr = call void(i8*, ...)*(i8*) @get_f(i8* %this)
+  musttail call void (i8*, ...) %fptr(i8* %this, ...)
   ret void
 }
 
@@ -84,7 +84,7 @@ define void @f_thunk(i8* %this, ...) {
 
 define void @g_thunk(i8* %fptr_i8, ...) {
   %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
-  musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+  musttail call void (i8*, ...) %fptr(i8* %fptr_i8, ...)
   ret void
 }
 
@@ -114,7 +114,7 @@ then:
   %a_p = getelementptr %struct.Foo, %struct.Foo* %this, i32 0, i32 1
   %a_i8 = load i8*, i8** %a_p
   %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
-  musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+  musttail call void (%struct.Foo*, ...) %a(%struct.Foo* %this, ...)
   ret void
 
 else:
@@ -122,7 +122,7 @@ else:
   %b_i8 = load i8*, i8** %b_p
   %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
   store i32 42, i32* @g
-  musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+  musttail call void (%struct.Foo*, ...) %b(%struct.Foo* %this, ...)
   ret void
 }
 
diff --git a/test/CodeGen/X86/narrow-shl-cst.ll b/test/CodeGen/X86/narrow-shl-cst.ll
index 40b9760..c9e9a3d 100644
--- a/test/CodeGen/X86/narrow-shl-cst.ll
+++ b/test/CodeGen/X86/narrow-shl-cst.ll
@@ -99,3 +99,26 @@ define i64 @test11(i64 %x) nounwind {
 ; CHECK: xorq $-65536
 ; CHECK: shlq $33
 }
+
+; PR23098
+define i32 @test12(i32 %x, i32* %y) nounwind {
+  %and = shl i32 %x, 1
+  %shl = and i32 %and, 255
+  store i32 %shl, i32* %y
+  ret i32 %shl
+; CHECK-LABEL: test12:
+; CHECK: andl $127
+; CHECK-NEXT: addl
+; CHECK-NOT: shl
+}
+
+define i64 @test13(i64 %x, i64* %y) nounwind {
+  %and = shl i64 %x, 1
+  %shl = and i64 %and, 255
+  store i64 %shl, i64* %y
+  ret i64 %shl
+; CHECK-LABEL: test13:
+; CHECK: andq $127
+; CHECK-NEXT: addq
+; CHECK-NOT: shl
+}
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index f62f372..8c08b3c 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,31 +1,303 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
 
 ; Make sure that we generate non-temporal stores for the test cases below.
+; We use xorps for zeroing, so domain information isn't available anymore.
 
-define void @test1(<4 x float>* %dst) {
-; CHECK-LABEL: test1:
+define void @test_zero_v4f32(<4 x float>* %dst) {
+; CHECK-LABEL: test_zero_v4f32:
 ; SSE: movntps
 ; AVX: vmovntps
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
   ret void
 }
 
-define void @test2(<4 x i32>* %dst) {
-; CHECK-LABEL: test2:
+define void @test_zero_v4i32(<4 x i32>* %dst) {
+; CHECK-LABEL: test_zero_v4i32:
 ; SSE: movntps
 ; AVX: vmovntps
   store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
   ret void
 }
 
-define void @test3(<2 x double>* %dst) {
-; CHECK-LABEL: test3:
+define void @test_zero_v2f64(<2 x double>* %dst) {
+; CHECK-LABEL: test_zero_v2f64:
 ; SSE: movntps
 ; AVX: vmovntps
   store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
   ret void
 }
 
+define void @test_zero_v2i64(<2 x i64>* %dst) {
+; CHECK-LABEL: test_zero_v2i64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v8i16(<8 x i16>* %dst) {
+; CHECK-LABEL: test_zero_v8i16:
+; SSE: movntps
+; AVX: vmovntps
+  store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v16i8(<16 x i8>* %dst) {
+; CHECK-LABEL: test_zero_v16i8:
+; SSE: movntps
+; AVX: vmovntps
+  store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_zero_v8f32(<8 x float>* %dst) {
+; CHECK-LABEL: test_zero_v8f32:
+; AVX: vmovntps %ymm
+  store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v8i32(<8 x i32>* %dst) {
+; CHECK-LABEL: test_zero_v8i32:
+; AVX2: vmovntps %ymm
+  store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v4f64(<4 x double>* %dst) {
+; CHECK-LABEL: test_zero_v4f64:
+; AVX: vmovntps %ymm
+  store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v4i64(<4 x i64>* %dst) {
+; CHECK-LABEL: test_zero_v4i64:
+; AVX2: vmovntps %ymm
+  store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v16i16(<16 x i16>* %dst) {
+; CHECK-LABEL: test_zero_v16i16:
+; AVX2: vmovntps %ymm
+  store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_zero_v32i8(<32 x i8>* %dst) {
+; CHECK-LABEL: test_zero_v32i8:
+; AVX2: vmovntps %ymm
+  store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+
+; Check that we also handle arguments.  Here the type survives longer.
+
+define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
+; CHECK-LABEL: test_arg_v4f32:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
+; CHECK-LABEL: test_arg_v4i32:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
+; CHECK-LABEL: test_arg_v2f64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
+; CHECK-LABEL: test_arg_v2i64:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
+; CHECK-LABEL: test_arg_v8i16:
+; SSE: movntps
+; AVX: vmovntps
+  store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
+; CHECK-LABEL: test_arg_v16i8:
+; SSE: movntps
+; AVX: vmovntps
+  store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
+; CHECK-LABEL: test_arg_v8f32:
+; AVX: vmovntps %ymm
+  store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
+; CHECK-LABEL: test_arg_v8i32:
+; AVX2: vmovntps %ymm
+  store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
+; CHECK-LABEL: test_arg_v4f64:
+; AVX: vmovntps %ymm
+  store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
+; CHECK-LABEL: test_arg_v4i64:
+; AVX2: vmovntps %ymm
+  store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
+; CHECK-LABEL: test_arg_v16i16:
+; AVX2: vmovntps %ymm
+  store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
+; CHECK-LABEL: test_arg_v32i8:
+; AVX2: vmovntps %ymm
+  store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+
+; Now check that if the execution domain is trivially visible, we use it.
+; We use an add to make the type survive all the way to the MOVNT.
+
+define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
+; CHECK-LABEL: test_op_v4f32:
+; SSE: movntps
+; AVX: vmovntps
+  %r = fadd <4 x float> %a, %b
+  store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
+; CHECK-LABEL: test_op_v4i32:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <4 x i32> %a, %b
+  store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
+; CHECK-LABEL: test_op_v2f64:
+; SSE: movntpd
+; AVX: vmovntpd
+  %r = fadd <2 x double> %a, %b
+  store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
+; CHECK-LABEL: test_op_v2i64:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <2 x i64> %a, %b
+  store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
+; CHECK-LABEL: test_op_v8i16:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <8 x i16> %a, %b
+  store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
+; CHECK-LABEL: test_op_v16i8:
+; SSE: movntdq
+; AVX: vmovntdq
+  %r = add <16 x i8> %a, %b
+  store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+; And now YMM versions.
+
+define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
+; CHECK-LABEL: test_op_v8f32:
+; AVX: vmovntps %ymm
+  %r = fadd <8 x float> %a, %b
+  store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
+; CHECK-LABEL: test_op_v8i32:
+; AVX2: vmovntdq %ymm
+  %r = add <8 x i32> %a, %b
+  store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
+; CHECK-LABEL: test_op_v4f64:
+; AVX: vmovntpd %ymm
+  %r = fadd <4 x double> %a, %b
+  store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
+; CHECK-LABEL: test_op_v4i64:
+; AVX2: vmovntdq %ymm
+  %r = add <4 x i64> %a, %b
+  store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
+; CHECK-LABEL: test_op_v16i16:
+; AVX2: vmovntdq %ymm
+  %r = add <16 x i16> %a, %b
+  store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
+define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
+; CHECK-LABEL: test_op_v32i8:
+; AVX2: vmovntdq %ymm
+  %r = add <32 x i8> %a, %b
+  store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
+  ret void
+}
+
 !1 = !{i32 1}
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 9ebf890..ae3ed3f 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -2,14 +2,14 @@
 
 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
 entry:
-	%tmp = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	%tmp.upgrd.1 = icmp eq i32 %X, 0		; <i1> [#uses=1]
 	%tmp3 = icmp slt i32 %Y, 5		; <i1> [#uses=1]
 	%tmp4 = or i1 %tmp3, %tmp.upgrd.1		; <i1> [#uses=1]
 	br i1 %tmp4, label %cond_true, label %UnifiedReturnBlock
 
 cond_true:		; preds = %entry
-	%tmp5 = tail call i32 (...)* @bar( )		; <i32> [#uses=0]
+	%tmp5 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
 	ret void
 
 UnifiedReturnBlock:		; preds = %entry
diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
index 37bdd7d..5c39438 100644
--- a/test/CodeGen/X86/patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
@@ -25,9 +25,9 @@ entry:
 ; FAST:       movq %rax, (%rsp)
 ; FAST:       callq
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
@@ -51,7 +51,7 @@ entry:
 ; FAST-NEXT:  movabsq $-559038736, %r11
 ; FAST-NEXT:  callq *%r11
   %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
   ret i64 %result
 }
 
@@ -79,7 +79,7 @@ entry:
 ; FAST-NEXT:  movabsq $-559038736, %r11
 ; FAST-NEXT:  callq *%r11
   %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index 24e324f..eda13fd 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -15,9 +15,9 @@ entry:
 ; CHECK:      movq %[[REG]], %rax
 ; CHECK:      ret
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -35,7 +35,7 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
@@ -48,14 +48,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -67,7 +67,7 @@ entry:
 ; CHECK:      nopl 8(%rax,%rax)
 ; CHECK-NEXT: popq
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
@@ -78,7 +78,7 @@ entry:
 ; CHECK:      movabsq $6153737369414576827, %r11
 ; CHECK-NEXT: callq *%r11
   %resolveCall2 = inttoptr i64 6153737369414576827 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index ca364f2..a0adba0 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -51,7 +51,7 @@ entry:
   %3 = call i32 asm "", "={ax}"() nounwind        ; <i32> [#uses=1]
   call void asm sideeffect alignstack "movl $0, $1", "{eax},*m,~{dirflag},~{fpsr},~{flags},~{memory}"(i32 %3, i32* %result) nounwind
   %4 = load i32, i32* %result, align 4                 ; <i32> [#uses=1]
-  %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 %4) nounwind ; <i32> [#uses=0]
+  %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 %4) nounwind ; <i32> [#uses=0]
   store i32 0, i32* %0, align 4
   %6 = load i32, i32* %0, align 4                      ; <i32> [#uses=1]
   store i32 %6, i32* %retval, align 4
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index faaf73b..d543deb 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -69,10 +69,10 @@ entry:
 
 define void @test3() nounwind {
 entry:
-    %tmp = call void(...)*(...)* @afoo()
+    %tmp = call void(...)*(...) @afoo()
     store void(...)* %tmp, void(...)** @pfoo
     %tmp1 = load void(...)*, void(...)** @pfoo
-    call void(...)* %tmp1()
+    call void(...) %tmp1()
     ret void
 ; LINUX-LABEL: test3:
 ; LINUX: 	calll	.L3$pb
@@ -88,7 +88,7 @@ declare void(...)* @afoo(...)
 
 define void @test4() nounwind {
 entry:
-    call void(...)* @foo()
+    call void(...) @foo()
     ret void
 ; LINUX-LABEL: test4:
 ; LINUX: calll	.L4$pb
@@ -146,43 +146,43 @@ define void @test7(i32 %n.u) nounwind {
 entry:
     switch i32 %n.u, label %bb12 [i32 1, label %bb i32 2, label %bb6 i32 4, label %bb7 i32 5, label %bb8 i32 6, label %bb10 i32 7, label %bb1 i32 8, label %bb3 i32 9, label %bb4 i32 10, label %bb9 i32 11, label %bb2 i32 12, label %bb5 i32 13, label %bb11 ]
 bb:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb1:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb2:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb3:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb4:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb5:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb6:
-    tail call void(...)* @foo1()
+    tail call void(...) @foo1()
     ret void
 bb7:
-    tail call void(...)* @foo2()
+    tail call void(...) @foo2()
     ret void
 bb8:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
 bb9:
-    tail call void(...)* @foo3()
+    tail call void(...) @foo3()
     ret void
 bb10:
-    tail call void(...)* @foo4()
+    tail call void(...) @foo4()
     ret void
 bb11:
-    tail call void(...)* @foo5()
+    tail call void(...) @foo5()
     ret void
 bb12:
-    tail call void(...)* @foo6()
+    tail call void(...) @foo6()
     ret void
     
 ; LINUX-LABEL: test7:
diff --git a/test/CodeGen/X86/pr1489.ll b/test/CodeGen/X86/pr1489.ll
index d37b9a2..13ced2a 100644
--- a/test/CodeGen/X86/pr1489.ll
+++ b/test/CodeGen/X86/pr1489.ll
@@ -48,7 +48,7 @@ entry:
 	%tmp1 = tail call i32 @bar( )		; <i32> [#uses=1]
 	%tmp2 = tail call i32 @foo( )		; <i32> [#uses=1]
 	%tmp3 = tail call i32 @quux( )		; <i32> [#uses=1]
-	%tmp5 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %tmp3, i32 %tmp2, i32 %tmp1, i32 %tmp )		; <i32> [#uses=0]
+	%tmp5 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([13 x i8], [13 x i8]* @.str, i32 0, i32 0), i32 %tmp3, i32 %tmp2, i32 %tmp1, i32 %tmp )		; <i32> [#uses=0]
 	ret i32 undef
 }
 
diff --git a/test/CodeGen/X86/pr18023.ll b/test/CodeGen/X86/pr18023.ll
index ed3d6a0..c7ea20c 100644
--- a/test/CodeGen/X86/pr18023.ll
+++ b/test/CodeGen/X86/pr18023.ll
@@ -24,7 +24,7 @@ define void @func() {
   %3 = load volatile i32, i32* @b, align 4
   store i32 3, i32* @c, align 4
   %4 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4)
   ret void
 }
 
diff --git a/test/CodeGen/X86/pr23246.ll b/test/CodeGen/X86/pr23246.ll
new file mode 100644
index 0000000..6eb24a6
--- /dev/null
+++ b/test/CodeGen/X86/pr23246.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple x86_64-unknown-unknown | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR23246
+; We're really only interested in doing something sane with the shuffle.
+
+; CHECK-LABEL: test:
+; CHECK:      movq2dq %mm0, %xmm0
+; CHECK-NEXT: pshufd {{.*}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+define <2 x i64> @test(x86_mmx %a) #0 {
+entry:
+  %b = bitcast x86_mmx %a to <1 x i64>
+  %s = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 undef, i32 0>
+  ret <2 x i64> %s
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/pr2326.ll b/test/CodeGen/X86/pr2326.ll
index 9cf83bb..88c7bb5 100644
--- a/test/CodeGen/X86/pr2326.ll
+++ b/test/CodeGen/X86/pr2326.ll
@@ -17,7 +17,7 @@ entry:
 	%tmp25 = and i1 %toBool23, %toBool24		; <i1> [#uses=1]
 	%tmp2526 = zext i1 %tmp25 to i8		; <i8> [#uses=1]
 	%tmp252627 = zext i8 %tmp2526 to i32		; <i32> [#uses=1]
-	%tmp29 = call i32 (...)* @func_15( i32 %tmp252627, i32 0 ) nounwind 		; <i32> [#uses=0]
+	%tmp29 = call i32 (...) @func_15( i32 %tmp252627, i32 0 ) nounwind 		; <i32> [#uses=0]
 	unreachable
 }
 
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index 6f31c5f..9a162d7 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -19,7 +19,7 @@ entry:
 	%conv = fpext float %neg to double		; <double> [#uses=1]
 	%neg4 = fsub float -0.000000e+00, %tmp3		; <float> [#uses=1]
 	%conv5 = fpext float %neg4 to double		; <double> [#uses=1]
-	%call = call i32 (...)* @printf( i8* getelementptr ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), double %conv, double %conv5 )		; <i32> [#uses=0]
+	%call = call i32 (...) @printf( i8* getelementptr ([17 x i8], [17 x i8]* @.str, i32 0, i32 0), double %conv, double %conv5 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/pr2982.ll b/test/CodeGen/X86/pr2982.ll
index ab46005..b7902b8 100644
--- a/test/CodeGen/X86/pr2982.ll
+++ b/test/CodeGen/X86/pr2982.ll
@@ -20,7 +20,7 @@ entry:
         %5 = sext i8 %4 to i32          ; <i32> [#uses=1]
         %6 = add i32 %2, %3             ; <i32> [#uses=1]
         %7 = add i32 %6, %5             ; <i32> [#uses=1]
-        %8 = tail call i32 (...)* @rshift_u_u(i32 %7, i32 0) nounwind          
+        %8 = tail call i32 (...) @rshift_u_u(i32 %7, i32 0) nounwind          
 ; <i32> [#uses=0]
         ret void
 }
diff --git a/test/CodeGen/X86/pr3244.ll b/test/CodeGen/X86/pr3244.ll
index b08a223..c6419d8 100644
--- a/test/CodeGen/X86/pr3244.ll
+++ b/test/CodeGen/X86/pr3244.ll
@@ -10,7 +10,7 @@ entry:
         %1 = load i32, i32* @g_487, align 4          ; <i32> [#uses=1]
         %2 = trunc i16 %0 to i8         ; <i8> [#uses=1]
         %3 = trunc i32 %1 to i8         ; <i8> [#uses=1]
-        %4 = tail call i32 (...)* @func_7(i64 -4455561449541442965, i32 1)
+        %4 = tail call i32 (...) @func_7(i64 -4455561449541442965, i32 1)
 nounwind             ; <i32> [#uses=1]
         %5 = trunc i32 %4 to i8         ; <i8> [#uses=1]
         %6 = mul i8 %3, %2              ; <i8> [#uses=1]
diff --git a/test/CodeGen/X86/pr3250.ll b/test/CodeGen/X86/pr3250.ll
index cccbf54..4ab989e 100644
--- a/test/CodeGen/X86/pr3250.ll
+++ b/test/CodeGen/X86/pr3250.ll
@@ -5,7 +5,7 @@ declare i32 @safe_sub_func_short_u_u(i16 signext, i16 signext) nounwind
 
 define i32 @func_106(i32 %p_107) nounwind {
 entry:
-        %0 = tail call i32 (...)* @safe_div_(i32 %p_107, i32 1) nounwind       
+        %0 = tail call i32 (...) @safe_div_(i32 %p_107, i32 1) nounwind       
         ; <i32> [#uses=1]
         %1 = lshr i32 %0, -9            ; <i32> [#uses=1]
         %2 = trunc i32 %1 to i16                ; <i16> [#uses=1]
diff --git a/test/CodeGen/X86/pr3457.ll b/test/CodeGen/X86/pr3457.ll
index 7264bcd..d4c0020 100644
--- a/test/CodeGen/X86/pr3457.ll
+++ b/test/CodeGen/X86/pr3457.ll
@@ -4,8 +4,8 @@
 
 define void @foo(double* nocapture %P) nounwind {
 entry:
-	%0 = tail call double (...)* @test() nounwind		; <double> [#uses=2]
-	%1 = tail call double (...)* @test() nounwind		; <double> [#uses=2]
+	%0 = tail call double (...) @test() nounwind		; <double> [#uses=2]
+	%1 = tail call double (...) @test() nounwind		; <double> [#uses=2]
 	%2 = fmul double %0, %0		; <double> [#uses=1]
 	%3 = fmul double %1, %1		; <double> [#uses=1]
 	%4 = fadd double %2, %3		; <double> [#uses=1]
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index afa1962..9723721 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -39,7 +39,7 @@ store i64 %dec.i, i64* @c, align 8
 %tobool.i = icmp ne i64 %dec.i, 0
 %lor.ext.i = zext i1 %tobool.i to i32
 store i32 %lor.ext.i, i32* @a, align 4
-%call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
+%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
 ret i32 0
 }
 
@@ -53,7 +53,7 @@ store i64 %dec.i, i64* @c, align 8
 %tobool.i = icmp ne i64 %0, 0
 %lor.ext.i = zext i1 %tobool.i to i32
 store i32 %lor.ext.i, i32* @a, align 4
-%call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
+%call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %dec.i) nounwind
 ret i32 0
 }
 
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 83b86ac..fcd0770 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
 
 ; If the target's divss/divps instructions are substantially
 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
@@ -20,13 +20,13 @@ define float @reciprocal_estimate(float %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate:
-; BTVER2: vrcpss
-; BTVER2: vmulss
-; BTVER2: vsubss
-; BTVER2: vmulss
-; BTVER2: vaddss
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate:
+; RECIP: vrcpss
+; RECIP: vmulss
+; RECIP: vsubss
+; RECIP: vmulss
+; RECIP: vaddss
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate:
 ; REFINE: vrcpss
@@ -51,13 +51,13 @@ define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate_v4f32:
-; BTVER2: vrcpps
-; BTVER2: vmulps
-; BTVER2: vsubps
-; BTVER2: vmulps
-; BTVER2: vaddps
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate_v4f32:
+; RECIP: vrcpps
+; RECIP: vmulps
+; RECIP: vsubps
+; RECIP: vmulps
+; RECIP: vaddps
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate_v4f32:
 ; REFINE: vrcpps
@@ -85,13 +85,13 @@ define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
 ; CHECK-NEXT: movaps
 ; CHECK-NEXT: retq
 
-; BTVER2-LABEL: reciprocal_estimate_v8f32:
-; BTVER2: vrcpps
-; BTVER2: vmulps
-; BTVER2: vsubps
-; BTVER2: vmulps
-; BTVER2: vaddps
-; BTVER2-NEXT: retq
+; RECIP-LABEL: reciprocal_estimate_v8f32:
+; RECIP: vrcpps
+; RECIP: vmulps
+; RECIP: vsubps
+; RECIP: vmulps
+; RECIP: vaddps
+; RECIP-NEXT: retq
 
 ; REFINE-LABEL: reciprocal_estimate_v8f32:
 ; REFINE: vrcpps
diff --git a/test/CodeGen/X86/scalarize-bitcast.ll b/test/CodeGen/X86/scalarize-bitcast.ll
index 6de511f..60650f4 100644
--- a/test/CodeGen/X86/scalarize-bitcast.ll
+++ b/test/CodeGen/X86/scalarize-bitcast.ll
@@ -21,7 +21,7 @@ entry:
 	%tmp24.i = extractelement <1 x i64> %tmp10.i, i32 0		; <i64> [#uses=1]
 	%tmp10 = bitcast i64 %tmp24.i to <1 x i64>		; <<1 x i64>> [#uses=1]
 	%tmp7 = extractelement <1 x i64> %tmp10, i32 0		; <i64> [#uses=1]
-	%call6 = tail call i32 (...)* @store8888(i64 %tmp7)		; <i32> [#uses=1]
+	%call6 = tail call i32 (...) @store8888(i64 %tmp7)		; <i32> [#uses=1]
 	store i32 %call6, i32* %src
 	ret void
 }
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
new file mode 100644
index 0000000..98471ee
--- /dev/null
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-ilp    | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-hybrid | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=source      | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-burr   | FileCheck %s
+; RUN: llc -march=x86-64 < %s -pre-RA-sched=linearize   | FileCheck %s
+
+; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
+; Tests checking backtracking in source scheduler. llc used to crash on them.
+
+; CHECK-LABEL: test1
+define i256 @test1(i256 %a) {
+  %b = add i256 %a, 1 
+  %m = shl i256 %b, 1
+  %p = add i256 %m, 1
+  %v = lshr i256 %b, %p
+  %t = trunc i256 %v to i1
+  %c = shl i256 1, %p
+  %f = select i1 %t, i256 undef, i256 %c
+  ret i256 %f
+}
+
+; CHECK-LABEL: test2
+define i256 @test2(i256 %a) {
+  %b = sub i256 0, %a
+  %c = and i256 %b, %a
+  %d = call i256 @llvm.ctlz.i256(i256 %c, i1 false)
+  ret i256 %d
+}
+
+; CHECK-LABEL: test3
+define i256 @test3(i256 %n) {
+  %m = sub i256 -1, %n
+  %x = sub i256 0, %n
+  %y = and i256 %x, %m
+  %z = call i256 @llvm.ctlz.i256(i256 %y, i1 false)
+  ret i256 %z
+}
+
+declare i256 @llvm.ctlz.i256(i256, i1) nounwind readnone
+
+; CHECK-LABEL: test4
+define i64 @test4(i64 %a, i64 %b) {
+  %r = zext i64 %b to i256
+  %u = add i256 %r, 1
+  %w = and i256 %u, 1461501637330902918203684832716283019655932542975
+  %x = zext i64 %a to i256
+  %c = icmp uge i256 %w, %x
+  %y = select i1 %c, i64 0, i64 1
+  %z = add i64 %y, 1
+  ret i64 %z
+}
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 4127288..55eaab9 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -11,16 +11,16 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj -o /dev/null
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj -o /dev/null
 
 ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
index 477ad36..ba54f1c 100644
--- a/test/CodeGen/X86/seh-safe-div.ll
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -173,15 +173,15 @@ define i32 @main() {
   store i32 10, i32* %n.addr, align 4
   store i32 2, i32* %d.addr, align 4
   %r1 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r1)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r1)
 
   store i32 10, i32* %n.addr, align 4
   store i32 0, i32* %d.addr, align 4
   %r2 = call i32 @safe_div(i32* %n.addr, i32* %d.addr)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r2)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r2)
 
   %r3 = call i32 @safe_div(i32* %n.addr, i32* null)
-  call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r3)
+  call void (i8*, ...) @printf(i8* getelementptr ([21 x i8], [21 x i8]* @str_result, i32 0, i32 0), i32 %r3)
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index 2454af9..6f1ddbd 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -29,7 +29,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: t3:
 ; CHECK: sbbq %rax, %rax
-; CHECK: andq $64, %rax
+; CHECK: andl $64, %eax
   %0 = icmp ult i64 %x, 18                        ; <i1> [#uses=1]
   %iftmp.2.0 = select i1 %0, i64 64, i64 0        ; <i64> [#uses=1]
   ret i64 %iftmp.2.0
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
index 24ba1fc..62e51f0 100644
--- a/test/CodeGen/X86/shift-pair.ll
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -3,7 +3,7 @@
 define i64 @test(i64 %A) {
 ; CHECK: @test
 ; CHECK: shrq $54
-; CHECK: andq $1020
+; CHECK: andl $1020
 ; CHECK: ret
     %B = lshr i64 %A, 56
     %C = shl i64 %B, 2
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index d32e567..b94960a 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -349,7 +349,7 @@ entry:
 ; X32ABI-LABEL: t17:
 ; X32ABI: xorl %eax, %eax
 ; X32ABI: jmp {{_?}}bar5
-  tail call void (...)* @bar5() nounwind
+  tail call void (...) @bar5() nounwind
   ret void
 }
 
@@ -369,7 +369,7 @@ entry:
 ; X32ABI-LABEL: t18:
 ; X32ABI: xorl %eax, %eax
 ; X32ABI: jmp {{_?}}bar6
-  %0 = tail call double (...)* @bar6() nounwind
+  %0 = tail call double (...) @bar6() nounwind
   ret void
 }
 
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 55aa6aa..2b21f4f 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -11,11 +11,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 ; CHECK-LABEL: test1:
 ; CHECK: imull
@@ -30,11 +30,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 ; CHECK-LABEL: test2:
 ; CHECK: imull
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 24b175e..4c6b521 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1,132 +1,141 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE
 
-; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
-; #include <math.h>
-; 
-; double fd(double d){
-;   return sqrt(d);
-; }
-; 
-; float ff(float f){
-;   return sqrtf(f);
-; }
-; 
-; long double fld(long double ld){
-;   return sqrtl(ld);
-; }
-;
-; Tests conversion of sqrt function calls into sqrt instructions when
-; -ffast-math is in effect.
+declare double @__sqrt_finite(double) #0
+declare float @__sqrtf_finite(float) #0
+declare x86_fp80 @__sqrtl_finite(x86_fp80) #0
+declare float @llvm.sqrt.f32(float) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #0
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
 
-; ModuleID = 'sqrt.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nounwind readnone uwtable
 define double @fd(double %d) #0 {
-entry:
-; CHECK: sqrtsd
-  %call = tail call double @__sqrt_finite(double %d) #2
+; CHECK-LABEL: fd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: fd:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
+  %call = tail call double @__sqrt_finite(double %d) #1
   ret double %call
 }
 
-; Function Attrs: nounwind readnone
-declare double @__sqrt_finite(double) #1
 
-; Function Attrs: nounwind readnone uwtable
 define float @ff(float %f) #0 {
-entry:
-; CHECK: sqrtss
-  %call = tail call float @__sqrtf_finite(float %f) #2
+; CHECK-LABEL: ff:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: ff:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm2, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ESTIMATE-NEXT:    vcmpeqss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT:    vandnps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
+  %call = tail call float @__sqrtf_finite(float %f) #1
   ret float %call
 }
 
-; Function Attrs: nounwind readnone
-declare float @__sqrtf_finite(float) #1
 
-; Function Attrs: nounwind readnone uwtable
 define x86_fp80 @fld(x86_fp80 %ld) #0 {
-entry:
-; CHECK: fsqrt
-  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
+; CHECK-LABEL: fld:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fsqrt
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: fld:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    fldt {{[0-9]+}}(%rsp)
+; ESTIMATE-NEXT:    fsqrt
+; ESTIMATE-NEXT:    retq
+  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #1
   ret x86_fp80 %call
 }
 
-declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
-
-declare float @llvm.sqrt.f32(float) #1
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
 
-; If the target's sqrtss and divss instructions are substantially
-; slower than rsqrtss with a Newton-Raphson refinement, we should
-; generate the estimate sequence.
 
 define float @reciprocal_square_root(float %x) #0 {
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT:    vmulss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call float @llvm.sqrt.f32(float %x)
   %div = fdiv fast float 1.0, %sqrt
   ret float %div
-
-; CHECK-LABEL: reciprocal_square_root:
-; CHECK: sqrtss
-; CHECK-NEXT: movss
-; CHECK-NEXT: divss
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root:
-; BTVER2: vrsqrtss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: vaddss
-; BTVER2-NEXT: vmulss
-; BTVER2-NEXT: retq
 }
 
 define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtps %xmm0, %xmm1
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    divps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtps %xmm0, %xmm1
+; ESTIMATE-NEXT:    vmulps %xmm1, %xmm1, %xmm2
+; ESTIMATE-NEXT:    vmulps %xmm0, %xmm2, %xmm0
+; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
   ret <4 x float> %div
-
-; CHECK-LABEL: reciprocal_square_root_v4f32:
-; CHECK: sqrtps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root_v4f32:
-; BTVER2: vrsqrtps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vaddps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: retq
 }
 
 define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
+; CHECK-LABEL: reciprocal_square_root_v8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sqrtps %xmm1, %xmm2
+; CHECK-NEXT:    sqrtps %xmm0, %xmm3
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    divps %xmm3, %xmm0
+; CHECK-NEXT:    divps %xmm2, %xmm1
+; CHECK-NEXT:    retq
+;
+; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
+; ESTIMATE:       # BB#0:
+; ESTIMATE-NEXT:    vrsqrtps %ymm0, %ymm1
+; ESTIMATE-NEXT:    vmulps %ymm1, %ymm1, %ymm2
+; ESTIMATE-NEXT:    vmulps %ymm0, %ymm2, %ymm0
+; ESTIMATE-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
+; ESTIMATE-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
+; ESTIMATE-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; ESTIMATE-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
   ret <8 x float> %div
-
-; CHECK-LABEL: reciprocal_square_root_v8f32:
-; CHECK: sqrtps
-; CHECK-NEXT: sqrtps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: movaps
-; CHECK-NEXT: divps
-; CHECK-NEXT: divps
-; CHECK-NEXT: retq
-; BTVER2-LABEL: reciprocal_square_root_v8f32:
-; BTVER2: vrsqrtps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: vaddps
-; BTVER2-NEXT: vmulps
-; BTVER2-NEXT: retq
 }
 
 
-attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/CodeGen/X86/sret-implicit.ll b/test/CodeGen/X86/sret-implicit.ll
index 3fade1d..5680952 100644
--- a/test/CodeGen/X86/sret-implicit.ll
+++ b/test/CodeGen/X86/sret-implicit.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin8 -terminal-rule < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -terminal-rule < %s | FileCheck %s
 
 ; CHECK-LABEL: return32
 ; CHECK-DAG: movq	$0, (%rdi)
diff --git a/test/CodeGen/X86/sse-varargs.ll b/test/CodeGen/X86/sse-varargs.ll
index da38f0e..7c3c781 100644
--- a/test/CodeGen/X86/sse-varargs.ll
+++ b/test/CodeGen/X86/sse-varargs.ll
@@ -2,7 +2,7 @@
 
 define i32 @t() nounwind  {
 entry:
-	tail call void (i32, ...)* @foo( i32 1, <4 x i32> < i32 10, i32 11, i32 12, i32 13 > ) nounwind 
+	tail call void (i32, ...) @foo( i32 1, <4 x i32> < i32 10, i32 11, i32 12, i32 13 > ) nounwind 
 	ret i32 0
 }
 
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index cab62a3..5afebd2 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -581,7 +581,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
   ; CHECK: test_x86_sse2_storel_dq
   ; CHECK: movl
-  ; CHECK: movq
+  ; CHECK: movlps
   call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
   ret void
 }
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index ca13392..3bde991 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1026,29 +1026,24 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
 }
 
 ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
-define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
 ; X32-LABEL: insertps_pr20411:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X32-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; X32-NEXT:    movdqu %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
-; X64-LABEL: insertps_pr20411:
-; X64:       ## BB#0:
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[2,3,0,1]
-; X64-NEXT:    pshufd {{.*#+}} xmm1 = mem[3,1,2,3]
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; X64-NEXT:    movdqu %xmm1, (%rdi)
+; X64-LABEL: insertps_pr20411:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; X64-NEXT:    movdqu %xmm1, (%rdi)
 ; X64-NEXT:    retq
-  %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
-  %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
-  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
-  %ptrcast = bitcast i32* %RET to <4 x i32>*
-  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
+  %ptrcast = bitcast i32* %RET to <4 x i32>*
+  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
   ret void
 }
 
diff --git a/test/CodeGen/X86/stack-folding-3dnow.ll b/test/CodeGen/X86/stack-folding-3dnow.ll
new file mode 100644
index 0000000..955bf44
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-3dnow.ll
@@ -0,0 +1,217 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s
+
+define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgusb
+  ;CHECK:       pavgusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pf2id(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pf2id
+  ;CHECK:       pf2id {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pf2iw(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pf2iw
+  ;CHECK:       pf2iw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfacc
+  ;CHECK:       pfacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfadd
+  ;CHECK:       pfadd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpeq
+  ;CHECK:       pfcmpeq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpge
+  ;CHECK:       pfcmpge {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfcmpgt
+  ;CHECK:       pfcmpgt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmax
+  ;CHECK:       pfmax {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmin
+  ;CHECK:       pfmin {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfmul
+  ;CHECK:       pfmul {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfnacc
+  ;CHECK:       pfnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfpnacc
+  ;CHECK:       pfpnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcp(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pfrcp
+  ;CHECK:       pfrcp {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrcpit1
+  ;CHECK:       pfrcpit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrcpit2
+  ;CHECK:       pfrcpit2 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfrsqit1
+  ;CHECK:       pfrsqit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pfrsqrt
+  ;CHECK:       pfrsqrt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfsub
+  ;CHECK:       pfsub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pfsubr
+  ;CHECK:       pfsubr {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pi2fd(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pi2fd
+  ;CHECK:       pi2fd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pi2fw(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pi2fw
+  ;CHECK:       pi2fw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhrw
+  ;CHECK:       pmulhrw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pswapd(x86_mmx %a) {
+  ;CHECK-LABEL: stack_fold_pswapd
+  ;CHECK:       pswapd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index a9a21c2..fec297d 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -87,15 +87,19 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %2
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
 }
 
 define i64 @stack_fold_movq_store(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
-  %1 = extractelement <2 x i64> %a0, i32 0
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i64 %1
+  ; add forces execution domain
+  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %2 = extractelement <2 x i64> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %2
 }
 
 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index 6aa2601..e814ae6 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -62,6 +62,33 @@ define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
 }
 declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
 
+;TODO stack_fold_crc32_32_8
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+
+;TODO stack_fold_crc32_32_16
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+
+define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_crc32_32_32
+  ;CHECK:       crc32l {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+;TODO stack_fold_crc32_64_8
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+
+define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_crc32_64_64
+  ;CHECK:       crc32q {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
 define <4 x i32> @stack_fold_movd_load(i32 %a0) {
   ;CHECK-LABEL: stack_fold_movd_load
   ;CHECK:       movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
@@ -87,15 +114,19 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %2
+  ; add forces execution domain
+  %3 = add <2 x i64> %2, <i64 1, i64 1>
+  ret <2 x i64> %3
 }
 
 define i64 @stack_fold_movq_store(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_movq_store
   ;CHECK:       movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
-  %1 = extractelement <2 x i64> %a0, i32 0
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
-  ret i64 %1
+  ; add forces execution domain
+  %1 = add <2 x i64> %a0, <i64 1, i64 1>
+  %2 = extractelement <2 x i64> %1, i32 0
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %2
 }
 
 define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-mmx.ll b/test/CodeGen/X86/stack-folding-mmx.ll
new file mode 100644
index 0000000..8a5d4e2
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-mmx.ll
@@ -0,0 +1,566 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s
+
+define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpd2pi
+  ;CHECK:       cvtpd2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+
+define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
+  ;CHECK-LABEL: stack_fold_cvtpi2pd
+  ;CHECK:       cvtpi2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0) nounwind readnone
+  ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+
+define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
+  ;CHECK-LABEL: stack_fold_cvtpi2ps
+  ;CHECK:       cvtpi2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %a1) nounwind readnone
+  ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvtps2pi
+  ;CHECK:       cvtps2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+
+define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttpd2pi
+  ;CHECK:       cvttpd2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+
+define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
+  ;CHECK-LABEL: stack_fold_cvttps2pi
+  ;CHECK:       cvttps2pi {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+
+; TODO stack_fold_movd_load
+; TODO stack_fold_movd_store
+; TODO stack_fold_movq_load
+; TODO stack_fold_movq_store
+
+define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       packssdw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       packsswb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       packuswb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddb
+  ;CHECK:       paddb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddd
+  ;CHECK:       paddd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddq
+  ;CHECK:       paddq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddsb
+  ;CHECK:       paddsb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddsw
+  ;CHECK:       paddsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddusb
+  ;CHECK:       paddusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddusw
+  ;CHECK:       paddusw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_paddw
+  ;CHECK:       paddw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pand
+  ;CHECK:       pand {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pandn
+  ;CHECK:       pandn {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       pavgb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       pavgw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqb
+  ;CHECK:       pcmpeqb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqd
+  ;CHECK:       pcmpeqd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpeqw
+  ;CHECK:       pcmpeqw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtb
+  ;CHECK:       pcmpgtb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtd
+  ;CHECK:       pcmpgtd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pcmpgtw
+  ;CHECK:       pcmpgtw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+
+; TODO stack_fold_pinsrw
+
+define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       pmaddwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       pmaxsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       pmaxub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       pminsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       pminub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhuw
+  ;CHECK:       pmulhuw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmulhw
+  ;CHECK:       pmulhw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmullw
+  ;CHECK:       pmullw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       pmuludq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_por
+  ;CHECK:       por {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       psadbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pshufw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pshufw
+  ;CHECK:       pshufw $1, {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+
+define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       pslld {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       psllq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       psllw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       psrad {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       psraw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       psrld {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       psrlq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       psrlw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       psubb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       psubd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       psubq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       psubsb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       psubsw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       psubusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       psubusw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       psubw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       punpckhbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhdq
+  ;CHECK:       punpckhdq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckhwd
+  ;CHECK:       punpckhwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpcklbw
+  ;CHECK:       punpcklbw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpckldq
+  ;CHECK:       punpckldq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_punpcklwd
+  ;CHECK:       punpcklwd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+
+define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
+  ;CHECK-LABEL: stack_fold_pxor
+  ;CHECK:       pxor {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
+  %2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
+  ret x86_mmx %2
+}
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index 0a4a4f2..6275c8d 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -48,7 +48,7 @@ attributes #0 = { sspreq }
 !20 = !{}
 !21 = !{i32 2, !"Dwarf Version", i32 2}
 !22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32, inlinedAt: !38)
+!23 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
 !24 = !MDSubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
 !25 = !MDNamespace(name: "__1", line: 1, file: !26, scope: null)
 !26 = !MDFile(filename: "main.cpp", directory: "/Users/matt/ryan_bug")
@@ -71,13 +71,12 @@ attributes #0 = { sspreq }
 !43 = !{!29, !29, !32, !44}
 !44 = !MDCompositeType(tag: DW_TAG_structure_type, name: "A", size: 8, align: 8, file: !1, scope: !25, elements: !45)
 !45 = !{!46}
-!46 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !44, type: !47, variables: !52)
+!46 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !44, type: !47)
 !47 = !MDSubroutineType(types: !48)
 !48 = !{!13, !49, !50, !50}
 !49 = !MDDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !44)
 !50 = !MDDerivedType(tag: DW_TAG_reference_type, baseType: !51)
 !51 = !MDDerivedType(tag: DW_TAG_const_type, baseType: !13)
-!52 = !{i32 786468}
 !53 = !{!34, !54}
 !54 = !MDTemplateTypeParameter(name: "_Compare", type: !44)
 !55 = !{!56, !57, !58}
@@ -86,7 +85,7 @@ attributes #0 = { sspreq }
 !58 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p3", line: 8, arg: 3, scope: !41, file: !10, type: !44)
 !59 = !MDLocation(line: 13, scope: !24, inlinedAt: !38)
 !63 = !{i32 undef}
-!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50, inlinedAt: !40)
+!64 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
 !65 = !MDSubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
 !66 = !{!67, !69, !70}
 !67 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index a88acf0..acaba6d 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -47,7 +47,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -83,7 +83,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -115,7 +115,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -147,7 +147,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [16 x i8], [16 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -180,7 +180,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -214,7 +214,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -248,7 +248,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -282,7 +282,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo, %struct.foo* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [16 x i8], [16 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -313,7 +313,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -345,7 +345,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -377,7 +377,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -409,7 +409,7 @@ entry:
   %0 = load i8*, i8** %a.addr, align 8
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %arraydecay1 = getelementptr inbounds [4 x i8], [4 x i8]* %buf, i32 0, i32 0
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
   ret void
 }
 
@@ -442,7 +442,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -476,7 +476,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -510,7 +510,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -544,7 +544,7 @@ entry:
   %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
   %buf1 = getelementptr inbounds %struct.foo.0, %struct.foo.0* %b, i32 0, i32 0
   %arraydecay2 = getelementptr inbounds [4 x i8], [4 x i8]* %buf1, i32 0, i32 0
-  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
   ret void
 }
 
@@ -571,7 +571,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -599,7 +599,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -627,7 +627,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -655,7 +655,7 @@ entry:
   %a.addr = alloca i8*, align 8
   store i8* %a, i8** %a.addr, align 8
   %0 = load i8*, i8** %a.addr, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %0)
   ret void
 }
 
@@ -808,7 +808,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -835,7 +835,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -862,7 +862,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -889,7 +889,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %0 = ptrtoint i32* %a to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1021,7 +1021,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1051,7 +1051,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1081,7 +1081,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1111,7 +1111,7 @@ entry:
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), double* %y.1)
   ret void
 }
 
@@ -1155,7 +1155,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1200,7 +1200,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1245,7 +1245,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1290,7 +1290,7 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
+  %call5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
@@ -1319,7 +1319,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1349,7 +1349,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1379,7 +1379,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1409,7 +1409,7 @@ entry:
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   store i32* %y, i32** %b, align 8
   %0 = load i32*, i32** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32* %0)
   ret void
 }
 
@@ -1437,7 +1437,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1466,7 +1466,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1494,7 +1494,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1523,7 +1523,7 @@ entry:
   %b = alloca i32*, align 8
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
   %0 = ptrtoint i32* %y to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
   ret void
 }
 
@@ -1549,7 +1549,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1576,7 +1576,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1603,7 +1603,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1630,7 +1630,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
@@ -1656,7 +1656,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1683,7 +1683,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1710,7 +1710,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1737,7 +1737,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1768,7 +1768,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1800,7 +1800,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1832,7 +1832,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -1864,7 +1864,7 @@ entry:
   %0 = bitcast i32* %a to float*
   store float* %0, float** %b, align 8
   %1 = load float*, float** %b, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), float* %1)
   ret void
 }
 
@@ -2006,7 +2006,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2034,7 +2034,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2062,7 +2062,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -2090,7 +2090,7 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
@@ -3152,7 +3152,7 @@ entry:
   %b = getelementptr inbounds %struct.nest, %struct.nest* %c, i32 0, i32 1
   %_a = getelementptr inbounds %struct.pair, %struct.pair* %b, i32 0, i32 0
   %0 = load i32, i32* %_a, align 4
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0)
   ret void
 }
 
@@ -3181,7 +3181,7 @@ bb:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %tmp = alloca %struct.small*, align 8
-  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp)
+  %tmp1 = call i32 (...) @dummy(%struct.small** %tmp)
   %tmp2 = load %struct.small*, %struct.small** %tmp, align 8
   %tmp3 = ptrtoint %struct.small* %tmp2 to i64
   %tmp4 = trunc i64 %tmp3 to i32
@@ -3209,7 +3209,7 @@ bb17:                                             ; preds = %bb6
 
 bb21:                                             ; preds = %bb6, %bb
   %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
-  %tmp23 = call i32 (...)* @dummy(i32 %tmp22)
+  %tmp23 = call i32 (...) @dummy(i32 %tmp22)
   ret i32 undef
 }
 
@@ -3235,7 +3235,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %test = alloca [32 x i8], align 16
   %arraydecay = getelementptr inbounds [32 x i8], [32 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3261,7 +3261,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %test = alloca [33 x i8], align 16
   %arraydecay = getelementptr inbounds [33 x i8], [33 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3287,7 +3287,7 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %test = alloca [4 x i8], align 1
   %arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3313,7 +3313,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %test = alloca [5 x i8], align 1
   %arraydecay = getelementptr inbounds [5 x i8], [5 x i8]* %test, i32 0, i32 0
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
   ret i32 %call
 }
 
@@ -3347,7 +3347,7 @@ entry:
   %3 = load i64, i64* %2, align 1
   %4 = getelementptr { i64, i8 }, { i64, i8 }* %test.coerce, i32 0, i32 1
   %5 = load i8, i8* %4, align 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
   ret i32 %call
 }
 
@@ -3381,7 +3381,7 @@ entry:
   %3 = load i64, i64* %2, align 1
   %4 = getelementptr { i64, i8 }, { i64, i8 }* %test.coerce, i32 0, i32 1
   %5 = load i8, i8* %4, align 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
   ret i32 %call
 }
 
@@ -3410,7 +3410,7 @@ entry:
   %0 = alloca i8, i64 4
   store i8* %0, i8** %test, align 8
   %1 = load i8*, i8** %test, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
   ret i32 %call
 }
 
@@ -3438,7 +3438,7 @@ entry:
   %0 = alloca i8, i64 5
   store i8* %0, i8** %test, align 8
   %1 = load i8*, i8** %test, align 8
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %1)
   ret i32 %call
 }
 
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index d2155bd..1392e5b 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -99,7 +99,7 @@
 
 define void @constantargs() {
 entry:
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 1, i32 15, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
@@ -116,7 +116,7 @@ entry:
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
   ret void
 }
 
@@ -139,7 +139,7 @@ entry:
   store i64 11, i64* %metadata1
   store i64 12, i64* %metadata1
   store i64 13, i64* %metadata1
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
   ret void
 }
 
@@ -155,10 +155,10 @@ entry:
 ; CHECK-LABEL:  .long L{{.*}}-_longid
 define void @longid() {
 entry:
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967295, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4294967296, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 -1, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4294967295, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4294967296, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 9223372036854775807, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 -1, i32 0)
   ret void
 }
 
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
index 73ee4f3..a38b920 100644
--- a/test/CodeGen/X86/stackmap-large-constants.ll
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -51,7 +51,7 @@
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 
 define void @foo() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
   ret void
 }
 
@@ -78,6 +78,6 @@ define void @foo() {
 
 
 define void @bar() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
   ret void
 }
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 31553c0..599b626 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -50,7 +50,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
   %a2 = call i64 asm sideeffect "", "={r8}"() nounwind
   %a3 = call i8 asm sideeffect "", "={ah}"() nounwind
   %a4 = call <4 x double> asm sideeffect "", "={ymm0}"() nounwind
@@ -97,7 +97,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
 
 ; StackMap 3 (no liveness information available)
@@ -129,7 +129,7 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
   call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
   ret void
 }
@@ -166,8 +166,8 @@ entry:
 ; PATCH-NEXT:   .byte 16
 ; Align
 ; PATCH-NEXT:   .align  3
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 5)
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
   call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
index 7932c0d..08fee2e 100644
--- a/test/CodeGen/X86/stackmap-nops.ll
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -193,41 +193,41 @@ entry:
 ; CHECK-NEXT: .byte 102
 ; CHECK-NEXT: .byte 102
 ; CHECK-NEXT: nopw %cs:512(%rax,%rax)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  0)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  1, i32  1)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  2, i32  2)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  3, i32  3)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  4, i32  4)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  5, i32  5)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  6, i32  6)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  7, i32  7)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  8, i32  8)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  9, i32  9)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 10, i32 10)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 11, i32 11)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 12)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 13)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 14)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 15)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 16)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 17)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 18, i32 18)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 19, i32 19)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 20, i32 20)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 21, i32 21)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 22, i32 22)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 23, i32 23)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 24, i32 24)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 25, i32 25)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 26, i32 26)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 27, i32 27)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  0, i32  0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  1, i32  1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  2, i32  2)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  3, i32  3)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  4, i32  4)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  5, i32  5)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  6, i32  6)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  7, i32  7)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  8, i32  8)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  9, i32  9)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 10, i32 10)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 11, i32 11)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 12)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 13, i32 13)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 14)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 15)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 16)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 17)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 18, i32 18)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 19, i32 19)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 20, i32 20)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 21, i32 21)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 22, i32 22)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 23, i32 23)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 24, i32 24)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 25, i32 25)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 26, i32 26)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 27, i32 27)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 28, i32 28)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 29, i32 29)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 30, i32 30)
 ; Add an extra stackmap with a zero-length shadow to thwart the shadow
 ; optimization. This will force all 15 bytes of the previous shadow to be
 ; padded with nops.
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 31, i32 0)
   ret void
 }
 
diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll
index a3725f2..001d8d9 100644
--- a/test/CodeGen/X86/stackmap-shadow-optimization.ll
+++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll
@@ -18,7 +18,7 @@ entry:
 ; CHECK:        callq   _bar
 ; CHECK-NOT:    nop
   call void @bar()
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 8)
   call void @bar()
   call void @bar()
   ret void
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index fc958ec..0805e81 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -125,7 +125,7 @@
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 12345 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
@@ -147,7 +147,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 -559038737 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -173,7 +173,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 -559038737 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -194,7 +194,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 -559038737 to i8*
-  %result = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -214,7 +214,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 -559038737 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -236,7 +236,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -258,7 +258,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -278,7 +278,7 @@ entry:
 ; CHECK-NEXT:   .short 6
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -297,7 +297,7 @@ entry:
 ; CHECK-NEXT:   .short 6
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -333,7 +333,7 @@ bb17:
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
   unreachable
 
 bb61:
@@ -367,7 +367,7 @@ define void @subRegOffset(i16 %arg) {
   %arghi = lshr i16 %v, 8
   %a1 = trunc i16 %arghi to i8
   tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
   ret void
 }
 
@@ -384,7 +384,7 @@ define void @subRegOffset(i16 %arg) {
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
   ret void
 }
 
@@ -422,10 +422,10 @@ entry:
   store i64 11, i64* %metadata1
   store i64 12, i64* %metadata1
   store i64 13, i64* %metadata1
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
   %metadata2 = alloca i8, i32 4, align 8
   %metadata3 = alloca i16, i32 4, align 8
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
   ret void
 }
 
@@ -441,10 +441,10 @@ entry:
 ; CHECK-LABEL:  .long L{{.*}}-_longid
 define void @longid() {
 entry:
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
   ret void
 }
 
@@ -462,7 +462,7 @@ entry:
 ; CHECK-NEXT:   .long   -{{[0-9]+}}
 define void @clobberScratch(i32 %a) {
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
@@ -474,11 +474,11 @@ define void @clobberScratch(i32 %a) {
 ; CHECK-NEXT:   .short 0
 define void @needsStackRealignment() {
   %val = alloca i64, i32 3, align 128
-  tail call void (...)* @escape_values(i64* %val)
+  tail call void (...) @escape_values(i64* %val)
 ; Note: Adding any non-constant to the stackmap would fail because we
 ; expected to be able to address off the frame pointer.  In a realigned
 ; frame, we must use the stack pointer instead.  This is a separate bug.
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 0, i32 0)
   ret void
 }
 declare void @escape_values(...)
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
new file mode 100644
index 0000000..eb34a42
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+; Check that we can lower a use of an alloca both as a deopt value (where the
+; exact meaning is up to the consumer of the stackmap) and as an explicit spill
+; slot used for GC.  
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare zeroext i1 @return_i1()
+
+; Can we handle an explicit relocation slot (in the form of an alloca) given 
+; to the statepoint?
+define i32 addrspace(1)* @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test
+; CHECK: pushq  %rax
+; CHECK: movq   %rdi, (%rsp)
+; CHECK: callq return_i1
+; CHECK: movq   (%rsp), %rax
+; CHECK: popq   %rdx
+; CHECK: retq
+entry:
+  %alloca = alloca i32 addrspace(1)*, align 8
+  store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
+  call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)** %alloca)
+  %rel = load i32 addrspace(1)*, i32 addrspace(1)** %alloca
+  ret i32 addrspace(1)* %rel
+}
+
+; Can we handle an alloca as a deopt value?  
+define i32 addrspace(1)* @test2(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test2
+; CHECK: pushq  %rax
+; CHECK: movq   %rdi, (%rsp)
+; CHECK: callq return_i1
+; CHECK: xorl   %eax, %eax
+; CHECK: popq   %rdx
+; CHECK: retq
+entry:
+  %alloca = alloca i32 addrspace(1)*, align 8
+  store i32 addrspace(1)* %ptr, i32 addrspace(1)** %alloca
+  call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 1, i32 addrspace(1)** %alloca)
+  ret i32 addrspace(1)* null
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...)
+
+
+; CHECK-LABEL: .section .llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 2
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 2
+
+; Functions and stack size
+; CHECK-NEXT:   .quad test
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad test2
+; CHECK-NEXT:   .quad 8
+
+; Large Constants
+; Statepoint ID only
+; CHECK: .quad	2882400000
+
+; Callsites
+; The GC one
+; CHECK: .long	.Ltmp1-test
+; CHECK: .short	0
+; CHECK: .short	3
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+; No Padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .align	8
+
+; The Deopt one
+; CHECK: .long	.Ltmp3-test2
+; CHECK: .short	0
+; CHECK: .short	3
+; SmallConstant (0)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	0
+; SmallConstant (1)
+; CHECK: .byte	4
+; CHECK: .byte	8
+; CHECK: .short	0
+; CHECK: .long	1
+; Direct Spill Slot [RSP+0]
+; CHECK: .byte	2
+; CHECK: .byte	8
+; CHECK: .short	7
+; CHECK: .long	0
+
+; No Padding or LiveOuts
+; CHECK: .short	0
+; CHECK: .short	0
+; CHECK: .align	8
+
+
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index 22049cf..9f6e4f9 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -20,7 +20,7 @@ define i1 @test_i1_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0)
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   ret i1 %call1
 }
@@ -32,7 +32,7 @@ define i32 @test_i32_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0)
   %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(i32 %safepoint_token)
   ret i32 %call1
 }
@@ -44,7 +44,7 @@ define i32* @test_i32ptr_return() gc "statepoint-example" {
 ; CHECK: popq %rdx
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0)
   %call1 = call i32* @llvm.experimental.gc.result.p0i32(i32 %safepoint_token)
   ret i32* %call1
 }
@@ -56,7 +56,7 @@ define float @test_float_return() gc "statepoint-example" {
 ; CHECK: popq %rax
 ; CHECK: retq
 entry:
-  %safepoint_token = tail call i32 (float ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0)
+  %safepoint_token = tail call i32 (float ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0)
   %call1 = call float @llvm.experimental.gc.result.f32(i32 %safepoint_token)
   ret float %call1
 }
@@ -70,7 +70,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
 ; CHECK-NEXT: popq %rdx
 ; CHECK-NEXT: retq
 entry:
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
   %call1 = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 4, i32 4)
   %call2 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   ret i1 %call2
@@ -81,7 +81,7 @@ define void @test_void_vararg() gc "statepoint-example" {
 ; Check a statepoint wrapping a *void* returning vararg function works
 ; CHECK: callq varargf
 entry:
-  %safepoint_token = tail call i32 (void (i32, ...)*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0)
+  %safepoint_token = tail call i32 (void (i32, ...)*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidi32varargf(void (i32, ...)* @varargf, i32 2, i32 0, i32 42, i32 43, i32 0)
   ;; if we try to use the result from a statepoint wrapping a
   ;; non-void-returning varargf, we will experience a crash.
   ret void
diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll
index 5a1b18a..0b296cf 100644
--- a/test/CodeGen/X86/statepoint-forward.ll
+++ b/test/CodeGen/X86/statepoint-forward.ll
@@ -25,7 +25,7 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
   %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
@@ -44,7 +44,7 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)* addrspace(1)* %p
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p)
   %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4)
   %after = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %pnew
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
@@ -72,7 +72,7 @@ entry:
   %before = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp1 = call i1 @f(i32 addrspace(1)* %before)
   call void @llvm.assume(i1 %cmp1)
-  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
@@ -90,7 +90,7 @@ entry:
   %cmp1 = call i1 @f(i32 addrspace(1)* %v)
   call void @llvm.assume(i1 %cmp1)
   store i32 addrspace(1)* %v, i32 addrspace(1)** %p
-  call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
+  call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0)
   %after = load i32 addrspace(1)*, i32 addrspace(1)** %p
   %cmp2 = call i1 @f(i32 addrspace(1)* %after)
   ret i1 %cmp2
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 91bf46a..177eb96 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -6,7 +6,9 @@ declare i64 addrspace(1)* @"some_other_call"(i64 addrspace(1)*)
 
 declare i32 @"personality_function"()
 
-define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj, i64 addrspace(1)* %obj1) {
+define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj, 
+                                      i64 addrspace(1)* %obj1)
+  gc "statepoint-example" {
 entry:
   ; CHECK: .Ltmp{{[0-9]+}}:
   ; CHECK: callq some_other_call
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index 3ecef33..a968c03 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -8,20 +8,20 @@ target triple = "x86_64-pc-linux-gnu"
 ; of GC arguments differ, niave lowering code would insert loads and 
 ; stores to rearrange items on the stack.  We need to make sure (for
 ; performance) that this doesn't happen.
-define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: back_to_back_calls
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdx, 16(%rsp)
 ; CHECK: movq	%rdi, 8(%rsp)
 ; CHECK: movq	%rsi, (%rsp)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
   %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
   %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
   %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
   %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
@@ -31,20 +31,20 @@ define i32 @back_to_back_calls(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 a
 
 ; This test simply checks that minor changes in vm state don't prevent slots
 ; being reused for gc values.  
-define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 {
+define i32 @reserve_first(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #1 gc "statepoint-example" {
 ; CHECK-LABEL: reserve_first
 ; The exact stores don't matter, but there need to be three stack slots created
 ; CHECK: movq	%rdx, 16(%rsp)
 ; CHECK: movq	%rdi, 8(%rsp)
 ; CHECK: movq	%rsi, (%rsp)
-  %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
+  %safepoint_token = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c)
   %a1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 9)
   %b1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 10)
   %c1 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 9, i32 11)
 ; CHECK: callq
 ; This is the key check.  There should NOT be any memory moves here
 ; CHECK-NOT: movq
-  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
+  %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 addrspace(1)* %a1, i32 0, i32 addrspace(1)* %c1, i32 0, i32 0, i32 addrspace(1)* %c1, i32 addrspace(1)* %b1, i32 addrspace(1)* %a1)
   %a2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 11)
   %b2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 10)
   %c2 = tail call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token2, i32 9, i32 9)
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index e452a63..9593c40 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -21,7 +21,7 @@ define i1 @test(i32 addrspace(1)* %ptr) gc "statepoint-example" {
 entry:
   %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8
   store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1
-  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null)
+  %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null)
   %call1 = call zeroext i1 @llvm.experimental.gc.result.i1(i32 %safepoint_token)
   %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 6, i32 6)
   %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7)
diff --git a/test/CodeGen/X86/sub-with-overflow.ll b/test/CodeGen/X86/sub-with-overflow.ll
index 34f4066..fa00d6f 100644
--- a/test/CodeGen/X86/sub-with-overflow.ll
+++ b/test/CodeGen/X86/sub-with-overflow.ll
@@ -11,11 +11,11 @@ entry:
   br i1 %obit, label %overflow, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 ; CHECK-LABEL: func1:
@@ -31,11 +31,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 
 ; CHECK-LABEL: func2:
diff --git a/test/CodeGen/X86/switch-crit-edge-constant.ll b/test/CodeGen/X86/switch-crit-edge-constant.ll
index b44385c..e9a208d 100644
--- a/test/CodeGen/X86/switch-crit-edge-constant.ll
+++ b/test/CodeGen/X86/switch-crit-edge-constant.ll
@@ -35,19 +35,19 @@ cond_true:		; preds = %bb2
 
 blahaha:		; preds = %cond_true, %bb2, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry, %entry
 	%s.0 = phi i8* [ getelementptr ([8 x i8], [8 x i8]* @str, i32 0, i64 0), %cond_true ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str1, i32 0, i64 0), %entry ], [ getelementptr ([5 x i8], [5 x i8]* @str2, i32 0, i64 0), %bb2 ]		; <i8*> [#uses=13]
-	%tmp8 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp10 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp12 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp14 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp16 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp18 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp20 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp22 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp24 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp26 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp28 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp30 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
-	%tmp32 = tail call i32 (i8*, ...)* @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp8 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp10 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp12 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp14 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp16 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp18 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp20 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp22 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp24 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp26 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp28 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp30 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
+	%tmp32 = tail call i32 (i8*, ...) @printf( i8* %s.0 )		; <i32> [#uses=0]
 	ret void
 }
 
diff --git a/test/CodeGen/X86/switch-or.ll b/test/CodeGen/X86/switch-or.ll
index 75832c7..6e6b013 100644
--- a/test/CodeGen/X86/switch-or.ll
+++ b/test/CodeGen/X86/switch-or.ll
@@ -12,7 +12,7 @@ entry:
   ]
 
 if.then:
-  %call = tail call i32 (...)* @bar() nounwind
+  %call = tail call i32 (...) @bar() nounwind
   ret void
 
 if.end:
diff --git a/test/CodeGen/X86/tail-call-win64.ll b/test/CodeGen/X86/tail-call-win64.ll
index 8811b75..fb10d5d 100644
--- a/test/CodeGen/X86/tail-call-win64.ll
+++ b/test/CodeGen/X86/tail-call-win64.ll
@@ -4,7 +4,7 @@
 ; in-function jumps from function exiting jumps.
 
 define void @tail_jmp_reg(i32, i32, void ()* %fptr) {
-  tail call void ()* %fptr()
+  tail call void () %fptr()
   ret void
 }
 
@@ -28,7 +28,7 @@ define void @tail_jmp_imm() {
 
 define void @tail_jmp_mem() {
   %fptr = load void ()*, void ()** @g_fptr
-  tail call void ()* %fptr()
+  tail call void () %fptr()
   ret void
 }
 
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index f4d51c2..9e054fe 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -215,7 +215,7 @@ entry:
   %idxprom = sext i32 %n to i64
   %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*], [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
   %0 = load i32 (i8*, ...)*, i32 (i8*, ...)** %arrayidx, align 8
-  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  %call = tail call i32 (i8*, ...) %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
   ret i32 %call
 }
 
diff --git a/test/CodeGen/X86/tailcall-fastisel.ll b/test/CodeGen/X86/tailcall-fastisel.ll
index 4e1fc43..f69e75c 100644
--- a/test/CodeGen/X86/tailcall-fastisel.ll
+++ b/test/CodeGen/X86/tailcall-fastisel.ll
@@ -11,7 +11,7 @@ fail:                                             ; preds = %entry
 
 define i32 @foo() nounwind {
 entry:
- %0 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+ %0 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
  ret i32 %0
 }
 
diff --git a/test/CodeGen/X86/tailcall-mem-intrinsics.ll b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
new file mode 100644
index 0000000..0e0ab5c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy
+; CHECK: jmp memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset
+; CHECK; jmp memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset
+; CHECK: jmp memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index d3498a4..38685ec 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -12,7 +12,7 @@ bb1:		; preds = %bb1, %bb1.thread
 	%0 = trunc i32 %i.0.reg2mem.0 to i8		; <i8> [#uses=1]
 	%1 = sdiv i8 %0, 2		; <i8> [#uses=1]
 	%2 = sext i8 %1 to i32		; <i32> [#uses=1]
-	%3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), i32 %2) nounwind		; <i32> [#uses=0]
+	%3 = tail call i32 (i8*, ...) @printf(i8* getelementptr ([4 x i8], [4 x i8]* @"\01LC", i32 0, i32 0), i32 %2) nounwind		; <i32> [#uses=0]
 	%indvar.next = add i32 %i.0.reg2mem.0, 1		; <i32> [#uses=2]
 	%exitcond = icmp eq i32 %indvar.next, 258		; <i1> [#uses=1]
 	br i1 %exitcond, label %bb2, label %bb1
diff --git a/test/CodeGen/X86/uint64-to-float.ll b/test/CodeGen/X86/uint64-to-float.ll
index ca764e7..a1074a6 100644
--- a/test/CodeGen/X86/uint64-to-float.ll
+++ b/test/CodeGen/X86/uint64-to-float.ll
@@ -6,13 +6,13 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
-; CHECK: testq %rdi, %rdi
+; CHECK: andl
+; CHECK-NEXT: testq %rdi, %rdi
 ; CHECK-NEXT: js LBB0_1
 ; CHECK: cvtsi2ss
 ; CHECK-NEXT: ret
 ; CHECK: LBB0_1
 ; CHECK: shrq
-; CHECK-NEXT: andq
 ; CHECK-NEXT: orq
 ; CHECK-NEXT: cvtsi2ss
 define float @test(i64 %a) {
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index c930c16..6435760 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -14,11 +14,11 @@ entry:
   br i1 %obit, label %carry, label %normal
 
 normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  %t1 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
   ret i1 true
 
 carry:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
+  %t2 = tail call i32 (i8*, ...) @printf( i8* getelementptr ([4 x i8], [4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
 }
 
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index 47d0811..2ed1acf 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -32,5 +32,5 @@ entry:
 !8 = !MDLocation(line: 4, column: 3, scope: !7)
 !9 = !{!1}
 !10 = !MDFile(filename: "test.c", directory: "/dir")
-!11 = !{i32 0}
+!11 = !{}
 !12 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll
index b49eecf..5f0e78f 100644
--- a/test/CodeGen/X86/utf16-cfstrings.ll
+++ b/test/CodeGen/X86/utf16-cfstrings.ll
@@ -21,7 +21,7 @@ define i32 @main() uwtable ssp {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
-  call void (%0*, ...)* @NSLog(%0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to %0*))
+  call void (%0*, ...) @NSLog(%0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to %0*))
   ret i32 0
 }
 
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
index 2dcf319..bb1104d 100644
--- a/test/CodeGen/X86/vararg-callee-cleanup.ll
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -4,8 +4,8 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 
 declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...)
 define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
-  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...) @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...) @thiscall_thunk(i8* %a, i32 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
@@ -19,8 +19,8 @@ define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
 
 declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...)
 define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
-  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...) @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...) @stdcall_thunk(i8* %a, i32 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
@@ -32,8 +32,8 @@ define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
 
 declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...)
 define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
-  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
-  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...) @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...) @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
   %t1 = add i32 %b, %c
   %r = add i32 %t1, %d
   ret i32 %r
diff --git a/test/CodeGen/X86/vararg_tailcall.ll b/test/CodeGen/X86/vararg_tailcall.ll
index 9b76bdd..98aa4a8 100644
--- a/test/CodeGen/X86/vararg_tailcall.ll
+++ b/test/CodeGen/X86/vararg_tailcall.ll
@@ -15,7 +15,7 @@
 ; WIN64: callq
 define void @foo(i64 %arg) nounwind optsize ssp noredzone {
 entry:
-  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
   ret void
 }
 
@@ -40,7 +40,7 @@ declare void @bar2(i8*, i64) optsize noredzone
 define i8* @foo2(i8* %arg) nounwind optsize ssp noredzone {
 entry:
   %tmp1 = load i8*, i8** @sel, align 8
-  %call = tail call i8* (i8*, i8*, ...)* @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, ...) @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -56,7 +56,7 @@ entry:
   %tmp3 = load i8*, i8** @sel4, align 8
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
-  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, ...) @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -73,7 +73,7 @@ entry:
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
   %tmp6 = load i8*, i8** @sel7, align 8
-  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...)* @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...) @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
   ret i8* %call
 }
 
@@ -89,6 +89,6 @@ entry:
   %tmp3 = load i8*, i8** @sel4, align 8
   %tmp4 = load i8*, i8** @sel5, align 8
   %tmp5 = load i8*, i8** @sel6, align 8
-  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
+  %call = tail call i8* (i8*, i8*, i8*, ...) @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
   ret i8* %call
 }
diff --git a/test/CodeGen/X86/variadic-node-pic.ll b/test/CodeGen/X86/variadic-node-pic.ll
index 1182a30..704459e 100644
--- a/test/CodeGen/X86/variadic-node-pic.ll
+++ b/test/CodeGen/X86/variadic-node-pic.ll
@@ -6,6 +6,6 @@ target triple = "x86_64-apple-darwin8"
 declare void @xscanf(i64) nounwind 
 
 define void @foo() nounwind  {
-	call void (i64)* @xscanf( i64 0 ) nounwind
+	call void (i64) @xscanf( i64 0 ) nounwind
 	unreachable
 }
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 07cd195..e507895 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -100,37 +100,29 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_8:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT:    movzbl %dl, %edx
-; CHECK-WIDE-NEXT:    orl %eax, %edx
-; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    vzeroupper
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
@@ -145,21 +137,17 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_4:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
-; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
-; CHECK-WIDE-NEXT:    orl %eax, %ecx
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
-; CHECK-WIDE-NEXT:    shll $8, %eax
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
-; CHECK-WIDE-NEXT:    movzbl %dl, %edx
-; CHECK-WIDE-NEXT:    orl %eax, %edx
-; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vmovshdup %xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
+; CHECK-WIDE-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 4db68bd..f35c4ab 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -180,3 +180,49 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
 declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
+
+;
+; Constant Folding
+;
+
+define <2 x double> @const_floor_v2f64() {
+  ; CHECK: const_floor_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_floor_v4f32() {
+  ; CHECK: const_floor_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
+
+define <2 x double> @const_ceil_v2f64() {
+  ; CHECK: const_ceil_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+  %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_ceil_v4f32() {
+  ; CHECK: const_ceil_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+  %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
+
+define <2 x double> @const_trunc_v2f64() {
+  ; CHECK: const_trunc_v2f64
+  ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+  %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
+  ret <2 x double> %t
+}
+
+define <4 x float> @const_trunc_v4f32() {
+  ; CHECK: const_trunc_v4f32
+  ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+  %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
+  ret <4 x float> %t
+}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 0f89515..4018a21 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -8,8 +8,8 @@ define void  @t1(i32 %a, x86_mmx* %P) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    shll $12, %ecx
 ; CHECK-NEXT:    movd %ecx, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
-; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; CHECK-NEXT:    movq %xmm0, (%eax)
 ; CHECK-NEXT:    retl
  %tmp12 = shl i32 %a, 12
  %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index 447f97a..cbd4208 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -6,8 +6,8 @@ define x86_mmx @t0(i32 %A) nounwind {
 ; X86-32-LABEL: t0:
 ; X86-32:       ## BB#0:
 ; X86-32:    movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
-; X86-32-NEXT:    movlpd %xmm0, (%esp)
+; X86-32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X86-32-NEXT:    movq %xmm0, (%esp)
 ; X86-32-NEXT:    movq (%esp), %mm0
 ; X86-32-NEXT:    addl $12, %esp
 ; X86-32-NEXT:    retl
diff --git a/test/CodeGen/X86/vec_reassociate.ll b/test/CodeGen/X86/vec_reassociate.ll
new file mode 100644
index 0000000..bf2053f
--- /dev/null
+++ b/test/CodeGen/X86/vec_reassociate.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s
+
+define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> %a0, <i32  1, i32 -2, i32  3, i32 -4>
+  %2 = add <4 x i32> %a1, <i32 -1, i32  2, i32 -3, i32  4>
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @add_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   paddd %xmm1, %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = add <4 x i32> <i32  1, i32 -2, i32  3, i32 -4>, %a0
+  %2 = add <4 x i32> <i32 -1, i32  2, i32 -3, i32  4>, %a1
+  %3 = add <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI2_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = mul <4 x i32> %a1, <i32 4, i32 3, i32 2, i32 1>
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @mul_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   pmulld %xmm1, %xmm0
+  ;CHECK-NEXT:   pmulld .LCPI3_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = mul <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %a0
+  %2 = mul <4 x i32> <i32 4, i32 3, i32 2, i32 1>, %a1
+  %3 = mul <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI4_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = and <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @and_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   andps %xmm1, %xmm0
+  ;CHECK-NEXT:   andps .LCPI5_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = and <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = and <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = and <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI6_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = or <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @or_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   orps %xmm1, %xmm0
+  ;CHECK-NEXT:   orps .LCPI7_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = or <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0 
+  %2 = or <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI8_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> %a0, <i32 -2, i32 -2, i32  3, i32  3>
+  %2 = xor <4 x i32> %a1, <i32 -1, i32 -1, i32  1, i32  1>
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+
+define <4 x i32> @xor_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL:  @xor_4i32_commute
+  ;CHECK:        # BB#0:
+  ;CHECK-NEXT:   xorps %xmm1, %xmm0
+  ;CHECK-NEXT:   xorps .LCPI9_0(%rip), %xmm0
+  ;CHECK-NEXT:   retq
+  %1 = xor <4 x i32> <i32 -2, i32 -2, i32  3, i32  3>, %a0
+  %2 = xor <4 x i32> <i32 -1, i32 -1, i32  1, i32  1>, %a1
+  %3 = xor <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index afde0ed..8ed8083 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -9,7 +9,7 @@
 
 define void @test1() {
 ;CHECK-LABEL: @test1
-;CHECK: xorpd
+;CHECK: xorps
   store <1 x i64> zeroinitializer, <1 x i64>* @M1
   store <2 x i32> zeroinitializer, <2 x i32>* @M2
   ret void
@@ -17,7 +17,7 @@ define void @test1() {
 
 define void @test2() {
 ;CHECK-LABEL: @test2
-;CHECK: pshufd
+;CHECK: pcmpeqd
   store <1 x i64> < i64 -1 >, <1 x i64>* @M1
   store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
   ret void
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 01b8972..53d13c8 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -634,28 +634,16 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
 }
 
 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
-; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzbl %dil, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    movzbl %dil, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -665,27 +653,28 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
 ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE2-NEXT:    shll   $8, %edi
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    shll   $8, %edi
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $5, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -693,16 +682,30 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 }
 
 define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
-; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE:       # BB#0:
-; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; SSE-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shll   $8, %edi
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $7, %edi, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shll   $8, %edi
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $7, %edi, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $15, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 0
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
@@ -713,29 +716,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movzbl %dil, %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    pxor   %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslld $24, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    movzbl %dil, %eax
+; SSSE3-NEXT:    pxor   %xmm0, %xmm0
+; SSSE3-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslld $24, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pxor   %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrb $2, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <16 x i8> undef, i8 %i, i32 3
   %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index eb77c38..4007f0b 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -1384,16 +1384,14 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $1, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $1, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
@@ -1403,16 +1401,14 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $5, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $5, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
@@ -1422,14 +1418,14 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $7, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 0
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
@@ -1439,16 +1435,14 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
 define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
 ; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movzwl %di, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE-NEXT:    pxor   %xmm0, %xmm0
+; SSE-NEXT:    pinsrw $2, %edi, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
 ; AVX:       # BB#0:
-; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    vpxor   %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $2, %edi, %xmm0
 ; AVX-NEXT:    retq
   %a = insertelement <8 x i16> undef, i16 %i, i32 3
   %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index aad3702..df4994d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3249,3 +3249,15 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 undef, i32 3, i32 undef, i32 20, i32 20, i32 5, i32 undef, i32 31, i32 undef, i32 11, i32 undef, i32 28, i32 28, i32 13, i32 undef>
   ret <16 x i16> %shuffle
 }
+
+define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
+; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl (%rdi), %eax
+; ALL-NEXT:    vmovd %eax, %xmm0
+; ALL-NEXT:    retq
+  %val = load i16, i16* %ptr
+  %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0
+  ret <16 x i16> %i0
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index f9f4b96..a0f43de 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -656,8 +656,6 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    movl $15, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd $15, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 8aca67c..1b42a63 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -813,15 +813,11 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
 ; AVX1-LABEL: insert_reg_and_zero_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq %rdi, %xmm0
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_reg_and_zero_v4i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq %rdi, %xmm0
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
   %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -832,15 +828,11 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 ; AVX1-LABEL: insert_mem_and_zero_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_mem_and_zero_v4i64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
   %a = load i64, i64* %ptr
   %v = insertelement <4 x i64> undef, i64 %a, i64 0
@@ -851,8 +843,9 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
 define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
 ; ALL-LABEL: insert_reg_and_zero_v4f64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; ALL-NEXT:    # kill: XMM0<def> XMM0<kill> YMM0<def>
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
 ; ALL-NEXT:    retq
   %v = insertelement <4 x double> undef, double %a, i32 0
   %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 417423a..bb07077 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,6 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +960,6 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -2090,3 +2086,20 @@ entry:
   %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %res
 }
+
+define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
+; AVX1-LABEL: insert_mem_and_zero_v8i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v8i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    retq
+  %a = load i32, i32* %ptr
+  %v = insertelement <8 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll
index 094722d..dbccd26 100644
--- a/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -9,7 +9,7 @@ define void @test0(<1 x i64>* %x) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-NEXT:    movlpd %xmm0, (%eax)
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test0:
@@ -38,13 +38,13 @@ define void @test1() {
 ; X32-NEXT:    .cfi_def_cfa_offset 24
 ; X32-NEXT:  Ltmp2:
 ; X32-NEXT:    .cfi_offset %edi, -8
-; X32-NEXT:    xorpd %xmm0, %xmm0
-; X32-NEXT:    movlpd %xmm0, (%esp)
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movlps %xmm0, (%esp)
 ; X32-NEXT:    movq (%esp), %mm0
 ; X32-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
 ; X32-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movlpd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movq %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq {{[0-9]+}}(%esp), %mm1
 ; X32-NEXT:    xorl %edi, %edi
 ; X32-NEXT:    maskmovq %mm1, %mm0
@@ -54,8 +54,8 @@ define void @test1() {
 ;
 ; X64-LABEL: test1:
 ; X64:       ## BB#0: ## %entry
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    movq %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movlps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %mm0
 ; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
 ; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 6b7f489..b0240dd 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -3,12 +3,14 @@
 
 ; CHECK: movl
 ; CHECK: paddw
-; CHECK: movlpd
+; CHECK: movq
+
+; FIXME - if this test cares about scheduling, why isn't it being checked?
 
 ; Scheduler causes produce a different instruction order
 ; ATOM: movl
 ; ATOM: paddw
-; ATOM: movlpd
+; ATOM: movq
 
 ; bitcast a v4i16 to v2i32
 
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index 060dfb1..8ed2785 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -52,7 +52,7 @@ forbody:		; preds = %forcond
 ; CHECK-NEXT: psraw $8
 ; CHECK-NEXT: psraw $2
 ; CHECK-NEXT: pshufb
-; CHECK-NEXT: movlpd
+; CHECK-NEXT: movq
 ;
 ; FIXME: We shouldn't require both a movd and an insert.
 ; CHECK-WIDE: %forbody
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index ccf0bd1..4e9d2df 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: movl
-; CHECK: movlpd
+; CHECK: movq
 
 ; bitcast a i64 to v2i32
 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 2aa870f..3028052 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -84,7 +84,7 @@ define void @shuf5(<8 x i8>* %p) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
 ; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    movq %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %v, <8 x i8>* %p, align 8
diff --git a/test/CodeGen/X86/x86-64-asm.ll b/test/CodeGen/X86/x86-64-asm.ll
index 2640e59..f103ab7 100644
--- a/test/CodeGen/X86/x86-64-asm.ll
+++ b/test/CodeGen/X86/x86-64-asm.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @frame_dummy() {
 entry:
-        %tmp1 = tail call void (i8*)* (void (i8*)*)* asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
+        %tmp1 = tail call void (i8*)* (void (i8*)*) asm "", "=r,0,~{dirflag},~{fpsr},~{flags}"( void (i8*)* null )           ; <void (i8*)*> [#uses=0]
         ret void
 }
 
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 2879fb4..2c954db 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -3,7 +3,7 @@
 define i64 @z() nounwind {
 ; CHECK:      movq    $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
 ; CHECK-NEXT: addl    %fs:0, %e[[R0]]x
-; CHECK-NEXT: andq    $100, %r[[R0]]x
+; CHECK-NEXT: andl    $100, %e[[R0]]x
 
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-64-varargs.ll b/test/CodeGen/X86/x86-64-varargs.ll
index f40e02f..ed07bde 100644
--- a/test/CodeGen/X86/x86-64-varargs.ll
+++ b/test/CodeGen/X86/x86-64-varargs.ll
@@ -6,6 +6,6 @@ declare i32 @printf(i8*, ...) nounwind
 
 define i32 @main() nounwind  {
 entry:
-	%tmp10.i = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([26 x i8], [26 x i8]* @.str, i32 0, i64 0), i32 12, double 0x3FF3EB8520000000, i32 120, i64 123456677890, i32 -10, double 4.500000e+15 ) nounwind 		; <i32> [#uses=0]
+	%tmp10.i = tail call i32 (i8*, ...) @printf( i8* getelementptr ([26 x i8], [26 x i8]* @.str, i32 0, i64 0), i32 12, double 0x3FF3EB8520000000, i32 120, i64 123456677890, i32 -10, double 4.500000e+15 ) nounwind 		; <i32> [#uses=0]
 	ret i32 0
 }
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index ebc1907..825efa6 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -17,7 +17,7 @@ define i32 @t1() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
 
@@ -31,7 +31,7 @@ define i32 @t2() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
 
@@ -45,6 +45,6 @@ define i32 @t3() nounwind {
     %2 = extractvalue {i64, i1} %1, 0
     %3 = extractvalue {i64, i1} %1, 1
     %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
     ret i32 0
 }
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
index dd1fcca..397e5bc 100644
--- a/test/CodeGen/X86/xor-icmp.ll
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -24,11 +24,11 @@ entry:
   br i1 %4, label %bb1, label %bb
 
 bb:                                               ; preds = %entry
-  %5 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=1]
+  %5 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=1]
   ret i32 %5
 
 bb1:                                              ; preds = %entry
-  %6 = tail call i32 (...)* @bar() nounwind       ; <i32> [#uses=1]
+  %6 = tail call i32 (...) @bar() nounwind       ; <i32> [#uses=1]
   ret i32 %6
 }
 
@@ -59,7 +59,7 @@ entry:
   br i1 %2, label %bb, label %return
 
 bb:                                               ; preds = %entry
-  %3 = tail call i32 (...)* @foo() nounwind       ; <i32> [#uses=0]
+  %3 = tail call i32 (...) @foo() nounwind       ; <i32> [#uses=0]
   ret i32 undef
 
 return:                                           ; preds = %entry