36 files changed, 527 insertions, 352 deletions
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
index 29d74a0..c400c76 100644
--- a/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -21,8 +21,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: LV: We can vectorize this loop!
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds [255 x i32]* @a, i64 0, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %red.05
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 255
@@ -42,5 +42,6 @@ for.end:                                          ; preds = %for.body
 ; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]], [[unroll:![0-9]+]]}
 ; CHECK: [[width]] = !{!"llvm.loop.vectorize.width", i32 1}
 ; CHECK: [[unroll]] = !{!"llvm.loop.interleave.count", i32 1}
-; CHECK: [[scalar]] = distinct !{[[scalar]], [[width]], [[unroll]]}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]], [[unroll]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
 
diff --git a/test/Transforms/LoopVectorize/X86/assume.ll b/test/Transforms/LoopVectorize/X86/assume.ll
index a94e24d..4fd378d 100644
--- a/test/Transforms/LoopVectorize/X86/assume.ll
+++ b/test/Transforms/LoopVectorize/X86/assume.ll
@@ -22,12 +22,12 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
   %cmp1 = fcmp ogt float %0, 1.000000e+02
   tail call void @llvm.assume(i1 %cmp1)
   %add = fadd float %0, 1.000000e+00
-  %arrayidx5 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
   store float %add, float* %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv, 1599
@@ -48,13 +48,13 @@ attributes #1 = { nounwind }
 ; Function Attrs: nounwind uwtable
 define void @test2(%struct.data* nocapture readonly %d) #0 {
 entry:
-  %b = getelementptr inbounds %struct.data* %d, i64 0, i32 1
-  %0 = load float** %b, align 8
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
   %ptrint = ptrtoint float* %0 to i64
   %maskedptr = and i64 %ptrint, 31
   %maskcond = icmp eq i64 %maskedptr, 0
-  %a = getelementptr inbounds %struct.data* %d, i64 0, i32 0
-  %1 = load float** %a, align 8
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
   %ptrint2 = ptrtoint float* %1 to i64
   %maskedptr3 = and i64 %ptrint2, 31
   %maskcond4 = icmp eq i64 %maskedptr3, 0
@@ -84,11 +84,11 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   tail call void @llvm.assume(i1 %maskcond)
-  %arrayidx = getelementptr inbounds float* %0, i64 %indvars.iv
-  %2 = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
   %add = fadd float %2, 1.000000e+00
   tail call void @llvm.assume(i1 %maskcond4)
-  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
   store float %add, float* %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv, 1599
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
index 01c9125..37977c4 100644
--- a/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -12,8 +12,8 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
-  %3 = load float* %2, align 4
+  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
   %4 = fmul float %3, 3.000000e+00
   store float %4, float* %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -35,8 +35,8 @@ define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i64* %a, i64 %indvars.iv
-  %3 = load i64* %2, align 4
+  %2 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 4
   %4 = add i64 %3, 3
   store i64 %4, i64* %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll
index a220866..754e859 100644
--- a/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -20,7 +20,7 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
   store i32 %n, i32* %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
index f4c07b4..d75b1d9 100644
--- a/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
+++ b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@@ -15,10 +15,10 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32]* @B, i64 0, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %shl = ashr i32 %0, 3
-  %arrayidx2 = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
   store i32 %shl, i32* %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 0af562d..eb2a2a5 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -13,7 +13,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
   %2 = trunc i64 %indvars.iv to i8
-  %3 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
   store i8 %2, i8* %3, align 1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -35,7 +35,7 @@ define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) noun
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
   %add = add nsw i64 %indvars.iv, 3
   %tofp = sitofp i64 %add to float
-  %gep = getelementptr inbounds float* %B, i64 %indvars.iv
+  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
   store float %tofp, float* %gep, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index 98718e1..0136571 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -21,15 +21,15 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %0 = shl nsw i64 %indvars.iv, 1
-  %arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
-  %1 = load i32* %arrayidx, align 8
+  %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 8
   %idxprom1 = sext i32 %1 to i64
-  %arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
-  %2 = load i32* %arrayidx2, align 4
-  %arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
-  %3 = load i32* %arrayidx4, align 4
+  %arrayidx2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx4, align 4
   %idxprom5 = sext i32 %3 to i64
-  %arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %idxprom5
   store i32 %2, i32* %arrayidx6, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
index 529ed88..4a56d6b 100644
--- a/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
@@ -20,10 +20,10 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [10000 x float]* @float_array, i64 0, i64 %indvars.iv
-  %1 = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [10000 x float], [10000 x float]* @float_array, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
   %conv = fptoui float %1 to i32
-  %arrayidx2 = getelementptr inbounds [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
   store i32 %conv, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
index ef3e3be..c066afc 100644
--- a/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-macosx"
 
 define void @convert() {
 entry:
-  %0 = load i32* @n, align 4
+  %0 = load i32, i32* @n, align 4
   %cmp4 = icmp eq i32 %0, 0
   br i1 %cmp4, label %for.end, label %for.body.preheader
 
@@ -22,10 +22,10 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [10000 x double]* @double_array, i64 0, i64 %indvars.iv
-  %1 = load double* %arrayidx, align 8
+  %arrayidx = getelementptr inbounds [10000 x double], [10000 x double]* @double_array, i64 0, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
   %conv = fptoui double %1 to i32
-  %arrayidx2 = getelementptr inbounds [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
   store i32 %conv, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %2 = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll b/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
index 23e6227..b3a0710 100644
--- a/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@@ -11,10 +11,10 @@ entry:
   br label %for.body
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
-  %tmp = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp = load float, float* %arrayidx, align 4
   %conv = fptosi float %tmp to i8
-  %arrayidx2 = getelementptr inbounds i8* %a, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
   store i8 %conv, i8* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 256
diff --git a/test/Transforms/LoopVectorize/X86/gather-cost.ll b/test/Transforms/LoopVectorize/X86/gather-cost.ll
index 09363d6..f0e6c8f 100644
--- a/test/Transforms/LoopVectorize/X86/gather-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -31,32 +31,32 @@ for.body:
   %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
   %add = add i64 %v.055, %offset
   %mul = mul i64 %add, 3
-  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
-  %0 = load float* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
-  %1 = load float* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float, float* %arrayidx2, align 4
   %mul3 = fmul fast float %0, %1
-  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
-  %2 = load float* %arrayidx4, align 4
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float, float* %arrayidx4, align 4
   %mul5 = fmul fast float %mul3, %2
-  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
-  %3 = load float* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float, float* %arrayidx6, align 4
   %mul7 = fmul fast float %mul5, %3
-  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
-  %4 = load float* %arrayidx8, align 4
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float, float* %arrayidx8, align 4
   %mul9 = fmul fast float %mul7, %4
   %add10 = fadd fast float %r.057, %mul9
   %arrayidx.sum = add i64 %mul, 1
-  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
-  %5 = load float* %arrayidx11, align 4
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
   %mul13 = fmul fast float %1, %5
   %mul15 = fmul fast float %2, %mul13
   %mul17 = fmul fast float %3, %mul15
   %mul19 = fmul fast float %4, %mul17
   %add20 = fadd fast float %g.056, %mul19
   %arrayidx.sum52 = add i64 %mul, 2
-  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
-  %6 = load float* %arrayidx21, align 4
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
   %mul23 = fmul fast float %1, %6
   %mul25 = fmul fast float %2, %mul23
   %mul27 = fmul fast float %3, %mul25
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index 05403cd..c581f4b 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -28,12 +28,12 @@ define void @example1() nounwind uwtable ssp {
 
 ; <label>:1                                       ; preds = %1, %0
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32* %4, align 4
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
   %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
   store i32 %6, i32* %7, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -61,10 +61,10 @@ define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb,
 
 ; <label>:1                                       ; preds = %1, %0
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
-  %3 = load i16* %2, align 2
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
   %4 = sext i16 %3 to i32
-  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
   store i32 %4, i32* %5, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 46efaf0..cbba530 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -14,8 +14,8 @@ entry:
   br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
 
 for.end.us:                                       ; preds = %for.body3.us
-  %arrayidx9.us = getelementptr inbounds i32* %b, i64 %indvars.iv33
-  %0 = load i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
   %add10.us = add nsw i32 %0, 3
   store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next34 = add i64 %indvars.iv33, 1
@@ -28,8 +28,8 @@ for.body3.us:                                     ; preds = %for.body3.us, %for.
   %1 = trunc i64 %indvars.iv29 to i32
   %add4.us = add i32 %add.us, %1
   %idxprom.us = sext i32 %add4.us to i64
-  %arrayidx.us = getelementptr inbounds i32* %a, i64 %idxprom.us
-  %2 = load i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
   %add5.us = add nsw i32 %2, 1
   store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next30 = add i64 %indvars.iv29, 1
@@ -41,7 +41,7 @@ for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
   %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
   %3 = trunc i64 %indvars.iv33 to i32
   %add.us = add i32 %3, %k
-  %arrayidx7.us = getelementptr inbounds i32* %a, i64 %indvars.iv33
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
   br label %for.body3.us
 
 for.end15:                                        ; preds = %for.end.us, %entry
diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index 9e2de80..8c375cc 100644
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -46,35 +46,35 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp slt i32 %0, 10000
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp slt i32 %3, 100
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %idxprom2 = sext i32 %4 to i64
-  %5 = load i32** %B.addr, align 8
-  %arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2
-  %6 = load i32* %arrayidx3, align 4
-  %7 = load i32* %i, align 4
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load i32** %trigger.addr, align 8
-  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
   %add = add nsw i32 %6, %9
-  %10 = load i32* %i, align 4
+  %10 = load i32, i32* %i, align 4
   %idxprom6 = sext i32 %10 to i64
-  %11 = load i32** %A.addr, align 8
-  %arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
   store i32 %add, i32* %arrayidx7, align 4
   br label %if.end
 
@@ -82,7 +82,7 @@ if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %12 = load i32* %i, align 4
+  %12 = load i32, i32* %i, align 4
   %inc = add nsw i32 %12, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond
@@ -130,36 +130,36 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp slt i32 %0, 10000
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp slt i32 %3, 100
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %idxprom2 = sext i32 %4 to i64
-  %5 = load float** %B.addr, align 8
-  %arrayidx3 = getelementptr inbounds float* %5, i64 %idxprom2
-  %6 = load float* %arrayidx3, align 4
-  %7 = load i32* %i, align 4
+  %5 = load float*, float** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load i32** %trigger.addr, align 8
-  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
   %conv = sitofp i32 %9 to float
   %add = fadd float %6, %conv
-  %10 = load i32* %i, align 4
+  %10 = load i32, i32* %i, align 4
   %idxprom6 = sext i32 %10 to i64
-  %11 = load float** %A.addr, align 8
-  %arrayidx7 = getelementptr inbounds float* %11, i64 %idxprom6
+  %11 = load float*, float** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
   store float %add, float* %arrayidx7, align 4
   br label %if.end
 
@@ -167,7 +167,7 @@ if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %12 = load i32* %i, align 4
+  %12 = load i32, i32* %i, align 4
   %inc = add nsw i32 %12, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond
@@ -218,36 +218,36 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp slt i32 %0, 10000
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp slt i32 %3, 100
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %idxprom2 = sext i32 %4 to i64
-  %5 = load double** %B.addr, align 8
-  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
-  %6 = load double* %arrayidx3, align 8
-  %7 = load i32* %i, align 4
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load i32** %trigger.addr, align 8
-  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
   %conv = sitofp i32 %9 to double
   %add = fadd double %6, %conv
-  %10 = load i32* %i, align 4
+  %10 = load i32, i32* %i, align 4
   %idxprom6 = sext i32 %10 to i64
-  %11 = load double** %A.addr, align 8
-  %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
   store double %add, double* %arrayidx7, align 8
   br label %if.end
 
@@ -255,7 +255,7 @@ if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %12 = load i32* %i, align 4
+  %12 = load i32, i32* %i, align 4
   %inc = add nsw i32 %12, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond
@@ -297,37 +297,37 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp slt i32 %0, 10000
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp slt i32 %3, 100
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %mul = mul nsw i32 %4, 2
   %idxprom2 = sext i32 %mul to i64
-  %5 = load double** %B.addr, align 8
-  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
-  %6 = load double* %arrayidx3, align 8
-  %7 = load i32* %i, align 4
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load i32** %trigger.addr, align 8
-  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
   %conv = sitofp i32 %9 to double
   %add = fadd double %6, %conv
-  %10 = load i32* %i, align 4
+  %10 = load i32, i32* %i, align 4
   %idxprom6 = sext i32 %10 to i64
-  %11 = load double** %A.addr, align 8
-  %arrayidx7 = getelementptr inbounds double* %11, i64 %idxprom6
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
   store double %add, double* %arrayidx7, align 8
   br label %if.end
 
@@ -335,7 +335,7 @@ if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %12 = load i32* %i, align 4
+  %12 = load i32, i32* %i, align 4
   %inc = add nsw i32 %12, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond
@@ -373,43 +373,43 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp slt i32 %0, 10000
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp slt i32 %3, 100
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %idxprom2 = sext i32 %4 to i64
-  %5 = load i32** %B.addr, align 8
-  %arrayidx3 = getelementptr inbounds i32* %5, i64 %idxprom2
-  %6 = load i32* %arrayidx3, align 4
-  %7 = load i32* %i, align 4
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load i32** %trigger.addr, align 8
-  %arrayidx5 = getelementptr inbounds i32* %8, i64 %idxprom4
-  %9 = load i32* %arrayidx5, align 4
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
   %add = add nsw i32 %6, %9
-  %10 = load i32* %i, align 4
+  %10 = load i32, i32* %i, align 4
   %idxprom6 = sext i32 %10 to i64
-  %11 = load i32** %A.addr, align 8
-  %arrayidx7 = getelementptr inbounds i32* %11, i64 %idxprom6
-  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %12 = load i32* %i, align 4
+  %12 = load i32, i32* %i, align 4
   %inc = add nsw i32 %12, 1
   store i32 %inc, i32* %i, align 4
   br label %for.cond
@@ -459,30 +459,30 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.inc, %entry
-  %0 = load i32* %i, align 4
+  %0 = load i32, i32* %i, align 4
   %cmp = icmp sge i32 %0, 0
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %1 = load i32* %i, align 4
+  %1 = load i32, i32* %i, align 4
   %idxprom = sext i32 %1 to i64
-  %2 = load i32** %trigger.addr, align 8
-  %arrayidx = getelementptr inbounds i32* %2, i64 %idxprom
-  %3 = load i32* %arrayidx, align 4
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
   %cmp1 = icmp sgt i32 %3, 0
   br i1 %cmp1, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  %4 = load i32* %i, align 4
+  %4 = load i32, i32* %i, align 4
   %idxprom2 = sext i32 %4 to i64
-  %5 = load double** %in.addr, align 8
-  %arrayidx3 = getelementptr inbounds double* %5, i64 %idxprom2
-  %6 = load double* %arrayidx3, align 8
+  %5 = load double*, double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
   %add = fadd double %6, 5.000000e-01
-  %7 = load i32* %i, align 4
+  %7 = load i32, i32* %i, align 4
   %idxprom4 = sext i32 %7 to i64
-  %8 = load double** %out.addr, align 8
-  %arrayidx5 = getelementptr inbounds double* %8, i64 %idxprom4
+  %8 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
   store double %add, double* %arrayidx5, align 8
   br label %if.end
 
@@ -490,7 +490,7 @@ if.end:                                           ; preds = %if.then, %for.body
   br label %for.inc
 
 for.inc:                                          ; preds = %if.end
-  %9 = load i32* %i, align 4
+  %9 = load i32, i32* %i, align 4
   %dec = add nsw i32 %9, -1
   store i32 %dec, i32* %i, align 4
   br label %for.cond
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 7feb66c..ba8e11e 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -54,17 +54,17 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
   store i32 %add, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 32
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32* %a, align 4
+  %1 = load i32, i32* %a, align 4
   ret i32 %1
 }
 
@@ -105,17 +105,17 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
   store i32 %add, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 32
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32* %a, align 4
+  %1 = load i32, i32* %a, align 4
   ret i32 %1
 }
 
@@ -156,17 +156,17 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %add = add nsw i32 %0, %N
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
   store i32 %add, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 32
   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
 
 for.end:                                          ; preds = %for.body
-  %1 = load i32* %a, align 4
+  %1 = load i32, i32* %a, align 4
   ret i32 %1
 }
 
diff --git a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
index fd69dc4..bb972c4 100644
--- a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
+++ b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
@@ -10,8 +10,8 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
   %add = fadd float %0, 1.000000e+00
   store float %add, float* %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
diff --git a/test/Transforms/LoopVectorize/X86/no-vector.ll b/test/Transforms/LoopVectorize/X86/no-vector.ll
index 692eec9..4b464b0 100644
--- a/test/Transforms/LoopVectorize/X86/no-vector.ll
+++ b/test/Transforms/LoopVectorize/X86/no-vector.ll
@@ -8,8 +8,8 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
   %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8* %s, i32 %i.06
-  %0 = load i8* %arrayidx, align 1
+  %arrayidx = getelementptr inbounds i8, i8* %s, i32 %i.06
+  %0 = load i8, i8* %arrayidx, align 1
   %conv = sext i8 %0 to i32
   %xor = xor i32 %conv, %r.05
   %inc = add nsw i32 %i.06, 1
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
index ad01044..631361c 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -17,28 +17,28 @@ entry:
   br label %for.body
 
 for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
-  %indvars.iv.reload = load i64* %indvars.iv.reg2mem
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv.reload
-  %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv.reload
-  %1 = load i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.reload = load i64, i64* %indvars.iv.reg2mem
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.reload
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.reload
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %idxprom3 = sext i32 %1 to i64
-  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next = add i64 %indvars.iv.reload, 1
   ; A new store without the parallel metadata here:
   store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem
-  %indvars.iv.next.reload1 = load i64* %indvars.iv.next.reg2mem
-  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next.reload1
-  %2 = load i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next.reload1 = load i64, i64* %indvars.iv.next.reg2mem
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next.reload1
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !3
   store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
-  %indvars.iv.next.reload = load i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload = load i64, i64* %indvars.iv.next.reg2mem
   %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 512
   br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3
 
 for.body.for.body_crit_edge:                      ; preds = %for.body
-  %indvars.iv.next.reload2 = load i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload2 = load i64, i64* %indvars.iv.next.reg2mem
   store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem
   br label %for.body
 
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 22ab521..53061ed 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -20,16 +20,16 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
-  %1 = load i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
   %idxprom3 = sext i32 %1 to i64
-  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
   store i32 %0, i32* %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
-  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
-  %2 = load i32* %arrayidx6, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4
   store i32 %2, i32* %arrayidx2, align 4
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 512
@@ -50,18 +50,18 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
-  %1 = load i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %idxprom3 = sext i32 %1 to i64
-  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
   ; This store might have originated from inlining a function with a parallel
   ; loop. Refers to a list with the "original loop reference" (!4) also included.
   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !5
   %indvars.iv.next = add i64 %indvars.iv, 1
-  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
-  %2 = load i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !3
   store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 512
@@ -83,18 +83,18 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !6
-  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
-  %1 = load i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !6
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !6
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !6
   %idxprom3 = sext i32 %1 to i64
-  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
   ; This refers to the loop marked with !7 which we are not in at the moment.
   ; It should prevent detecting as a parallel loop.
   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !7
   %indvars.iv.next = add i64 %indvars.iv, 1
-  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
-  %2 = load i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !6
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.mem.parallel_loop_access !6
   store i32 %2, i32* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !6
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 512
diff --git a/test/Transforms/LoopVectorize/X86/powof2div.ll b/test/Transforms/LoopVectorize/X86/powof2div.ll
index 054da8e..6bc738a 100644
--- a/test/Transforms/LoopVectorize/X86/powof2div.ll
+++ b/test/Transforms/LoopVectorize/X86/powof2div.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @Foo = common global %struct.anon zeroinitializer, align 4
 
 ;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>*
+;CHECK: load <4 x i32>, <4 x i32>*
 ;CHECK: sdiv <4 x i32>
 ;CHECK: store <4 x i32>
 
@@ -17,10 +17,10 @@ entry:
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
   %div = sdiv i32 %0, 2
-  %arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
   store i32 %div, i32* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 100
diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
index 3957a55..3741b95 100644
--- a/test/Transforms/LoopVectorize/X86/reduction-crash.ll
+++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -14,17 +14,17 @@ bb:
   br label %bb2
 
 bb2:                                              ; preds = %bb
-  %tmp = load double* null, align 8
+  %tmp = load double, double* null, align 8
   br i1 undef, label %bb3, label %bb12
 
 bb3:                                              ; preds = %bb3, %bb2
   %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ]
   %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ]
-  %tmp6 = getelementptr inbounds [16 x double]* undef, i32 0, i32 %tmp5
-  %tmp7 = load double* %tmp6, align 4
+  %tmp6 = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 %tmp5
+  %tmp7 = load double, double* %tmp6, align 4
   %tmp8 = add nsw i32 %tmp5, 1
   %tmp9 = fadd fast double %tmp4, undef
-  %tmp10 = getelementptr inbounds float* %arg, i32 %tmp5
+  %tmp10 = getelementptr inbounds float, float* %arg, i32 %tmp5
   store float undef, float* %tmp10, align 4
   %tmp11 = icmp eq i32 %tmp8, %arg1
   br i1 %tmp11, label %bb12, label %bb3
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index 8c7a881..47c262b 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -30,12 +30,12 @@ define void @example1() optsize {
 
 ; <label>:1                                       ; preds = %1, %0
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32* %4, align 4
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
   %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
   store i32 %6, i32* %7, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -65,7 +65,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 
 .lr.ph5:                                          ; preds = %0, %.lr.ph5
   %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
-  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
   store i32 %x, i32* %3, align 4
   %indvars.iv.next7 = add i64 %indvars.iv6, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
@@ -76,12 +76,12 @@ define void @example2(i32 %n, i32 %x) optsize {
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
   %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
   %4 = add nsw i32 %.02, -1
-  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %6 = load i32* %5, align 4
-  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %8 = load i32* %7, align 4
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
   %9 = and i32 %8, %6
-  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
   store i32 %9, i32* %10, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %11 = icmp eq i32 %4, 0
@@ -104,9 +104,9 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
   %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
   %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
   %2 = add nsw i32 %.05, -1
-  %3 = getelementptr inbounds i32* %.023, i64 1
-  %4 = load i32* %.023, align 16
-  %5 = getelementptr inbounds i32* %.014, i64 1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
   store i32 %4, i32* %.014, align 16
   %6 = icmp eq i32 %2, 0
   br i1 %6, label %._crit_edge, label %.lr.ph
@@ -128,9 +128,9 @@ define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
   %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
   %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
   %2 = add nsw i32 %.05, -1
-  %3 = getelementptr inbounds i32* %.023, i64 1
-  %4 = load i32* %.023, align 16
-  %5 = getelementptr inbounds i32* %.014, i64 1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
   store i32 %4, i32* %.014, align 16
   %6 = icmp eq i32 %2, 0
   br i1 %6, label %._crit_edge, label %.lr.ph
@@ -152,11 +152,11 @@ define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
   %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
   %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
   %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
-  %2 = getelementptr inbounds i16* %.04, i64 1
-  %3 = load i16* %.04, align 2
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
   %4 = zext i16 %3 to i32
   %5 = shl nuw nsw i32 %4, 7
-  %6 = getelementptr inbounds i32* %.013, i64 1
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
   store i32 %5, i32* %.013, align 4
   %7 = add nsw i32 %i.02, 1
   %exitcond = icmp eq i32 %7, 256
@@ -178,11 +178,11 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
   %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
   %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
   %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
-  %2 = getelementptr inbounds i16* %.04, i64 1
-  %3 = load i16* %.04, align 2
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
   %4 = zext i16 %3 to i32
   %5 = shl nuw nsw i32 %4, 7
-  %6 = getelementptr inbounds i32* %.013, i64 1
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
   store i32 %5, i32* %.013, align 4
   %7 = add nsw i32 %i.02, 1
   %exitcond = icmp eq i32 %7, 256
diff --git a/test/Transforms/LoopVectorize/X86/struct-store.ll b/test/Transforms/LoopVectorize/X86/struct-store.ll
index a995e43..4ff3b0e 100644
--- a/test/Transforms/LoopVectorize/X86/struct-store.ll
+++ b/test/Transforms/LoopVectorize/X86/struct-store.ll
@@ -15,7 +15,7 @@ entry:
 
 loop:
   %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
-  %tmp = getelementptr inbounds [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
+  %tmp = getelementptr inbounds [16 x { i64, i64 }], [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
   store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
diff --git a/test/Transforms/LoopVectorize/X86/tripcount.ll b/test/Transforms/LoopVectorize/X86/tripcount.ll
index a4ec694..c0bbb92 100644
--- a/test/Transforms/LoopVectorize/X86/tripcount.ll
+++ b/test/Transforms/LoopVectorize/X86/tripcount.ll
@@ -22,8 +22,8 @@ for.body.preheader:
 
 for.body:
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds [0 x i32]* @big, i32 0, i32 %i.07
-  %0 = load i32* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @big, i32 0, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
   %neg = xor i32 %0, -1
   store i32 %neg, i32* %arrayidx, align 4
   %inc = add nsw i32 %i.07, 1
diff --git a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index 86c32b2..38af11c 100644
--- a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -12,10 +12,10 @@ entry:
   br label %for.body
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i64* %a, i64 %indvars.iv
-  %tmp = load i64* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 4
   %conv = uitofp i64 %tmp to double
-  %arrayidx2 = getelementptr inbounds double* %b, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds double, double* %b, i64 %indvars.iv
   store double %conv, double* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 256
diff --git a/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
index 5064fec..52914b6 100644
--- a/test/Transforms/LoopVectorize/X86/unroll-pm.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -17,8 +17,8 @@ define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
-  %3 = load i32* %2, align 4
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
   %4 = add nsw i32 %3, 6
   store i32 %4, i32* %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
index 716dc08..4411da3 100644
--- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -16,8 +16,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; CHECK-VECTOR: ret
 ;
 ; CHECK-SCALAR-LABEL: @foo(
-; CHECK-SCALAR: load i32*
-; CHECK-SCALAR-NOT: load i32*
+; CHECK-SCALAR: load i32, i32*
+; CHECK-SCALAR-NOT: load i32, i32*
 ; CHECK-SCALAR: store i32
 ; CHECK-SCALAR-NOT: store i32
 ; CHECK-SCALAR: ret
@@ -26,8 +26,8 @@ define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
 
 ; <label>:1                                       ; preds = %1, %0
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
-  %3 = load i32* %2, align 4
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
   %4 = add nsw i32 %3, 6
   store i32 %4, i32* %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -57,8 +57,8 @@ define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
-  %3 = load i32* %2, align 4
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
   %4 = add nsw i32 %3, 6
   store i32 %4, i32* %2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -86,10 +86,10 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
   %mul = fmul float %0, %N
-  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
   store float %mul, float* %arrayidx2, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 256
diff --git a/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/test/Transforms/LoopVectorize/X86/unroll_selection.ll
index c684b4e..71b8290 100644
--- a/test/Transforms/LoopVectorize/X86/unroll_selection.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll_selection.ll
@@ -16,8 +16,8 @@ define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp {
 
 ; <label>:2                                       ; preds = %2, %0
   %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ]
-  %3 = getelementptr inbounds double* %A, i64 %indvars.iv
-  %4 = load double* %3, align 8
+  %3 = getelementptr inbounds double, double* %A, i64 %indvars.iv
+  %4 = load double, double* %3, align 8
   %5 = fadd double %4, 3.000000e+00
   %6 = fmul double %4, 2.000000e+00
   %7 = fadd double %5, %6
@@ -58,8 +58,8 @@ define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp {
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds i16* %A, i64 %i.01
-  %3 = load i16* %2, align 2
+  %2 = getelementptr inbounds i16, i16* %A, i64 %i.01
+  %3 = load i16, i16* %2, align 2
   %4 = xor i16 %3, 3
   store i16 %4, i16* %2, align 2
   %5 = add i64 %i.01, 1
diff --git a/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/test/Transforms/LoopVectorize/X86/veclib-calls.ll
new file mode 100644
index 0000000..62e0a44
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -0,0 +1,182 @@
+; RUN: opt < %s -vector-library=Accelerate -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sqrtf(float) nounwind readnone
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expf(float) nounwind readnone
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log_f32(
+;CHECK: vlogf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logf(float) nounwind readnone
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; For abs instruction we'll generate vector intrinsic, as it's cheaper than a lib call.
+;CHECK-LABEL: @fabs_f32(
+;CHECK: fabs{{.*}}<4 x float>
+;CHECK: ret void
+declare float @fabsf(float) nounwind readnone
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we can vectorize an intrinsic into a vector call.
+;CHECK-LABEL: @exp_f32_intrin(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.exp.f32(float) nounwind readnone
+define void @exp_f32_intrin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize arbitrary functions.
+;CHECK-LABEL: @foo_f32(
+;CHECK-NOT: foo{{.*}}<4 x float>
+;CHECK: ret void
+declare float @foo(float) nounwind readnone
+define void @foo_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @foo(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize calls with nobuiltin attribute.
+;CHECK-LABEL: @sqrt_f32_nobuiltin(
+;CHECK-NOT: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+define void @sqrt_f32_nobuiltin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone nobuiltin
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
index a781fbe..c2a0fed 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -15,9 +15,9 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; The source code for the test:
 ;
 ; #include <math.h>
-; void foo(float* restrict A, float * restrict B, int size)
+; void foo(float* restrict A, float * restrict B)
 ; {
-;   for (int i = 0; i < size; ++i) A[i] = sinf(B[i]);
+;   for (int i = 0; i < 1000; i+=2) A[i] = sinf(B[i]);
 ; }
 ;
 
@@ -25,24 +25,20 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
 ;
 
-define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
 entry:
-  %cmp6 = icmp sgt i32 %size, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.end
-
-for.body.preheader:
   br label %for.body
 
 for.body:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
   %call = tail call float @llvm.sin.f32(float %0)
-  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
   store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
   br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
 
 for.end.loopexit:
@@ -59,24 +55,20 @@ for.end:
 ; This method will not be vectorized, as scalar cost is lower than any of vector costs.
 ;
 
-define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
 entry:
-  %cmp6 = icmp sgt i32 %size, 0
-  br i1 %cmp6, label %for.body.preheader, label %for.end
-
-for.body.preheader:
   br label %for.body
 
 for.body:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
   %call = tail call float @llvm.sin.f32(float %0)
-  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
   store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
   br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
 
 for.end.loopexit:
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index e39e6b5..8d139ac 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -29,10 +29,10 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
-  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
-  %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
   %add = fadd fast float %0, %1
   store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -55,10 +55,10 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
-  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
-  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
-  %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %add = fadd fast float %0, %1
   store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
diff --git a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
index ece9895..5efabe1 100644
--- a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -15,12 +15,12 @@ define void @scalarselect(i1 %cond) {
 
 ; <label>:1
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32* %4, align 4
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
   %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
 
 ; A scalar select has a cost of 1 on core2
 ; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
@@ -42,12 +42,12 @@ define void @vectorselect(i1 %cond) {
 
 ; <label>:1
   %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
-  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
-  %3 = load i32* %2, align 4
-  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
-  %5 = load i32* %4, align 4
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
   %6 = add nsw i32 %5, %3
-  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
   %8 = icmp ult i64 %indvars.iv, 8
 
 ; A vector select has a cost of 1 on core2
diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
index e57cfef..6cd3c9c 100644
--- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -19,7 +19,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; CHECK: test_consecutive_store
 ; CHECK: The Widest type: 64 bits
 define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
-  %4 = load %0** %2, align 8
+  %4 = load %0*, %0** %2, align 8
   %5 = icmp eq %0** %0, %1
   br i1 %5, label %12, label %6
 
@@ -29,7 +29,7 @@ define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwt
 ; <label>:7                                       ; preds = %7, %6
   %8 = phi %0** [ %0, %6 ], [ %9, %7 ]
   store %0* %4, %0** %8, align 8
-  %9 = getelementptr inbounds %0** %8, i64 1
+  %9 = getelementptr inbounds %0*, %0** %8, i64 1
   %10 = icmp eq %0** %9, %1
   br i1 %10, label %11, label %7
 
@@ -61,12 +61,12 @@ define void @test_nonconsecutive_store() nounwind ssp uwtable {
 
 ; <label>:3                                       ; preds = %3, %1
   %4 = phi i64 [ 0, %1 ], [ %11, %3 ]
-  %5 = getelementptr inbounds [2048 x i16]* @q, i64 0, i64 %4
-  %6 = load i16* %5, align 2
+  %5 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q, i64 0, i64 %4
+  %6 = load i16, i16* %5, align 2
   %7 = sext i16 %6 to i64
   %8 = add i64 %7, 1
   %9 = inttoptr i64 %8 to i32*
-  %10 = getelementptr inbounds [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
+  %10 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
   store i32* %9, i32** %10, align 8
   %11 = add i64 %4, 1
   %12 = trunc i64 %11 to i32
@@ -100,8 +100,8 @@ define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
 ; <label>:1                                       ; preds = %1, %0
   %2 = phi i64 [ 0, %0 ], [ %10, %1 ]
   %3 = phi i8 [ 0, %0 ], [ %9, %1 ]
-  %4 = getelementptr inbounds [1024 x i32*]* @ia, i32 0, i64 %2
-  %5 = load i32** %4, align 4
+  %4 = getelementptr inbounds [1024 x i32*], [1024 x i32*]* @ia, i32 0, i64 %2
+  %5 = load i32*, i32** %4, align 4
   %6 = ptrtoint i32* %5 to i64
   %7 = trunc i64 %6 to i8
   %8 = add i8 %3, 1
@@ -127,9 +127,9 @@ define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
 
 ; <label>:3                                       ; preds = %3, %1
   %4 = phi i64 [ 0, %1 ], [ %10, %3 ]
-  %5 = getelementptr inbounds [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
-  %6 = getelementptr inbounds [2048 x i16]* @q2, i64 0, i64 %4
-  %7 = load i32** %5, align 2
+  %5 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
+  %6 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q2, i64 0, i64 %4
+  %7 = load i32*, i32** %5, align 2
   %8 = ptrtoint i32* %7 to i64
   %9 = trunc i64 %8 to i16
   store i16 %9, i16* %6, align 8
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 011ce8e..60ad3c6 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -52,7 +52,7 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !16
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !16
   %0 = trunc i64 %indvars.iv to i32, !dbg !16
   store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18
   %cmp3 = icmp sle i32 %0, %Length, !dbg !22
@@ -74,7 +74,7 @@ entry:
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !30
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !30
   %0 = trunc i64 %indvars.iv to i32, !dbg !30
   store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25
@@ -97,12 +97,12 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv, !dbg !35
-  %0 = load i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
   %idxprom1 = sext i32 %0 to i64, !dbg !35
-  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1, !dbg !35
-  %1 = load i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
-  %arrayidx4 = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
   store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
@@ -122,40 +122,40 @@ attributes #0 = { nounwind }
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = !{!"0x11\004\00clang version 3.5.0\001\00\006\00\002", !1, !2, !2, !3, !2, !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
-!1 = !{!"source.cpp", !"."}
+!0 = !MDCompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: 2, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !MDFile(filename: "source.cpp", directory: ".")
 !2 = !{}
 !3 = !{!4, !7, !8}
-!4 = !{!"0x2e\00test\00test\00\001\000\001\000\006\00256\001\001", !1, !5, !6, null, void (i32*, i32)* @_Z4testPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
-!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./source.cpp]
-!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = !{!"0x2e\00test_disabled\00test_disabled\00\0010\000\001\000\006\00256\001\0010", !1, !5, !6, null, void (i32*, i32)* @_Z13test_disabledPii, null, null, !2} ; [ DW_TAG_subprogram ] [line 10] [def] [test_disabled]
-!8 = !{!"0x2e\00test_array_bounds\00test_array_bounds\00\0016\000\001\000\006\00256\001\0016", !1, !5, !6, null, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, !2} ; [ DW_TAG_subprogram ] [line 16] [def] [test_array_bounds]
+!4 = !MDSubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !1, scope: !5, type: !6, function: void (i32*, i32)* @_Z4testPii, variables: !2)
+!5 = !MDFile(filename: "source.cpp", directory: ".")
+!6 = !MDSubroutineType(types: !2)
+!7 = !MDSubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 10, file: !1, scope: !5, type: !6, function: void (i32*, i32)* @_Z13test_disabledPii, variables: !2)
+!8 = !MDSubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 16, file: !1, scope: !5, type: !6, function: void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, variables: !2)
 !9 = !{i32 2, !"Dwarf Version", i32 2}
-!10 = !{i32 2, !"Debug Info Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
 !11 = !{!"clang version 3.5.0"}
 !12 = !MDLocation(line: 3, column: 8, scope: !13)
-!13 = !{!"0xb\003\003\000", !1, !4} ; [ DW_TAG_lexical_block ]
+!13 = distinct !MDLexicalBlock(line: 3, column: 3, file: !1, scope: !4)
 !14 = !{!14, !15, !15}
 !15 = !{!"llvm.loop.vectorize.enable", i1 true}
 !16 = !MDLocation(line: 4, column: 5, scope: !17)
-!17 = !{!"0xb\003\0036\000", !1, !13} ; [ DW_TAG_lexical_block ]
+!17 = distinct !MDLexicalBlock(line: 3, column: 36, file: !1, scope: !13)
 !18 = !{!19, !19, i64 0}
 !19 = !{!"int", !20, i64 0}
 !20 = !{!"omnipotent char", !21, i64 0}
 !21 = !{!"Simple C/C++ TBAA"}
 !22 = !MDLocation(line: 5, column: 9, scope: !23)
-!23 = !{!"0xb\005\009\000", !1, !17} ; [ DW_TAG_lexical_block ]
+!23 = distinct !MDLexicalBlock(line: 5, column: 9, file: !1, scope: !17)
 !24 = !MDLocation(line: 8, column: 1, scope: !4)
 !25 = !MDLocation(line: 12, column: 8, scope: !26)
-!26 = !{!"0xb\0012\003\000", !1, !7} ; [ DW_TAG_lexical_block ]
+!26 = distinct !MDLexicalBlock(line: 12, column: 3, file: !1, scope: !7)
 !27 = !{!27, !28, !29}
 !28 = !{!"llvm.loop.interleave.count", i32 1}
 !29 = !{!"llvm.loop.vectorize.width", i32 1}
 !30 = !MDLocation(line: 13, column: 5, scope: !26)
 !31 = !MDLocation(line: 14, column: 1, scope: !7)
 !32 = !MDLocation(line: 18, column: 8, scope: !33)
-!33 = !{!"0xb\0018\003\000", !1, !8} ; [ DW_TAG_lexical_block ]
+!33 = distinct !MDLexicalBlock(line: 18, column: 3, file: !1, scope: !8)
 !34 = !{!34, !15}
 !35 = !MDLocation(line: 19, column: 5, scope: !33)
 !36 = !MDLocation(line: 20, column: 1, scope: !8)
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index 16fe370..a4e895a 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -26,11 +26,11 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19
-  %arrayidx = getelementptr inbounds [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
-  %0 = load i8* %arrayidx, align 1, !dbg !19, !tbaa !21
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !19, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !19
-  %arrayidx2 = getelementptr inbounds [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
-  %1 = load i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
   %conv3 = sext i8 %1 to i32, !dbg !19
   %sub = sub i32 %conv, %conv3, !dbg !19
   %add = add nsw i32 %sub, %add8, !dbg !19
@@ -49,14 +49,14 @@ declare void @ibar(i32*) #1
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!1 = !{!"vectorization-remarks.c", !"."}
+!1 = !MDFile(filename: "vectorization-remarks.c", directory: ".")
 !2 = !{}
 !3 = !{!4}
-!4 = !{!"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\006", !1, !5, !6, null, i32 (i32)* @foo, null, null, !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
-!5 = !{!"0x29", !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
-!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = !MDSubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !5, type: !6, function: i32 (i32)* @foo, variables: !2)
+!5 = !MDFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !MDSubroutineType(types: !2)
 !7 = !{i32 2, !"Dwarf Version", i32 4}
-!8 = !{i32 1, !"Debug Info Version", i32 2}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
 !9 = !{!"clang version 3.5.0 "}
 !10 = !MDLocation(line: 8, column: 3, scope: !4)
 !11 = !{!12, !12, i64 0}
@@ -64,11 +64,11 @@ declare void @ibar(i32*) #1
 !13 = !{!"omnipotent char", !14, i64 0}
 !14 = !{!"Simple C/C++ TBAA"}
 !15 = !MDLocation(line: 17, column: 8, scope: !16)
-!16 = !{!"0xb\0017\008\002", !1, !17} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!17 = !{!"0xb\0017\008\001", !1, !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!18 = !{!"0xb\0017\003\000", !1, !4} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!16 = distinct !MDLexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !MDLexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !MDLexicalBlock(line: 17, column: 3, file: !1, scope: !4)
 !19 = !MDLocation(line: 18, column: 5, scope: !20)
-!20 = !{!"0xb\0017\0027\000", !1, !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!20 = distinct !MDLexicalBlock(line: 17, column: 27, file: !1, scope: !18)
 !21 = !{!13, !13, i64 0}
 !22 = !MDLocation(line: 20, column: 3, scope: !4)
 !23 = !MDLocation(line: 21, column: 3, scope: !4)
diff --git a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index d8e5403..0debb33 100644
--- a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -17,7 +17,7 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %conv = sitofp i32 1 to x86_fp80
-  %arrayidx = getelementptr inbounds [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
   store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32