11 files changed, 897 insertions, 0 deletions
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
new file mode 100644
index 0000000..32a91ce
--- /dev/null
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -0,0 +1,112 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
+; want to select the pairs:
+; %div77 = fdiv double %sub74, %mul76.v.r1 <->   %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
+; %add84 = fadd double %sub83, 2.000000e+00 <->   %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
+; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <->   %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
+; %mul117 = fmul double %sub39.v.r1, %sub116 <->   %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
+; and so a dependency cycle would be created.
+
+declare double @fabs(double) nounwind readnone
+define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
+entry:
+  br label %go
+go:
+  %conv = sitofp i32 %n.0 to double
+  %add35 = fadd double %conv, %a
+  %sub36 = fadd double %add35, -1.000000e+00
+  %add38 = fadd double %conv, %b
+  %sub39 = fadd double %add38, -1.000000e+00
+  %add41 = fadd double %conv, %c
+  %sub42 = fadd double %add41, -1.000000e+00
+  %sub45 = fadd double %add35, -2.000000e+00
+  %sub48 = fadd double %add38, -2.000000e+00
+  %sub51 = fadd double %add41, -2.000000e+00
+  %mul52 = shl nsw i32 %n.0, 1
+  %sub53 = add nsw i32 %mul52, -1
+  %conv54 = sitofp i32 %sub53 to double
+  %sub56 = add nsw i32 %mul52, -3
+  %conv57 = sitofp i32 %sub56 to double
+  %sub59 = add nsw i32 %mul52, -5
+  %conv60 = sitofp i32 %sub59 to double
+  %mul61 = mul nsw i32 %n.0, %n.0
+  %conv62 = sitofp i32 %mul61 to double
+  %mul63 = fmul double %conv62, 3.000000e+00
+  %mul67 = fmul double %sub65, %conv
+  %add68 = fadd double %mul63, %mul67
+  %add69 = fadd double %add68, 2.000000e+00
+  %sub71 = fsub double %add69, %mul2.v.r1
+  %sub74 = fsub double %sub71, %mul73
+  %mul75 = fmul double %conv57, 2.000000e+00
+  %mul76 = fmul double %mul75, %sub42
+  %div77 = fdiv double %sub74, %mul76
+  %mul82 = fmul double %add80, %conv
+  %sub83 = fsub double %mul63, %mul82
+  %add84 = fadd double %sub83, 2.000000e+00
+  %sub86 = fsub double %add84, %mul2.v.r1
+  %sub87 = fsub double -0.000000e+00, %sub86
+  %mul88 = fmul double %sub36, %sub87
+  %mul89 = fmul double %mul88, %sub39
+  %mul90 = fmul double %conv54, 4.000000e+00
+  %mul91 = fmul double %mul90, %conv57
+  %mul92 = fmul double %mul91, %sub51
+  %mul93 = fmul double %mul92, %sub42
+  %div94 = fdiv double %mul89, %mul93
+  %mul95 = fmul double %sub45, %sub36
+  %mul96 = fmul double %mul95, %sub48
+  %mul97 = fmul double %mul96, %sub39
+  %sub99 = fsub double %conv, %a
+  %sub100 = fadd double %sub99, -2.000000e+00
+  %mul101 = fmul double %mul97, %sub100
+  %sub103 = fsub double %conv, %b
+  %sub104 = fadd double %sub103, -2.000000e+00
+  %mul105 = fmul double %mul101, %sub104
+  %mul106 = fmul double %conv57, 8.000000e+00
+  %mul107 = fmul double %mul106, %conv57
+  %mul108 = fmul double %mul107, %conv60
+  %sub111 = fadd double %add41, -3.000000e+00
+  %mul112 = fmul double %mul108, %sub111
+  %mul113 = fmul double %mul112, %sub51
+  %mul114 = fmul double %mul113, %sub42
+  %div115 = fdiv double %mul105, %mul114
+  %sub116 = fsub double -0.000000e+00, %sub36
+  %mul117 = fmul double %sub39, %sub116
+  %sub119 = fsub double %conv, %c
+  %sub120 = fadd double %sub119, -1.000000e+00
+  %mul121 = fmul double %mul117, %sub120
+  %mul123 = fmul double %mul75, %sub51
+  %mul124 = fmul double %mul123, %sub42
+  %div125 = fdiv double %mul121, %mul124
+  %mul126 = fmul double %div77, %sub
+  %add127 = fadd double %mul126, 1.000000e+00
+  %mul128 = fmul double %add127, %Anm1.0
+  %mul129 = fmul double %div94, %sub
+  %add130 = fadd double %div125, %mul129
+  %mul131 = fmul double %add130, %sub
+  %mul132 = fmul double %mul131, %Anm2.0
+  %add133 = fadd double %mul128, %mul132
+  %mul134 = fmul double %div115, %mul1
+  %mul135 = fmul double %mul134, %Anm3.0
+  %add136 = fadd double %add133, %mul135
+  %mul139 = fmul double %add127, %Bnm1.0
+  %mul143 = fmul double %mul131, %Bnm2.0
+  %add144 = fadd double %mul139, %mul143
+  %mul146 = fmul double %mul134, %Bnm3.0
+  %add147 = fadd double %add144, %mul146
+  %div148 = fdiv double %add136, %add147
+  %sub149 = fsub double %F.0, %div148
+  %div150 = fdiv double %sub149, %F.0
+  %call = tail call double @fabs(double %div150) nounwind readnone
+  %cmp = fcmp olt double %call, 0x3CB0000000000000
+  %cmp152 = icmp sgt i32 %n.0, 20000
+  %or.cond = or i1 %cmp, %cmp152
+  br i1 %or.cond, label %done, label %go
+done:
+  ret void
+; CHECK: @test1
+; CHECK: go:
+; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; FIXME: When tree pruning is deterministic, include the entire output.
+}
diff --git a/test/Transforms/BBVectorize/func-alias.ll b/test/Transforms/BBVectorize/func-alias.ll
new file mode 100644
index 0000000..9d0cc07
--- /dev/null
+++ b/test/Transforms/BBVectorize/func-alias.ll
@@ -0,0 +1,244 @@
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s
+; The chain length is set to 2 so that this will do some vectorization; check that the order of the functions is unchanged.
+
+%struct.descriptor_dimension = type { i64, i64, i64 }
+%struct.__st_parameter_common = type { i32, i32, i8*, i32, i32, i8*, i32* }
+%struct.__st_parameter_dt = type { %struct.__st_parameter_common, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] }
+%"struct.array4_real(kind=4)" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
+%"struct.array4_integer(kind=4).73" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
+%struct.array4_unknown = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
+
+@.cst4 = external unnamed_addr constant [11 x i8], align 8
+@.cst823 = external unnamed_addr constant [214 x i8], align 64
+@j.4580 = external global i32
+@j1.4581 = external global i32
+@nty1.4590 = external global [2 x i8]
+@nty2.4591 = external global [2 x i8]
+@xr1.4592 = external global float
+@xr2.4593 = external global float
+@yr1.4594 = external global float
+@yr2.4595 = external global float
+
+@__main1_MOD_iave = external unnamed_addr global i32
+@__main1_MOD_igrp = external global i32
+@__main1_MOD_iounit = external global i32
+@__main1_MOD_ityp = external global i32
+@__main1_MOD_mclmsg = external unnamed_addr global %struct.array4_unknown, align 32
+@__main1_MOD_mxdate = external unnamed_addr global %"struct.array4_integer(kind=4).73", align 32
+@__main1_MOD_rmxval = external unnamed_addr global %"struct.array4_real(kind=4)", align 32
+
+declare void @_gfortran_st_write(%struct.__st_parameter_dt*)
+declare void @_gfortran_st_write_done(%struct.__st_parameter_dt*)
+declare void @_gfortran_transfer_character_write(%struct.__st_parameter_dt*, i8*, i32)
+declare void @_gfortran_transfer_integer_write(%struct.__st_parameter_dt*, i8*, i32)
+declare void @_gfortran_transfer_real_write(%struct.__st_parameter_dt*, i8*, i32)
+
+define i1 @"prtmax__<bb 3>_<bb 34>"(%struct.__st_parameter_dt* %memtmp3, i32 %D.4627_188.reload) nounwind {
+; CHECK: prtmax__
+newFuncRoot:
+  br label %"<bb 34>"
+
+codeRepl80.exitStub:                              ; preds = %"<bb 34>"
+  ret i1 true
+
+"<bb 34>.<bb 25>_crit_edge.exitStub":             ; preds = %"<bb 34>"
+  ret i1 false
+
+"<bb 34>":                                        ; preds = %newFuncRoot
+  %tmp128 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
+  %tmp129 = getelementptr inbounds %struct.__st_parameter_common* %tmp128, i32 0, i32 2
+  store i8* getelementptr inbounds ([11 x i8]* @.cst4, i64 0, i64 0), i8** %tmp129, align 8
+  %tmp130 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
+  %tmp131 = getelementptr inbounds %struct.__st_parameter_common* %tmp130, i32 0, i32 3
+  store i32 31495, i32* %tmp131, align 4
+  %tmp132 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 5
+  store i8* getelementptr inbounds ([214 x i8]* @.cst823, i64 0, i64 0), i8** %tmp132, align 8
+  %tmp133 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 6
+  store i32 214, i32* %tmp133, align 4
+  %tmp134 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
+  %tmp135 = getelementptr inbounds %struct.__st_parameter_common* %tmp134, i32 0, i32 0
+  store i32 4096, i32* %tmp135, align 4
+  %iounit.8748_288 = load i32* @__main1_MOD_iounit, align 4
+  %tmp136 = getelementptr inbounds %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
+  %tmp137 = getelementptr inbounds %struct.__st_parameter_common* %tmp136, i32 0, i32 1
+  store i32 %iounit.8748_288, i32* %tmp137, align 4
+  call void @_gfortran_st_write(%struct.__st_parameter_dt* %memtmp3) nounwind
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j.4580, i32 4) nounwind
+; CHECK: @_gfortran_transfer_integer_write
+  %D.75807_289 = load i8** getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8
+  %j.8758_290 = load i32* @j.4580, align 4
+  %D.75760_291 = sext i32 %j.8758_290 to i64
+  %iave.8736_292 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_293 = sext i32 %iave.8736_292 to i64
+  %D.75808_294 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75809_295 = mul nsw i64 %D.75620_293, %D.75808_294
+  %igrp.8737_296 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_297 = sext i32 %igrp.8737_296 to i64
+  %D.75810_298 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75811_299 = mul nsw i64 %D.75635_297, %D.75810_298
+  %D.75812_300 = add nsw i64 %D.75809_295, %D.75811_299
+  %D.75813_301 = add nsw i64 %D.75760_291, %D.75812_300
+  %ityp.8750_302 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_303 = sext i32 %ityp.8750_302 to i64
+  %D.75814_304 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75815_305 = mul nsw i64 %D.75704_303, %D.75814_304
+  %D.75816_306 = add nsw i64 %D.75813_301, %D.75815_305
+  %D.75817_307 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8
+  %D.75818_308 = add nsw i64 %D.75816_306, %D.75817_307
+  %tmp138 = bitcast i8* %D.75807_289 to [0 x float]*
+  %tmp139 = bitcast [0 x float]* %tmp138 to float*
+  %D.75819_309 = getelementptr inbounds float* %tmp139, i64 %D.75818_308
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75819_309, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  %D.75820_310 = load i8** getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8
+  %j.8758_311 = load i32* @j.4580, align 4
+  %D.75760_312 = sext i32 %j.8758_311 to i64
+  %iave.8736_313 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_314 = sext i32 %iave.8736_313 to i64
+  %D.75821_315 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75822_316 = mul nsw i64 %D.75620_314, %D.75821_315
+  %igrp.8737_317 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_318 = sext i32 %igrp.8737_317 to i64
+  %D.75823_319 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75824_320 = mul nsw i64 %D.75635_318, %D.75823_319
+  %D.75825_321 = add nsw i64 %D.75822_316, %D.75824_320
+  %D.75826_322 = add nsw i64 %D.75760_312, %D.75825_321
+  %ityp.8750_323 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_324 = sext i32 %ityp.8750_323 to i64
+  %D.75827_325 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75828_326 = mul nsw i64 %D.75704_324, %D.75827_325
+  %D.75829_327 = add nsw i64 %D.75826_322, %D.75828_326
+  %D.75830_328 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8
+  %D.75831_329 = add nsw i64 %D.75829_327, %D.75830_328
+  %tmp140 = bitcast i8* %D.75820_310 to [0 x [1 x i8]]*
+  %tmp141 = bitcast [0 x [1 x i8]]* %tmp140 to [1 x i8]*
+  %D.75832_330 = getelementptr inbounds [1 x i8]* %tmp141, i64 %D.75831_329
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75832_330, i32 1) nounwind
+; CHECK: @_gfortran_transfer_character_write
+  %D.75833_331 = load i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8
+  %j.8758_332 = load i32* @j.4580, align 4
+  %D.75760_333 = sext i32 %j.8758_332 to i64
+  %iave.8736_334 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_335 = sext i32 %iave.8736_334 to i64
+  %D.75834_336 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75835_337 = mul nsw i64 %D.75620_335, %D.75834_336
+  %igrp.8737_338 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_339 = sext i32 %igrp.8737_338 to i64
+  %D.75836_340 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75837_341 = mul nsw i64 %D.75635_339, %D.75836_340
+  %D.75838_342 = add nsw i64 %D.75835_337, %D.75837_341
+  %D.75839_343 = add nsw i64 %D.75760_333, %D.75838_342
+  %ityp.8750_344 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_345 = sext i32 %ityp.8750_344 to i64
+  %D.75840_346 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75841_347 = mul nsw i64 %D.75704_345, %D.75840_346
+  %D.75842_348 = add nsw i64 %D.75839_343, %D.75841_347
+  %D.75843_349 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8
+  %D.75844_350 = add nsw i64 %D.75842_348, %D.75843_349
+  %tmp142 = bitcast i8* %D.75833_331 to [0 x i32]*
+  %tmp143 = bitcast [0 x i32]* %tmp142 to i32*
+  %D.75845_351 = getelementptr inbounds i32* %tmp143, i64 %D.75844_350
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75845_351, i32 4) nounwind
+; CHECK: @_gfortran_transfer_integer_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr1.4592, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr1.4594, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty1.4590, i32 2) nounwind
+; CHECK: @_gfortran_transfer_character_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j1.4581, i32 4) nounwind
+; CHECK: @_gfortran_transfer_integer_write
+  %D.75807_352 = load i8** getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8
+  %j1.8760_353 = load i32* @j1.4581, align 4
+  %D.75773_354 = sext i32 %j1.8760_353 to i64
+  %iave.8736_355 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_356 = sext i32 %iave.8736_355 to i64
+  %D.75808_357 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75809_358 = mul nsw i64 %D.75620_356, %D.75808_357
+  %igrp.8737_359 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_360 = sext i32 %igrp.8737_359 to i64
+  %D.75810_361 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75811_362 = mul nsw i64 %D.75635_360, %D.75810_361
+  %D.75812_363 = add nsw i64 %D.75809_358, %D.75811_362
+  %D.75846_364 = add nsw i64 %D.75773_354, %D.75812_363
+  %ityp.8750_365 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_366 = sext i32 %ityp.8750_365 to i64
+  %D.75814_367 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75815_368 = mul nsw i64 %D.75704_366, %D.75814_367
+  %D.75847_369 = add nsw i64 %D.75846_364, %D.75815_368
+  %D.75817_370 = load i64* getelementptr inbounds (%"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8
+  %D.75848_371 = add nsw i64 %D.75847_369, %D.75817_370
+  %tmp144 = bitcast i8* %D.75807_352 to [0 x float]*
+  %tmp145 = bitcast [0 x float]* %tmp144 to float*
+  %D.75849_372 = getelementptr inbounds float* %tmp145, i64 %D.75848_371
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75849_372, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  %D.75820_373 = load i8** getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8
+  %j1.8760_374 = load i32* @j1.4581, align 4
+  %D.75773_375 = sext i32 %j1.8760_374 to i64
+  %iave.8736_376 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_377 = sext i32 %iave.8736_376 to i64
+  %D.75821_378 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75822_379 = mul nsw i64 %D.75620_377, %D.75821_378
+  %igrp.8737_380 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_381 = sext i32 %igrp.8737_380 to i64
+  %D.75823_382 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75824_383 = mul nsw i64 %D.75635_381, %D.75823_382
+  %D.75825_384 = add nsw i64 %D.75822_379, %D.75824_383
+  %D.75850_385 = add nsw i64 %D.75773_375, %D.75825_384
+  %ityp.8750_386 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_387 = sext i32 %ityp.8750_386 to i64
+  %D.75827_388 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75828_389 = mul nsw i64 %D.75704_387, %D.75827_388
+  %D.75851_390 = add nsw i64 %D.75850_385, %D.75828_389
+  %D.75830_391 = load i64* getelementptr inbounds (%struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8
+  %D.75852_392 = add nsw i64 %D.75851_390, %D.75830_391
+  %tmp146 = bitcast i8* %D.75820_373 to [0 x [1 x i8]]*
+  %tmp147 = bitcast [0 x [1 x i8]]* %tmp146 to [1 x i8]*
+  %D.75853_393 = getelementptr inbounds [1 x i8]* %tmp147, i64 %D.75852_392
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75853_393, i32 1) nounwind
+; CHECK: @_gfortran_transfer_character_write
+  %D.75833_394 = load i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8
+  %j1.8760_395 = load i32* @j1.4581, align 4
+  %D.75773_396 = sext i32 %j1.8760_395 to i64
+  %iave.8736_397 = load i32* @__main1_MOD_iave, align 4
+  %D.75620_398 = sext i32 %iave.8736_397 to i64
+  %D.75834_399 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8
+  %D.75835_400 = mul nsw i64 %D.75620_398, %D.75834_399
+  %igrp.8737_401 = load i32* @__main1_MOD_igrp, align 4
+  %D.75635_402 = sext i32 %igrp.8737_401 to i64
+  %D.75836_403 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8
+  %D.75837_404 = mul nsw i64 %D.75635_402, %D.75836_403
+  %D.75838_405 = add nsw i64 %D.75835_400, %D.75837_404
+  %D.75854_406 = add nsw i64 %D.75773_396, %D.75838_405
+  %ityp.8750_407 = load i32* @__main1_MOD_ityp, align 4
+  %D.75704_408 = sext i32 %ityp.8750_407 to i64
+  %D.75840_409 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8
+  %D.75841_410 = mul nsw i64 %D.75704_408, %D.75840_409
+  %D.75855_411 = add nsw i64 %D.75854_406, %D.75841_410
+  %D.75843_412 = load i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8
+  %D.75856_413 = add nsw i64 %D.75855_411, %D.75843_412
+  %tmp148 = bitcast i8* %D.75833_394 to [0 x i32]*
+  %tmp149 = bitcast [0 x i32]* %tmp148 to i32*
+  %D.75857_414 = getelementptr inbounds i32* %tmp149, i64 %D.75856_413
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75857_414, i32 4) nounwind
+; CHECK: @_gfortran_transfer_integer_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr2.4593, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr2.4595, i32 4) nounwind
+; CHECK: @_gfortran_transfer_real_write
+  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty2.4591, i32 2) nounwind
+; CHECK: @_gfortran_transfer_character_write
+  call void @_gfortran_st_write_done(%struct.__st_parameter_dt* %memtmp3) nounwind
+; CHECK: @_gfortran_st_write_done
+  %j.8758_415 = load i32* @j.4580, align 4
+  %D.4634_416 = icmp eq i32 %j.8758_415, %D.4627_188.reload
+  %j.8758_417 = load i32* @j.4580, align 4
+  %j.8770_418 = add nsw i32 %j.8758_417, 1
+  store i32 %j.8770_418, i32* @j.4580, align 4
+  %tmp150 = icmp ne i1 %D.4634_416, false
+  br i1 %tmp150, label %codeRepl80.exitStub, label %"<bb 34>.<bb 25>_crit_edge.exitStub"
+}
+
diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll
new file mode 100644
index 0000000..cea225d
--- /dev/null
+++ b/test/Transforms/BBVectorize/ld1.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %i2 = load double* %c, align 8
+  %add = fadd double %mul, %i2
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx6 = getelementptr inbounds double* %c, i64 1
+  %i5 = load double* %arrayidx6, align 8
+  %add7 = fadd double %mul5, %i5
+  %mul9 = fmul double %add, %i1
+  %add11 = fadd double %mul9, %i2
+  %mul13 = fmul double %add7, %i4
+  %add15 = fadd double %mul13, %i5
+  %mul16 = fmul double %add11, %add15
+  ret double %mul16
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8
+; CHECK: %add = fadd <2 x double> %mul, %i2
+; CHECK: %mul9 = fmul <2 x double> %add, %i1
+; CHECK: %add11 = fadd <2 x double> %mul9, %i2
+; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
+; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
+; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
+; CHECK: ret double %mul16
+}
+
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
new file mode 100644
index 0000000..19eebc0
--- /dev/null
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
new file mode 100644
index 0000000..bebc91a
--- /dev/null
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -0,0 +1,93 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK: %0 = load double* %arrayidx, align 8
+; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK: %1 = load double* %arrayidx2, align 8
+; CHECK: %mul = fmul double %0, %0
+; CHECK: %mul3 = fmul double %0, %1
+; CHECK: %add = fadd double %mul, %mul3
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
+; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
+; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
+; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
+; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
+; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
+; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
+; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
+; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
+; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
+; CHECK: %add7 = fadd double %add, %mul6.v.r1
+; CHECK: %add12 = fadd double %add7, %mul6.v.r2
+; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK: store double %add12, double* %arrayidx14, align 8
+; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
+; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
+; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
+; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
+; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
+; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
+; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
+; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
+; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/BBVectorize/mem-op-depth.ll b/test/Transforms/BBVectorize/mem-op-depth.ll
new file mode 100644
index 0000000..84f16bd
--- /dev/null
+++ b/test/Transforms/BBVectorize/mem-op-depth.ll
@@ -0,0 +1,22 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=6 -instcombine -gvn -S | FileCheck %s
+
+@A = common global [1024 x float] zeroinitializer, align 16
+@B = common global [1024 x float] zeroinitializer, align 16
+
+define i32 @test1() nounwind {
+; CHECK: @test1
+  %V1 = load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 0), align 16
+  %V2 = load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 1), align 4
+  %V3= load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 2), align 8
+  %V4 = load float* getelementptr inbounds ([1024 x float]* @A, i64 0, i64 3), align 4
+; CHECK:   %V1 = load <4 x float>* bitcast ([1024 x float]* @A to <4 x float>*), align 16
+  store float %V1, float* getelementptr inbounds ([1024 x float]* @B, i64 0, i64 0), align 16
+  store float %V2, float* getelementptr inbounds ([1024 x float]* @B, i64 0, i64 1), align 4
+  store float %V3, float* getelementptr inbounds ([1024 x float]* @B, i64 0, i64 2), align 8
+  store float %V4, float* getelementptr inbounds ([1024 x float]* @B, i64 0, i64 3), align 4
+; CHECK-NEXT: store <4 x float> %V1, <4 x float>* bitcast ([1024 x float]* @B to <4 x float>*), align 16
+  ret i32 0
+; CHECK-NEXT: ret i32 0
+}
diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll
new file mode 100644
index 0000000..8c9cc3c
--- /dev/null
+++ b/test/Transforms/BBVectorize/req-depth.ll
@@ -0,0 +1,17 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%R  = fmul double %Y1, %Y2
+	ret double %R
+; CHECK-RD3: @test1
+; CHECK-RD2: @test1
+; CHECK-RD3-NOT: <2 x double>
+; CHECK-RD2: <2 x double>
+}
+
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
new file mode 100644
index 0000000..d9945b5
--- /dev/null
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -0,0 +1,46 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK-SL4: @test1
+; CHECK-SL4-NOT: <2 x double>
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+        ; Here we have a dependency chain: the short search limit will not
+        ; see past this chain and so will not see the second part of the
+        ; pair to vectorize.
+        %mul41 = fmul double %Z1, %Y2
+        %sub48 = fsub double %Z1, %mul41
+        %mul62 = fmul double %Z1, %sub48
+        %sub69 = fsub double %Z1, %mul62
+        %mul83 = fmul double %Z1, %sub69
+        %sub90 = fsub double %Z1, %mul83
+        %mul104 = fmul double %Z1, %sub90
+        %sub111 = fsub double %Z1, %mul104
+        %mul125 = fmul double %Z1, %sub111
+        %sub132 = fsub double %Z1, %mul125
+        %mul146 = fmul double %Z1, %sub132
+        %sub153 = fsub double %Z1, %mul146
+        ; end of chain.
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R1  = fdiv double %Z1, %Z2
+        %R   = fmul double %R1, %sub153
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
new file mode 100644
index 0000000..b2ef27b
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -0,0 +1,59 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+declare double @llvm.fma.f64(double, double, double)
+declare double @llvm.cos.f64(double)
+
+; Basic depth-3 chain with fma
+define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
+; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
+; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with cos
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.cos.f64(double %X1)
+	%Y2 = call double @llvm.cos.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
+
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
new file mode 100644
index 0000000..a5397ee
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -0,0 +1,110 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test1
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with extending loads and stores
+define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0f = load float* %a, align 4
+  %i0 = fpext float %i0f to double
+  %i1f = load float* %b, align 4
+  %i1 = fpext float %i1f to double
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds float* %a, i64 1
+  %i3f = load float* %arrayidx3, align 4
+  %i3 = fpext float %i3f to double
+  %arrayidx4 = getelementptr inbounds float* %b, i64 1
+  %i4f = load float* %arrayidx4, align 4
+  %i4 = fpext float %i4f to double
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test2
+; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
+; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
+; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4
+; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
+; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4
+; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test2
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with loads and truncating stores
+define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %mulf = fptrunc double %mul to float
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %mul5f = fptrunc double %mul5 to float
+  store float %mulf, float* %c, align 8
+  %arrayidx5 = getelementptr inbounds float* %c, i64 1
+  store float %mul5f, float* %arrayidx5, align 4
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK: %0 = bitcast float* %c to <2 x float>*
+; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO: %i0 = load double* %a, align 8
+; CHECK-AO: %i1 = load double* %b, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
+; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
+; CHECK-AO: %i3 = load double* %arrayidx3, align 8
+; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
+; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
+; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
+; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
+; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK-AO: ret void
+}
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
new file mode 100644
index 0000000..904d766
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -0,0 +1,152 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair first splat)
+define double @test3(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test3
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair second splat)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test4
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain
+define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
+; CHECK: @test5
+; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	%X1 = fsub <2 x float> %A1, %B1
+	%X2 = fsub <2 x float> %A2, %B2
+; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1
+	%Y1 = fmul <2 x float> %X1, %A1
+	%Y2 = fmul <2 x float> %X2, %A2
+; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0
+	%Z1 = fadd <2 x float> %Y1, %B1
+	%Z2 = fadd <2 x float> %Y2, %B2
+; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1
+	%R  = fmul <2 x float> %Z1, %Z2
+; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2
+	ret <2 x float> %R
+; CHECK: ret <2 x float> %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+; CHECK: @test6
+; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
+; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
+	%R  = mul <8 x i8> %Q1, %Q2
+; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
+	ret <8 x i8> %R
+; CHECK: ret <8 x i8> %R
+}
+
+