17 files changed, 1198 insertions, 37 deletions
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 1883a8f..39408a2 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -68,7 +68,7 @@ declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind rea
 
 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> zeroinitializer
@@ -76,7 +76,7 @@ entry:
 
 define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
@@ -85,7 +85,7 @@ entry:
 
 define <4 x i32> @constantMulARM64() nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
@@ -93,7 +93,7 @@ entry:
 
 define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -101,7 +101,7 @@ entry:
 
 define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
 entry:
-  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
 ; CHECK: entry:
 ; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -109,17 +109,17 @@ entry:
 
 define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
 ; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
 entry:
-  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
   ret <4 x i32> %b
 ; CHECK: entry:
@@ -127,8 +127,8 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %b
 }
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind readnone ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/OverlappingInsertvalues.ll b/test/Transforms/InstCombine/OverlappingInsertvalues.ll
new file mode 100644
index 0000000..9248aec
--- /dev/null
+++ b/test/Transforms/InstCombine/OverlappingInsertvalues.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we can find and remove redundant insertvalues
+; CHECK-LABEL: foo_simple
+; CHECK-NOT: i8* %x, 0
+define { i8*, i64, i32 } @foo_simple(i8* %x, i8* %y) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i8* %y, 0
+  ret { i8*, i64, i32 } %1
+}
+; Check that we can find and remove redundant nodes in insertvalues chain
+; CHECK-LABEL: foo_ovwrt_chain
+; CHECK-NOT: i64 %y, 1
+; CHECK-NOT: i32 555, 2
+define { i8*, i64, i32 } @foo_ovwrt_chain(i8* %x, i64 %y, i64 %z) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i32 } undef, i8* %x, 0
+  %1 = insertvalue { i8*, i64, i32 } %0, i64 %y, 1
+  %2 = insertvalue { i8*, i64, i32 } %1, i32 555, 2
+  %3 = insertvalue { i8*, i64, i32 } %2, i64 %z, 1
+  %4 = insertvalue { i8*, i64, i32 } %3, i32 777, 2
+  ret { i8*, i64, i32 } %4
+}
+; Check that we propagate insertvalues only if they are use as the first
+; operand (as initial value of aggregate)
+; CHECK-LABEL: foo_use_as_second_operand
+; CHECK: i16 %x, 0
+; CHECK: %0, 1
+define { i8, {i16, i32} } @foo_use_as_second_operand(i16 %x) nounwind {
+entry:
+  %0 = insertvalue { i16, i32 } undef, i16 %x, 0
+  %1 = insertvalue { i8, {i16, i32} } undef, { i16, i32 } %0, 1
+  ret { i8, {i16, i32} } %1
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index ae1cfa1..6d0c131 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -129,3 +129,24 @@ define void @test8() {
   call void (...)* @use(i32* %x)
   ret void
 }
+
+; PR19569
+%struct_type = type { i32, i32 }
+declare void @test9_aux(<{ %struct_type }>* inalloca)
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+
+define void @test9(%struct_type* %a) {
+; CHECK-LABEL: @test9(
+entry:
+  %inalloca.save = call i8* @llvm.stacksave()
+  %argmem = alloca inalloca <{ %struct_type }>
+; CHECK: alloca inalloca i64, align 8
+  %0 = getelementptr inbounds <{ %struct_type }>* %argmem, i32 0, i32 0
+  %1 = bitcast %struct_type* %0 to i8*
+  %2 = bitcast %struct_type* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 8, i32 4, i1 false)
+  call void @test9_aux(<{ %struct_type }>* inalloca %argmem)
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index a6b56f9..284960b 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll
@@ -6,46 +6,46 @@ target datalayout = "e-p:32:32:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16
 ; Cases that should be bitcast
 
 ; Test cast between scalars with same bit sizes
-@alias_i32_to_f32 = alias bitcast (i32 (i32)* @func_i32 to float (float)*)
+@alias_i32_to_f32 = alias float (float), i32 (i32)* @func_i32
 
 ; Test cast between vectors with same number of elements and bit sizes
-@alias_v2i32_to_v2f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <2 x float> (<2 x float>)*)
+@alias_v2i32_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
 
 ; Test cast from vector to scalar with same number of bits
-@alias_v2f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <2 x float> (<2 x float>)*)
+@alias_v2f32_to_i64 = alias <2 x float> (<2 x float>), i64 (i64)* @func_i64
 
 ; Test cast from scalar to vector with same number of bits
-@alias_i64_to_v2f32 = alias bitcast (<2 x float> (<2 x float>)* @func_v2f32 to i64 (i64)*)
+@alias_i64_to_v2f32 = alias  i64 (i64), <2 x float> (<2 x float>)* @func_v2f32
 
 ; Test cast between vectors of pointers
-@alias_v2i32p_to_v2i64p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to <2 x i64*> (<2 x i64*>)*)
+@alias_v2i32p_to_v2i64p = alias <2 x i64*> (<2 x i64*>), <2 x i32*> (<2 x i32*>)* @func_v2i32p
 
 
 ; Cases that should be invalid and unchanged
 
 ; Test cast between scalars with different bit sizes
-@alias_i64_to_f32 = alias bitcast (i64 (i64)* @func_i64 to float (float)*)
+@alias_i64_to_f32 = alias float (float), i64 (i64)* @func_i64
 
 ; Test cast between vectors with different bit sizes but the
 ; same number of elements
-@alias_v2i64_to_v2f32 = alias bitcast (<2 x i64> (<2 x i64>)* @func_v2i64 to <2 x float> (<2 x float>)*)
+@alias_v2i64_to_v2f32 = alias <2 x float> (<2 x float>), <2 x i64> (<2 x i64>)* @func_v2i64
 
 ; Test cast between vectors with same number of bits and different
 ; numbers of elements
-@alias_v2i32_to_v4f32 = alias bitcast (<2 x i32> (<2 x i32>)* @func_v2i32 to <4 x float> (<4 x float>)*)
+@alias_v2i32_to_v4f32 = alias  <4 x float> (<4 x float>), <2 x i32> (<2 x i32>)* @func_v2i32
 
 ; Test cast between scalar and vector with different number of bits
-@alias_i64_to_v4f32 = alias bitcast (<4 x float> (<4 x float>)* @func_v4f32 to i64 (i64)*)
+@alias_i64_to_v4f32 = alias i64 (i64), <4 x float> (<4 x float>)* @func_v4f32
 
 ; Test cast between vector and scalar with different number of bits
-@alias_v4f32_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x float> (<4 x float>)*)
+@alias_v4f32_to_i64 = alias <4 x float> (<4 x float>), i64 (i64)* @func_i64
 
 ; Test cast from scalar to vector of pointers with same number of bits
 ; We don't know the pointer size at this point, so this can't be done
-@alias_i64_to_v2i32p = alias bitcast (<2 x i32*> (<2 x i32*>)* @func_v2i32p to i64 (i64)*)
+@alias_i64_to_v2i32p = alias  i64 (i64), <2 x i32*> (<2 x i32*>)* @func_v2i32p
 
 ; Test cast between vector of pointers and scalar with different number of bits
-@alias_v4i32p_to_i64 = alias bitcast (i64 (i64)* @func_i64 to <4 x i32*> (<4 x i32*>)*)
+@alias_v4i32p_to_i64 = alias <4 x i32*> (<4 x i32*>), i64 (i64)* @func_i64
 
 
 
diff --git a/test/Transforms/InstCombine/blend_x86.ll b/test/Transforms/InstCombine/blend_x86.ll
new file mode 100644
index 0000000..778d44b
--- /dev/null
+++ b/test/Transforms/InstCombine/blend_x86.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -S | FileCheck %s
+
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd
+; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x double> %ab, <2 x double> %xy
+  %1 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %xy, <2 x double> %ab, <2 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps
+; CHECK: select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %abcd, <4 x float> %xyzw
+  %1 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %xyzw, <4 x float> %abcd, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb
+; CHECK: select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %abcd, <16 x i8> %xyzw
+  %1 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd, <16 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <16 x i8> %1
+}
+
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: @constant_blendvpd_avx
+; CHECK: select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %ab, <4 x double> %xy
+  %1 = tail call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %xy, <4 x double> %ab, <4 x double> <double 0xFFFFFFFFE0000000, double 0.000000e+00, double 0xFFFFFFFFE0000000, double 0.000000e+00>)
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: @constant_blendvps_avx
+; CHECK: select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>,  <8 x float> %abcd, <8 x float> %xyzw
+  %1 = tail call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %xyzw, <8 x float> %abcd, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0xFFFFFFFFE0000000>)
+  ret <8 x float> %1
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: @constant_pblendvb_avx2
+; CHECK: select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %abcd, <32 x i8> %xyzw
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %xyzw, <32 x i8> %abcd,
+        <32 x i8> <i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0,
+                   i8 0, i8 0, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0>)
+  ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/Transforms/InstCombine/call-cast-target-inalloca.ll b/test/Transforms/InstCombine/call-cast-target-inalloca.ll
index baf97e0..90289e2 100644
--- a/test/Transforms/InstCombine/call-cast-target-inalloca.ll
+++ b/test/Transforms/InstCombine/call-cast-target-inalloca.ll
@@ -8,7 +8,7 @@ declare void @takes_i32_inalloca(i32* inalloca)
 
 define void @f() {
 ; CHECK-LABEL: define void @f()
-  %args = alloca i32
+  %args = alloca inalloca i32
   call void bitcast (void (i32)* @takes_i32 to void (i32*)*)(i32* inalloca %args)
 ; CHECK: call void bitcast
   ret void
diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 1bf486f..9c7ba9b 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll
@@ -156,3 +156,22 @@ define <2 x i64> @test18(<2 x i64> %x) nounwind {
 ; CHECK-NEXT: sub <2 x i64> zeroinitializer, %x
 ; CHECK-NEXT: ret <2 x i64>
 }
+
+define i32 @test19(i32 %x) {
+  %A = udiv i32 1, %x
+  ret i32 %A
+; CHECK-LABEL: @test19(
+; CHECK-NEXT: icmp eq i32 %x, 1
+; CHECK-NEXT: zext i1 %{{.*}} to i32
+; CHECK-NEXT ret i32
+}
+
+define i32 @test20(i32 %x) {
+  %A = sdiv i32 1, %x
+  ret i32 %A
+; CHECK-LABEL: @test20(
+; CHECK-NEXT: add i32 %x, 1
+; CHECK-NEXT: icmp ult i32 %{{.*}}, 3
+; CHECK-NEXT: select i1 %{{.*}}, i32 %x, i32 {{.*}}
+; CHECK-NEXT: ret i32
+}
diff --git a/test/Transforms/InstCombine/gep-addrspace.ll b/test/Transforms/InstCombine/gep-addrspace.ll
index 24c355d..29511a3 100644
--- a/test/Transforms/InstCombine/gep-addrspace.ll
+++ b/test/Transforms/InstCombine/gep-addrspace.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-pc-win32"
@@ -17,3 +17,18 @@ ST:
   ret void
 }
 
+@array = internal addrspace(3) global [256 x float] zeroinitializer, align 4
+@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+
+define void @keep_necessary_addrspacecast(i64 %i, float** %out0, float** %out1) {
+entry:
+; CHECK-LABEL: @keep_necessary_addrspacecast
+  %0 = getelementptr [256 x float]* addrspacecast ([256 x float] addrspace(3)* @array to [256 x float]*), i64 0, i64 %i
+; CHECK: addrspacecast float addrspace(3)* %{{[0-9]+}} to float*
+  %1 = getelementptr [0 x float]* addrspacecast (float addrspace(3)* @scalar to [0 x float]*), i64 0, i64 %i
+; CHECK: addrspacecast float addrspace(3)* %{{[0-9]+}} to float*
+  store float* %0, float** %out0, align 4
+  store float* %1, float** %out1, align 4
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 12a4744..f45897c 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1356,3 +1356,12 @@ define i1 @icmp_ashr_ashr_ne(i32 %a, i32 %b) nounwind {
  %z = icmp ne i32 %x, %y
  ret i1 %z
 }
+
+; CHECK-LABEL: @icmp_neg_cst_slt
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %a, 10
+; CHECK-NEXT: ret i1 [[CMP]]
+define i1 @icmp_neg_cst_slt(i32 %a) {
+  %1 = sub nsw i32 0, %a
+  %2 = icmp slt i32 %1, -10
+  ret i1 %2
+}
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index 58793ab..b5a0ab8 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -7,11 +7,11 @@ entry:
 	%lookupTable = alloca [128 x float], align 16		; <[128 x float]*> [#uses=5]
 	%lookupTable1 = bitcast [128 x float]* %lookupTable to i8*		; <i8*> [#uses=1]
 	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i64 512, i32 16, i1 false)
-        
+
 ; CHECK-LABEL: @test1(
 ; CHECK-NOT: alloca
 ; CHECK-NOT: call{{.*}}@llvm.memcpy
-        
+
 	%tmp3 = shl i32 %hash, 2		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp3, 124		; <i32> [#uses=4]
 	%tmp753 = getelementptr [128 x float]* %lookupTable, i32 0, i32 %tmp5		; <float*> [#uses=1]
@@ -37,6 +37,9 @@ entry:
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
 %U = type { i32, i32, i32, i32, i32 }
@@ -64,7 +67,30 @@ define void @test2() {
   ret void
 }
 
+define void @test2_addrspacecast() {
+  %A = alloca %T
+  %B = alloca %T
+  %a = addrspacecast %T* %A to i8 addrspace(1)*
+  %b = addrspacecast %T* %B to i8 addrspace(1)*
+
+; CHECK-LABEL: @test2_addrspacecast(
+
+; %A alloca is deleted
+; This doesn't exactly match what test2 does, because folding the type
+; cast into the alloca doesn't work for the addrspacecast yet.
+; CHECK-NEXT: alloca %T
+; CHECK-NEXT: addrspacecast
+
+; use @G instead of %A
+; CHECK-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %{{.*}},
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %b, i8 addrspace(1)* %a, i64 124, i32 4, i1 false)
+  call void @bar_as1(i8 addrspace(1)* %b)
+  ret void
+}
+
 declare void @bar(i8*)
+declare void @bar_as1(i8 addrspace(1)*)
 
 
 ;; Should be able to eliminate the alloca.
@@ -78,11 +104,22 @@ define void @test3() {
   ret void
 }
 
+define void @test3_addrspacecast() {
+  %A = alloca %T
+  %a = bitcast %T* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%T* @G to i8 addrspace(1)*), i64 124, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test3_addrspacecast(
+; CHECK-NEXT: call void @bar(i8* getelementptr inbounds (%T* @G, i64 0, i32 0))
+  ret void
+}
+
+
 define void @test4() {
   %A = alloca %T
   %a = bitcast %T* %A to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
-  call void @baz(i8* byval %a) 
+  call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
@@ -94,7 +131,7 @@ define void @test5() {
   %a = bitcast %T* %A to i8*
   call void @llvm.lifetime.start(i64 -1, i8* %a)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
-  call void @baz(i8* byval %a) 
+  call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
@@ -135,6 +172,18 @@ define void @test8() {
   ret void
 }
 
+
+define void @test8_addrspacecast() {
+  %A = alloca %U, align 16
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 1) to i8 addrspace(1)*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test8_addrspacecast(
+; CHECK: llvm.memcpy
+; CHECK: bar
+  ret void
+}
+
 define void @test9() {
   %A = alloca %U, align 4
   %a = bitcast %U* %A to i8*
@@ -144,3 +193,13 @@ define void @test9() {
 ; CHECK-NEXT: call void @bar(i8* bitcast (%U* getelementptr inbounds ([2 x %U]* @H, i64 0, i64 1) to i8*))
   ret void
 }
+
+define void @test9_addrspacecast() {
+  %A = alloca %U, align 4
+  %a = bitcast %U* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* addrspacecast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 1) to i8 addrspace(1)*), i64 20, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+; CHECK-LABEL: @test9_addrspacecast(
+; CHECK-NEXT: call void @bar(i8* bitcast (%U* getelementptr inbounds ([2 x %U]* @H, i64 0, i64 1) to i8*))
+  ret void
+}
diff --git a/test/Transforms/InstCombine/overflow-mul.ll b/test/Transforms/InstCombine/overflow-mul.ll
new file mode 100644
index 0000000..04019ae
--- /dev/null
+++ b/test/Transforms/InstCombine/overflow-mul.ll
@@ -0,0 +1,164 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; return mul(zext x, zext y) > MAX
+define i32 @pr4917_1(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_1(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ugt i64 %mul64, 4294967295
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) >= MAX+1
+define i32 @pr4917_1a(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_1a(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp uge i64 %mul64, 4294967296
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) > MAX
+; mul(x, y) is used
+define i32 @pr4917_2(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_2(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ugt i64 %mul64, 4294967295
+; CHECK-DAG: [[VAL:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+  %mul32 = trunc i64 %mul64 to i32
+; CHECK-DAG: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+  %retval = select i1 %overflow, i32 %mul32, i32 111
+; CHECK: select i1 [[OVFL]], i32 [[VAL]]
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) > MAX
+; mul is used in non-truncate
+define i64 @pr4917_3(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_3(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK-NOT: umul.with.overflow.i32
+  %overflow = icmp ugt i64 %mul64, 4294967295
+  %retval = select i1 %overflow, i64 %mul64, i64 111
+  ret i64 %retval
+}
+
+; return mul(zext x, zext y) <= MAX
+define i32 @pr4917_4(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_4(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ule i64 %mul64, 4294967295
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+; CHECK: xor
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; return mul(zext x, zext y) < MAX+1
+define i32 @pr4917_4a(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4917_4a(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+; CHECK-NOT: zext i32
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %overflow = icmp ult i64 %mul64, 4294967296
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+; CHECK: xor
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; operands of mul are of different size
+define i32 @pr4917_5(i32 %x, i8 %y) nounwind {
+; CHECK-LABEL: @pr4917_5(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i8 %y to i64
+; CHECK: [[Y:%.*]] = zext i8 %y to i32
+  %mul64 = mul i64 %l, %r
+  %overflow = icmp ugt i64 %mul64, 4294967295
+  %mul32 = trunc i64 %mul64 to i32
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 [[Y]])
+; CHECK-DAG: [[VAL:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-DAG: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+  %retval = select i1 %overflow, i32 %mul32, i32 111
+; CHECK: select i1 [[OVFL]], i32 [[VAL]]
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) != zext trunc mul
+define i32 @pr4918_1(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_1(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp ne i64 %mul64, %part64
+; CHECK: [[OVFL:%.*]] = extractvalue { i32, i1 } [[MUL:%.*]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
+; mul(zext x, zext y) == zext trunc mul
+define i32 @pr4918_2(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_2(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp eq i64 %mul64, %part64
+; CHECK: extractvalue { i32, i1 } [[MUL]]
+  %retval = zext i1 %overflow to i32
+; CHECK: xor
+  ret i32 %retval
+}
+
+; zext trunc mul != mul(zext x, zext y)
+define i32 @pr4918_3(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: @pr4918_3(
+entry:
+  %l = zext i32 %x to i64
+  %r = zext i32 %y to i64
+  %mul64 = mul i64 %l, %r
+; CHECK: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %x, i32 %y)
+  %part32 = trunc i64 %mul64 to i32
+  %part64 = zext i32 %part32 to i64
+  %overflow = icmp ne i64 %part64, %mul64
+; CHECK: extractvalue { i32, i1 } [[MUL]], 1
+  %retval = zext i1 %overflow to i32
+  ret i32 %retval
+}
+
diff --git a/test/Transforms/InstCombine/pr19420.ll b/test/Transforms/InstCombine/pr19420.ll
new file mode 100644
index 0000000..23fa0a4
--- /dev/null
+++ b/test/Transforms/InstCombine/pr19420.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL
+; CHECK: mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
+; CHECK-NEXT: ret
+define <4 x i32> @test_FoldShiftByConstant_CreateSHL(<4 x i32> %in) {
+  %mul.i = mul <4 x i32> %in, <i32 0, i32 -1, i32 0, i32 -1>
+  %vshl_n = shl <4 x i32> %mul.i, <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x i32> %vshl_n
+}
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2
+; CHECK: mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
+; CHECK-NEXT: ret
+define <8 x i16> @test_FoldShiftByConstant_CreateSHL2(<8 x i16> %in) {
+  %mul.i = mul <8 x i16> %in, <i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1>
+  %vshl_n = shl <8 x i16> %mul.i, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %vshl_n
+}
+
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd
+; CHECK: mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+; CHECK-NEXT: and <16 x i8> %vsra_n2, <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
+; CHECK-NEXT: ret
+define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) {
+  %vsra_n = ashr <16 x i8> %in0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %tmp = add <16 x i8> %in0, %vsra_n
+  %vshl_n = shl <16 x i8> %tmp, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  ret <16 x i8> %vshl_n
+}
+
+
+define i32 @bar(i32 %x, i32 %y) {
+  %a = lshr i32 %x, 4
+  %b = add i32 %a, %y
+  %c = shl i32 %b, 4
+  ret i32 %c
+}
+
+define <2 x i32> @bar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+  %a = lshr <2 x i32> %x, <i32 5, i32 5>
+  %b = add <2 x i32> %a, %y
+  %c = shl <2 x i32> %b, <i32 5, i32 5>
+  ret <2 x i32> %c
+}
+
+
+
+
+define i32 @foo(i32 %x, i32 %y) {
+  %a = lshr i32 %x, 4
+  %b = and i32 %a, 8
+  %c = add i32 %b, %y
+  %d = shl i32 %c, 4
+  ret i32 %d
+}
+
+define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
+  %a = lshr <2 x i32> %x, <i32 4, i32 4>
+  %b = and <2 x i32> %a, <i32 8, i32 8>
+  %c = add <2 x i32> %b, %y
+  %d = shl <2 x i32> %c, <i32 4, i32 4>
+  ret <2 x i32> %d
+}
+
+
+
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 1458bde..2213be1 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -1031,3 +1031,103 @@ define i32 @test67(i16 %x) {
 ; CHECK: lshr exact i32 %2, 1
 ; CHECK: xor i32 %3, 42
 }
+
+; SMIN(SMIN(X, 11), 92) -> SMIN(X, 11)
+define i32 @test68(i32 %x) {
+entry:
+  %cmp = icmp slt i32 11, %x
+  %cond = select i1 %cmp, i32 11, i32 %x
+  %cmp3 = icmp slt i32 92, %cond
+  %retval = select i1 %cmp3, i32 92, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test68(
+; CHECK: ret i32 %cond
+}
+
+; MIN(MIN(X, 24), 83) -> MIN(X, 24)
+define i32 @test69(i32 %x) {
+entry:
+  %cmp = icmp ult i32 24, %x
+  %cond = select i1 %cmp, i32 24, i32 %x
+  %cmp3 = icmp ult i32 83, %cond
+  %retval = select i1 %cmp3, i32 83, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test69(
+; CHECK: ret i32 %cond
+}
+
+; SMAX(SMAX(X, 75), 36) -> SMAX(X, 75)
+define i32 @test70(i32 %x) {
+entry:
+  %cmp = icmp slt i32 %x, 75
+  %cond = select i1 %cmp, i32 75, i32 %x
+  %cmp3 = icmp slt i32 %cond, 36
+  %retval = select i1 %cmp3, i32 36, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test70(
+; CHECK: ret i32 %cond
+}
+
+; MAX(MAX(X, 68), 47) -> MAX(X, 68)
+define i32 @test71(i32 %x) {
+entry:
+  %cmp = icmp ult i32 %x, 68
+  %cond = select i1 %cmp, i32 68, i32 %x
+  %cmp3 = icmp ult i32 %cond, 47
+  %retval = select i1 %cmp3, i32 47, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test71(
+; CHECK: ret i32 %cond
+}
+
+; SMIN(SMIN(X, 92), 11) -> SMIN(X, 11)
+define i32 @test72(i32 %x) {
+  %cmp = icmp sgt i32 %x, 92
+  %cond = select i1 %cmp, i32 92, i32 %x
+  %cmp3 = icmp sgt i32 %cond, 11
+  %retval = select i1 %cmp3, i32 11, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test72(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp sgt i32 %x, 11
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 11, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; MIN(MIN(X, 83), 24) -> MIN(X, 24)
+define i32 @test73(i32 %x) {
+  %cmp = icmp ugt i32 %x, 83
+  %cond = select i1 %cmp, i32 83, i32 %x
+  %cmp3 = icmp ugt i32 %cond, 24
+  %retval = select i1 %cmp3, i32 24, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test73(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ugt i32 %x, 24
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 24, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75)
+define i32 @test74(i32 %x) {
+  %cmp = icmp slt i32 %x, 36
+  %cond = select i1 %cmp, i32 36, i32 %x
+  %cmp3 = icmp slt i32 %cond, 75
+  %retval = select i1 %cmp3, i32 75, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test74(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp slt i32 %x, 75
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 75, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+
+; MAX(MAX(X, 47), 68) -> MAX(X, 68)
+define i32 @test75(i32 %x) {
+  %cmp = icmp ult i32 %x, 47
+  %cond = select i1 %cmp, i32 47, i32 %x
+  %cmp3 = icmp ult i32 %cond, 68
+  %retval = select i1 %cmp3, i32 68, i32 %cond
+  ret i32 %retval
+; CHECK-LABEL: @test75(
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %x, 68
+; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x
+; CHECK-NEXT: ret i32 [[SEL]]
+}
+\ No newline at end of file
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index b1082f0..5586bb6 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -36,17 +36,52 @@ define i32 @test4(i8 %A) {
 define i32 @test5(i32 %A) {
 ; CHECK-LABEL: @test5(
 ; CHECK: ret i32 undef
-        %B = lshr i32 %A, 32  ;; shift all bits out 
+        %B = lshr i32 %A, 32  ;; shift all bits out
         ret i32 %B
 }
 
+define <4 x i32> @test5_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_splat_vector(
+; CHECK: ret <4 x i32> undef
+  %B = lshr <4 x i32> %A, <i32 32, i32 32, i32 32, i32 32>     ;; shift all bits out
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5_zero_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_zero_vector(
+; CHECK-NEXT: ret <4 x i32> %A
+  %B = lshr <4 x i32> %A, zeroinitializer
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5_non_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5_non_splat_vector(
+; CHECK-NOT: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
+  ret <4 x i32> %B
+}
+
 define i32 @test5a(i32 %A) {
 ; CHECK-LABEL: @test5a(
 ; CHECK: ret i32 undef
-        %B = shl i32 %A, 32     ;; shift all bits out 
+        %B = shl i32 %A, 32     ;; shift all bits out
         ret i32 %B
 }
 
+define <4 x i32> @test5a_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5a_splat_vector(
+; CHECK: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 32, i32 32, i32 32>     ;; shift all bits out
+  ret <4 x i32> %B
+}
+
+define <4 x i32> @test5a_non_splat_vector(<4 x i32> %A) {
+; CHECK-LABEL: @test5a_non_splat_vector(
+; CHECK-NOT: ret <4 x i32> undef
+  %B = shl <4 x i32> %A, <i32 32, i32 1, i32 2, i32 3>
+  ret <4 x i32> %B
+}
+
 define i32 @test5b() {
 ; CHECK-LABEL: @test5b(
 ; CHECK: ret i32 -1
@@ -82,7 +117,7 @@ define i32 @test6a(i32 %A) {
 define i32 @test7(i8 %A) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT: ret i32 -1
-        %shift.upgrd.3 = zext i8 %A to i32 
+        %shift.upgrd.3 = zext i8 %A to i32
         %B = ashr i32 -1, %shift.upgrd.3  ;; Always equal to -1
         ret i32 %B
 }
@@ -232,7 +267,7 @@ define i1 @test16(i32 %X) {
 ; CHECK-NEXT: and i32 %X, 16
 ; CHECK-NEXT: icmp ne i32
 ; CHECK-NEXT: ret i1
-        %tmp.3 = ashr i32 %X, 4 
+        %tmp.3 = ashr i32 %X, 4
         %tmp.6 = and i32 %tmp.3, 1
         %tmp.7 = icmp ne i32 %tmp.6, 0
         ret i1 %tmp.7
@@ -344,6 +379,20 @@ define i32 @test25(i32 %tmp.2, i32 %AA) {
         ret i32 %tmp.6
 }
 
+define <2 x i32> @test25_vector(<2 x i32> %tmp.2, <2 x i32> %AA) {
+; CHECK-LABEL: @test25_vector(
+; CHECK: %tmp.3 = lshr <2 x i32> %tmp.2, <i32 17, i32 17>
+; CHECK-NEXT: shl <2 x i32> %tmp.3, <i32 17, i32 17>
+; CHECK-NEXT: add <2 x i32> %tmp.51, %AA
+; CHECK-NEXT: and <2 x i32> %x2, <i32 -131072, i32 -131072>
+; CHECK-NEXT: ret <2 x i32>
+  %x = lshr <2 x i32> %AA, <i32 17, i32 17>
+  %tmp.3 = lshr <2 x i32> %tmp.2, <i32 17, i32 17>
+  %tmp.5 = add <2 x i32> %tmp.3, %x
+  %tmp.6 = shl <2 x i32> %tmp.5, <i32 17, i32 17>
+  ret <2 x i32> %tmp.6
+}
+
 ;; handle casts between shifts.
 define i32 @test26(i32 %A) {
 ; CHECK-LABEL: @test26(
@@ -365,12 +414,12 @@ define i1 @test27(i32 %x) nounwind {
   %z = trunc i32 %y to i1
   ret i1 %z
 }
- 
+
 define i8 @test28(i8 %x) {
 entry:
 ; CHECK-LABEL: @test28(
 ; CHECK:     icmp slt i8 %x, 0
-; CHECK-NEXT:     br i1 
+; CHECK-NEXT:     br i1
 	%tmp1 = lshr i8 %x, 7
 	%cond1 = icmp ne i8 %tmp1, 0
 	br i1 %cond1, label %bb1, label %bb2
@@ -476,7 +525,7 @@ entry:
   %ins = or i128 %tmp23, %tmp27
   %tmp45 = lshr i128 %ins, 64
   ret i128 %tmp45
-  
+
 ; CHECK-LABEL: @test36(
 ; CHECK:  %tmp231 = or i128 %B, %A
 ; CHECK:  %ins = and i128 %tmp231, 18446744073709551615
@@ -492,7 +541,7 @@ entry:
   %tmp45 = lshr i128 %ins, 64
   %tmp46 = trunc i128 %tmp45 to i64
   ret i64 %tmp46
-  
+
 ; CHECK-LABEL: @test37(
 ; CHECK:  %tmp23 = shl nuw nsw i128 %tmp22, 32
 ; CHECK:  %ins = or i128 %tmp23, %A
@@ -780,3 +829,32 @@ bb11:                                             ; preds = %bb8
 bb12:                                             ; preds = %bb11, %bb8, %bb
   ret void
 }
+
+define i32 @test64(i32 %a) {
+; CHECK-LABEL: @test64(
+; CHECK-NEXT: ret i32 undef
+  %b = ashr i32 %a, 32  ; shift all bits out
+  ret i32 %b
+}
+
+define <4 x i32> @test64_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test64_splat_vector
+; CHECK-NEXT: ret <4 x i32> undef
+  %b = ashr <4 x i32> %a, <i32 32, i32 32, i32 32, i32 32>  ; shift all bits out
+  ret <4 x i32> %b
+}
+
+define <4 x i32> @test64_non_splat_vector(<4 x i32> %a) {
+; CHECK-LABEL: @test64_non_splat_vector
+; CHECK-NOT: ret <4 x i32> undef
+  %b = ashr <4 x i32> %a, <i32 32, i32 0, i32 1, i32 2>  ; shift all bits out
+  ret <4 x i32> %b
+}
+
+define <2 x i65> @test_65(<2 x i64> %t) {
+; CHECK-LABEL: @test_65
+  %a = zext <2 x i64> %t to <2 x i65>
+  %sext = shl <2 x i65> %a, <i65 33, i65 33>
+  %b = ashr <2 x i65> %sext, <i65 33, i65 33>
+  ret <2 x i65> %b
+}
diff --git a/test/Transforms/InstCombine/strlen-1.ll b/test/Transforms/InstCombine/strlen-1.ll
index 4fa5b4f..4a3caf2 100644
--- a/test/Transforms/InstCombine/strlen-1.ll
+++ b/test/Transforms/InstCombine/strlen-1.ll
@@ -5,6 +5,7 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
 @hello = constant [6 x i8] c"hello\00"
+@longer = constant [7 x i8] c"longer\00"
 @null = constant [1 x i8] zeroinitializer
 @null_hello = constant [7 x i8] c"\00hello\00"
 @nullstring = constant i8 0
@@ -85,6 +86,17 @@ define i1 @test_simplify8() {
 ; CHECK-NEXT: ret i1 false
 }
 
+define i32 @test_simplify9(i1 %x) {
+; CHECK-LABEL: @test_simplify9
+  %hello = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %longer = getelementptr [7 x i8]* @longer, i32 0, i32 0
+  %s = select i1 %x, i8* %hello, i8* %longer
+  %l = call i32 @strlen(i8* %s)
+; CHECK-NEXT: select i1 %x, i32 5, i32 6
+  ret i32 %l
+; CHECK-NEXT: ret
+}
+
 ; Check cases that shouldn't be simplified.
 
 define i32 @test_no_simplify1() {
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index d12412a..41d2b29 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define i16 @test1(float %f) {
 entry:
@@ -209,4 +210,369 @@ define <4 x float> @test_select(float %f, float %g) {
   ret <4 x float> %ret
 }
 
+; We should optimize these two redundant insertqi into one
+; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
+; CHECK-NOT: insertqi
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
+  ret <2 x i64> %2
+}
+
+; The result of this insert is the second arg, since the top 64 bits of
+; the result are undefined, and we copy the bottom 64 bits from the
+; second arg
+; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: ret <2 x i64> %i
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
+  ret <2 x i64> %1
+}
+
+; Test the several types of ranges and ordering that exist for two insertqi
+; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
+; CHECK: ret <2 x i64> %[[RES]]
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
+; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
+define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
+; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
+  ret <2 x i64> %2
+}
+
+
+; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps(
+; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+  ret <4 x float> %a
+}
+
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256(
+; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
+  ret <8 x float> %a
+}
+
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>)
+define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd(
+; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> <i32 2, i32 0>)
+  ret <2 x double> %a
+}
+
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>)
+define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256(
+; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> <i32 3, i32 1, i32 2, i32 0>)
+  ret <4 x double> %a
+}
+
+define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_zero(
+; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
+  ret <4 x float> %a
+}
+
+define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
+; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
+; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
+  ret <8 x float> %a
+}
+
+define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_zero(
+; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> zeroinitializer)
+  ret <2 x double> %a
+}
+
+define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
+; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
+; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> zeroinitializer)
+  ret <4 x double> %a
+}
+
+define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_1
+; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
+}
+
+define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_1
+; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
+}
+
+define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_0
+; CHECK: ret <2 x i64> zeroinitializer
+}
+
+define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_0
+; CHECK: ret <4 x i64> zeroinitializer
+}
+define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_psrl_1
+; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
+}
+
+define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
+  %S = bitcast i32 1 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_psrl_1
+; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
+}
+
+define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
+  %6 = bitcast <8 x i16> %5 to <4 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <4 x i32> %8 to <2 x i64>
+  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <2 x i64> %10 to <8 x i16>
+  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
+  %13 = bitcast <8 x i16> %12 to <4 x i32>
+  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
+  %15 = bitcast <4 x i32> %14 to <2 x i64>
+  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
+  ret <2 x i64> %16
+; CHECK: test_sse2_psrl_0
+; CHECK: ret <2 x i64> zeroinitializer
+}
+
+define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
+  %S = bitcast i32 128 to i32
+  %1 = zext i32 %S to i64
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 0, i32 1
+  %4 = bitcast <2 x i64> %3 to <8 x i16>
+  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
+  %6 = bitcast <16 x i16> %5 to <8 x i32>
+  %7 = bitcast <2 x i64> %3 to <4 x i32>
+  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
+  %9 = bitcast <8 x i32> %8 to <4 x i64>
+  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
+  %11 = bitcast <4 x i64> %10 to <16 x i16>
+  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
+  %13 = bitcast <16 x i16> %12 to <8 x i32>
+  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
+  %15 = bitcast <8 x i32> %14 to <4 x i64>
+  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
+  ret <4 x i64> %16
+; CHECK: test_avx2_psrl_0
+; CHECK: ret <4 x i64> zeroinitializer
+}
+
+declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
+declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
+declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
+declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
 
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index a409a91..fc0f8bd 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -244,4 +244,164 @@ define <4 x i8> @test16b(i8 %ele) {
   %tmp1 = shl <8 x i8> %tmp0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i8> %tmp2
-}
-\ No newline at end of file
+}
+
+; If composition of two shuffles is identity, shuffles can be removed.
+define <4 x i32> @shuffle_17ident(<4 x i32> %v) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17ident(
+; CHECK-NOT: shufflevector
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer,
+                           <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %shuffle, <4 x i32> zeroinitializer,
+                            <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle2
+}
+
+; swizzle can be put after operation
+define <4 x i32> @shuffle_17and(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17and(
+; CHECK-NOT: shufflevector
+; CHECK: and <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = and <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17add(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17add(
+; CHECK-NOT: shufflevector
+; CHECK: add <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17addnsw(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17addnsw(
+; CHECK-NOT: shufflevector
+; CHECK: add nsw <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add nsw <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17addnuw(<4 x i32> %v1, <4 x i32> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17addnuw(
+; CHECK-NOT: shufflevector
+; CHECK: add nuw <4 x i32> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x i32> %v2, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add nuw <4 x i32> %t1, %t2
+  ret <4 x i32> %r
+}
+
+define <4 x float> @shuffle_17fsub(<4 x float> %v1, <4 x float> %v2) nounwind uwtable {
+; CHECK-LABEL: @shuffle_17fsub(
+; CHECK-NOT: shufflevector
+; CHECK: fsub <4 x float> %v1, %v2
+; CHECK: shufflevector
+  %t1 = shufflevector <4 x float> %v1, <4 x float> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %t2 = shufflevector <4 x float> %v2, <4 x float> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = fsub <4 x float> %t1, %t2
+  ret <4 x float> %r
+}
+
+define <4 x i32> @shuffle_17addconst(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_17addconst(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = add <4 x i32> %v1, <i32 4, i32 1, i32 2, i32 3>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[VAR1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK: ret <4 x i32> [[VAR2]]
+  %t1 = shufflevector <4 x i32> %v1, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  %r = add <4 x i32> %t1, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17add2(<4 x i32> %v) {
+; CHECK-LABEL: @shuffle_17add2(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR:%[a-zA-Z0-9.]+]] = shl <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: ret <4 x i32> [[VAR]]
+  %t1 = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %t2 = add <4 x i32> %t1, %t1
+  %r = shufflevector <4 x i32> %t2, <4 x i32> zeroinitializer,
+                     <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_17mulsplat(<4 x i32> %v) {
+; CHECK-LABEL: @shuffle_17mulsplat(
+; CHECK-NOT: shufflevector
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = mul <4 x i32> %v, %v
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <4 x i32> [[VAR1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: ret <4 x i32> [[VAR2]]
+  %s1 = shufflevector <4 x i32> %v,
+                      <4 x i32> zeroinitializer,
+                      <4 x i32> zeroinitializer
+  %m1 = mul <4 x i32> %s1, %s1
+  %s2 = shufflevector <4 x i32> %m1,
+                      <4 x i32> zeroinitializer,
+                      <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %s2
+}
+
+; Do not reorder shuffle and binop if LHS of shuffles are of different size
+define <2 x i32> @pr19717(<4 x i32> %in0, <2 x i32> %in1) {
+; CHECK-LABEL: @pr19717(
+; CHECK: shufflevector
+; CHECK: shufflevector
+; CHECK: mul
+  %shuffle = shufflevector <4 x i32> %in0, <4 x i32> %in0, <2 x i32> zeroinitializer
+  %shuffle4 = shufflevector <2 x i32> %in1, <2 x i32> %in1, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %shuffle4
+  ret <2 x i32> %mul
+}
+
+define <4 x i16> @pr19717a(<8 x i16> %in0, <8 x i16> %in1) {
+; CHECK-LABEL: @pr19717a(
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = mul <8 x i16> %in0, %in1
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = shufflevector <8 x i16> [[VAR1]], <8 x i16> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK: ret <4 x i16> [[VAR2]]
+  %shuffle = shufflevector <8 x i16> %in0, <8 x i16> %in0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  %shuffle1 = shufflevector <8 x i16> %in1, <8 x i16> %in1, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+  %mul = mul <4 x i16> %shuffle, %shuffle1
+  ret <4 x i16> %mul
+}
+
+define <8 x i8> @pr19730(<16 x i8> %in0) {
+; CHECK-LABEL: @pr19730(
+; CHECK: shufflevector
+  %shuffle = shufflevector <16 x i8> %in0, <16 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %shuffle1 = shufflevector <8 x i8> %shuffle, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %shuffle1
+}
+
+define i32 @pr19737(<4 x i32> %in0) {
+; CHECK-LABEL: @pr19737(
+; CHECK: [[VAR:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> %in0, i32 0
+; CHECK: ret i32 [[VAR]]
+  %shuffle.i = shufflevector <4 x i32> zeroinitializer, <4 x i32> %in0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %neg.i = xor <4 x i32> %shuffle.i, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and.i = and <4 x i32> %in0, %neg.i
+  %rv = extractelement <4 x i32> %and.i, i32 0
+  ret i32 %rv
+}