Implement AArch64 neon instructions class SIMD lsone and SIMD lone-post.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195078 91177308-0d34-0410-b5e6-96231b3b80d8
author: Hao Liu <Hao.Liu@arm.com> 2013-11-19 02:17:05 +0000
committer: Hao Liu <Hao.Liu@arm.com> 2013-11-19 02:17:05 +0000
commit: 36c7806f4eacd676932ba630246f88e0e37b1cd4 (patch)
tree: 2c9884d3bdad08211208fbb8e21a6ed8d423d93e /test
parent: e40e68add7f17f6ad5cd5e85ea44b149f6935147 (diff)
download: external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.zip
external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.gz
external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.bz2
5 files changed, 2960 insertions, 0 deletions
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
new file mode 100644
index 0000000..3f28320
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
@@ -0,0 +1,2113 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2q_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2q_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2q_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2q_dup_s64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2q_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2q_dup_f64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3q_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3q_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3q_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3q_dup_s64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3q_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3q_dup_f64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4q_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4q_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4q_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4q_dup_s64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4q_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
+  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4q_dup_f64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
+  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
+  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
new file mode 100644
index 0000000..80a9347
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
@@ -0,0 +1,319 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
+; CHECK-LABEL: test_vld2q_dup_fx_update
+; CHECK: ld2r  {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
+  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
+  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
+  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <16 x i8>] } %7
+}
+
+define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_reg_update
+; CHECK: ld2r  {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
+  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret { [2 x <4 x i32>] } %8
+}
+
+define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
+; CHECK-LABEL: test_vld3_dup_fx_update
+; CHECK: ld3r  {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
+  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 3
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] }  %11
+}
+
+define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_reg_update
+; CHECK: ld3r  {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
+  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
+  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
+  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
+  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [3 x <8 x i8>] }%10
+}
+
+define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
+; CHECK-LABEL: test_vld4_dup_fx_update
+; CHECK: ld4r  {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_reg_update
+; CHECK: ld4r  {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vld2_lane_fx_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2_lane_reg_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vld3_lane_fx_update
+; CHECK: ld3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <2 x float>] %b, 0
+  %2 = extractvalue [3 x <2 x float>] %b, 1
+  %3 = extractvalue [3 x <2 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
+  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
+  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
+  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
+  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
+  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret { [3 x <2 x float>] } %11
+}
+
+define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_lane_reg_update
+; CHECK: ld3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
+  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
+  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] } %11
+}
+
+define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
+; CHECK-LABEL: test_vld4_lane_fx_update
+; CHECK: ld4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
+  %1 = extractvalue [4 x <2 x i32>] %b, 0
+  %2 = extractvalue [4 x <2 x i32>] %b, 1
+  %3 = extractvalue [4 x <2 x i32>] %b, 2
+  %4 = extractvalue [4 x <2 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
+  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
+  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_lane_reg_update
+; CHECK: ld4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x double>] %b, 0
+  %2 = extractvalue [4 x <2 x double>] %b, 1
+  %3 = extractvalue [4 x <2 x double>] %b, 2
+  %4 = extractvalue [4 x <2 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
+  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
+  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vst2_lane_fx_update
+; CHECK: st2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret void
+}
+
+define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst2_lane_reg_update
+; CHECK: st2  {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vst3_lane_fx_update
+; CHECK: st3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <4 x float>] %b, 0
+  %2 = extractvalue [3 x <4 x float>] %b, 1
+  %3 = extractvalue [3 x <4 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst3_lane_reg_update
+; CHECK: st3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
+; CHECK-LABEL: test_vst4_lane_fx_update
+; CHECK: st4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
+  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %tmp1 = getelementptr double* %a, i32 4
+  store double* %tmp1, double** %ptr
+  ret void
+}
+
+
+define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst4_lane_reg_update
+; CHECK: st4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
+  %tmp1 = getelementptr float* %a, i32 %inc
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index eaa5c56..74719e0 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -4170,6 +4170,125 @@
 // CHECK-ERROR:          st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
 // CHECK-ERROR:                                     ^
 
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+         ld1r {x1}, [x0]
+         ld2r {v31.4s, v0.2s}, [sp]
+         ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: ld1r {x1}, [x0]
+// CHECK-ERROR:       ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:      ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Load/Store single N-element structure to/from one lane of N consecutive
+// registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[16], [x0]
+         ld2 {v15.h, v16.h}[8], [x15]
+         ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+         ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: ld1 {v0.b}[16], [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:                              ^
+
+         st1 {v0.d}[16], [x0]
+         st2 {v31.s, v0.s}[3], [8]
+         st3 {v15.h, v16.h, v17.h}[-1], [x15]
+         st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: st1 {v0.d}[16], [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: lane number incompatible with layout
+// CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:                              ^
+
+//------------------------------------------------------------------------------
+// Post-index of load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+         ld1r {v15.8h}, [x15], #5
+         ld2r {v0.2d, v1.2d}, [x0], #7
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1r {v15.8h}, [x15], #5
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2r {v0.2d, v1.2d}, [x0], #7
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+// CHECK-ERROR:                                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR:                                           ^
+
+//------------------------------------------------------------------------------
+// Post-index of Load/Store single N-element structure to/from one lane of N
+// consecutive registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[0], [x0], #2
+         ld2 {v15.h, v16.h}[0], [x15], #3
+         ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1 {v0.b}[0], [x0], #2
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR:                                        ^
+
+         st1 {v0.d}[0], [x0], #7
+         st2 {v31.s, v0.s}[0], [sp], #6
+         st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+         st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st1 {v0.d}[0], [x0], #7
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[0], [sp], #6
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+// CHECK-ERROR:                                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR:                                        ^
+
+
          ins v2.b[16], w1
          ins v7.h[8], w14
          ins v20.s[5], w30
diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s
new file mode 100644
index 0000000..140d752
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s
@@ -0,0 +1,325 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+         ld1r {v0.16b}, [x0]
+         ld1r {v15.8h}, [x15]
+         ld1r {v31.4s}, [sp]
+         ld1r {v0.2d}, [x0]
+         ld1r {v0.8b}, [x0]
+         ld1r {v15.4h}, [x15]
+         ld1r {v31.2s}, [sp]
+         ld1r {v0.1d}, [x0]
+// CHECK: ld1r {v0.16b}, [x0]          // encoding: [0x00,0xc0,0x40,0x4d]
+// CHECK: ld1r {v15.8h}, [x15]         // encoding: [0xef,0xc5,0x40,0x4d]
+// CHECK: ld1r {v31.4s}, [sp]          // encoding: [0xff,0xcb,0x40,0x4d]
+// CHECK: ld1r {v0.2d}, [x0]           // encoding: [0x00,0xcc,0x40,0x4d]
+// CHECK: ld1r {v0.8b}, [x0]           // encoding: [0x00,0xc0,0x40,0x0d]
+// CHECK: ld1r {v15.4h}, [x15]         // encoding: [0xef,0xc5,0x40,0x0d]
+// CHECK: ld1r {v31.2s}, [sp]          // encoding: [0xff,0xcb,0x40,0x0d]
+// CHECK: ld1r {v0.1d}, [x0]           // encoding: [0x00,0xcc,0x40,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2r {v0.16b, v1.16b}, [x0]
+         ld2r {v15.8h, v16.8h}, [x15]
+         ld2r {v31.4s, v0.4s}, [sp]
+         ld2r {v0.2d, v1.2d}, [x0]
+         ld2r {v0.8b, v1.8b}, [x0]
+         ld2r {v15.4h, v16.4h}, [x15]
+         ld2r {v31.2s, v0.2s}, [sp]
+         ld2r {v31.1d, v0.1d}, [sp]
+// CHECK: ld2r {v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xc0,0x60,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xc5,0x60,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xcb,0x60,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xcc,0x60,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xc0,0x60,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xc5,0x60,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xcb,0x60,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [sp]   // encoding: [0xff,0xcf,0x60,0x0d]
+
+         ld3r {v0.16b, v1.16b, v2.16b}, [x0]
+         ld3r {v15.8h, v16.8h, v17.8h}, [x15]
+         ld3r {v31.4s, v0.4s, v1.4s}, [sp]
+         ld3r {v0.2d, v1.2d, v2.2d}, [x0]
+         ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+         ld3r {v31.2s, v0.2s, v1.2s}, [sp]
+         ld3r {v31.1d, v0.1d, v1.1d}, [sp]
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0xe0,0x40,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0xe5,0x40,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0xeb,0x40,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0xec,0x40,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0xe0,0x40,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0xe5,0x40,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0xeb,0x40,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp] // encoding: [0xff,0xef,0x40,0x0d]
+
+         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0xe0,0x60,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0xe5,0x60,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0xeb,0x60,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0xec,0x60,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0xe0,0x60,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0xe5,0x60,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0xeb,0x60,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] // encoding: [0xff,0xef,0x60,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[9], [x0]
+         ld1 {v15.h}[7], [x15]
+         ld1 {v31.s}[3], [sp]
+         ld1 {v0.d}[1], [x0]
+// CHECK: ld1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x40,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x40,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x40,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x40,0x4d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2 {v0.b, v1.b}[9], [x0]
+         ld2 {v15.h, v16.h}[7], [x15]
+         ld2 {v31.s, v0.s}[3], [sp]
+         ld2 {v0.d, v1.d}[1], [x0]
+// CHECK: ld2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x60,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x60,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x60,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x60,0x4d]
+
+         ld3 {v0.b, v1.b, v2.b}[9], [x0]
+         ld3 {v15.h, v16.h, v17.h}[7], [x15]
+         ld3 {v31.s, v0.s, v1.s}[3], [sp]
+         ld3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x40,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x40,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d]
+
+         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x60,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x60,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+         st1 {v0.b}[9], [x0]
+         st1 {v15.h}[7], [x15]
+         st1 {v31.s}[3], [sp]
+         st1 {v0.d}[1], [x0]
+// CHECK: st1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x00,0x4d]
+// CHECK: st1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x00,0x4d]
+// CHECK: st1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x00,0x4d]
+// CHECK: st1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x00,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+         st2 {v0.b, v1.b}[9], [x0]
+         st2 {v15.h, v16.h}[7], [x15]
+         st2 {v31.s, v0.s}[3], [sp]
+         st2 {v0.d, v1.d}[1], [x0]
+// CHECK: st2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x20,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x20,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x20,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x20,0x4d]
+
+         st3 {v0.b, v1.b, v2.b}[9], [x0]
+         st3 {v15.h, v16.h, v17.h}[7], [x15]
+         st3 {v31.s, v0.s, v1.s}[3], [sp]
+         st3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x00,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x00,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d]
+
+         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x20,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x20,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index oad single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+         ld1r {v0.16b}, [x0], #1
+         ld1r {v15.8h}, [x15], #2
+         ld1r {v31.4s}, [sp], #4
+         ld1r {v0.2d}, [x0], #8
+         ld1r {v0.8b}, [x0], x0
+         ld1r {v15.4h}, [x15], x1
+         ld1r {v31.2s}, [sp], x2
+         ld1r {v0.1d}, [x0], x3
+// CHECK: ld1r {v0.16b}, [x0], #1      // encoding: [0x00,0xc0,0xdf,0x4d]
+// CHECK: ld1r {v15.8h}, [x15], #2     // encoding: [0xef,0xc5,0xdf,0x4d]
+// CHECK: ld1r {v31.4s}, [sp], #4      // encoding: [0xff,0xcb,0xdf,0x4d]
+// CHECK: ld1r {v0.2d}, [x0], #8       // encoding: [0x00,0xcc,0xdf,0x4d]
+// CHECK: ld1r {v0.8b}, [x0], x0       // encoding: [0x00,0xc0,0xc0,0x0d]
+// CHECK: ld1r {v15.4h}, [x15], x1     // encoding: [0xef,0xc5,0xc1,0x0d]
+// CHECK: ld1r {v31.2s}, [sp], x2      // encoding: [0xff,0xcb,0xc2,0x0d]
+// CHECK: ld1r {v0.1d}, [x0], x3       // encoding: [0x00,0xcc,0xc3,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2r {v0.16b, v1.16b}, [x0], #2
+         ld2r {v15.8h, v16.8h}, [x15], #4
+         ld2r {v31.4s, v0.4s}, [sp], #8
+         ld2r {v0.2d, v1.2d}, [x0], #16
+         ld2r {v0.8b, v1.8b}, [x0], x6
+         ld2r {v15.4h, v16.4h}, [x15], x7
+         ld2r {v31.2s, v0.2s}, [sp], x9
+         ld2r {v31.1d, v0.1d}, [x0], x5
+// CHECK: ld2r {v0.16b, v1.16b}, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d]
+
+         ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9
+         ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6
+         ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7
+         ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5
+         ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+         ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12
+         ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d]
+
+         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4
+         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8
+         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16
+         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32
+         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5
+         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9
+         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[9], [x0], #1
+         ld1 {v15.h}[7], [x15], x9
+         ld1 {v31.s}[3], [sp], x6
+         ld1 {v0.d}[1], [x0], #8
+// CHECK: ld1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0xdf,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0xc9,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0xc6,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0xdf,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2 {v0.b, v1.b}[9], [x0], x3
+         ld2 {v15.h, v16.h}[7], [x15], #4
+         ld2 {v31.s, v0.s}[3], [sp], #8
+         ld2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: ld2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d]
+
+         ld3 {v0.b, v1.b, v2.b}[9], [x0], #3
+         ld3 {v15.h, v16.h, v17.h}[7], [x15], #6
+         ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
+         ld3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d]
+
+         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+         st1 {v0.b}[9], [x0], #1
+         st1 {v15.h}[7], [x15], x9
+         st1 {v31.s}[3], [sp], x6
+         st1 {v0.d}[1], [x0], #8
+// CHECK: st1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0x9f,0x4d]
+// CHECK: st1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0x89,0x4d]
+// CHECK: st1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0x86,0x4d]
+// CHECK: st1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0x9f,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         st2 {v0.b, v1.b}[9], [x0], x3
+         st2 {v15.h, v16.h}[7], [x15], #4
+         st2 {v31.s, v0.s}[3], [sp], #8
+         st2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: st2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d]
+
+         st3 {v0.b, v1.b, v2.b}[9], [x0], #3
+         st3 {v15.h, v16.h, v17.h}[7], [x15], #6
+         st3 {v31.s, v0.s, v1.s}[3], [sp], x3
+         st3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d]
+
+         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d]
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
index c165901..9f9e777 100644
--- a/test/MC/Disassembler/AArch64/neon-instructions.txt
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -2060,6 +2060,90 @@
 0xff,0x0b,0x9f,0x4c
 
 #----------------------------------------------------------------------
+# Vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0]
+# CHECK: ld1r {v15.8h}, [x15]
+# CHECK: ld2r {v31.4s, v0.4s}, [sp]
+# CHECK: ld2r {v0.2d, v1.2d}, [x0]
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+0x00,0xc0,0x40,0x4d
+0xef,0xc5,0x40,0x4d
+0xff,0xcb,0x60,0x4d
+0x00,0xcc,0x60,0x4d
+0x00,0xe0,0x40,0x0d
+0xef,0xe5,0x40,0x0d
+0xff,0xeb,0x60,0x0d
+0xff,0xef,0x60,0x0d
+
+#----------------------------------------------------------------------
+# Vector load/store single N-element structure to/from one lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[9], [x0]
+# CHECK: ld2 {v15.h, v16.h}[7], [x15]
+# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp]
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+# CHECK: st1 {v0.d}[1], [x0]
+# CHECK: st2 {v31.s, v0.s}[3], [sp]
+# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15]
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+0x00,0x04,0x40,0x4d
+0xef,0x59,0x60,0x4d
+0xff,0xb3,0x40,0x4d
+0x00,0xa4,0x60,0x4d
+0x00,0x84,0x00,0x4d
+0xff,0x93,0x20,0x4d
+0xef,0x79,0x00,0x4d
+0x00,0x24,0x20,0x4d
+
+#----------------------------------------------------------------------
+# Post-index of vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0], #1
+# CHECK: ld1r {v15.8h}, [x15], #2
+# CHECK: ld2r {v31.4s, v0.4s}, [sp], #8
+# CHECK: ld2r {v0.2d, v1.2d}, [x0], #16
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+0x00,0xc0,0xdf,0x4d
+0xef,0xc5,0xdf,0x4d
+0xff,0xcb,0xff,0x4d
+0x00,0xcc,0xff,0x4d
+0x00,0xe0,0xdf,0x0d
+0xef,0xe5,0xdf,0x0d
+0xff,0xeb,0xfe,0x0d
+0xff,0xef,0xe7,0x0d
+
+#----------------------------------------------------------------------
+# Post-index of vector load/store single N-element structure to/from
+#  one lane of N consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[0], [x0], #1
+# CHECK: ld2 {v15.h, v16.h}[0], [x15], #4
+# CHECK: ld3 {v31.s, v0.s, v1.s}[0], [sp], x3
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+# CHECK: st1 {v0.d}[0], [x0], #8
+# CHECK: st2 {v31.s, v0.s}[0], [sp], #8
+# CHECK: st3 {v15.h, v16.h, v17.h}[0], [x15], #6
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], x5
+0x00,0x04,0xdf,0x4d
+0xef,0x59,0xff,0x4d
+0xff,0xb3,0xc3,0x4d
+0x00,0xa4,0xff,0x4d
+0x00,0x84,0x9f,0x4d
+0xff,0x93,0xbf,0x4d
+0xef,0x79,0x9f,0x4d
+0x00,0x24,0xa5,0x4d
+
+#----------------------------------------------------------------------
 # Bitwise extract
 #----------------------------------------------------------------------
 0x20,0x18,0x02,0x2e
author	Hao Liu <Hao.Liu@arm.com>	2013-11-19 02:17:05 +0000
committer	Hao Liu <Hao.Liu@arm.com>	2013-11-19 02:17:05 +0000
commit	36c7806f4eacd676932ba630246f88e0e37b1cd4 (patch)
tree	2c9884d3bdad08211208fbb8e21a6ed8d423d93e /test
parent	e40e68add7f17f6ad5cd5e85ea44b149f6935147 (diff)
download	external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.zip external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.gz external_llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.bz2