From 4c5e43da7792f75567b693105cc53e3f1992ad98 Mon Sep 17 00:00:00 2001 From: Pirama Arumuga Nainar Date: Wed, 8 Apr 2015 08:55:49 -0700 Subject: Update aosp/master llvm for rebase to r233350 Change-Id: I07d935f8793ee8ec6b7da003f6483046594bca49 --- test/CodeGen/X86/unaligned-32-byte-memops.ll | 110 +++++++-------------------- 1 file changed, 29 insertions(+), 81 deletions(-) (limited to 'test/CodeGen/X86/unaligned-32-byte-memops.ll') diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll index 9cec17d..b337a80 100644 --- a/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -20,7 +20,7 @@ define <8 x float> @load32bytes(<8 x float>* %Ap) { ; HASWELL: vmovups ; HASWELL: retq - %A = load <8 x float>* %Ap, align 16 + %A = load <8 x float>, <8 x float>* %Ap, align 16 ret <8 x float> %A } @@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) { ; Merge two consecutive 16-byte subvector loads into a single 32-byte load ; if it's faster. -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) - -; Use the vinsertf128 intrinsic to model source code -; that explicitly uses AVX intrinsics. -define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 1 - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 2 - %v1 = load <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) - ret <8 x float> %v3 -} - -; Swap the operands of the shufflevector and vinsertf128 to ensure that the -; pattern still matches. -define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads_swap - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 2 - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 3 - %v1 = load <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) - ret <8 x float> %v3 -} - -; Replace the vinsertf128 intrinsic with a shufflevector as might be -; expected from auto-vectorized code. define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic @@ -113,10 +61,10 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 3 - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 4 - %v1 = load <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 + %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 + %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 + %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> ret <8 x float> %v3 } @@ -136,10 +84,10 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { ; HASWELL: vmovups ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <4 x float>* %ptr, i64 4 - %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 5 - %v1 = load <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 + %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 + %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 + %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> ret <8 x float> %v3 } @@ -168,10 +116,10 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { ; HASWELL: vpaddq ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <2 x i64>* %ptr, i64 5 - %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 6 - %v1 = load <2 x i64>* %ptr1, align 1 - %v2 = load <2 x i64>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5 + %ptr2 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 6 + %v1 = load <2 x i64>, <2 x i64>* %ptr1, align 1 + %v2 = load <2 x i64>, <2 x i64>* %ptr2, align 1 %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> %v4 = add <4 x i64> %v3, %x ret <4 x i64> %v4 @@ -196,10 +144,10 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { ; HASWELL: vpaddd ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <4 x i32>* %ptr, i64 6 - %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 7 - %v1 = load <4 x i32>* %ptr1, align 1 - %v2 = load <4 x i32>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6 + %ptr2 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 7 + %v1 = load <4 x i32>, <4 x i32>* %ptr1, align 1 + %v2 = load <4 x i32>, <4 x i32>* %ptr2, align 1 %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> %v4 = add <8 x i32> %v3, %x ret <8 x i32> %v4 @@ -224,10 +172,10 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { ; HASWELL: vpaddw ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <8 x i16>* %ptr, i64 7 - %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 8 - %v1 = load <8 x i16>* %ptr1, align 1 - %v2 = load <8 x i16>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7 + %ptr2 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 8 + %v1 = load <8 x i16>, <8 x i16>* %ptr1, align 1 + %v2 = load <8 x i16>, <8 x i16>* %ptr2, align 1 %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> %v4 = add <16 x i16> %v3, %x ret <16 x i16> %v4 @@ -252,10 +200,10 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { ; HASWELL: vpaddb ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <16 x i8>* %ptr, i64 8 - %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 9 - %v1 = load <16 x i8>* %ptr1, align 1 - %v2 = load <16 x i8>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8 + %ptr2 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 9 + %v1 = load <16 x i8>, <16 x i8>* %ptr1, align 1 + %v2 = load <16 x i8>, <16 x i8>* %ptr2, align 1 %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> %v4 = add <32 x i8> %v3, %x ret <32 x i8> %v4 @@ -277,10 +225,10 @@ define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x doubl ; HASWELL: vaddpd ; HASWELL-NEXT: retq - %ptr1 = getelementptr inbounds <2 x double>* %ptr, i64 9 - %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 10 - %v1 = load <2 x double>* %ptr1, align 1 - %v2 = load <2 x double>* %ptr2, align 1 + %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9 + %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 10 + %v1 = load <2 x double>, <2 x double>* %ptr1, align 1 + %v2 = load <2 x double>, <2 x double>* %ptr2, align 1 %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> %v4 = fadd <4 x double> %v3, %x ret <4 x double> %v4 -- cgit v1.1