diff options
Diffstat (limited to 'renderscript/clang-include/avxintrin.h')
-rw-r--r-- | renderscript/clang-include/avxintrin.h | 93 |
1 files changed, 64 insertions, 29 deletions
diff --git a/renderscript/clang-include/avxintrin.h b/renderscript/clang-include/avxintrin.h index d7c7f46..f30a5ad 100644 --- a/renderscript/clang-include/avxintrin.h +++ b/renderscript/clang-include/avxintrin.h @@ -429,19 +429,6 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) __m128 __b = (b); \ (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) -/* Vector extract */ -#define _mm256_extractf128_pd(A, O) __extension__ ({ \ - __m256d __A = (A); \ - (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) - -#define _mm256_extractf128_ps(A, O) __extension__ ({ \ - __m256 __A = (A); \ - (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) - -#define _mm256_extractf128_si256(A, O) __extension__ ({ \ - __m256i __A = (A); \ - (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) - static __inline int __attribute__((__always_inline__, __nodebug__)) _mm256_extract_epi32(__m256i __a, const int __imm) { @@ -472,22 +459,6 @@ _mm256_extract_epi64(__m256i __a, const int __imm) } #endif -/* Vector insert */ -#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ - __m256d __V1 = (V1); \ - __m128d __V2 = (V2); \ - (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) - -#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ - __m256 __V1 = (V1); \ - __m128 __V2 = (V2); \ - (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) - -#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ - __m256i __V1 = (V1); \ - __m128i __V2 = (V2); \ - (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) - static __inline __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_insert_epi32(__m256i __a, int __b, int const __imm) { @@ -1166,6 +1137,70 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); } +/* + Vector insert. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ + (__m256)__builtin_shufflevector( \ + (__v8sf)(V1), \ + (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ + (((M) & 1) ? 0 : 8), \ + (((M) & 1) ? 1 : 9), \ + (((M) & 1) ? 2 : 10), \ + (((M) & 1) ? 3 : 11), \ + (((M) & 1) ? 8 : 4), \ + (((M) & 1) ? 9 : 5), \ + (((M) & 1) ? 10 : 6), \ + (((M) & 1) ? 11 : 7) );}) + +#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ + (__m256d)__builtin_shufflevector( \ + (__v4df)(V1), \ + (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) + +#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ + (__m256i)__builtin_shufflevector( \ + (__v4di)(V1), \ + (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) + +/* + Vector extract. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +#define _mm256_extractf128_ps(V, M) __extension__ ({ \ + (__m128)__builtin_shufflevector( \ + (__v8sf)(V), \ + (__v8sf)(_mm256_setzero_ps()), \ + (((M) & 1) ? 4 : 0), \ + (((M) & 1) ? 5 : 1), \ + (((M) & 1) ? 6 : 2), \ + (((M) & 1) ? 7 : 3) );}) + +#define _mm256_extractf128_pd(V, M) __extension__ ({ \ + (__m128d)__builtin_shufflevector( \ + (__v4df)(V), \ + (__v4df)(_mm256_setzero_pd()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + +#define _mm256_extractf128_si256(V, M) __extension__ ({ \ + (__m128i)__builtin_shufflevector( \ + (__v4di)(V), \ + (__v4di)(_mm256_setzero_si256()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + /* SIMD load ops (unaligned) */ static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) |