diff options
Diffstat (limited to 'media/libstagefright/codecs/avc/enc/src/sad_inline.h')
-rw-r--r-- | media/libstagefright/codecs/avc/enc/src/sad_inline.h | 488 |
1 files changed, 488 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/avc/enc/src/sad_inline.h b/media/libstagefright/codecs/avc/enc/src/sad_inline.h new file mode 100644 index 0000000..f39794f --- /dev/null +++ b/media/libstagefright/codecs/avc/enc/src/sad_inline.h @@ -0,0 +1,488 @@ +/* ------------------------------------------------------------------ + * Copyright (C) 1998-2009 PacketVideo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. + * See the License for the specific language governing permissions + * and limitations under the License. + * ------------------------------------------------------------------- + */ +#ifndef _SAD_INLINE_H_ +#define _SAD_INLINE_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ + + __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) + { + tmp = tmp - tmp2; + if (tmp > 0) sad += tmp; + else sad -= tmp; + + return sad; + } + + __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) + { + int32 x7; + + x7 = src2 ^ src1; /* check odd/even combination */ + if ((uint32)src2 >= (uint32)src1) + { + src1 = src2 - src1; /* subs */ + } + else + { + src1 = src1 - src2; + } + x7 = x7 ^ src1; /* only odd bytes need to add carry */ + x7 = mask & ((uint32)x7 >> 1); + x7 = (x7 << 8) - x7; + src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ + src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ + + return src1; + } + +#define NUMBER 3 +#define SHIFT 24 + +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 2 +#undef SHIFT +#define SHIFT 16 +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 1 +#undef SHIFT +#define SHIFT 8 +#include "sad_mb_offset.h" + + + __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) + { + int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; + + x9 = 0x80808080; /* const. */ + + x8 = (uint32)ref & 0x3; + if (x8 == 3) + goto SadMBOffset3; + if (x8 == 2) + goto SadMBOffset2; + if (x8 == 1) + goto SadMBOffset1; + +// x5 = (x4<<8)-x4; /* x5 = x4*255; */ + x4 = x5 = 0; + + x6 = 0xFFFF00FF; + + ref -= lx; + blk -= 16; + + x8 = 16; + +LOOP_SAD0: + /****** process 8 pixels ******/ + x10 = *((uint32*)(ref += lx)); + x11 = *((uint32*)(ref + 4)); + x12 = *((uint32*)(blk += 16)); + x14 = *((uint32*)(blk + 4)); + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + /****** process 8 pixels ******/ + x10 = *((uint32*)(ref + 8)); + x11 = *((uint32*)(ref + 12)); + x12 = *((uint32*)(blk + 8)); + x14 = *((uint32*)(blk + 12)); + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + /****************/ + x10 = x5 - (x4 << 8); /* extract low bytes */ + x10 = x10 + x4; /* add with high bytes */ + x10 = x10 + (x10 << 16); /* add with lower half word */ + + if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ + { + if (--x8) + { + goto LOOP_SAD0; + } + + } + + return ((uint32)x10 >> 16); + +SadMBOffset3: + + return sad_mb_offset3(ref, blk, lx, dmin); + +SadMBOffset2: + + return sad_mb_offset2(ref, blk, lx, dmin); + +SadMBOffset1: + + return sad_mb_offset1(ref, blk, lx, dmin); + + } + +#elif defined(__CC_ARM) /* only work with arm v5 */ + + __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) + { + __asm + { + rsbs tmp, tmp, tmp2 ; + rsbmi tmp, tmp, #0 ; + add sad, sad, tmp ; + } + + return sad; + } + + __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) + { + int32 x7; + + __asm + { + EOR x7, src2, src1; /* check odd/even combination */ + SUBS src1, src2, src1; + EOR x7, x7, src1; + AND x7, mask, x7, lsr #1; + ORRCC x7, x7, #0x80000000; + RSB x7, x7, x7, lsl #8; + ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ + EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ + } + + return src1; + } + + __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) + { + int32 x7; + + __asm + { + EOR x7, src2, src1; /* check odd/even combination */ + ADDS src1, src2, src1; + EOR x7, x7, src1; /* only odd bytes need to add carry */ + ANDS x7, mask, x7, rrx; + RSB x7, x7, x7, lsl #8; + SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ + EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ + } + + return src1; + } + +#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ + BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ + ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ + SBC x5, x5, x11; /* accumulate low bytes */ \ + BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ + ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ + + +#define NUMBER 3 +#define SHIFT 24 +#define INC_X8 0x08000001 + +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 2 +#undef SHIFT +#define SHIFT 16 +#undef INC_X8 +#define INC_X8 0x10000001 +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 1 +#undef SHIFT +#define SHIFT 8 +#undef INC_X8 +#define INC_X8 0x08000001 +#include "sad_mb_offset.h" + + + __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) + { + int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; + + x9 = 0x80808080; /* const. */ + x4 = x5 = 0; + + __asm + { + MOVS x8, ref, lsl #31 ; + BHI SadMBOffset3; + BCS SadMBOffset2; + BMI SadMBOffset1; + + MVN x6, #0xFF00; + } +LOOP_SAD0: + /****** process 8 pixels ******/ + x11 = *((int32*)(ref + 12)); + x10 = *((int32*)(ref + 8)); + x14 = *((int32*)(blk + 12)); + x12 = *((int32*)(blk + 8)); + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + __asm + { + /****** process 8 pixels ******/ + LDR x11, [ref, #4]; + LDR x10, [ref], lx ; + LDR x14, [blk, #4]; + LDR x12, [blk], #16 ; + } + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + /****************/ + x10 = x5 - (x4 << 8); /* extract low bytes */ + x10 = x10 + x4; /* add with high bytes */ + x10 = x10 + (x10 << 16); /* add with lower half word */ + + __asm + { + /****************/ + RSBS x11, dmin, x10, lsr #16; + ADDLSS x8, x8, #0x10000001; + BLS LOOP_SAD0; + } + + return ((uint32)x10 >> 16); + +SadMBOffset3: + + return sad_mb_offset3(ref, blk, lx, dmin, x8); + +SadMBOffset2: + + return sad_mb_offset2(ref, blk, lx, dmin, x8); + +SadMBOffset1: + + return sad_mb_offset1(ref, blk, lx, dmin, x8); + } + + +#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ + + __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) + { +__asm__ volatile("rsbs %1, %1, %2\n\trsbmi %1, %1, #0\n\tadd %0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2)); + return sad; + } + + __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) + { + int32 x7; + +__asm__ volatile("EOR %1, %2, %0\n\tSUBS %0, %2, %0\n\tEOR %1, %1, %0\n\tAND %1, %3, %1, lsr #1\n\tORRCC %1, %1, #0x80000000\n\tRSB %1, %1, %1, lsl #8\n\tADD %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); + + return src1; + } + + __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) + { + int32 x7; + +__asm__ volatile("EOR %1, %2, %0\n\tADDS %0, %2, %0\n\tEOR %1, %1, %0\n\tANDS %1, %3, %1, rrx\n\tRSB %1, %1, %1, lsl #8\n\tSUB %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); + + return src1; + } + +#define sum_accumulate __asm__ volatile("SBC %0, %0, %1\n\tBIC %1, %4, %1\n\tADD %2, %2, %1, lsr #8\n\tSBC %0, %0, %3\n\tBIC %3, %4, %3\n\tADD %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6)); + +#define NUMBER 3 +#define SHIFT 24 +#define INC_X8 0x08000001 + +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 2 +#undef SHIFT +#define SHIFT 16 +#undef INC_X8 +#define INC_X8 0x10000001 +#include "sad_mb_offset.h" + +#undef NUMBER +#define NUMBER 1 +#undef SHIFT +#define SHIFT 8 +#undef INC_X8 +#define INC_X8 0x08000001 +#include "sad_mb_offset.h" + + + __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) + { + int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; + + x9 = 0x80808080; /* const. */ + x4 = x5 = 0; + + x8 = (uint32)ref & 0x3; + if (x8 == 3) + goto SadMBOffset3; + if (x8 == 2) + goto SadMBOffset2; + if (x8 == 1) + goto SadMBOffset1; + + x8 = 16; +/// +__asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); + +LOOP_SAD0: + /****** process 8 pixels ******/ + x11 = *((int32*)(ref + 12)); + x10 = *((int32*)(ref + 8)); + x14 = *((int32*)(blk + 12)); + x12 = *((int32*)(blk + 8)); + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + /****** process 8 pixels ******/ + x11 = *((int32*)(ref + 4)); +__asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx)); + //x10 = *((int32*)ref); ref+=lx; + x14 = *((int32*)(blk + 4)); +__asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); + + /* process x11 & x14 */ + x11 = sad_4pixel(x11, x14, x9); + + /* process x12 & x10 */ + x10 = sad_4pixel(x10, x12, x9); + + x5 = x5 + x10; /* accumulate low bytes */ + x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ + x5 = x5 + x11; /* accumulate low bytes */ + x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ + x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ + + /****************/ + x10 = x5 - (x4 << 8); /* extract low bytes */ + x10 = x10 + x4; /* add with high bytes */ + x10 = x10 + (x10 << 16); /* add with lower half word */ + + /****************/ + + if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ + { + if (--x8) + { + goto LOOP_SAD0; + } + + } + + return ((uint32)x10 >> 16); + +SadMBOffset3: + + return sad_mb_offset3(ref, blk, lx, dmin); + +SadMBOffset2: + + return sad_mb_offset2(ref, blk, lx, dmin); + +SadMBOffset1: + + return sad_mb_offset1(ref, blk, lx, dmin); + } + + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // _SAD_INLINE_H_ + |