diff options
author | James Dong <jdong@google.com> | 2011-05-31 18:53:46 -0700 |
---|---|---|
committer | James Dong <jdong@google.com> | 2011-06-02 12:32:46 -0700 |
commit | 0c1bc742181ded4930842b46e9507372f0b1b963 (patch) | |
tree | c952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc | |
parent | 92a746c3b18d035189f596ce32847bf26247aaca (diff) | |
download | frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2 |
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder
over PV software decoder is more than 30%.
o In addition, it fixes some known PV software decoder issues like missing
output frames
o allow both pv and on2 software avc to be available for easy comparision
o change output frames from 8 to 16
Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc')
90 files changed, 21423 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h new file mode 100755 index 0000000..35b510b --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h @@ -0,0 +1,1153 @@ +/** + * + * File Name: armVC.h + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * File: armVideo.h + * Brief: Declares API's/Basic Data types used across the OpenMAX Video domain + * + */ + + +#ifndef _armVideo_H_ +#define _armVideo_H_ + +#include "omxVC.h" +#include "armCOMM_Bitstream.h" + +/** + * ARM specific state structure to hold Motion Estimation information. + */ + +struct m4p2_MESpec +{ + OMXVCM4P2MEParams MEParams; + OMXVCM4P2MEMode MEMode; +}; + +struct m4p10_MESpec +{ + OMXVCM4P10MEParams MEParams; + OMXVCM4P10MEMode MEMode; +}; + +typedef struct m4p2_MESpec ARMVCM4P2_MESpec; +typedef struct m4p10_MESpec ARMVCM4P10_MESpec; + +/** + * Function: armVCM4P2_CompareMV + * + * Description: + * Performs comparision of motion vectors and SAD's to decide the + * best MV and SAD + * + * Remarks: + * + * Parameters: + * [in] mvX x coordinate of the candidate motion vector + * [in] mvY y coordinate of the candidate motion vector + * [in] candSAD Candidate SAD + * [in] bestMVX x coordinate of the best motion vector + * [in] bestMVY y coordinate of the best motion vector + * [in] bestSAD best SAD + * + * Return Value: + * OMX_INT -- 1 to indicate that the current sad is the best + * 0 to indicate that it is NOT the best SAD + */ + +OMX_INT armVCM4P2_CompareMV ( + OMX_S16 mvX, + OMX_S16 mvY, + OMX_INT candSAD, + OMX_S16 bestMVX, + OMX_S16 bestMVY, + OMX_INT bestSAD); + +/** + * Function: armVCM4P2_ACDCPredict + * + * Description: + * Performs adaptive DC/AC coefficient prediction for an intra block. Prior + * to the function call, prediction direction (predDir) should be selected + * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2. + * + * Remarks: + * + * Parameters: + * [in] pSrcDst pointer to the coefficient buffer which contains + * the quantized coefficient residuals (PQF) of the + * current block + * [in] pPredBufRow pointer to the coefficient row buffer + * [in] pPredBufCol pointer to the coefficient column buffer + * [in] curQP quantization parameter of the current block. curQP + * may equal to predQP especially when the current + * block and the predictor block are in the same + * macroblock. + * [in] predQP quantization parameter of the predictor block + * [in] predDir indicates the prediction direction which takes one + * of the following values: + * OMX_VIDEO_HORIZONTAL predict horizontally + * OMX_VIDEO_VERTICAL predict vertically + * [in] ACPredFlag a flag indicating if AC prediction should be + * performed. It is equal to ac_pred_flag in the bit + * stream syntax of MPEG-4 + * [in] videoComp video component type (luminance, chrominance or + * alpha) of the current block + * [in] flag This flag defines the if one wants to use this functions to + * calculate PQF (set 1, prediction) or QF (set 0, reconstruction) + * [out] pPreACPredict pointer to the predicted coefficients buffer. + * Filled ONLY if it is not NULL + * [out] pSrcDst pointer to the coefficient buffer which contains + * the quantized coefficients (QF) of the current + * block + * [out] pPredBufRow pointer to the updated coefficient row buffer + * [out] pPredBufCol pointer to the updated coefficient column buffer + * [out] pSumErr pointer to the updated sum of the difference + * between predicted and unpredicted coefficients + * If this is NULL, do not update + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_ACDCPredict( + OMX_S16 * pSrcDst, + OMX_S16 * pPreACPredict, + OMX_S16 * pPredBufRow, + OMX_S16 * pPredBufCol, + OMX_INT curQP, + OMX_INT predQP, + OMX_INT predDir, + OMX_INT ACPredFlag, + OMXVCM4P2VideoComponent videoComp, + OMX_U8 flag, + OMX_INT *pSumErr +); + +/** + * Function: armVCM4P2_SetPredDir + * + * Description: + * Performs detecting the prediction direction + * + * Remarks: + * + * Parameters: + * [in] blockIndex block index indicating the component type and + * position as defined in subclause 6.1.3.8, of ISO/IEC + * 14496-2. Furthermore, indexes 6 to 9 indicate the + * alpha blocks spatially corresponding to luminance + * blocks 0 to 3 in the same macroblock. + * [in] pCoefBufRow pointer to the coefficient row buffer + * [in] pQpBuf pointer to the quantization parameter buffer + * [out] predQP quantization parameter of the predictor block + * [out] predDir indicates the prediction direction which takes one + * of the following values: + * OMX_VIDEO_HORIZONTAL predict horizontally + * OMX_VIDEO_VERTICAL predict vertically + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_SetPredDir( + OMX_INT blockIndex, + OMX_S16 *pCoefBufRow, + OMX_S16 *pCoefBufCol, + OMX_INT *predDir, + OMX_INT *predQP, + const OMX_U8 *pQpBuf +); + +/** + * Function: armVCM4P2_EncodeVLCZigzag_Intra + * + * Description: + * Performs zigzag scanning and VLC encoding for one intra block. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream + * [in] pBitOffset pointer to the bit position in the byte pointed + * by *ppBitStream. Valid within 0 to 7. + * [in] pQDctBlkCoef pointer to the quantized DCT coefficient + * [in] predDir AC prediction direction, which is used to decide + * the zigzag scan pattern. This takes one of the + * following values: + * OMX_VIDEO_NONE AC prediction not used. + * Performs classical zigzag + * scan. + * OMX_VIDEO_HORIZONTAL Horizontal prediction. + * Performs alternate-vertical + * zigzag scan. + * OMX_VIDEO_VERTICAL Vertical prediction. + * Performs alternate-horizontal + * zigzag scan. + * [in] pattern block pattern which is used to decide whether + * this block is encoded + * [in] start start indicates whether the encoding begins with 0th element + * or 1st. + * [out] ppBitStream *ppBitStream is updated after the block is encoded, + * so that it points to the current byte in the bit + * stream buffer. + * [out] pBitOffset *pBitOffset is updated so that it points to the + * current bit position in the byte pointed by + * *ppBitStream. + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_EncodeVLCZigzag_Intra( + OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + const OMX_S16 *pQDctBlkCoef, + OMX_U8 predDir, + OMX_U8 pattern, + OMX_INT shortVideoHeader, + OMX_U8 start +); + +/** + * Function: armVCM4P2_DecodeVLCZigzag_Intra + * + * Description: + * Performs VLC decoding and inverse zigzag scan for one intra coded block. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bitstream buffer + * [in] pBitOffset pointer to the bit position in the byte pointed + * to by *ppBitStream. *pBitOffset is valid within + * [0-7]. + * [in] predDir AC prediction direction which is used to decide + * the zigzag scan pattern. It takes one of the + * following values: + * OMX_VIDEO_NONE AC prediction not used; + * perform classical zigzag scan; + * OMX_VIDEO_HORIZONTAL Horizontal prediction; + * perform alternate-vertical + * zigzag scan; + * OMX_VIDEO_VERTICAL Vertical prediction; + * thus perform + * alternate-horizontal + * zigzag scan. + * [in] videoComp video component type (luminance, chrominance or + * alpha) of the current block + * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0, + * and escape mode 4 is used when shortVideoHeader==1. + * [in] start start indicates whether the encoding begins with 0th element + * or 1st. + * [out] ppBitStream *ppBitStream is updated after the block is + * decoded, so that it points to the current byte + * in the bit stream buffer + * [out] pBitOffset *pBitOffset is updated so that it points to the + * current bit position in the byte pointed by + * *ppBitStream + * [out] pDst pointer to the coefficient buffer of current + * block. Should be 32-bit aligned + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_DecodeVLCZigzag_Intra( + const OMX_U8 ** ppBitStream, + OMX_INT * pBitOffset, + OMX_S16 * pDst, + OMX_U8 predDir, + OMX_INT shortVideoHeader, + OMX_U8 start +); + +/** + * Function: armVCM4P2_FillVLDBuffer + * + * Description: + * Performs filling of the coefficient buffer according to the run, level + * and sign, also updates the index + * + * Parameters: + * [in] storeRun Stored Run value (count of zeros) + * [in] storeLevel Stored Level value (non-zero value) + * [in] sign Flag indicating the sign of level + * [in] last status of the last flag + * [in] pIndex pointer to coefficient index in 8x8 matrix + * [out] pIndex pointer to updated coefficient index in 8x8 + * matrix + * [in] pZigzagTable pointer to the zigzag tables + * [out] pDst pointer to the coefficient buffer of current + * block. Should be 32-bit aligned + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_FillVLDBuffer( + OMX_U32 storeRun, + OMX_S16 * pDst, + OMX_S16 storeLevel, + OMX_U8 sign, + OMX_U8 last, + OMX_U8 * index, + const OMX_U8 * pZigzagTable +); + +/** + * Function: armVCM4P2_GetVLCBits + * + * Description: + * Performs escape mode decision based on the run, run+, level, level+ and + * last combinations. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream + * [in] pBitOffset pointer to the bit position in the byte pointed + * by *ppBitStream. Valid within 0 to 7 + * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0, + * and escape mode 4 is used when shortVideoHeader==1. + * [in] start start indicates whether the encoding begins with + * 0th element or 1st. + * [in/out] pLast pointer to last status flag + * [in] runBeginSingleLevelEntriesL0 The run value from which level + * will be equal to 1: last == 0 + * [in] IndexBeginSingleLevelEntriesL0 Array index in the VLC table + * pointing to the + * runBeginSingleLevelEntriesL0 + * [in] runBeginSingleLevelEntriesL1 The run value from which level + * will be equal to 1: last == 1 + * [in] IndexBeginSingleLevelEntriesL1 Array index in the VLC table + * pointing to the + * runBeginSingleLevelEntriesL0 + * [in] pRunIndexTableL0 Run Index table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pVlcTableL0 VLC table for last == 0 + * [in] pRunIndexTableL1 Run Index table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [in] pVlcTableL1 VLC table for last == 1 + * [in] pLMAXTableL0 Level MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pLMAXTableL1 Level MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [in] pRMAXTableL0 Run MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pRMAXTableL1 Run MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [out]pDst pointer to the coefficient buffer of current + * block. Should be 32-bit aligned + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_GetVLCBits ( + const OMX_U8 **ppBitStream, + OMX_INT * pBitOffset, + OMX_S16 * pDst, + OMX_INT shortVideoHeader, + OMX_U8 start, + OMX_U8 * pLast, + OMX_U8 runBeginSingleLevelEntriesL0, + OMX_U8 maxIndexForMultipleEntriesL0, + OMX_U8 maxRunForMultipleEntriesL1, + OMX_U8 maxIndexForMultipleEntriesL1, + const OMX_U8 * pRunIndexTableL0, + const ARM_VLC32 *pVlcTableL0, + const OMX_U8 * pRunIndexTableL1, + const ARM_VLC32 *pVlcTableL1, + const OMX_U8 * pLMAXTableL0, + const OMX_U8 * pLMAXTableL1, + const OMX_U8 * pRMAXTableL0, + const OMX_U8 * pRMAXTableL1, + const OMX_U8 * pZigzagTable +); + +/** + * Function: armVCM4P2_PutVLCBits + * + * Description: + * Checks the type of Escape Mode and put encoded bits for + * quantized DCT coefficients. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream + * [in] pBitOffset pointer to the bit position in the byte pointed + * by *ppBitStream. Valid within 0 to 7 + * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0, + * and escape mode 4 is used when shortVideoHeader==1. + * [in] start start indicates whether the encoding begins with + * 0th element or 1st. + * [in] maxStoreRunL0 Max store possible (considering last and inter/intra) + * for last = 0 + * [in] maxStoreRunL1 Max store possible (considering last and inter/intra) + * for last = 1 + * [in] maxRunForMultipleEntriesL0 + * The run value after which level + * will be equal to 1: + * (considering last and inter/intra status) for last = 0 + * [in] maxRunForMultipleEntriesL1 + * The run value after which level + * will be equal to 1: + * (considering last and inter/intra status) for last = 1 + * [in] pRunIndexTableL0 Run Index table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pVlcTableL0 VLC table for last == 0 + * [in] pRunIndexTableL1 Run Index table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [in] pVlcTableL1 VLC table for last == 1 + * [in] pLMAXTableL0 Level MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pLMAXTableL1 Level MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [in] pRMAXTableL0 Run MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 0 + * [in] pRMAXTableL1 Run MAX table defined in + * armVCM4P2_Huff_Tables_VLC.c for last == 1 + * [out] pQDctBlkCoef pointer to the quantized DCT coefficient + * [out] ppBitStream *ppBitStream is updated after the block is encoded + * so that it points to the current byte in the bit + * stream buffer. + * [out] pBitOffset *pBitOffset is updated so that it points to the + * current bit position in the byte pointed by + * *ppBitStream. + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + + +OMXResult armVCM4P2_PutVLCBits ( + OMX_U8 **ppBitStream, + OMX_INT * pBitOffset, + const OMX_S16 *pQDctBlkCoef, + OMX_INT shortVideoHeader, + OMX_U8 start, + OMX_U8 maxStoreRunL0, + OMX_U8 maxStoreRunL1, + OMX_U8 maxRunForMultipleEntriesL0, + OMX_U8 maxRunForMultipleEntriesL1, + const OMX_U8 * pRunIndexTableL0, + const ARM_VLC32 *pVlcTableL0, + const OMX_U8 * pRunIndexTableL1, + const ARM_VLC32 *pVlcTableL1, + const OMX_U8 * pLMAXTableL0, + const OMX_U8 * pLMAXTableL1, + const OMX_U8 * pRMAXTableL0, + const OMX_U8 * pRMAXTableL1, + const OMX_U8 * pZigzagTable +); +/** + * Function: armVCM4P2_FillVLCBuffer + * + * Description: + * Performs calculating the VLC bits depending on the escape type and insert + * the same in the bitstream + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream + * [in] pBitOffset pointer to the bit position in the byte pointed + * by *ppBitStream. Valid within 0 to 7 + * [in] run Run value (count of zeros) to be encoded + * [in] level Level value (non-zero value) to be encoded + * [in] runPlus Calculated as runPlus = run - (RMAX + 1) + * [in] levelPlus Calculated as + * levelPlus = sign(level)*[abs(level) - LMAX] + * [in] fMode Flag indicating the escape modes + * [in] last status of the last flag + * [in] maxRunForMultipleEntries + * The run value after which level will be equal to 1: + * (considering last and inter/intra status) + * [in] pRunIndexTable Run Index table defined in + * armVCM4P2_Huff_tables_VLC.h + * [in] pVlcTable VLC table defined in armVCM4P2_Huff_tables_VLC.h + * [out] ppBitStream *ppBitStream is updated after the block is encoded + * so that it points to the current byte in the bit + * stream buffer. + * [out] pBitOffset *pBitOffset is updated so that it points to the + * current bit position in the byte pointed by + * *ppBitStream. + * + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_FillVLCBuffer ( + OMX_U8 **ppBitStream, + OMX_INT * pBitOffset, + OMX_U32 run, + OMX_S16 level, + OMX_U32 runPlus, + OMX_S16 levelPlus, + OMX_U8 fMode, + OMX_U8 last, + OMX_U8 maxRunForMultipleEntries, + const OMX_U8 *pRunIndexTable, + const ARM_VLC32 *pVlcTable +); + +/** + * Function: armVCM4P2_CheckVLCEscapeMode + * + * Description: + * Performs escape mode decision based on the run, run+, level, level+ and + * last combinations. + * + * Remarks: + * + * Parameters: + * [in] run Run value (count of zeros) to be encoded + * [in] level Level value (non-zero value) to be encoded + * [in] runPlus Calculated as runPlus = run - (RMAX + 1) + * [in] levelPlus Calculated as + * levelPlus = sign(level)*[abs(level) - LMAX] + * [in] maxStoreRun Max store possible (considering last and inter/intra) + * [in] maxRunForMultipleEntries + * The run value after which level + * will be equal to 1: + * (considering last and inter/intra status) + * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0, + * and escape mode 4 is used when shortVideoHeader==1. + * [in] pRunIndexTable Run Index table defined in + * armVCM4P2_Huff_Tables_VLC.c + * (considering last and inter/intra status) + * + * + * Return Value: + * Returns an Escape mode which can take values from 0 to 3 + * 0 --> no escape mode, 1 --> escape type 1, + * 1 --> escape type 2, 3 --> escape type 3, check section 7.4.1.3 + * in the MPEG ISO standard. + * + */ + +OMX_U8 armVCM4P2_CheckVLCEscapeMode( + OMX_U32 run, + OMX_U32 runPlus, + OMX_S16 level, + OMX_S16 levelPlus, + OMX_U8 maxStoreRun, + OMX_U8 maxRunForMultipleEntries, + OMX_INT shortVideoHeader, + const OMX_U8 *pRunIndexTable +); + + +/** + * Function: armVCM4P2_BlockMatch_Integer + * + * Description: + * Performs a 16x16 block search; estimates motion vector and associated minimum SAD. + * Both the input and output motion vectors are represented using half-pixel units, and + * therefore a shift left or right by 1 bit may be required, respectively, to match the + * input or output MVs with other functions that either generate output MVs or expect + * input MVs represented using integer pixel units. + * + * Remarks: + * + * Parameters: + * [in] pSrcRefBuf pointer to the reference Y plane; points to the reference MB that + * corresponds to the location of the current macroblock in the current + * plane. + * [in] refWidth width of the reference plane + * [in] pRefRect pointer to the valid rectangular in reference plane. Relative to image origin. + * It's not limited to the image boundary, but depended on the padding. For example, + * if you pad 4 pixels outside the image border, then the value for left border + * can be -4 + * [in] pSrcCurrBuf pointer to the current macroblock extracted from original plane (linear array, + * 256 entries); must be aligned on an 8-byte boundary. + * [in] pCurrPointPos position of the current macroblock in the current plane + * [in] pSrcPreMV pointer to predicted motion vector; NULL indicates no predicted MV + * [in] pSrcPreSAD pointer to SAD associated with the predicted MV (referenced by pSrcPreMV) + * [in] searchRange search range for 16X16 integer block,the units of it is full pixel,the search range + * is the same in all directions.It is in inclusive of the boundary and specified in + * terms of integer pixel units. + * [in] pMESpec vendor-specific motion estimation specification structure; must have been allocated + * and then initialized using omxVCM4P2_MEInit prior to calling the block matching + * function. + * [in] BlockSize MacroBlock Size i.e either 16x16 or 8x8. + * [out] pDstMV pointer to estimated MV + * [out] pDstSAD pointer to minimum SAD + * + * Return Value: + * OMX_Sts_NoErr ¨C no error. + * OMX_Sts_BadArgErr ¨C bad arguments + * + */ + +OMXResult armVCM4P2_BlockMatch_Integer( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pCurrPointPos, + const OMXVCMotionVector *pSrcPreMV, + const OMX_INT *pSrcPreSAD, + void *pMESpec, + OMXVCMotionVector *pDstMV, + OMX_INT *pDstSAD, + OMX_U8 BlockSize +); + +/** + * Function: armVCM4P2_BlockMatch_Half + * + * Description: + * Performs a 16x16 block match with half-pixel resolution. Returns the estimated + * motion vector and associated minimum SAD. This function estimates the half-pixel + * motion vector by interpolating the integer resolution motion vector referenced + * by the input parameter pSrcDstMV, i.e., the initial integer MV is generated + * externally. The input parameters pSrcRefBuf and pSearchPointRefPos should be + * shifted by the winning MV of 16x16 integer search prior to calling BlockMatch_Half_16x16. + * The function BlockMatch_Integer_16x16 may be used for integer motion estimation. + * + * Remarks: + * + * Parameters: + * [in] pSrcRefBuf pointer to the reference Y plane; points to the reference MB + * that corresponds to the location of the current macroblock in + * the current plane. + * [in] refWidth width of the reference plane + * [in] pRefRect reference plane valid region rectangle + * [in] pSrcCurrBuf pointer to the current macroblock extracted from original plane + * (linear array, 256 entries); must be aligned on an 8-byte boundary. + * [in] pSearchPointRefPos position of the starting point for half pixel search (specified + * in terms of integer pixel units) in the reference plane. + * [in] rndVal rounding control bit for half pixel motion estimation; + * 0=rounding control disabled; 1=rounding control enabled + * [in] pSrcDstMV pointer to the initial MV estimate; typically generated during a prior + * 16X16 integer search and its unit is half pixel. + * [in] BlockSize MacroBlock Size i.e either 16x16 or 8x8. + * [out]pSrcDstMV pointer to estimated MV + * [out]pDstSAD pointer to minimum SAD + * + * Return Value: + * OMX_Sts_NoErr ¨C no error + * OMX_Sts_BadArgErr ¨C bad arguments + * + */ + +OMXResult armVCM4P2_BlockMatch_Half( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pSearchPointRefPos, + OMX_INT rndVal, + OMXVCMotionVector *pSrcDstMV, + OMX_INT *pDstSAD, + OMX_U8 BlockSize +); +/** + * Function: armVCM4P2_PadMV + * + * Description: + * Performs motion vector padding for a macroblock. + * + * Remarks: + * + * Parameters: + * [in] pSrcDstMV pointer to motion vector buffer of the current + * macroblock + * [in] pTransp pointer to transparent status buffer of the + * current macroblock + * [out] pSrcDstMV pointer to motion vector buffer in which the + * motion vectors have been padded + * Return Value: + * Standard OMXResult result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P2_PadMV( + OMXVCMotionVector * pSrcDstMV, + OMX_U8 * pTransp +); + +/* + * H.264 Specific Declarations + */ +/* Defines */ +#define ARM_M4P10_Q_OFFSET (15) + + +/* Dequant tables */ + +extern const OMX_U8 armVCM4P10_PosToVCol4x4[16]; +extern const OMX_U8 armVCM4P10_PosToVCol2x2[4]; +extern const OMX_U8 armVCM4P10_VMatrix[6][3]; +extern const OMX_U32 armVCM4P10_MFMatrix[6][3]; + + +/* + * Description: + * This function perform the work required by the OpenMAX + * DecodeCoeffsToPair function and DecodeChromaDCCoeffsToPair. + * Since most of the code is common we share it here. + * + * Parameters: + * [in] ppBitStream Double pointer to current byte in bit stream buffer + * [in] pOffset Pointer to current bit position in the byte pointed + * to by *ppBitStream + * [in] sMaxNumCoeff Maximum number of non-zero coefficients in current + * block (4,15 or 16) + * [in] nTable Table number (0 to 4) according to the five columns + * of Table 9-5 in the H.264 spec + * [out] ppBitStream *ppBitStream is updated after each block is decoded + * [out] pOffset *pOffset is updated after each block is decoded + * [out] pNumCoeff Pointer to the number of nonzero coefficients in + * this block + * [out] ppPosCoefbuf Double pointer to destination residual + * coefficient-position pair buffer + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + + */ + +OMXResult armVCM4P10_DecodeCoeffsToPair( + const OMX_U8** ppBitStream, + OMX_S32* pOffset, + OMX_U8* pNumCoeff, + OMX_U8**ppPosCoefbuf, + OMX_INT nTable, + OMX_INT sMaxNumCoeff + ); + +/* + * Description: + * Perform DC style intra prediction, averaging upper and left block + * + * Parameters: + * [in] pSrcLeft Pointer to the buffer of 16 left coefficients: + * p[x, y] (x = -1, y = 0..3) + * [in] pSrcAbove Pointer to the buffer of 16 above coefficients: + * p[x,y] (x = 0..3, y = -1) + * [in] leftStep Step of left coefficient buffer + * [in] dstStep Step of the destination buffer + * [in] availability Neighboring 16x16 MB availability flag + * [out] pDst Pointer to the destination buffer + * + * Return Value: + * None + */ + +void armVCM4P10_PredictIntraDC4x4( + const OMX_U8* pSrcLeft, + const OMX_U8 *pSrcAbove, + OMX_U8* pDst, + OMX_INT leftStep, + OMX_INT dstStep, + OMX_S32 availability +); + +/* + * Description + * Unpack a 4x4 block of coefficient-residual pair values + * + * Parameters: + * [in] ppSrc Double pointer to residual coefficient-position pair + * buffer output by CALVC decoding + * [out] ppSrc *ppSrc is updated to the start of next non empty block + * [out] pDst Pointer to unpacked 4x4 block + */ + +void armVCM4P10_UnpackBlock4x4( + const OMX_U8 **ppSrc, + OMX_S16* pDst +); + +/* + * Description + * Unpack a 2x2 block of coefficient-residual pair values + * + * Parameters: + * [in] ppSrc Double pointer to residual coefficient-position pair + * buffer output by CALVC decoding + * [out] ppSrc *ppSrc is updated to the start of next non empty block + * [out] pDst Pointer to unpacked 4x4 block + */ + +void armVCM4P10_UnpackBlock2x2( + const OMX_U8 **ppSrc, + OMX_S16* pDst +); + +/* + * Description + * Deblock one boundary pixel + * + * Parameters: + * [in] pQ0 Pointer to pixel q0 + * [in] Step Step between pixels q0 and q1 + * [in] tC0 Edge threshold value + * [in] alpha alpha threshold value + * [in] beta beta threshold value + * [in] bS deblocking strength + * [in] ChromaFlag True for chroma blocks + * [out] pQ0 Deblocked pixels + * + */ + +void armVCM4P10_DeBlockPixel( + OMX_U8 *pQ0, /* pointer to the pixel q0 */ + int Step, /* step between pixels q0 and q1 */ + int tC0, /* edge threshold value */ + int alpha, /* alpha */ + int beta, /* beta */ + int bS, /* deblocking strength */ + int ChromaFlag +); + +/** + * Function: armVCM4P10_InterpolateHalfHor_Luma + * + * Description: + * This function performs interpolation for horizontal 1/2-pel positions + * + * Remarks: + * + * [in] pSrc Pointer to top-left corner of block used to interpolate + in the reconstructed frame plane + * [in] iSrcStep Step of the source buffer. + * [in] iDstStep Step of the destination(interpolation) buffer. + * [in] iWidth Width of the current block + * [in] iHeight Height of the current block + * [out] pDst Pointer to the interpolation buffer of the 1/2-pel + * + * Return Value: + * Standard OMXResult value. + * + */ + +OMXResult armVCM4P10_InterpolateHalfHor_Luma( + const OMX_U8* pSrc, + OMX_U32 iSrcStep, + OMX_U8* pDst, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + +/** + * Function: armVCM4P10_InterpolateHalfVer_Luma + * + * Description: + * This function performs interpolation for vertical 1/2-pel positions + * around a full-pel position. + * + * Remarks: + * + * [in] pSrc Pointer to top-left corner of block used to interpolate + * in the reconstructed frame plane + * [in] iSrcStep Step of the source buffer. + * [in] iDstStep Step of the destination(interpolation) buffer. + * [in] iWidth Width of the current block + * [in] iHeight Height of the current block + * [out] pDst Pointer to the interpolation buffer of the 1/2-pel + * + * Return Value: + * Standard OMXResult value. + * + */ + +OMXResult armVCM4P10_InterpolateHalfVer_Luma( + const OMX_U8* pSrc, + OMX_U32 iSrcStep, + OMX_U8* pDst, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + +/** + * Function: armVCM4P10_InterpolateHalfDiag_Luma + * + * Description: + * This function performs interpolation for (1/2, 1/2) positions + * around a full-pel position. + * + * Remarks: + * + * [in] pSrc Pointer to top-left corner of block used to interpolate + * in the reconstructed frame plane + * [in] iSrcStep Step of the source buffer. + * [in] iDstStep Step of the destination(interpolation) buffer. + * [in] iWidth Width of the current block + * [in] iHeight Height of the current block + * [out] pDst Pointer to the interpolation buffer of the (1/2,1/2)-pel + * + * Return Value: + * Standard OMXResult value. + * + */ + +OMXResult armVCM4P10_InterpolateHalfDiag_Luma( + const OMX_U8* pSrc, + OMX_U32 iSrcStep, + OMX_U8* pDst, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + +/* + * Description: + * Transform Residual 4x4 Coefficients + * + * Parameters: + * [in] pSrc Source 4x4 block + * [out] pDst Destination 4x4 block + * + */ + +void armVCM4P10_TransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc); + +/* + * Description: + * Forward Transform Residual 4x4 Coefficients + * + * Parameters: + * [in] pSrc Source 4x4 block + * [out] pDst Destination 4x4 block + * + */ + +void armVCM4P10_FwdTransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc); + +OMX_INT armVCM4P10_CompareMotionCostToMV ( + OMX_S16 mvX, + OMX_S16 mvY, + OMXVCMotionVector diffMV, + OMX_INT candSAD, + OMXVCMotionVector *bestMV, + OMX_U32 nLamda, + OMX_S32 *pBestCost); + +/** + * Function: armVCCOMM_SAD + * + * Description: + * This function calculate the SAD for NxM blocks. + * + * Remarks: + * + * [in] pSrcOrg Pointer to the original block + * [in] iStepOrg Step of the original block buffer + * [in] pSrcRef Pointer to the reference block + * [in] iStepRef Step of the reference block buffer + * [in] iHeight Height of the block + * [in] iWidth Width of the block + * [out] pDstSAD Pointer of result SAD + * + * Return Value: + * Standard OMXResult value. + * + */ +OMXResult armVCCOMM_SAD( + const OMX_U8* pSrcOrg, + OMX_U32 iStepOrg, + const OMX_U8* pSrcRef, + OMX_U32 iStepRef, + OMX_S32* pDstSAD, + OMX_U32 iHeight, + OMX_U32 iWidth); + +/** + * Function: armVCCOMM_Average + * + * Description: + * This function calculates the average of two blocks and stores the result. + * + * Remarks: + * + * [in] pPred0 Pointer to the top-left corner of reference block 0 + * [in] pPred1 Pointer to the top-left corner of reference block 1 + * [in] iPredStep0 Step of reference block 0 + * [in] iPredStep1 Step of reference block 1 + * [in] iDstStep Step of the destination buffer + * [in] iWidth Width of the blocks + * [in] iHeight Height of the blocks + * [out] pDstPred Pointer to the destination buffer + * + * Return Value: + * Standard OMXResult value. + * + */ + OMXResult armVCCOMM_Average ( + const OMX_U8* pPred0, + const OMX_U8* pPred1, + OMX_U32 iPredStep0, + OMX_U32 iPredStep1, + OMX_U8* pDstPred, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + +/** + * Function: armVCM4P10_SADQuar + * + * Description: + * This function calculates the SAD between one block (pSrc) and the + * average of the other two (pSrcRef0 and pSrcRef1) + * + * Remarks: + * + * [in] pSrc Pointer to the original block + * [in] pSrcRef0 Pointer to reference block 0 + * [in] pSrcRef1 Pointer to reference block 1 + * [in] iSrcStep Step of the original block buffer + * [in] iRefStep0 Step of reference block 0 + * [in] iRefStep1 Step of reference block 1 + * [in] iHeight Height of the block + * [in] iWidth Width of the block + * [out] pDstSAD Pointer of result SAD + * + * Return Value: + * Standard OMXResult value. + * + */ +OMXResult armVCM4P10_SADQuar( + const OMX_U8* pSrc, + const OMX_U8* pSrcRef0, + const OMX_U8* pSrcRef1, + OMX_U32 iSrcStep, + OMX_U32 iRefStep0, + OMX_U32 iRefStep1, + OMX_U32* pDstSAD, + OMX_U32 iHeight, + OMX_U32 iWidth +); + +/** + * Function: armVCM4P10_Interpolate_Chroma + * + * Description: + * This function performs interpolation for chroma components. + * + * Remarks: + * + * [in] pSrc Pointer to top-left corner of block used to + * interpolate in the reconstructed frame plane + * [in] iSrcStep Step of the source buffer. + * [in] iDstStep Step of the destination(interpolation) buffer. + * [in] iWidth Width of the current block + * [in] iHeight Height of the current block + * [in] dx Fractional part of horizontal motion vector + * component in 1/8 pixel unit (0~7) + * [in] dy Fractional part of vertical motion vector + * component in 1/8 pixel unit (0~7) + * [out] pDst Pointer to the interpolation buffer + * + * Return Value: + * Standard OMXResult value. + * + */ + OMXResult armVCM4P10_Interpolate_Chroma( + OMX_U8 *pSrc, + OMX_U32 iSrcStep, + OMX_U8 *pDst, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight, + OMX_U32 dx, + OMX_U32 dy +); + +/** + * Function: armVCM4P10_Interpolate_Luma + * + * Description: + * This function performs interpolation for luma components. + * + * Remarks: + * + * [in] pSrc Pointer to top-left corner of block used to + * interpolate in the reconstructed frame plane + * [in] iSrcStep Step of the source buffer. + * [in] iDstStep Step of the destination(interpolation) buffer. + * [in] iWidth Width of the current block + * [in] iHeight Height of the current block + * [in] dx Fractional part of horizontal motion vector + * component in 1/4 pixel unit (0~3) + * [in] dy Fractional part of vertical motion vector + * component in 1/4 pixel unit (0~3) + * [out] pDst Pointer to the interpolation buffer + * + * Return Value: + * Standard OMXResult value. + * + */ + + OMXResult armVCM4P10_Interpolate_Luma( + const OMX_U8 *pSrc, + OMX_U32 iSrcStep, + OMX_U8 *pDst, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight, + OMX_U32 dx, + OMX_U32 dy +); + +/** + * Function: omxVCH264_DequantTransformACFromPair_U8_S16_C1_DLx + * + * Description: + * Reconstruct the 4x4 residual block from coefficient-position pair buffer, + * perform dequantisation and integer inverse transformation for 4x4 block of + * residuals and update the pair buffer pointer to next non-empty block. + * + * Remarks: + * + * Parameters: + * [in] ppSrc Double pointer to residual coefficient-position + * pair buffer output by CALVC decoding + * [in] pDC Pointer to the DC coefficient of this block, NULL + * if it doesn't exist + * [in] QP Quantization parameter + * [in] AC Flag indicating if at least one non-zero coefficient exists + * [out] pDst pointer to the reconstructed 4x4 block data + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult armVCM4P10_DequantTransformACFromPair_U8_S16_C1_DLx( + OMX_U8 **ppSrc, + OMX_S16 *pDst, + OMX_INT QP, + OMX_S16* pDC, + int AC +); + +#endif /*_armVideo_H_*/ + +/*End of File*/ + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h new file mode 100755 index 0000000..32a0166 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h @@ -0,0 +1,72 @@ +;// +;// +;// File Name: armVCCOMM_s.h +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// ARM optimized OpenMAX AC header file +;// +;// Formula used: +;// MACRO for calculating median for three values. + + + + IF :LNOT::DEF:ARMVCCOMM_S_H + INCLUDE armCOMM_s.h + M_VARIANTS CortexA8, ARM1136JS + + IF ARM1136JS :LOR: CortexA8 + + ;///* + ;// * Macro: M_MEDIAN3 + ;// * + ;// * Description: Finds the median of three numbers + ;// * + ;// * Remarks: + ;// * + ;// * Parameters: + ;// * [in] x First entry for the list of three numbers. + ;// * [in] y Second entry for the list of three numbers. + ;// * Input value may be corrupted at the end of + ;// * the execution of this macro. + ;// * [in] z Third entry of the list of three numbers. + ;// * Input value corrupted at the end of the + ;// * execution of this macro. + ;// * [in] t Temporary scratch register. + ;// * [out]z Median of the three numbers. + ;// */ + + MACRO + + M_MEDIAN3 $x, $y, $z, $t + + SUBS $t, $y, $z; // if (y < z) + ADDLT $z, $z, $t; // swap y and z + SUBLT $y, $y, $t; + + ;// Now z' <= y', so there are three cases for the + ;// median value, depending on x. + + ;// 1) x <= z' <= y' : median value is z' + ;// 2) z' <= x <= y' : median value is x + ;// 3) z' <= y' <= x : median value is y' + + CMP $z, $x; // if ( x > min(y,z) ) + MOVLT $z, $x; // ans = x + + CMP $x, $y; // if ( x > max(y,z) ) + MOVGT $z, $y; // ans = max(y,z) + + MEND + ENDIF + + + + ENDIF ;// ARMACCOMM_S_H + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h new file mode 100755 index 0000000..7b3cc72 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h @@ -0,0 +1,4381 @@ +/** + * File: omxVC.h + * Brief: OpenMAX DL v1.0.2 - Video Coding library + * + * Copyright © 2005-2008 The Khronos Group Inc. All Rights Reserved. + * + * These materials are protected by copyright laws and contain material + * proprietary to the Khronos Group, Inc. You may use these materials + * for implementing Khronos specifications, without altering or removing + * any trademark, copyright or other notice from the specification. + * + * Khronos Group makes no, and expressly disclaims any, representations + * or warranties, express or implied, regarding these materials, including, + * without limitation, any implied warranties of merchantability or fitness + * for a particular purpose or non-infringement of any intellectual property. + * Khronos Group makes no, and expressly disclaims any, warranties, express + * or implied, regarding the correctness, accuracy, completeness, timeliness, + * and reliability of these materials. + * + * Under no circumstances will the Khronos Group, or any of its Promoters, + * Contributors or Members or their respective partners, officers, directors, + * employees, agents or representatives be liable for any damages, whether + * direct, indirect, special or consequential damages for lost revenues, + * lost profits, or otherwise, arising from or in connection with these + * materials. + * + * Khronos and OpenMAX are trademarks of the Khronos Group Inc. + * + */ + +/* *****************************************************************************************/ + +#ifndef _OMXVC_H_ +#define _OMXVC_H_ + +#include "omxtypes.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +/* 6.1.1.1 Motion Vectors */ +/* In omxVC, motion vectors are represented as follows: */ + +typedef struct { + OMX_S16 dx; + OMX_S16 dy; +} OMXVCMotionVector; + + + +/** + * Function: omxVCCOMM_Average_8x (6.1.3.1.1) + * + * Description: + * This function calculates the average of two 8x4, 8x8, or 8x16 blocks. The + * result is rounded according to (a+b+1)/2. The block average function can + * be used in conjunction with half-pixel interpolation to obtain quarter + * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1. + * + * Input Arguments: + * + * pPred0 - Pointer to the top-left corner of reference block 0 + * pPred1 - Pointer to the top-left corner of reference block 1 + * iPredStep0 - Step of reference block 0 + * iPredStep1 - Step of reference block 1 + * iDstStep - Step of the destination buffer. + * iHeight - Height of the blocks + * + * Output Arguments: + * + * pDstPred - Pointer to the destination buffer. 8-byte aligned. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - one or more of the following pointers is NULL: pPred0, pPred1, or + * pDstPred. + * - pDstPred is not aligned on an 8-byte boundary. + * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 8. + * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 8. + * - iDstStep <= 0 or iDstStep is not a multiple of 8. + * - iHeight is not 4, 8, or 16. + * + */ +OMXResult omxVCCOMM_Average_8x ( + const OMX_U8 *pPred0, + const OMX_U8 *pPred1, + OMX_U32 iPredStep0, + OMX_U32 iPredStep1, + OMX_U8 *pDstPred, + OMX_U32 iDstStep, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCCOMM_Average_16x (6.1.3.1.2) + * + * Description: + * This function calculates the average of two 16x16 or 16x8 blocks. The + * result is rounded according to (a+b+1)/2. The block average function can + * be used in conjunction with half-pixel interpolation to obtain quarter + * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1. + * + * Input Arguments: + * + * pPred0 - Pointer to the top-left corner of reference block 0 + * pPred1 - Pointer to the top-left corner of reference block 1 + * iPredStep0 - Step of reference block 0 + * iPredStep1 - Step of reference block 1 + * iDstStep - Step of the destination buffer + * iHeight - Height of the blocks + * + * Output Arguments: + * + * pDstPred - Pointer to the destination buffer. 16-byte aligned. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - one or more of the following pointers is NULL: pPred0, pPred1, or + * pDstPred. + * - pDstPred is not aligned on a 16-byte boundary. + * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 16. + * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 16. + * - iDstStep <= 0 or iDstStep is not a multiple of 16. + * - iHeight is not 8 or 16. + * + */ +OMXResult omxVCCOMM_Average_16x ( + const OMX_U8 *pPred0, + const OMX_U8 *pPred1, + OMX_U32 iPredStep0, + OMX_U32 iPredStep1, + OMX_U8 *pDstPred, + OMX_U32 iDstStep, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCCOMM_ExpandFrame_I (6.1.3.2.1) + * + * Description: + * This function expands a reconstructed frame in-place. The unexpanded + * source frame should be stored in a plane buffer with sufficient space + * pre-allocated for edge expansion, and the input frame should be located in + * the plane buffer center. This function executes the pixel expansion by + * replicating source frame edge pixel intensities in the empty pixel + * locations (expansion region) between the source frame edge and the plane + * buffer edge. The width/height of the expansion regions on the + * horizontal/vertical edges is controlled by the parameter iExpandPels. + * + * Input Arguments: + * + * pSrcDstPlane - pointer to the top-left corner of the frame to be + * expanded; must be aligned on an 8-byte boundary. + * iFrameWidth - frame width; must be a multiple of 8. + * iFrameHeight -frame height; must be a multiple of 8. + * iExpandPels - number of pixels to be expanded in the horizontal and + * vertical directions; must be a multiple of 8. + * iPlaneStep - distance, in bytes, between the start of consecutive lines + * in the plane buffer; must be larger than or equal to + * (iFrameWidth + 2 * iExpandPels). + * + * Output Arguments: + * + * pSrcDstPlane -Pointer to the top-left corner of the frame (NOT the + * top-left corner of the plane); must be aligned on an 8-byte + * boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - pSrcDstPlane is NULL. + * - pSrcDstPlane is not aligned on an 8-byte boundary. + * - one of the following parameters is either equal to zero or is a + * non-multiple of 8: iFrameHeight, iFrameWidth, iPlaneStep, or + * iExpandPels. + * - iPlaneStep < (iFrameWidth + 2 * iExpandPels). + * + */ +OMXResult omxVCCOMM_ExpandFrame_I ( + OMX_U8 *pSrcDstPlane, + OMX_U32 iFrameWidth, + OMX_U32 iFrameHeight, + OMX_U32 iExpandPels, + OMX_U32 iPlaneStep +); + + + +/** + * Function: omxVCCOMM_Copy8x8 (6.1.3.3.1) + * + * Description: + * Copies the reference 8x8 block to the current block. + * + * Input Arguments: + * + * pSrc - pointer to the reference block in the source frame; must be + * aligned on an 8-byte boundary. + * step - distance between the starts of consecutive lines in the reference + * frame, in bytes; must be a multiple of 8 and must be larger than + * or equal to 8. + * + * Output Arguments: + * + * pDst - pointer to the destination block; must be aligned on an 8-byte + * boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - one or more of the following pointers is NULL: pSrc, pDst + * - one or more of the following pointers is not aligned on an 8-byte + * boundary: pSrc, pDst + * - step <8 or step is not a multiple of 8. + * + */ +OMXResult omxVCCOMM_Copy8x8 ( + const OMX_U8 *pSrc, + OMX_U8 *pDst, + OMX_INT step +); + + + +/** + * Function: omxVCCOMM_Copy16x16 (6.1.3.3.2) + * + * Description: + * Copies the reference 16x16 macroblock to the current macroblock. + * + * Input Arguments: + * + * pSrc - pointer to the reference macroblock in the source frame; must be + * aligned on a 16-byte boundary. + * step - distance between the starts of consecutive lines in the reference + * frame, in bytes; must be a multiple of 16 and must be larger + * than or equal to 16. + * + * Output Arguments: + * + * pDst - pointer to the destination macroblock; must be aligned on a + * 16-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - one or more of the following pointers is NULL: pSrc, pDst + * - one or more of the following pointers is not aligned on a 16-byte + * boundary: pSrc, pDst + * - step <16 or step is not a multiple of 16. + * + */ +OMXResult omxVCCOMM_Copy16x16 ( + const OMX_U8 *pSrc, + OMX_U8 *pDst, + OMX_INT step +); + + + +/** + * Function: omxVCCOMM_ComputeTextureErrorBlock_SAD (6.1.4.1.1) + * + * Description: + * Computes texture error of the block; also returns SAD. + * + * Input Arguments: + * + * pSrc - pointer to the source plane; must be aligned on an 8-byte + * boundary. + * srcStep - step of the source plane + * pSrcRef - pointer to the reference buffer, an 8x8 block; must be aligned + * on an 8-byte boundary. + * + * Output Arguments: + * + * pDst - pointer to the destination buffer, an 8x8 block; must be aligned + * on an 8-byte boundary. + * pDstSAD - pointer to the Sum of Absolute Differences (SAD) value + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - At least one of the following + * pointers is NULL: pSrc, pSrcRef, pDst and pDstSAD. + * - pSrc is not 8-byte aligned. + * - SrcStep <= 0 or srcStep is not a multiple of 8. + * - pSrcRef is not 8-byte aligned. + * - pDst is not 8-byte aligned. + * + */ +OMXResult omxVCCOMM_ComputeTextureErrorBlock_SAD ( + const OMX_U8 *pSrc, + OMX_INT srcStep, + const OMX_U8 *pSrcRef, + OMX_S16 *pDst, + OMX_INT *pDstSAD +); + + + +/** + * Function: omxVCCOMM_ComputeTextureErrorBlock (6.1.4.1.2) + * + * Description: + * Computes the texture error of the block. + * + * Input Arguments: + * + * pSrc - pointer to the source plane. This should be aligned on an 8-byte + * boundary. + * srcStep - step of the source plane + * pSrcRef - pointer to the reference buffer, an 8x8 block. This should be + * aligned on an 8-byte boundary. + * + * Output Arguments: + * + * pDst - pointer to the destination buffer, an 8x8 block. This should be + * aligned on an 8-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - At least one of the following pointers is NULL: + * pSrc, pSrcRef, pDst. + * - pSrc is not 8-byte aligned. + * - SrcStep <= 0 or srcStep is not a multiple of 8. + * - pSrcRef is not 8-byte aligned. + * - pDst is not 8-byte aligned + * + */ +OMXResult omxVCCOMM_ComputeTextureErrorBlock ( + const OMX_U8 *pSrc, + OMX_INT srcStep, + const OMX_U8 *pSrcRef, + OMX_S16 *pDst +); + + + +/** + * Function: omxVCCOMM_LimitMVToRect (6.1.4.1.3) + * + * Description: + * Limits the motion vector associated with the current block/macroblock to + * prevent the motion compensated block/macroblock from moving outside a + * bounding rectangle as shown in Figure 6-1. + * + * Input Arguments: + * + * pSrcMV - pointer to the motion vector associated with the current block + * or macroblock + * pRectVOPRef - pointer to the bounding rectangle + * Xcoord, Ycoord - coordinates of the current block or macroblock + * size - size of the current block or macroblock; must be equal to 8 or + * 16. + * + * Output Arguments: + * + * pDstMV - pointer to the limited motion vector + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the + * following conditions is true: + * - at least one of the following pointers is NULL: + * pSrcMV, pDstMV, or pRectVOPRef. + * - size is not equal to either 8 or 16. + * - the width or height of the bounding rectangle is less than + * twice the block size. + */ +OMXResult omxVCCOMM_LimitMVToRect ( + const OMXVCMotionVector *pSrcMV, + OMXVCMotionVector *pDstMV, + const OMXRect *pRectVOPRef, + OMX_INT Xcoord, + OMX_INT Ycoord, + OMX_INT size +); + + + +/** + * Function: omxVCCOMM_SAD_16x (6.1.4.1.4) + * + * Description: + * This function calculates the SAD for 16x16 and 16x8 blocks. + * + * Input Arguments: + * + * pSrcOrg - Pointer to the original block; must be aligned on a 16-byte + * boundary. + * iStepOrg - Step of the original block buffer + * pSrcRef - Pointer to the reference block + * iStepRef - Step of the reference block buffer + * iHeight - Height of the block + * + * Output Arguments: + * + * pDstSAD - Pointer of result SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the + * following conditions is true: + * - at least one of the following pointers is NULL: + * pSrcOrg, pDstSAD, or pSrcRef + * - pSrcOrg is not 16-byte aligned. + * - iStepOrg <= 0 or iStepOrg is not a multiple of 16 + * - iStepRef <= 0 or iStepRef is not a multiple of 16 + * - iHeight is not 8 or 16 + * + */ +OMXResult omxVCCOMM_SAD_16x ( + const OMX_U8 *pSrcOrg, + OMX_U32 iStepOrg, + const OMX_U8 *pSrcRef, + OMX_U32 iStepRef, + OMX_S32 *pDstSAD, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCCOMM_SAD_8x (6.1.4.1.5) + * + * Description: + * This function calculates the SAD for 8x16, 8x8, 8x4 blocks. + * + * Input Arguments: + * + * pSrcOrg - Pointer to the original block; must be aligned on a 8-byte + * boundary. + * iStepOrg - Step of the original block buffer + * pSrcRef - Pointer to the reference block + * iStepRef - Step of the reference block buffer + * iHeight - Height of the block + * + * Output Arguments: + * + * pDstSAD -Pointer of result SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the + * following conditions is true: + * - at least one of the following pointers is NULL: + * pSrcOrg, pDstSAD, or pSrcRef + * - pSrcOrg is not 8-byte aligned. + * - iStepOrg <= 0 or iStepOrg is not a multiple of 8 + * - iStepRef <= 0 or iStepRef is not a multiple of 8 + * - iHeight is not 4, 8 or 16 + * + */ +OMXResult omxVCCOMM_SAD_8x ( + const OMX_U8 *pSrcOrg, + OMX_U32 iStepOrg, + const OMX_U8 *pSrcRef, + OMX_U32 iStepRef, + OMX_S32*pDstSAD, + OMX_U32 iHeight +); + + + +/* 6.2.1.1 Direction */ +/* The direction enumerator is used with functions that perform AC/DC prediction and zig-zag scan. */ + +enum { + OMX_VC_NONE = 0, + OMX_VC_HORIZONTAL = 1, + OMX_VC_VERTICAL = 2 +}; + + + +/* 6.2.1.2 Bilinear Interpolation */ +/* The bilinear interpolation enumerator is used with motion estimation, motion compensation, and reconstruction functions. */ + +enum { + OMX_VC_INTEGER_PIXEL = 0, /* case a */ + OMX_VC_HALF_PIXEL_X = 1, /* case b */ + OMX_VC_HALF_PIXEL_Y = 2, /* case c */ + OMX_VC_HALF_PIXEL_XY = 3 /* case d */ +}; + + + +/* 6.2.1.3 Neighboring Macroblock Availability */ +/* Neighboring macroblock availability is indicated using the following flags: */ + +enum { + OMX_VC_UPPER = 1, /** above macroblock is available */ + OMX_VC_LEFT = 2, /** left macroblock is available */ + OMX_VC_CENTER = 4, + OMX_VC_RIGHT = 8, + OMX_VC_LOWER = 16, + OMX_VC_UPPER_LEFT = 32, /** above-left macroblock is available */ + OMX_VC_UPPER_RIGHT = 64, /** above-right macroblock is available */ + OMX_VC_LOWER_LEFT = 128, + OMX_VC_LOWER_RIGHT = 256 +}; + + + +/* 6.2.1.4 Video Components */ +/* A data type that enumerates video components is defined as follows: */ + +typedef enum { + OMX_VC_LUMINANCE, /** Luminance component */ + OMX_VC_CHROMINANCE /** chrominance component */ +} OMXVCM4P2VideoComponent; + + + +/* 6.2.1.5 MacroblockTypes */ +/* A data type that enumerates macroblock types is defined as follows: */ + +typedef enum { + OMX_VC_INTER = 0, /** P picture or P-VOP */ + OMX_VC_INTER_Q = 1, /** P picture or P-VOP */ + OMX_VC_INTER4V = 2, /** P picture or P-VOP */ + OMX_VC_INTRA = 3, /** I and P picture, I- and P-VOP */ + OMX_VC_INTRA_Q = 4, /** I and P picture, I- and P-VOP */ + OMX_VC_INTER4V_Q = 5 /** P picture or P-VOP (H.263)*/ +} OMXVCM4P2MacroblockType; + + + +/* 6.2.1.6 Coordinates */ +/* Coordinates are represented as follows: */ + +typedef struct { + OMX_INT x; + OMX_INT y; +} OMXVCM4P2Coordinate; + + + +/* 6.2.1.7 Motion Estimation Algorithms */ +/* A data type that enumerates motion estimation search methods is defined as follows: */ + +typedef enum { + OMX_VC_M4P2_FAST_SEARCH = 0, /** Fast motion search */ + OMX_VC_M4P2_FULL_SEARCH = 1 /** Full motion search */ +} OMXVCM4P2MEMode; + + + +/* 6.2.1.8 Motion Estimation Parameters */ +/* A data structure containing control parameters for + * motion estimation functions is defined as follows: + */ + +typedef struct { + OMX_INT searchEnable8x8; /** enables 8x8 search */ + OMX_INT halfPelSearchEnable; /** enables half-pel resolution */ + OMX_INT searchRange; /** search range */ + OMX_INT rndVal; /** rounding control; 0-disabled, 1-enabled*/ +} OMXVCM4P2MEParams; + + + +/* 6.2.1.9 Macroblock Information */ +/* A data structure containing macroblock parameters for + * motion estimation functions is defined as follows: + */ + +typedef struct { + OMX_S32 sliceId; /* slice number */ + OMXVCM4P2MacroblockType mbType; /* MB type: OMX_VC_INTRA, OMX_VC_INTER, or OMX_VC_INTER4 */ + OMX_S32 qp; /* quantization parameter*/ + OMX_U32 cbpy; /* CBP Luma */ + OMX_U32 cbpc; /* CBP Chroma */ + OMXVCMotionVector pMV0[2][2]; /* motion vector, represented using 1/2-pel units, + * pMV0[blocky][blockx] (blocky = 0~1, blockx =0~1) + */ + OMXVCMotionVector pMVPred[2][2]; /* motion vector prediction, represented using 1/2-pel units, + * pMVPred[blocky][blockx] (blocky = 0~1, blockx = 0~1) + */ + OMX_U8 pPredDir[2][2]; /* AC prediction direction: + * OMX_VC_NONE, OMX_VC_VERTICAL, OMX_VC_HORIZONTAL + */ +} OMXVCM4P2MBInfo, *OMXVCM4P2MBInfoPtr; + + + +/** + * Function: omxVCM4P2_FindMVpred (6.2.3.1.1) + * + * Description: + * Predicts a motion vector for the current block using the procedure + * specified in [ISO14496-2], subclause 7.6.5. The resulting predicted MV is + * returned in pDstMVPred. If the parameter pDstMVPredME if is not NULL then + * the set of three MV candidates used for prediction is also returned, + * otherwise pDstMVPredMEis NULL upon return. + * + * Input Arguments: + * + * pSrcMVCurMB - pointer to the MV buffer associated with the current Y + * macroblock; a value of NULL indicates unavailability. + * pSrcCandMV1 - pointer to the MV buffer containing the 4 MVs associated + * with the MB located to the left of the current MB; set to NULL + * if there is no MB to the left. + * pSrcCandMV2 - pointer to the MV buffer containing the 4 MVs associated + * with the MB located above the current MB; set to NULL if there + * is no MB located above the current MB. + * pSrcCandMV3 - pointer to the MV buffer containing the 4 MVs associated + * with the MB located to the right and above the current MB; set + * to NULL if there is no MB located to the above-right. + * iBlk - the index of block in the current macroblock + * pDstMVPredME - MV candidate return buffer; if set to NULL then + * prediction candidate MVs are not returned and pDstMVPredME will + * be NULL upon function return; if pDstMVPredME is non-NULL then it + * must point to a buffer containing sufficient space for three + * return MVs. + * + * Output Arguments: + * + * pDstMVPred - pointer to the predicted motion vector + * pDstMVPredME - if non-NULL upon input then pDstMVPredME points upon + * return to a buffer containing the three motion vector candidates + * used for prediction as specified in [ISO14496-2], subclause + * 7.6.5, otherwise if NULL upon input then pDstMVPredME is NULL + * upon output. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - the pointer pDstMVPred is NULL + * - the parameter iBlk does not fall into the range 0 <= iBlk<=3 + * + */ +OMXResult omxVCM4P2_FindMVpred ( + const OMXVCMotionVector *pSrcMVCurMB, + const OMXVCMotionVector *pSrcCandMV1, + const OMXVCMotionVector *pSrcCandMV2, + const OMXVCMotionVector *pSrcCandMV3, + OMXVCMotionVector *pDstMVPred, + OMXVCMotionVector *pDstMVPredME, + OMX_INT iBlk +); + + + +/** + * Function: omxVCM4P2_IDCT8x8blk (6.2.3.2.1) + * + * Description: + * Computes a 2D inverse DCT for a single 8x8 block, as defined in + * [ISO14496-2]. + * + * Input Arguments: + * + * pSrc - pointer to the start of the linearly arranged IDCT input buffer; + * must be aligned on a 16-byte boundary. According to + * [ISO14496-2], the input coefficient values should lie within the + * range [-2048, 2047]. + * + * Output Arguments: + * + * pDst - pointer to the start of the linearly arranged IDCT output buffer; + * must be aligned on a 16-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - pSrc or pDst is NULL. + * - pSrc or pDst is not 16-byte aligned. + * + */ +OMXResult omxVCM4P2_IDCT8x8blk ( + const OMX_S16 *pSrc, + OMX_S16 *pDst +); + + + +/** + * Function: omxVCM4P2_MEGetBufSize (6.2.4.1.1) + * + * Description: + * Computes the size, in bytes, of the vendor-specific specification + * structure for the following motion estimation functions: + * BlockMatch_Integer_8x8, BlockMatch_Integer_16x16, and MotionEstimationMB. + * + * Input Arguments: + * + * MEmode - motion estimation mode; available modes are defined by the + * enumerated type OMXVCM4P2MEMode + * pMEParams - motion estimation parameters + * + * Output Arguments: + * + * pSize - pointer to the number of bytes required for the specification + * structure + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - one or more of the following is true: + * - an invalid value was specified for the parameter MEmode + * - a negative or zero value was specified for the + * parameter pMEParams->searchRange + * + */ +OMXResult omxVCM4P2_MEGetBufSize ( + OMXVCM4P2MEMode MEmode, + const OMXVCM4P2MEParams *pMEParams, + OMX_U32 *pSize +); + + + +/** + * Function: omxVCM4P2_MEInit (6.2.4.1.2) + * + * Description: + * Initializes the vendor-specific specification structure required for the + * following motion estimation functions: BlockMatch_Integer_8x8, + * BlockMatch_Integer_16x16, and MotionEstimationMB. Memory for the + * specification structure *pMESpec must be allocated prior to calling the + * function, and should be aligned on a 4-byte boundary. Following + * initialization by this function, the vendor-specific structure *pMESpec + * should contain an implementation-specific representation of all motion + * estimation parameters received via the structure pMEParams, for example + * rndVal, searchRange, etc. The number of bytes required for the + * specification structure can be determined using the function + * omxVCM4P2_MEGetBufSize. + * + * Input Arguments: + * + * MEmode - motion estimation mode; available modes are defined by the + * enumerated type OMXVCM4P2MEMode + * pMEParams - motion estimation parameters + * pMESpec - pointer to the uninitialized ME specification structure + * + * Output Arguments: + * + * pMESpec - pointer to the initialized ME specification structure + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - one or more of the following is true: + * - an invalid value was specified for the parameter MEmode + * - a negative or zero value was specified for the + * parameter pMEParams->searchRange + * + */ +OMXResult omxVCM4P2_MEInit ( + OMXVCM4P2MEMode MEmode, + const OMXVCM4P2MEParams*pMEParams, + void *pMESpec +); + + + +/** + * Function: omxVCM4P2_BlockMatch_Integer_16x16 (6.2.4.2.1) + * + * Description: + * Performs a 16x16 block search; estimates motion vector and associated + * minimum SAD. Both the input and output motion vectors are represented using + * half-pixel units, and therefore a shift left or right by 1 bit may be + * required, respectively, to match the input or output MVs with other + * functions that either generate output MVs or expect input MVs represented + * using integer pixel units. + * + * Input Arguments: + * + * pSrcRefBuf - pointer to the reference Y plane; points to the reference + * MB that corresponds to the location of the current macroblock in + * the current plane. + * refWidth - width of the reference plane + * pRefRect - pointer to the valid reference plane rectangle; coordinates + * are specified relative to the image origin. Rectangle + * boundaries may extend beyond image boundaries if the image has + * been padded. For example, if padding extends 4 pixels beyond + * frame border, then the value for the left border could be set to + * -4. + * pSrcCurrBuf - pointer to the current block in the current macroblock + * buffer extracted from the original plane (linear array, 256 + * entries); must be aligned on a 16-byte boundary. The number of + * bytes between lines (step) is 16. + * pCurrPointPos - position of the current macroblock in the current plane + * pSrcPreMV - pointer to predicted motion vector; NULL indicates no + * predicted MV + * pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced + * by pSrcPreMV); may be set to NULL if unavailable. + * pMESpec - vendor-specific motion estimation specification structure; + * must have been allocated and then initialized using + * omxVCM4P2_MEInit prior to calling the block matching function. + * + * Output Arguments: + * + * pDstMV - pointer to estimated MV + * pDstSAD - pointer to minimum SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following + * conditions is true: + * - at least one of the following pointers is NULL: pSrcRefBuf, + * pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or + * pMESpec, or + * - pSrcCurrBuf is not 16-byte aligned + * + */ +OMXResult omxVCM4P2_BlockMatch_Integer_16x16 ( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pCurrPointPos, + const OMXVCMotionVector*pSrcPreMV, + const OMX_INT *pSrcPreSAD, + void *pMESpec, + OMXVCMotionVector*pDstMV, + OMX_INT *pDstSAD +); + + + +/** + * Function: omxVCM4P2_BlockMatch_Integer_8x8 (6.2.4.2.2) + * + * Description: + * Performs an 8x8 block search; estimates motion vector and associated + * minimum SAD. Both the input and output motion vectors are represented + * using half-pixel units, and therefore a shift left or right by 1 bit may be + * required, respectively, to match the input or output MVs with other + * functions that either generate output MVs or expect input MVs represented + * using integer pixel units. + * + * Input Arguments: + * + * pSrcRefBuf - pointer to the reference Y plane; points to the reference + * block that corresponds to the location of the current 8x8 block + * in the current plane. + * refWidth - width of the reference plane + * pRefRect - pointer to the valid reference plane rectangle; coordinates + * are specified relative to the image origin. Rectangle + * boundaries may extend beyond image boundaries if the image has + * been padded. + * pSrcCurrBuf - pointer to the current block in the current macroblock + * buffer extracted from the original plane (linear array, 128 + * entries); must be aligned on an 8-byte boundary. The number of + * bytes between lines (step) is 16 bytes. + * pCurrPointPos - position of the current block in the current plane + * pSrcPreMV - pointer to predicted motion vector; NULL indicates no + * predicted MV + * pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced + * by pSrcPreMV); may be set to NULL if unavailable. + * pMESpec - vendor-specific motion estimation specification structure; + * must have been allocated and then initialized using + * omxVCM4P2_MEInit prior to calling the block matching function. + * + * Output Arguments: + * + * pDstMV - pointer to estimated MV + * pDstSAD - pointer to minimum SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following + * conditions is true: + * - at least one of the following pointers is NULL: pSrcRefBuf, + * pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or + * pMESpec, or + * - pSrcCurrBuf is not 8-byte aligned + * + */ +OMXResult omxVCM4P2_BlockMatch_Integer_8x8 ( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pCurrPointPos, + const OMXVCMotionVector *pSrcPreMV, + const OMX_INT *pSrcPreSAD, + void *pMESpec, + OMXVCMotionVector *pDstMV, + OMX_INT *pDstSAD +); + + + +/** + * Function: omxVCM4P2_BlockMatch_Half_16x16 (6.2.4.2.3) + * + * Description: + * Performs a 16x16 block match with half-pixel resolution. Returns the + * estimated motion vector and associated minimum SAD. This function + * estimates the half-pixel motion vector by interpolating the integer + * resolution motion vector referenced by the input parameter pSrcDstMV, i.e., + * the initial integer MV is generated externally. The input parameters + * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of + * 16x16 integer search prior to calling BlockMatch_Half_16x16. The function + * BlockMatch_Integer_16x16 may be used for integer motion estimation. + * + * Input Arguments: + * + * pSrcRefBuf - pointer to the reference Y plane; points to the reference + * macroblock that corresponds to the location of the current + * macroblock in the current plane. + * refWidth - width of the reference plane + * pRefRect - reference plane valid region rectangle + * pSrcCurrBuf - pointer to the current block in the current macroblock + * buffer extracted from the original plane (linear array, 256 + * entries); must be aligned on a 16-byte boundary. The number of + * bytes between lines (step) is 16. + * pSearchPointRefPos - position of the starting point for half pixel + * search (specified in terms of integer pixel units) in the + * reference plane, i.e., the reference position pointed to by the + * predicted motion vector. + * rndVal - rounding control parameter: 0 - disabled; 1 - enabled. + * pSrcDstMV - pointer to the initial MV estimate; typically generated + * during a prior 16X16 integer search; specified in terms of + * half-pixel units. + * + * Output Arguments: + * + * pSrcDstMV - pointer to estimated MV + * pDstSAD - pointer to minimum SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following + * conditions is true: + * - at least one of the following pointers is NULL: pSrcRefBuf, + * pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV. + * - pSrcCurrBuf is not 16-byte aligned, or + * + */ +OMXResult omxVCM4P2_BlockMatch_Half_16x16 ( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pSearchPointRefPos, + OMX_INT rndVal, + OMXVCMotionVector *pSrcDstMV, + OMX_INT *pDstSAD +); + + + +/** + * Function: omxVCM4P2_BlockMatch_Half_8x8 (6.2.4.2.4) + * + * Description: + * Performs an 8x8 block match with half-pixel resolution. Returns the + * estimated motion vector and associated minimum SAD. This function + * estimates the half-pixel motion vector by interpolating the integer + * resolution motion vector referenced by the input parameter pSrcDstMV, i.e., + * the initial integer MV is generated externally. The input parameters + * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of + * 8x8 integer search prior to calling BlockMatch_Half_8x8. The function + * BlockMatch_Integer_8x8 may be used for integer motion estimation. + * + * Input Arguments: + * + * pSrcRefBuf - pointer to the reference Y plane; points to the reference + * block that corresponds to the location of the current 8x8 block + * in the current plane. + * refWidth - width of the reference plane + * pRefRect - reference plane valid region rectangle + * pSrcCurrBuf - pointer to the current block in the current macroblock + * buffer extracted from the original plane (linear array, 128 + * entries); must be aligned on a 8-byte boundary. The number of + * bytes between lines (step) is 16. + * pSearchPointRefPos - position of the starting point for half pixel + * search (specified in terms of integer pixel units) in the + * reference plane. + * rndVal - rounding control parameter: 0 - disabled; 1 - enabled. + * pSrcDstMV - pointer to the initial MV estimate; typically generated + * during a prior 8x8 integer search, specified in terms of + * half-pixel units. + * + * Output Arguments: + * + * pSrcDstMV - pointer to estimated MV + * pDstSAD - pointer to minimum SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following + * conditions is true: + * - at least one of the following pointers is NULL: + * pSrcRefBuf, pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV + * - pSrcCurrBuf is not 8-byte aligned + * + */ +OMXResult omxVCM4P2_BlockMatch_Half_8x8 ( + const OMX_U8 *pSrcRefBuf, + OMX_INT refWidth, + const OMXRect *pRefRect, + const OMX_U8 *pSrcCurrBuf, + const OMXVCM4P2Coordinate *pSearchPointRefPos, + OMX_INT rndVal, + OMXVCMotionVector *pSrcDstMV, + OMX_INT *pDstSAD +); + + + +/** + * Function: omxVCM4P2_MotionEstimationMB (6.2.4.3.1) + * + * Description: + * Performs motion search for a 16x16 macroblock. Selects best motion search + * strategy from among inter-1MV, inter-4MV, and intra modes. Supports + * integer and half pixel resolution. + * + * Input Arguments: + * + * pSrcCurrBuf - pointer to the top-left corner of the current MB in the + * original picture plane; must be aligned on a 16-byte boundary. + * The function does not expect source data outside the region + * bounded by the MB to be available; for example it is not + * necessary for the caller to guarantee the availability of + * pSrcCurrBuf[-SrcCurrStep], i.e., the row of pixels above the MB + * to be processed. + * srcCurrStep - width of the original picture plane, in terms of full + * pixels; must be a multiple of 16. + * pSrcRefBuf - pointer to the reference Y plane; points to the reference + * plane location corresponding to the location of the current + * macroblock in the current plane; must be aligned on a 16-byte + * boundary. + * srcRefStep - width of the reference picture plane, in terms of full + * pixels; must be a multiple of 16. + * pRefRect - reference plane valid region rectangle, specified relative to + * the image origin + * pCurrPointPos - position of the current macroblock in the current plane + * pMESpec - pointer to the vendor-specific motion estimation specification + * structure; must be allocated and then initialized using + * omxVCM4P2_MEInit prior to calling this function. + * pMBInfo - array, of dimension four, containing pointers to information + * associated with four nearby MBs: + * - pMBInfo[0] - pointer to left MB information + * - pMBInfo[1] - pointer to top MB information + * - pMBInfo[2] - pointer to top-left MB information + * - pMBInfo[3] - pointer to top-right MB information + * Any pointer in the array may be set equal to NULL if the + * corresponding MB doesn't exist. For each MB, the following structure + * members are used: + * - mbType - macroblock type, either OMX_VC_INTRA, OMX_VC_INTER, or + * OMX_VC_INTER4V + * - pMV0[2][2] - estimated motion vectors; represented + * in 1/2 pixel units + * - sliceID - number of the slice to which the MB belongs + * pSrcDstMBCurr - pointer to information structure for the current MB. + * The following entries should be set prior to calling the + * function: sliceID - the number of the slice the to which the + * current MB belongs. The structure elements cbpy and cbpc are + * ignored. + * + * Output Arguments: + * + * pSrcDstMBCurr - pointer to updated information structure for the current + * MB after MB-level motion estimation has been completed. The + * following structure members are updated by the ME function: + * - mbType - macroblock type: OMX_VC_INTRA, OMX_VC_INTER, or + * OMX_VC_INTER4V. + * - pMV0[2][2] - estimated motion vectors; represented in + * terms of 1/2 pel units. + * - pMVPred[2][2] - predicted motion vectors; represented + * in terms of 1/2 pel units. + * The structure members cbpy and cbpc are not updated by the function. + * pDstSAD - pointer to the minimum SAD for INTER1V, or sum of minimum SADs + * for INTER4V + * pDstBlockSAD - pointer to an array of SAD values for each of the four + * 8x8 luma blocks in the MB. The block SADs are in scan order for + * each MB. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the + * following conditions is true: + * - at least one of the following pointers is NULL: pSrcCurrBuf, + * pSrcRefBuf, pRefRect, pCurrPointPos, pMBInter, pMBIntra, + * pSrcDstMBCurr, or pDstSAD. + * + */ +OMXResult omxVCM4P2_MotionEstimationMB ( + const OMX_U8 *pSrcCurrBuf, + OMX_S32 srcCurrStep, + const OMX_U8 *pSrcRefBuf, + OMX_S32 srcRefStep, + const OMXRect*pRefRect, + const OMXVCM4P2Coordinate *pCurrPointPos, + void *pMESpec, + const OMXVCM4P2MBInfoPtr *pMBInfo, + OMXVCM4P2MBInfo *pSrcDstMBCurr, + OMX_U16 *pDstSAD, + OMX_U16 *pDstBlockSAD +); + + + +/** + * Function: omxVCM4P2_DCT8x8blk (6.2.4.4.1) + * + * Description: + * Computes a 2D forward DCT for a single 8x8 block, as defined in + * [ISO14496-2]. + * + * Input Arguments: + * + * pSrc - pointer to the start of the linearly arranged input buffer; must + * be aligned on a 16-byte boundary. Input values (pixel + * intensities) are valid in the range [-255,255]. + * + * Output Arguments: + * + * pDst - pointer to the start of the linearly arranged output buffer; must + * be aligned on a 16-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, returned if: + * - pSrc or pDst is NULL. + * - pSrc or pDst is not 16-byte aligned. + * + */ +OMXResult omxVCM4P2_DCT8x8blk ( + const OMX_S16 *pSrc, + OMX_S16 *pDst +); + + + +/** + * Function: omxVCM4P2_QuantIntra_I (6.2.4.4.2) + * + * Description: + * Performs quantization on intra block coefficients. This function supports + * bits_per_pixel == 8. + * + * Input Arguments: + * + * pSrcDst - pointer to the input intra block coefficients; must be aligned + * on a 16-byte boundary. + * QP - quantization parameter (quantizer_scale). + * blockIndex - block index indicating the component type and position, + * valid in the range 0 to 5, as defined in [ISO14496-2], subclause + * 6.1.3.8. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; shortVideoHeader==1 selects linear intra DC + * mode, and shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * pSrcDst - pointer to the output (quantized) interblock coefficients. + * When shortVideoHeader==1, AC coefficients are saturated on the + * interval [-127, 127], and DC coefficients are saturated on the + * interval [1, 254]. When shortVideoHeader==0, AC coefficients + * are saturated on the interval [-2047, 2047]. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - pSrcDst is NULL. + * - blockIndex < 0 or blockIndex >= 10 + * - QP <= 0 or QP >= 32. + * + */ +OMXResult omxVCM4P2_QuantIntra_I ( + OMX_S16 *pSrcDst, + OMX_U8 QP, + OMX_INT blockIndex, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_QuantInter_I (6.2.4.4.3) + * + * Description: + * Performs quantization on an inter coefficient block; supports + * bits_per_pixel == 8. + * + * Input Arguments: + * + * pSrcDst - pointer to the input inter block coefficients; must be aligned + * on a 16-byte boundary. + * QP - quantization parameter (quantizer_scale) + * shortVideoHeader - binary flag indicating presence of short_video_header; + * shortVideoHeader==1 selects linear intra DC mode, and + * shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * pSrcDst - pointer to the output (quantized) interblock coefficients. + * When shortVideoHeader==1, AC coefficients are saturated on the + * interval [-127, 127], and DC coefficients are saturated on the + * interval [1, 254]. When shortVideoHeader==0, AC coefficients + * are saturated on the interval [-2047, 2047]. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - pSrcDst is NULL. + * - QP <= 0 or QP >= 32. + * + */ +OMXResult omxVCM4P2_QuantInter_I ( + OMX_S16 *pSrcDst, + OMX_U8 QP, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_TransRecBlockCoef_intra (6.2.4.4.4) + * + * Description: + * Quantizes the DCT coefficients, implements intra block AC/DC coefficient + * prediction, and reconstructs the current intra block texture for prediction + * on the next frame. Quantized row and column coefficients are returned in + * the updated coefficient buffers. + * + * Input Arguments: + * + * pSrc - pointer to the pixels of current intra block; must be aligned on + * an 8-byte boundary. + * pPredBufRow - pointer to the coefficient row buffer containing + * ((num_mb_per_row * 2 + 1) * 8) elements of type OMX_S16. + * Coefficients are organized into blocks of eight as described + * below (Internal Prediction Coefficient Update Procedures). The + * DC coefficient is first, and the remaining buffer locations + * contain the quantized AC coefficients. Each group of eight row + * buffer elements combined with one element eight elements ahead + * contains the coefficient predictors of the neighboring block + * that is spatially above or to the left of the block currently to + * be decoded. A negative-valued DC coefficient indicates that this + * neighboring block is not INTRA-coded or out of bounds, and + * therefore the AC and DC coefficients are invalid. Pointer must + * be aligned on an 8-byte boundary. + * pPredBufCol - pointer to the prediction coefficient column buffer + * containing 16 elements of type OMX_S16. Coefficients are + * organized as described in section 6.2.2.5. Pointer must be + * aligned on an 8-byte boundary. + * pSumErr - pointer to a flag indicating whether or not AC prediction is + * required; AC prediction is enabled if *pSumErr >=0, but the + * value is not used for coefficient prediction, i.e., the sum of + * absolute differences starts from 0 for each call to this + * function. Otherwise AC prediction is disabled if *pSumErr < 0 . + * blockIndex - block index indicating the component type and position, as + * defined in [ISO14496-2], subclause 6.1.3.8. + * curQp - quantization parameter of the macroblock to which the current + * block belongs + * pQpBuf - pointer to a 2-element quantization parameter buffer; pQpBuf[0] + * contains the quantization parameter associated with the 8x8 + * block left of the current block (QPa), and pQpBuf[1] contains + * the quantization parameter associated with the 8x8 block above + * the current block (QPc). In the event that the corresponding + * block is outside of the VOP bound, the Qp value will not affect + * the intra prediction process, as described in [ISO14496-2], + * sub-clause 7.4.3.3, Adaptive AC Coefficient Prediction. + * srcStep - width of the source buffer; must be a multiple of 8. + * dstStep - width of the reconstructed destination buffer; must be a + * multiple of 16. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; shortVideoHeader==1 selects linear intra DC + * mode, and shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * pDst - pointer to the quantized DCT coefficient buffer; pDst[0] contains + * the predicted DC coefficient; the remaining entries contain the + * quantized AC coefficients (without prediction). The pointer + * pDstmust be aligned on a 16-byte boundary. + * pRec - pointer to the reconstructed texture; must be aligned on an + * 8-byte boundary. + * pPredBufRow - pointer to the updated coefficient row buffer + * pPredBufCol - pointer to the updated coefficient column buffer + * pPreACPredict - if prediction is enabled, the parameter points to the + * start of the buffer containing the coefficient differences for + * VLC encoding. The entry pPreACPredict[0]indicates prediction + * direction for the current block and takes one of the following + * values: OMX_VC_NONE (prediction disabled), OMX_VC_HORIZONTAL, or + * OMX_VC_VERTICAL. The entries + * pPreACPredict[1]-pPreACPredict[7]contain predicted AC + * coefficients. If prediction is disabled (*pSumErr<0) then the + * contents of this buffer are undefined upon return from the + * function + * pSumErr - pointer to the value of the accumulated AC coefficient errors, + * i.e., sum of the absolute differences between predicted and + * unpredicted AC coefficients + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - Bad arguments: + * - At least one of the following pointers is NULL: pSrc, pDst, pRec, + * pCoefBufRow, pCoefBufCol, pQpBuf, pPreACPredict, pSumErr. + * - blockIndex < 0 or blockIndex >= 10; + * - curQP <= 0 or curQP >= 32. + * - srcStep, or dstStep <= 0 or not a multiple of 8. + * - pDst is not 16-byte aligned: . + * - At least one of the following pointers is not 8-byte aligned: + * pSrc, pRec. + * + * Note: The coefficient buffers must be updated in accordance with the + * update procedures defined in section in 6.2.2. + * + */ +OMXResult omxVCM4P2_TransRecBlockCoef_intra ( + const OMX_U8 *pSrc, + OMX_S16 *pDst, + OMX_U8 *pRec, + OMX_S16 *pPredBufRow, + OMX_S16 *pPredBufCol, + OMX_S16 *pPreACPredict, + OMX_INT *pSumErr, + OMX_INT blockIndex, + OMX_U8 curQp, + const OMX_U8 *pQpBuf, + OMX_INT srcStep, + OMX_INT dstStep, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_TransRecBlockCoef_inter (6.2.4.4.5) + * + * Description: + * Implements DCT, and quantizes the DCT coefficients of the inter block + * while reconstructing the texture residual. There is no boundary check for + * the bit stream buffer. + * + * Input Arguments: + * + * pSrc -pointer to the residuals to be encoded; must be aligned on an + * 16-byte boundary. + * QP - quantization parameter. + * shortVideoHeader - binary flag indicating presence of short_video_header; + * shortVideoHeader==1 selects linear intra DC mode, and + * shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * pDst - pointer to the quantized DCT coefficients buffer; must be aligned + * on a 16-byte boundary. + * pRec - pointer to the reconstructed texture residuals; must be aligned + * on a 16-byte boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - At least one of the following pointers is either NULL or + * not 16-byte aligned: + * - pSrc + * - pDst + * - pRec + * - QP <= 0 or QP >= 32. + * + */ +OMXResult omxVCM4P2_TransRecBlockCoef_inter ( + const OMX_S16 *pSrc, + OMX_S16 *pDst, + OMX_S16 *pRec, + OMX_U8 QP, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_EncodeVLCZigzag_IntraDCVLC (6.2.4.5.2) + * + * Description: + * Performs zigzag scan and VLC encoding of AC and DC coefficients for one + * intra block. Two versions of the function (DCVLC and ACVLC) are provided + * in order to support the two different methods of processing DC + * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, "Intra DC + * Coefficient Decoding for the Case of Switched VLC Encoding". + * + * Input Arguments: + * + * ppBitStream - double pointer to the current byte in the bitstream + * pBitOffset - pointer to the bit position in the byte pointed by + * *ppBitStream. Valid within 0 to 7. + * pQDctBlkCoef - pointer to the quantized DCT coefficient + * predDir - AC prediction direction, which is used to decide the zigzag + * scan pattern; takes one of the following values: + * - OMX_VC_NONE - AC prediction not used. + * Performs classical zigzag scan. + * - OMX_VC_HORIZONTAL - Horizontal prediction. + * Performs alternate-vertical zigzag scan. + * - OMX_VC_VERTICAL - Vertical prediction. + * Performs alternate-horizontal zigzag scan. + * pattern - block pattern which is used to decide whether this block is + * encoded + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * videoComp - video component type (luminance, chrominance) of the current + * block + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is encoded, so + * that it points to the current byte in the bit stream buffer. + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - Bad arguments: + * - At least one of the following pointers is NULL: ppBitStream, + * *ppBitStream, pBitOffset, pQDctBlkCoef. + * - *pBitOffset < 0, or *pBitOffset >7. + * - PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or + * OMX_VC_VERTICAL. + * - VideoComp is not one component of enum OMXVCM4P2VideoComponent. + * + */ +OMXResult omxVCM4P2_EncodeVLCZigzag_IntraDCVLC ( + OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + const OMX_S16 *pQDctBlkCoef, + OMX_U8 predDir, + OMX_U8 pattern, + OMX_INT shortVideoHeader, + OMXVCM4P2VideoComponent videoComp +); + + + +/** + * Function: omxVCM4P2_EncodeVLCZigzag_IntraACVLC (6.2.4.5.2) + * + * Description: + * Performs zigzag scan and VLC encoding of AC and DC coefficients for one + * intra block. Two versions of the function (DCVLC and ACVLC) are provided + * in order to support the two different methods of processing DC + * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC + * Coefficient Decoding for the Case of Switched VLC Encoding. + * + * Input Arguments: + * + * ppBitStream - double pointer to the current byte in the bitstream + * pBitOffset - pointer to the bit position in the byte pointed by + * *ppBitStream. Valid within 0 to 7. + * pQDctBlkCoef - pointer to the quantized DCT coefficient + * predDir - AC prediction direction, which is used to decide the zigzag + * scan pattern; takes one of the following values: + * - OMX_VC_NONE - AC prediction not used. + * Performs classical zigzag scan. + * - OMX_VC_HORIZONTAL - Horizontal prediction. + * Performs alternate-vertical zigzag scan. + * - OMX_VC_VERTICAL - Vertical prediction. + * Performs alternate-horizontal zigzag scan. + * pattern - block pattern which is used to decide whether this block is + * encoded + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is encoded, so + * that it points to the current byte in the bit stream buffer. + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - Bad arguments: + * - At least one of the following pointers is NULL: ppBitStream, + * *ppBitStream, pBitOffset, pQDctBlkCoef. + * - *pBitOffset < 0, or *pBitOffset >7. + * - PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or + * OMX_VC_VERTICAL. + * - VideoComp is not one component of enum OMXVCM4P2VideoComponent. + * + */ +OMXResult omxVCM4P2_EncodeVLCZigzag_IntraACVLC ( + OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + const OMX_S16 *pQDctBlkCoef, + OMX_U8 predDir, + OMX_U8 pattern, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_EncodeVLCZigzag_Inter (6.2.4.5.3) + * + * Description: + * Performs classical zigzag scanning and VLC encoding for one inter block. + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the bit + * stream + * pBitOffset - pointer to the bit position in the byte pointed by + * *ppBitStream. Valid within 0 to 7 + * pQDctBlkCoef - pointer to the quantized DCT coefficient + * pattern - block pattern which is used to decide whether this block is + * encoded + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is encoded so that + * it points to the current byte in the bit stream buffer. + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - Bad arguments + * - At least one of the pointers: is NULL: ppBitStream, *ppBitStream, + * pBitOffset, pQDctBlkCoef + * - *pBitOffset < 0, or *pBitOffset >7. + * + */ +OMXResult omxVCM4P2_EncodeVLCZigzag_Inter ( + OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + const OMX_S16 *pQDctBlkCoef, + OMX_U8 pattern, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_EncodeMV (6.2.4.5.4) + * + * Description: + * Predicts a motion vector for the current macroblock, encodes the + * difference, and writes the output to the stream buffer. The input MVs + * pMVCurMB, pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB should lie + * within the ranges associated with the input parameter fcodeForward, as + * described in [ISO14496-2], subclause 7.6.3. This function provides a + * superset of the functionality associated with the function + * omxVCM4P2_FindMVpred. + * + * Input Arguments: + * + * ppBitStream - double pointer to the current byte in the bitstream buffer + * pBitOffset - index of the first free (next available) bit in the stream + * buffer referenced by *ppBitStream, valid in the range 0 to 7. + * pMVCurMB - pointer to the current macroblock motion vector; a value of + * NULL indicates unavailability. + * pSrcMVLeftMB - pointer to the source left macroblock motion vector; a + * value of NULLindicates unavailability. + * pSrcMVUpperMB - pointer to source upper macroblock motion vector; a + * value of NULL indicates unavailability. + * pSrcMVUpperRightMB - pointer to source upper right MB motion vector; a + * value of NULL indicates unavailability. + * fcodeForward - an integer with values from 1 to 7; used in encoding + * motion vectors related to search range, as described in + * [ISO14496-2], subclause 7.6.3. + * MBType - macro block type, valid in the range 0 to 5 + * + * Output Arguments: + * + * ppBitStream - updated pointer to the current byte in the bit stream + * buffer + * pBitOffset - updated index of the next available bit position in stream + * buffer referenced by *ppBitStream + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - At least one of the following pointers is NULL: ppBitStream, + * *ppBitStream, pBitOffset, pMVCurMB + * - *pBitOffset < 0, or *pBitOffset >7. + * - fcodeForward <= 0, or fcodeForward > 7, or MBType < 0. + * + */ +OMXResult omxVCM4P2_EncodeMV ( + OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + const OMXVCMotionVector *pMVCurMB, + const OMXVCMotionVector*pSrcMVLeftMB, + const OMXVCMotionVector *pSrcMVUpperMB, + const OMXVCMotionVector *pSrcMVUpperRightMB, + OMX_INT fcodeForward, + OMXVCM4P2MacroblockType MBType +); + + + +/** + * Function: omxVCM4P2_DecodePadMV_PVOP (6.2.5.1.1) + * + * Description: + * Decodes and pads the four motion vectors associated with a non-intra P-VOP + * macroblock. For macroblocks of type OMX_VC_INTER4V, the output MV is + * padded as specified in [ISO14496-2], subclause 7.6.1.6. Otherwise, for + * macroblocks of types other than OMX_VC_INTER4V, the decoded MV is copied to + * all four output MV buffer entries. + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the bit + * stream buffer + * pBitOffset - pointer to the bit position in the byte pointed to by + * *ppBitStream. *pBitOffset is valid within [0-7]. + * pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB - pointers to the + * motion vector buffers of the macroblocks specially at the left, + * upper, and upper-right side of the current macroblock, + * respectively; a value of NULL indicates unavailability. Note: + * Any neighborhood macroblock outside the current VOP or video + * packet or outside the current GOB (when short_video_header is + * 1 ) for which gob_header_empty is 0 is treated as + * transparent, according to [ISO14496-2], subclause 7.6.5. + * fcodeForward - a code equal to vop_fcode_forward in MPEG-4 bit stream + * syntax + * MBType - the type of the current macroblock. If MBType is not equal to + * OMX_VC_INTER4V, the destination motion vector buffer is still + * filled with the same decoded vector. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded, so + * that it points to the current byte in the bit stream buffer + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream + * pDstMVCurMB - pointer to the motion vector buffer for the current + * macroblock; contains four decoded motion vectors + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments: + * - At least one of the following pointers is NULL: + * ppBitStream, *ppBitStream, pBitOffset, pDstMVCurMB + * - *pBitOffset exceeds [0,7] + * - fcodeForward exceeds (0,7] + * - MBType less than zero + * - motion vector buffer is not 4-byte aligned. + * OMX_Sts_Err - status error + * + */ +OMXResult omxVCM4P2_DecodePadMV_PVOP ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMXVCMotionVector *pSrcMVLeftMB, + OMXVCMotionVector*pSrcMVUpperMB, + OMXVCMotionVector *pSrcMVUpperRightMB, + OMXVCMotionVector*pDstMVCurMB, + OMX_INT fcodeForward, + OMXVCM4P2MacroblockType MBType +); + + + +/** + * Function: omxVCM4P2_DecodeVLCZigzag_IntraDCVLC (6.2.5.2.2) + * + * Description: + * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients + * for one intra block. Two versions of the function (DCVLC and ACVLC) are + * provided in order to support the two different methods of processing DC + * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC + * Coefficient Decoding for the Case of Switched VLC Encoding. + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the + * bitstream buffer + * pBitOffset - pointer to the bit position in the current byte referenced + * by *ppBitStream. The parameter *pBitOffset is valid in the + * range [0-7]. + * Bit Position in one byte: |Most Least| + * *pBitOffset |0 1 2 3 4 5 6 7| + * predDir - AC prediction direction; used to select the zigzag scan + * pattern; takes one of the following values: + * - OMX_VC_NONE - AC prediction not used; + * performs classical zigzag scan. + * - OMX_VC_HORIZONTAL - Horizontal prediction; + * performs alternate-vertical zigzag scan; + * - OMX_VC_VERTICAL - Vertical prediction; + * performs alternate-horizontal zigzag scan. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * videoComp - video component type (luminance or chrominance) of the + * current block + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded such + * that it points to the current byte in the bit stream buffer + * pBitOffset - *pBitOffset is updated such that it points to the current + * bit position in the byte pointed by *ppBitStream + * pDst - pointer to the coefficient buffer of current block; must be + * 4-byte aligned. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, if: + * - At least one of the following pointers is NULL: + * ppBitStream, *ppBitStream, pBitOffset, pDst + * - *pBitOffset exceeds [0,7] + * - preDir exceeds [0,2] + * - pDst is not 4-byte aligned + * OMX_Sts_Err - if: + * - In DecodeVLCZigzag_IntraDCVLC, dc_size > 12 + * - At least one of mark bits equals zero + * - Illegal stream encountered; code cannot be located in VLC table + * - Forbidden code encountered in the VLC FLC table. + * - The number of coefficients is greater than 64 + * + */ +OMXResult omxVCM4P2_DecodeVLCZigzag_IntraDCVLC ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMX_S16 *pDst, + OMX_U8 predDir, + OMX_INT shortVideoHeader, + OMXVCM4P2VideoComponent videoComp +); + + + +/** + * Function: omxVCM4P2_DecodeVLCZigzag_IntraACVLC (6.2.5.2.2) + * + * Description: + * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients + * for one intra block. Two versions of the function (DCVLC and ACVLC) are + * provided in order to support the two different methods of processing DC + * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC + * Coefficient Decoding for the Case of Switched VLC Encoding. + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the + * bitstream buffer + * pBitOffset - pointer to the bit position in the current byte referenced + * by *ppBitStream. The parameter *pBitOffset is valid in the + * range [0-7]. Bit Position in one byte: |Most Least| *pBitOffset + * |0 1 2 3 4 5 6 7| + * predDir - AC prediction direction; used to select the zigzag scan + * pattern; takes one of the following values: OMX_VC_NONE - AC + * prediction not used; performs classical zigzag scan. + * OMX_VC_HORIZONTAL - Horizontal prediction; performs + * alternate-vertical zigzag scan; OMX_VC_VERTICAL - Vertical + * prediction; performs alternate-horizontal zigzag scan. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * videoComp - video component type (luminance or chrominance) of the + * current block + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded such + * that it points to the current byte in the bit stream buffer + * pBitOffset - *pBitOffset is updated such that it points to the current + * bit position in the byte pointed by *ppBitStream + * pDst - pointer to the coefficient buffer of current block; must be + * 4-byte aligned. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments At least one of the following + * pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, + * or At least one of the following conditions is true: + * *pBitOffset exceeds [0,7], preDir exceeds [0,2], or pDst is + * not 4-byte aligned + * OMX_Sts_Err In DecodeVLCZigzag_IntraDCVLC, dc_size > 12 At least one of + * mark bits equals zero Illegal stream encountered; code cannot + * be located in VLC table Forbidden code encountered in the VLC + * FLC table The number of coefficients is greater than 64 + * + */ +OMXResult omxVCM4P2_DecodeVLCZigzag_IntraACVLC ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMX_S16 *pDst, + OMX_U8 predDir, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_DecodeVLCZigzag_Inter (6.2.5.2.3) + * + * Description: + * Performs VLC decoding and inverse zigzag scan for one inter-coded block. + * + * Input Arguments: + * + * ppBitStream - double pointer to the current byte in the stream buffer + * pBitOffset - pointer to the next available bit in the current stream + * byte referenced by *ppBitStream. The parameter *pBitOffset is + * valid within the range [0-7]. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; escape modes 0-3 are used if + * shortVideoHeader==0, and escape mode 4 is used when + * shortVideoHeader==1. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded such + * that it points to the current byte in the stream buffer + * pBitOffset - *pBitOffset is updated after decoding such that it points + * to the next available bit in the stream byte referenced by + * *ppBitStream + * pDst - pointer to the coefficient buffer of current block; must be + * 4-byte aligned. + * + * Return Value: + * + * OMX_Sts_BadArgErr - bad arguments: + * - At least one of the following pointers is NULL: + * ppBitStream, *ppBitStream, pBitOffset, pDst + * - pDst is not 4-byte aligned + * - *pBitOffset exceeds [0,7] + * OMX_Sts_Err - status error, if: + * - At least one mark bit is equal to zero + * - Encountered an illegal stream code that cannot be found in the VLC table + * - Encountered an illegal code in the VLC FLC table + * - The number of coefficients is greater than 64 + * + */ +OMXResult omxVCM4P2_DecodeVLCZigzag_Inter ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMX_S16 *pDst, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_QuantInvIntra_I (6.2.5.3.2) + * + * Description: + * Performs the second inverse quantization mode on an intra/inter coded + * block. Supports bits_per_pixel = 8. The output coefficients are clipped to + * the range [-2048, 2047]. + * + * Input Arguments: + * + * pSrcDst - pointer to the input (quantized) intra/inter block; must be + * aligned on a 16-byte boundary. + * QP - quantization parameter (quantizer_scale) + * videoComp - video component type of the current block. Takes one of the + * following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra + * version only). + * shortVideoHeader - binary flag indicating presence of short_video_header + * (intra version only). + * + * Output Arguments: + * + * pSrcDst - pointer to the output (dequantized) intra/inter block + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; one or more of the following is + * true: + * - pSrcDst is NULL + * - QP <= 0 or QP >=31 + * - videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE. + * + */ +OMXResult omxVCM4P2_QuantInvIntra_I ( + OMX_S16 *pSrcDst, + OMX_INT QP, + OMXVCM4P2VideoComponent videoComp, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_QuantInvInter_I (6.2.5.3.2) + * + * Description: + * Performs the second inverse quantization mode on an intra/inter coded + * block. Supports bits_per_pixel = 8. The output coefficients are clipped to + * the range [-2048, 2047]. + * + * Input Arguments: + * + * pSrcDst - pointer to the input (quantized) intra/inter block; must be + * aligned on a 16-byte boundary. + * QP - quantization parameter (quantizer_scale) + * videoComp - video component type of the current block. Takes one of the + * following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra + * version only). + * shortVideoHeader - binary flag indicating presence of short_video_header + * (intra version only). + * + * Output Arguments: + * + * pSrcDst - pointer to the output (dequantized) intra/inter block + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; one or more of the following is + * true: + * - pSrcDst is NULL + * - QP <= 0 or QP >=31 + * - videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE. + * + */ +OMXResult omxVCM4P2_QuantInvInter_I ( + OMX_S16 *pSrcDst, + OMX_INT QP +); + + + +/** + * Function: omxVCM4P2_DecodeBlockCoef_Intra (6.2.5.4.1) + * + * Description: + * Decodes the INTRA block coefficients. Inverse quantization, inversely + * zigzag positioning, and IDCT, with appropriate clipping on each step, are + * performed on the coefficients. The results are then placed in the output + * frame/plane on a pixel basis. Note: This function will be used only when + * at least one non-zero AC coefficient of current block exists in the bit + * stream. The DC only condition will be handled in another function. + * + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the bit + * stream buffer. There is no boundary check for the bit stream + * buffer. + * pBitOffset - pointer to the bit position in the byte pointed to by + * *ppBitStream. *pBitOffset is valid within [0-7]. + * step - width of the destination plane + * pCoefBufRow - pointer to the coefficient row buffer; must be aligned on + * an 8-byte boundary. + * pCoefBufCol - pointer to the coefficient column buffer; must be aligned + * on an 8-byte boundary. + * curQP - quantization parameter of the macroblock which the current block + * belongs to + * pQPBuf - pointer to the quantization parameter buffer + * blockIndex - block index indicating the component type and position as + * defined in [ISO14496-2], subclause 6.1.3.8, Figure 6-5. + * intraDCVLC - a code determined by intra_dc_vlc_thr and QP. This allows a + * mechanism to switch between two VLC for coding of Intra DC + * coefficients as per [ISO14496-2], Table 6-21. + * ACPredFlag - a flag equal to ac_pred_flag (of luminance) indicating if + * the ac coefficients of the first row or first column are + * differentially coded for intra coded macroblock. + * shortVideoHeader - binary flag indicating presence of + * short_video_header; shortVideoHeader==1 selects linear intra DC + * mode, and shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded, so + * that it points to the current byte in the bit stream buffer + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream + * pDst - pointer to the block in the destination plane; must be aligned on + * an 8-byte boundary. + * pCoefBufRow - pointer to the updated coefficient row buffer. + * pCoefBufCol - pointer to the updated coefficient column buffer Note: + * The coefficient buffers must be updated in accordance with the + * update procedure defined in section 6.2.2. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, if: + * - At least one of the following pointers is NULL: + * ppBitStream, *ppBitStream, pBitOffset, pCoefBufRow, pCoefBufCol, + * pQPBuf, pDst. + * - *pBitOffset exceeds [0,7] + * - curQP exceeds (1, 31) + * - blockIndex exceeds [0,5] + * - step is not the multiple of 8 + * - a pointer alignment requirement was violated. + * OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Intra. + * + */ +OMXResult omxVCM4P2_DecodeBlockCoef_Intra ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMX_U8 *pDst, + OMX_INT step, + OMX_S16 *pCoefBufRow, + OMX_S16 *pCoefBufCol, + OMX_U8 curQP, + const OMX_U8 *pQPBuf, + OMX_INT blockIndex, + OMX_INT intraDCVLC, + OMX_INT ACPredFlag, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_DecodeBlockCoef_Inter (6.2.5.4.2) + * + * Description: + * Decodes the INTER block coefficients. This function performs inverse + * quantization, inverse zigzag positioning, and IDCT (with appropriate + * clipping on each step) on the coefficients. The results (residuals) are + * placed in a contiguous array of 64 elements. For INTER block, the output + * buffer holds the residuals for further reconstruction. + * + * Input Arguments: + * + * ppBitStream - pointer to the pointer to the current byte in the bit + * stream buffer. There is no boundary check for the bit stream + * buffer. + * pBitOffset - pointer to the bit position in the byte pointed to by + * *ppBitStream. *pBitOffset is valid within [0-7] + * QP - quantization parameter + * shortVideoHeader - binary flag indicating presence of + * short_video_header; shortVideoHeader==1 selects linear intra DC + * mode, and shortVideoHeader==0 selects non linear intra DC mode. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after the block is decoded, so + * that it points to the current byte in the bit stream buffer + * pBitOffset - *pBitOffset is updated so that it points to the current bit + * position in the byte pointed by *ppBitStream + * pDst - pointer to the decoded residual buffer (a contiguous array of 64 + * elements of OMX_S16 data type); must be aligned on a 16-byte + * boundary. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, if: + * - At least one of the following pointers is Null: + * ppBitStream, *ppBitStream, pBitOffset , pDst + * - *pBitOffset exceeds [0,7] + * - QP <= 0. + * - pDst is not 16-byte aligned + * OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Inter . + * + */ +OMXResult omxVCM4P2_DecodeBlockCoef_Inter ( + const OMX_U8 **ppBitStream, + OMX_INT *pBitOffset, + OMX_S16 *pDst, + OMX_INT QP, + OMX_INT shortVideoHeader +); + + + +/** + * Function: omxVCM4P2_PredictReconCoefIntra (6.2.5.4.3) + * + * Description: + * Performs adaptive DC/AC coefficient prediction for an intra block. Prior + * to the function call, prediction direction (predDir) should be selected as + * specified in [ISO14496-2], subclause 7.4.3.1. + * + * Input Arguments: + * + * pSrcDst - pointer to the coefficient buffer which contains the quantized + * coefficient residuals (PQF) of the current block; must be + * aligned on a 4-byte boundary. The output coefficients are + * saturated to the range [-2048, 2047]. + * pPredBufRow - pointer to the coefficient row buffer; must be aligned on + * a 4-byte boundary. + * pPredBufCol - pointer to the coefficient column buffer; must be aligned + * on a 4-byte boundary. + * curQP - quantization parameter of the current block. curQP may equal to + * predQP especially when the current block and the predictor block + * are in the same macroblock. + * predQP - quantization parameter of the predictor block + * predDir - indicates the prediction direction which takes one of the + * following values: OMX_VC_HORIZONTAL - predict horizontally + * OMX_VC_VERTICAL - predict vertically + * ACPredFlag - a flag indicating if AC prediction should be performed. It + * is equal to ac_pred_flag in the bit stream syntax of MPEG-4 + * videoComp - video component type (luminance or chrominance) of the + * current block + * + * Output Arguments: + * + * pSrcDst - pointer to the coefficient buffer which contains the quantized + * coefficients (QF) of the current block + * pPredBufRow - pointer to the updated coefficient row buffer + * pPredBufCol - pointer to the updated coefficient column buffer Note: + * Buffer update: Update the AC prediction buffer (both row and + * column buffer). + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments, if: + * - At least one of the pointers is NULL: + * pSrcDst, pPredBufRow, or pPredBufCol. + * - curQP <= 0, + * - predQP <= 0, + * - curQP >31, + * - predQP > 31, + * - preDir exceeds [1,2] + * - pSrcDst, pPredBufRow, or pPredBufCol is not 4-byte aligned. + * + */ +OMXResult omxVCM4P2_PredictReconCoefIntra ( + OMX_S16 *pSrcDst, + OMX_S16 *pPredBufRow, + OMX_S16 *pPredBufCol, + OMX_INT curQP, + OMX_INT predQP, + OMX_INT predDir, + OMX_INT ACPredFlag, + OMXVCM4P2VideoComponent videoComp +); + + + +/** + * Function: omxVCM4P2_MCReconBlock (6.2.5.5.1) + * + * Description: + * Performs motion compensation prediction for an 8x8 block using + * interpolation described in [ISO14496-2], subclause 7.6.2. + * + * Input Arguments: + * + * pSrc - pointer to the block in the reference plane. + * srcStep - distance between the start of consecutive lines in the + * reference plane, in bytes; must be a multiple of 8. + * dstStep - distance between the start of consecutive lines in the + * destination plane, in bytes; must be a multiple of 8. + * pSrcResidue - pointer to a buffer containing the 16-bit prediction + * residuals; must be 16-byte aligned. If the pointer is NULL, then + * no prediction is done, only motion compensation, i.e., the block + * is moved with interpolation. + * predictType - bilinear interpolation type, as defined in section + * 6.2.1.2. + * rndVal - rounding control parameter: 0 - disabled; 1 - enabled. + * + * Output Arguments: + * + * pDst - pointer to the destination buffer; must be 8-byte aligned. If + * prediction residuals are added then output intensities are + * clipped to the range [0,255]. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned under any of the following + * conditions: + * - pDst is not 8-byte aligned. + * - pSrcResidue is not 16-byte aligned. + * - one or more of the following pointers is NULL: pSrc or pDst. + * - either srcStep or dstStep is not a multiple of 8. + * - invalid type specified for the parameter predictType. + * - the parameter rndVal is not equal either to 0 or 1. + * + */ +OMXResult omxVCM4P2_MCReconBlock ( + const OMX_U8 *pSrc, + OMX_INT srcStep, + const OMX_S16 *pSrcResidue, + OMX_U8 *pDst, + OMX_INT dstStep, + OMX_INT predictType, + OMX_INT rndVal +); + + + +/* 6.3.1.1 Intra 16x16 Prediction Modes */ +/* A data type that enumerates intra_16x16 macroblock prediction modes is defined as follows: */ + +typedef enum { + OMX_VC_16X16_VERT = 0, /** Intra_16x16_Vertical */ + OMX_VC_16X16_HOR = 1, /** Intra_16x16_Horizontal */ + OMX_VC_16X16_DC = 2, /** Intra_16x16_DC */ + OMX_VC_16X16_PLANE = 3 /** Intra_16x16_Plane */ +} OMXVCM4P10Intra16x16PredMode; + + + +/* 6.3.1.2 Intra 4x4 Prediction Modes */ +/* A data type that enumerates intra_4x4 macroblock prediction modes is defined as follows: */ + +typedef enum { + OMX_VC_4X4_VERT = 0, /** Intra_4x4_Vertical */ + OMX_VC_4X4_HOR = 1, /** Intra_4x4_Horizontal */ + OMX_VC_4X4_DC = 2, /** Intra_4x4_DC */ + OMX_VC_4X4_DIAG_DL = 3, /** Intra_4x4_Diagonal_Down_Left */ + OMX_VC_4X4_DIAG_DR = 4, /** Intra_4x4_Diagonal_Down_Right */ + OMX_VC_4X4_VR = 5, /** Intra_4x4_Vertical_Right */ + OMX_VC_4X4_HD = 6, /** Intra_4x4_Horizontal_Down */ + OMX_VC_4X4_VL = 7, /** Intra_4x4_Vertical_Left */ + OMX_VC_4X4_HU = 8 /** Intra_4x4_Horizontal_Up */ +} OMXVCM4P10Intra4x4PredMode; + + + +/* 6.3.1.3 Chroma Prediction Modes */ +/* A data type that enumerates intra chroma prediction modes is defined as follows: */ + +typedef enum { + OMX_VC_CHROMA_DC = 0, /** Intra_Chroma_DC */ + OMX_VC_CHROMA_HOR = 1, /** Intra_Chroma_Horizontal */ + OMX_VC_CHROMA_VERT = 2, /** Intra_Chroma_Vertical */ + OMX_VC_CHROMA_PLANE = 3 /** Intra_Chroma_Plane */ +} OMXVCM4P10IntraChromaPredMode; + + + +/* 6.3.1.4 Motion Estimation Modes */ +/* A data type that enumerates H.264 motion estimation modes is defined as follows: */ + +typedef enum { + OMX_VC_M4P10_FAST_SEARCH = 0, /** Fast motion search */ + OMX_VC_M4P10_FULL_SEARCH = 1 /** Full motion search */ +} OMXVCM4P10MEMode; + + + +/* 6.3.1.5 Macroblock Types */ +/* A data type that enumerates H.264 macroblock types is defined as follows: */ + +typedef enum { + OMX_VC_P_16x16 = 0, /* defined by [ISO14496-10] */ + OMX_VC_P_16x8 = 1, + OMX_VC_P_8x16 = 2, + OMX_VC_P_8x8 = 3, + OMX_VC_PREF0_8x8 = 4, + OMX_VC_INTER_SKIP = 5, + OMX_VC_INTRA_4x4 = 8, + OMX_VC_INTRA_16x16 = 9, + OMX_VC_INTRA_PCM = 10 +} OMXVCM4P10MacroblockType; + + + +/* 6.3.1.6 Sub-Macroblock Types */ +/* A data type that enumerates H.264 sub-macroblock types is defined as follows: */ + +typedef enum { + OMX_VC_SUB_P_8x8 = 0, /* defined by [ISO14496-10] */ + OMX_VC_SUB_P_8x4 = 1, + OMX_VC_SUB_P_4x8 = 2, + OMX_VC_SUB_P_4x4 = 3 +} OMXVCM4P10SubMacroblockType; + + + +/* 6.3.1.7 Variable Length Coding (VLC) Information */ + +typedef struct { + OMX_U8 uTrailing_Ones; /* Trailing ones; 3 at most */ + OMX_U8 uTrailing_One_Signs; /* Trailing ones signal */ + OMX_U8 uNumCoeffs; /* Total number of non-zero coefs, including trailing ones */ + OMX_U8 uTotalZeros; /* Total number of zero coefs */ + OMX_S16 iLevels[16]; /* Levels of non-zero coefs, in reverse zig-zag order */ + OMX_U8 uRuns[16]; /* Runs for levels and trailing ones, in reverse zig-zag order */ +} OMXVCM4P10VLCInfo; + + + +/* 6.3.1.8 Macroblock Information */ + +typedef struct { + OMX_S32 sliceId; /* slice number */ + OMXVCM4P10MacroblockType mbType; /* MB type */ + OMXVCM4P10SubMacroblockType subMBType[4]; /* sub-block type */ + OMX_S32 qpy; /* qp for luma */ + OMX_S32 qpc; /* qp for chroma */ + OMX_U32 cbpy; /* CBP Luma */ + OMX_U32 cbpc; /* CBP Chroma */ + OMXVCMotionVector pMV0[4][4]; /* motion vector, represented using 1/4-pel units, pMV0[blocky][blockx] (blocky = 0~3, blockx =0~3) */ + OMXVCMotionVector pMVPred[4][4]; /* motion vector prediction, Represented using 1/4-pel units, pMVPred[blocky][blockx] (blocky = 0~3, blockx = 0~3) */ + OMX_U8 pRefL0Idx[4]; /* reference picture indices */ + OMXVCM4P10Intra16x16PredMode Intra16x16PredMode; /* best intra 16x16 prediction mode */ + OMXVCM4P10Intra4x4PredMode pIntra4x4PredMode[16]; /* best intra 4x4 prediction mode for each block, pMV0 indexed as above */ +} OMXVCM4P10MBInfo, *OMXVCM4P10MBInfoPtr; + + + +/* 6.3.1.9 Motion Estimation Parameters */ + +typedef struct { + OMX_S32 blockSplitEnable8x8; /* enables 16x8, 8x16, 8x8 */ + OMX_S32 blockSplitEnable4x4; /* enable splitting of 8x4, 4x8, 4x4 blocks */ + OMX_S32 halfSearchEnable; + OMX_S32 quarterSearchEnable; + OMX_S32 intraEnable4x4; /* 1=enable, 0=disable */ + OMX_S32 searchRange16x16; /* integer pixel units */ + OMX_S32 searchRange8x8; + OMX_S32 searchRange4x4; +} OMXVCM4P10MEParams; + + + +/** + * Function: omxVCM4P10_PredictIntra_4x4 (6.3.3.1.1) + * + * Description: + * Perform Intra_4x4 prediction for luma samples. If the upper-right block is + * not available, then duplication work should be handled inside the function. + * Users need not define them outside. + * + * Input Arguments: + * + * pSrcLeft - Pointer to the buffer of 4 left pixels: + * p[x, y] (x = -1, y = 0..3) + * pSrcAbove - Pointer to the buffer of 8 above pixels: + * p[x,y] (x = 0..7, y =-1); + * must be aligned on a 4-byte boundary. + * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) + * leftStep - Step of left pixel buffer; must be a multiple of 4. + * dstStep - Step of the destination buffer; must be a multiple of 4. + * predMode - Intra_4x4 prediction mode. + * availability - Neighboring 4x4 block availability flag, refer to + * "Neighboring Macroblock Availability" . + * + * Output Arguments: + * + * pDst - Pointer to the destination buffer; must be aligned on a 4-byte + * boundary. + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * pDst is NULL. + * dstStep < 4, or dstStep is not a multiple of 4. + * leftStep is not a multiple of 4. + * predMode is not in the valid range of enumeration + * OMXVCM4P10Intra4x4PredMode. + * predMode is OMX_VC_4x4_VERT, but availability doesn't set OMX_VC_UPPER + * indicating p[x,-1] (x = 0..3) is not available. + * predMode is OMX_VC_4x4_HOR, but availability doesn't set OMX_VC_LEFT + * indicating p[-1,y] (y = 0..3) is not available. + * predMode is OMX_VC_4x4_DIAG_DL, but availability doesn't set + * OMX_VC_UPPER indicating p[x, -1] (x = 0..3) is not available. + * predMode is OMX_VC_4x4_DIAG_DR, but availability doesn't set + * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating + * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not + * available. + * predMode is OMX_VC_4x4_VR, but availability doesn't set + * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating + * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not + * available. + * predMode is OMX_VC_4x4_HD, but availability doesn't set + * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating + * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not + * available. + * predMode is OMX_VC_4x4_VL, but availability doesn't set OMX_VC_UPPER + * indicating p[x,-1] (x = 0..3) is not available. + * predMode is OMX_VC_4x4_HU, but availability doesn't set OMX_VC_LEFT + * indicating p[-1,y] (y = 0..3) is not available. + * availability sets OMX_VC_UPPER, but pSrcAbove is NULL. + * availability sets OMX_VC_LEFT, but pSrcLeft is NULL. + * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. + * either pSrcAbove or pDst is not aligned on a 4-byte boundary. + * + * Note: + * pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if + * they are not used by intra prediction as implied in predMode. + * + */ +OMXResult omxVCM4P10_PredictIntra_4x4 ( + const OMX_U8 *pSrcLeft, + const OMX_U8 *pSrcAbove, + const OMX_U8 *pSrcAboveLeft, + OMX_U8 *pDst, + OMX_INT leftStep, + OMX_INT dstStep, + OMXVCM4P10Intra4x4PredMode predMode, + OMX_S32 availability +); + + + +/** + * Function: omxVCM4P10_PredictIntra_16x16 (6.3.3.1.2) + * + * Description: + * Perform Intra_16x16 prediction for luma samples. If the upper-right block + * is not available, then duplication work should be handled inside the + * function. Users need not define them outside. + * + * Input Arguments: + * + * pSrcLeft - Pointer to the buffer of 16 left pixels: p[x, y] (x = -1, y = + * 0..15) + * pSrcAbove - Pointer to the buffer of 16 above pixels: p[x,y] (x = 0..15, + * y= -1); must be aligned on a 16-byte boundary. + * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) + * leftStep - Step of left pixel buffer; must be a multiple of 16. + * dstStep - Step of the destination buffer; must be a multiple of 16. + * predMode - Intra_16x16 prediction mode, please refer to section 3.4.1. + * availability - Neighboring 16x16 MB availability flag. Refer to + * section 3.4.4. + * + * Output Arguments: + * + * pDst -Pointer to the destination buffer; must be aligned on a 16-byte + * boundary. + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * pDst is NULL. + * dstStep < 16. or dstStep is not a multiple of 16. + * leftStep is not a multiple of 16. + * predMode is not in the valid range of enumeration + * OMXVCM4P10Intra16x16PredMode + * predMode is OMX_VC_16X16_VERT, but availability doesn't set + * OMX_VC_UPPER indicating p[x,-1] (x = 0..15) is not available. + * predMode is OMX_VC_16X16_HOR, but availability doesn't set OMX_VC_LEFT + * indicating p[-1,y] (y = 0..15) is not available. + * predMode is OMX_VC_16X16_PLANE, but availability doesn't set + * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating + * p[x,-1](x = 0..15), or p[-1,y] (y = 0..15), or p[-1,-1] is not + * available. + * availability sets OMX_VC_UPPER, but pSrcAbove is NULL. + * availability sets OMX_VC_LEFT, but pSrcLeft is NULL. + * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. + * either pSrcAbove or pDst is not aligned on a 16-byte boundary. + * + * Note: + * pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if + * they are not used by intra prediction implied in predMode. + * Note: + * OMX_VC_UPPER_RIGHT is not used in intra_16x16 luma prediction. + * + */ +OMXResult omxVCM4P10_PredictIntra_16x16 ( + const OMX_U8 *pSrcLeft, + const OMX_U8 *pSrcAbove, + const OMX_U8 *pSrcAboveLeft, + OMX_U8 *pDst, + OMX_INT leftStep, + OMX_INT dstStep, + OMXVCM4P10Intra16x16PredMode predMode, + OMX_S32 availability +); + + + +/** + * Function: omxVCM4P10_PredictIntraChroma_8x8 (6.3.3.1.3) + * + * Description: + * Performs intra prediction for chroma samples. + * + * Input Arguments: + * + * pSrcLeft - Pointer to the buffer of 8 left pixels: p[x, y] (x = -1, y= + * 0..7). + * pSrcAbove - Pointer to the buffer of 8 above pixels: p[x,y] (x = 0..7, y + * = -1); must be aligned on an 8-byte boundary. + * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) + * leftStep - Step of left pixel buffer; must be a multiple of 8. + * dstStep - Step of the destination buffer; must be a multiple of 8. + * predMode - Intra chroma prediction mode, please refer to section 3.4.3. + * availability - Neighboring chroma block availability flag, please refer + * to "Neighboring Macroblock Availability". + * + * Output Arguments: + * + * pDst - Pointer to the destination buffer; must be aligned on an 8-byte + * boundary. + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If any of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * pDst is NULL. + * dstStep < 8 or dstStep is not a multiple of 8. + * leftStep is not a multiple of 8. + * predMode is not in the valid range of enumeration + * OMXVCM4P10IntraChromaPredMode. + * predMode is OMX_VC_CHROMA_VERT, but availability doesn't set + * OMX_VC_UPPER indicating p[x,-1] (x = 0..7) is not available. + * predMode is OMX_VC_CHROMA_HOR, but availability doesn't set OMX_VC_LEFT + * indicating p[-1,y] (y = 0..7) is not available. + * predMode is OMX_VC_CHROMA_PLANE, but availability doesn't set + * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating + * p[x,-1](x = 0..7), or p[-1,y] (y = 0..7), or p[-1,-1] is not + * available. + * availability sets OMX_VC_UPPER, but pSrcAbove is NULL. + * availability sets OMX_VC_LEFT, but pSrcLeft is NULL. + * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. + * either pSrcAbove or pDst is not aligned on a 8-byte boundary. + * + * Note: pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointer if + * they are not used by intra prediction implied in predMode. + * + * Note: OMX_VC_UPPER_RIGHT is not used in intra chroma prediction. + * + */ +OMXResult omxVCM4P10_PredictIntraChroma_8x8 ( + const OMX_U8 *pSrcLeft, + const OMX_U8 *pSrcAbove, + const OMX_U8 *pSrcAboveLeft, + OMX_U8 *pDst, + OMX_INT leftStep, + OMX_INT dstStep, + OMXVCM4P10IntraChromaPredMode predMode, + OMX_S32 availability +); + + + +/** + * Function: omxVCM4P10_InterpolateLuma (6.3.3.2.1) + * + * Description: + * Performs quarter-pixel interpolation for inter luma MB. It is assumed that + * the frame is already padded when calling this function. + * + * Input Arguments: + * + * pSrc - Pointer to the source reference frame buffer + * srcStep - reference frame step, in bytes; must be a multiple of roi.width + * dstStep - destination frame step, in bytes; must be a multiple of + * roi.width + * dx - Fractional part of horizontal motion vector component in 1/4 pixel + * unit; valid in the range [0,3] + * dy - Fractional part of vertical motion vector y component in 1/4 pixel + * unit; valid in the range [0,3] + * roi - Dimension of the interpolation region; the parameters roi.width and + * roi.height must be equal to either 4, 8, or 16. + * + * Output Arguments: + * + * pDst - Pointer to the destination frame buffer: + * if roi.width==4, 4-byte alignment required + * if roi.width==8, 8-byte alignment required + * if roi.width==16, 16-byte alignment required + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * pSrc or pDst is NULL. + * srcStep or dstStep < roi.width. + * dx or dy is out of range [0,3]. + * roi.width or roi.height is out of range {4, 8, 16}. + * roi.width is equal to 4, but pDst is not 4 byte aligned. + * roi.width is equal to 8 or 16, but pDst is not 8 byte aligned. + * srcStep or dstStep is not a multiple of 8. + * + */ +OMXResult omxVCM4P10_InterpolateLuma ( + const OMX_U8 *pSrc, + OMX_S32 srcStep, + OMX_U8 *pDst, + OMX_S32 dstStep, + OMX_S32 dx, + OMX_S32 dy, + OMXSize roi +); + + + +/** + * Function: omxVCM4P10_InterpolateChroma (6.3.3.2.2) + * + * Description: + * Performs 1/8-pixel interpolation for inter chroma MB. + * + * Input Arguments: + * + * pSrc -Pointer to the source reference frame buffer + * srcStep -Reference frame step in bytes + * dstStep -Destination frame step in bytes; must be a multiple of + * roi.width. + * dx -Fractional part of horizontal motion vector component in 1/8 pixel + * unit; valid in the range [0,7] + * dy -Fractional part of vertical motion vector component in 1/8 pixel + * unit; valid in the range [0,7] + * roi -Dimension of the interpolation region; the parameters roi.width and + * roi.height must be equal to either 2, 4, or 8. + * + * Output Arguments: + * + * pDst -Pointer to the destination frame buffer: + * if roi.width==2, 2-byte alignment required + * if roi.width==4, 4-byte alignment required + * if roi.width==8, 8-byte alignment required + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * pSrc or pDst is NULL. + * srcStep or dstStep < 8. + * dx or dy is out of range [0-7]. + * roi.width or roi.height is out of range {2,4,8}. + * roi.width is equal to 2, but pDst is not 2-byte aligned. + * roi.width is equal to 4, but pDst is not 4-byte aligned. + * roi.width is equal to 8, but pDst is not 8 byte aligned. + * srcStep or dstStep is not a multiple of 8. + * + */ +OMXResult omxVCM4P10_InterpolateChroma ( + const OMX_U8 *pSrc, + OMX_S32 srcStep, + OMX_U8 *pDst, + OMX_S32 dstStep, + OMX_S32 dx, + OMX_S32 dy, + OMXSize roi +); + + + +/** + * Function: omxVCM4P10_FilterDeblockingLuma_VerEdge_I (6.3.3.3.1) + * + * Description: + * Performs in-place deblock filtering on four vertical edges of the luma + * macroblock (16x16). + * + * Input Arguments: + * + * pSrcDst - Pointer to the input macroblock; must be 16-byte aligned. + * srcdstStep -Step of the arrays; must be a multiple of 16. + * pAlpha -Array of size 2 of alpha thresholds (the first item is the alpha + * threshold for the external vertical edge, and the second item is + * for the internal vertical edge); per [ISO14496-10] alpha values + * must be in the range [0,255]. + * pBeta -Array of size 2 of beta thresholds (the first item is the beta + * threshold for the external vertical edge, and the second item is + * for the internal vertical edge); per [ISO14496-10] beta values + * must be in the range [0,18]. + * pThresholds -Array of size 16 of Thresholds (TC0) (values for the left + * edge of each 4x4 block, arranged in vertical block order); must + * be aligned on a 4-byte boundary.. Per [ISO14496-10] values must + * be in the range [0,25]. + * pBS -Array of size 16 of BS parameters (arranged in vertical block + * order); valid in the range [0,4] with the following + * restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) + * pBS[i]== 4 if and only if pBS[i^3]== 4. Must be 4-byte aligned. + * + * Output Arguments: + * + * pSrcDst -Pointer to filtered output macroblock. + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns + * OMX_Sts_BadArgErr: + * Either of the pointers in pSrcDst, pAlpha, pBeta, pThresholds, or pBS + * is NULL. + * Either pThresholds or pBS is not aligned on a 4-byte boundary. + * pSrcDst is not 16-byte aligned. + * srcdstStep is not a multiple of 16. + * pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. + * pBeta[0] and/or pBeta[1] is outside the range [0,18]. + * One or more entries in the table pThresholds[0..15]is outside of the + * range [0,25]. + * pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && + * pBS[i^3]!=4) for 0<=i<=3. + * + */ +OMXResult omxVCM4P10_FilterDeblockingLuma_VerEdge_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_FilterDeblockingLuma_HorEdge_I (6.3.3.3.2) + * + * Description: + * Performs in-place deblock filtering on four horizontal edges of the luma + * macroblock (16x16). + * + * Input Arguments: + * + * pSrcDst - pointer to the input macroblock; must be 16-byte aligned. + * srcdstStep - step of the arrays; must be a multiple of 16. + * pAlpha - array of size 2 of alpha thresholds (the first item is the alpha + * threshold for the external vertical edge, and the second item is + * for the internal horizontal edge); per [ISO14496-10] alpha + * values must be in the range [0,255]. + * pBeta - array of size 2 of beta thresholds (the first item is the beta + * threshold for the external horizontal edge, and the second item + * is for the internal horizontal edge). Per [ISO14496-10] beta + * values must be in the range [0,18]. + * pThresholds - array of size 16 containing thresholds, TC0, for the top + * horizontal edge of each 4x4 block, arranged in horizontal block + * order; must be aligned on a 4-byte boundary. Per [ISO14496 10] + * values must be in the range [0,25]. + * pBS - array of size 16 of BS parameters (arranged in horizontal block + * order); valid in the range [0,4] with the following + * restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) + * pBS[i]== 4 if and only if pBS[i^3]== 4. Must be 4-byte aligned. + * + * Output Arguments: + * + * pSrcDst -Pointer to filtered output macroblock. + * + * Return Value: + * + * OMX_Sts_NoErr, if the function runs without error. + * + * OMX_Sts_BadArgErr, if one of the following cases occurs: + * - one or more of the following pointers is NULL: pSrcDst, pAlpha, + * pBeta, pThresholds, or pBS. + * - either pThresholds or pBS is not aligned on a 4-byte boundary. + * - pSrcDst is not 16-byte aligned. + * - srcdstStep is not a multiple of 16. + * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. + * - pBeta[0] and/or pBeta[1] is outside the range [0,18]. + * - One or more entries in the table pThresholds[0..15] is + * outside of the range [0,25]. + * - pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or + * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. + * + */ +OMXResult omxVCM4P10_FilterDeblockingLuma_HorEdge_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_FilterDeblockingChroma_VerEdge_I (6.3.3.3.3) + * + * Description: + * Performs in-place deblock filtering on four vertical edges of the chroma + * macroblock (8x8). + * + * Input Arguments: + * + * pSrcDst - Pointer to the input macroblock; must be 8-byte aligned. + * srcdstStep - Step of the arrays; must be a multiple of 8. + * pAlpha - Array of size 2 of alpha thresholds (the first item is alpha + * threshold for external vertical edge, and the second item is for + * internal vertical edge); per [ISO14496-10] alpha values must be + * in the range [0,255]. + * pBeta - Array of size 2 of beta thresholds (the first item is the beta + * threshold for the external vertical edge, and the second item is + * for the internal vertical edge); per [ISO14496-10] beta values + * must be in the range [0,18]. + * pThresholds - Array of size 8 containing thresholds, TC0, for the left + * vertical edge of each 4x2 chroma block, arranged in vertical + * block order; must be aligned on a 4-byte boundary. Per + * [ISO14496-10] values must be in the range [0,25]. + * pBS - Array of size 16 of BS parameters (values for each 2x2 chroma + * block, arranged in vertical block order). This parameter is the + * same as the pBS parameter passed into FilterDeblockLuma_VerEdge; + * valid in the range [0,4] with the following restrictions: i) + * pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and + * only if pBS[i^3]== 4. Must be 4 byte aligned. + * + * Output Arguments: + * + * pSrcDst -Pointer to filtered output macroblock. + * + * Return Value: + * + * OMX_Sts_NoErr, if the function runs without error. + * + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - one or more of the following pointers is NULL: pSrcDst, pAlpha, + * pBeta, pThresholds, or pBS. + * - pSrcDst is not 8-byte aligned. + * - srcdstStep is not a multiple of 8. + * - pThresholds is not 4-byte aligned. + * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. + * - pBeta[0] and/or pBeta[1] is outside the range [0,18]. + * - One or more entries in the table pThresholds[0..7] is outside + * of the range [0,25]. + * - pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or + * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. + * - pBS is not 4-byte aligned. + * + */ +OMXResult omxVCM4P10_FilterDeblockingChroma_VerEdge_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_FilterDeblockingChroma_HorEdge_I (6.3.3.3.4) + * + * Description: + * Performs in-place deblock filtering on the horizontal edges of the chroma + * macroblock (8x8). + * + * Input Arguments: + * + * pSrcDst - pointer to the input macroblock; must be 8-byte aligned. + * srcdstStep - array step; must be a multiple of 8. + * pAlpha - array of size 2 containing alpha thresholds; the first element + * contains the threshold for the external horizontal edge, and the + * second element contains the threshold for internal horizontal + * edge. Per [ISO14496-10] alpha values must be in the range + * [0,255]. + * pBeta - array of size 2 containing beta thresholds; the first element + * contains the threshold for the external horizontal edge, and the + * second element contains the threshold for the internal + * horizontal edge. Per [ISO14496-10] beta values must be in the + * range [0,18]. + * pThresholds - array of size 8 containing thresholds, TC0, for the top + * horizontal edge of each 2x4 chroma block, arranged in horizontal + * block order; must be aligned on a 4-byte boundary. Per + * [ISO14496-10] values must be in the range [0,25]. + * pBS - array of size 16 containing BS parameters for each 2x2 chroma + * block, arranged in horizontal block order; valid in the range + * [0,4] with the following restrictions: i) pBS[i]== 4 may occur + * only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== 4. + * Must be 4-byte aligned. + * + * Output Arguments: + * + * pSrcDst -Pointer to filtered output macroblock. + * + * Return Value: + * + * OMX_Sts_NoErr, if the function runs without error. + * + * OMX_Sts_BadArgErr, if one of the following cases occurs: + * - any of the following pointers is NULL: + * pSrcDst, pAlpha, pBeta, pThresholds, or pBS. + * - pSrcDst is not 8-byte aligned. + * - srcdstStep is not a multiple of 8. + * - pThresholds is not 4-byte aligned. + * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. + * - pBeta[0] and/or pBeta[1] is outside the range [0,18]. + * - One or more entries in the table pThresholds[0..7] is outside + * of the range [0,25]. + * - pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or + * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. + * - pBS is not 4-byte aligned. + * + */ +OMXResult omxVCM4P10_FilterDeblockingChroma_HorEdge_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_DeblockLuma_I (6.3.3.3.5) + * + * Description: + * This function performs in-place deblock filtering the horizontal and + * vertical edges of a luma macroblock (16x16). + * + * Input Arguments: + * + * pSrcDst - pointer to the input macroblock; must be 16-byte aligned. + * srcdstStep - image width; must be a multiple of 16. + * pAlpha - pointer to a 2x2 table of alpha thresholds, organized as + * follows: {external vertical edge, internal vertical edge, + * external horizontal edge, internal horizontal edge }. Per + * [ISO14496-10] alpha values must be in the range [0,255]. + * pBeta - pointer to a 2x2 table of beta thresholds, organized as follows: + * {external vertical edge, internal vertical edge, external + * horizontal edge, internal horizontal edge }. Per [ISO14496-10] + * beta values must be in the range [0,18]. + * pThresholds - pointer to a 16x2 table of threshold (TC0), organized as + * follows: {values for the left or above edge of each 4x4 block, + * arranged in vertical block order and then in horizontal block + * order}; must be aligned on a 4-byte boundary. Per [ISO14496-10] + * values must be in the range [0,25]. + * pBS - pointer to a 16x2 table of BS parameters arranged in scan block + * order for vertical edges and then horizontal edges; valid in the + * range [0,4] with the following restrictions: i) pBS[i]== 4 may + * occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== + * 4. Must be 4-byte aligned. + * + * Output Arguments: + * + * pSrcDst - pointer to filtered output macroblock. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - one or more of the following pointers is NULL: pSrcDst, pAlpha, + * pBeta, pThresholds or pBS. + * - pSrcDst is not 16-byte aligned. + * - either pThresholds or pBS is not aligned on a 4-byte boundary. + * - one or more entries in the table pAlpha[0..3] is outside the range + * [0,255]. + * - one or more entries in the table pBeta[0..3] is outside the range + * [0,18]. + * - one or more entries in the table pThresholds[0..31]is outside of + * the range [0,25]. + * - pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or + * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. + * - srcdstStep is not a multiple of 16. + * + */ +OMXResult omxVCM4P10_DeblockLuma_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_DeblockChroma_I (6.3.3.3.6) + * + * Description: + * Performs in-place deblocking filtering on all edges of the chroma + * macroblock (16x16). + * + * Input Arguments: + * + * pSrcDst - pointer to the input macroblock; must be 8-byte aligned. + * srcdstStep - step of the arrays; must be a multiple of 8. + * pAlpha - pointer to a 2x2 array of alpha thresholds, organized as + * follows: {external vertical edge, internal vertical edge, + * external horizontal edge, internal horizontal edge }. Per + * [ISO14496-10] alpha values must be in the range [0,255]. + * pBeta - pointer to a 2x2 array of Beta Thresholds, organized as follows: + * { external vertical edge, internal vertical edge, external + * horizontal edge, internal horizontal edge }. Per [ISO14496-10] + * beta values must be in the range [0,18]. + * pThresholds - array of size 8x2 of Thresholds (TC0) (values for the left + * or above edge of each 4x2 or 2x4 block, arranged in vertical + * block order and then in horizontal block order); must be aligned + * on a 4-byte boundary. Per [ISO14496-10] values must be in the + * range [0,25]. + * pBS - array of size 16x2 of BS parameters (arranged in scan block order + * for vertical edges and then horizontal edges); valid in the + * range [0,4] with the following restrictions: i) pBS[i]== 4 may + * occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== + * 4. Must be 4-byte aligned. + * + * Output Arguments: + * + * pSrcDst - pointer to filtered output macroblock. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - one or more of the following pointers is NULL: pSrcDst, pAlpha, + * pBeta, pThresholds, or pBS. + * - pSrcDst is not 8-byte aligned. + * - either pThresholds or pBS is not 4-byte aligned. + * - one or more entries in the table pAlpha[0..3] is outside the range + * [0,255]. + * - one or more entries in the table pBeta[0..3] is outside the range + * [0,18]. + * - one or more entries in the table pThresholds[0..15]is outside of + * the range [0,25]. + * - pBS is out of range, i.e., one of the following conditions is true: + * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or + * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. + * - srcdstStep is not a multiple of 8. + * + */ +OMXResult omxVCM4P10_DeblockChroma_I ( + OMX_U8 *pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8 *pAlpha, + const OMX_U8 *pBeta, + const OMX_U8 *pThresholds, + const OMX_U8 *pBS +); + + + +/** + * Function: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (6.3.4.1.1) + * + * Description: + * Performs CAVLC decoding and inverse raster scan for a 2x2 block of + * ChromaDCLevel. The decoded coefficients in the packed position-coefficient + * buffer are stored in reverse zig-zag order, i.e., the first buffer element + * contains the last non-zero postion-coefficient pair of the block. Within + * each position-coefficient pair, the position entry indicates the + * raster-scan position of the coefficient, while the coefficient entry + * contains the coefficient value. + * + * Input Arguments: + * + * ppBitStream - Double pointer to current byte in bit stream buffer + * pOffset - Pointer to current bit position in the byte pointed to by + * *ppBitStream; valid in the range [0,7]. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after each block is decoded + * pOffset - *pOffset is updated after each block is decoded + * pNumCoeff - Pointer to the number of nonzero coefficients in this block + * ppPosCoefBuf - Double pointer to destination residual + * coefficient-position pair buffer. Buffer position + * (*ppPosCoefBuf) is updated upon return, unless there are only + * zero coefficients in the currently decoded block. In this case + * the caller is expected to bypass the transform/dequantization of + * the empty blocks. + * + * Return Value: + * + * OMX_Sts_NoErr, if the function runs without error. + * + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - ppBitStream or pOffset is NULL. + * - ppPosCoefBuf or pNumCoeff is NULL. + * OMX_Sts_Err - if one of the following is true: + * - an illegal code is encountered in the bitstream + * + */ +OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC ( + const OMX_U8 **ppBitStream, + OMX_S32*pOffset, + OMX_U8 *pNumCoeff, + OMX_U8 **ppPosCoefbuf +); + + + +/** + * Function: omxVCM4P10_DecodeCoeffsToPairCAVLC (6.3.4.1.2) + * + * Description: + * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of + * Intra16x16DCLevel, Intra16x16ACLevel, LumaLevel, and ChromaACLevel. Inverse + * field scan is not supported. The decoded coefficients in the packed + * position-coefficient buffer are stored in reverse zig-zag order, i.e., the + * first buffer element contains the last non-zero postion-coefficient pair of + * the block. Within each position-coefficient pair, the position entry + * indicates the raster-scan position of the coefficient, while the + * coefficient entry contains the coefficient value. + * + * Input Arguments: + * + * ppBitStream -Double pointer to current byte in bit stream buffer + * pOffset - Pointer to current bit position in the byte pointed to by + * *ppBitStream; valid in the range [0,7]. + * sMaxNumCoeff - Maximum the number of non-zero coefficients in current + * block + * sVLCSelect - VLC table selector, obtained from the number of non-zero + * coefficients contained in the above and left 4x4 blocks. It is + * equivalent to the variable nC described in H.264 standard table + * 9 5, except its value can t be less than zero. + * + * Output Arguments: + * + * ppBitStream - *ppBitStream is updated after each block is decoded. + * Buffer position (*ppPosCoefBuf) is updated upon return, unless + * there are only zero coefficients in the currently decoded block. + * In this case the caller is expected to bypass the + * transform/dequantization of the empty blocks. + * pOffset - *pOffset is updated after each block is decoded + * pNumCoeff - Pointer to the number of nonzero coefficients in this block + * ppPosCoefBuf - Double pointer to destination residual + * coefficient-position pair buffer + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - ppBitStream or pOffset is NULL. + * - ppPosCoefBuf or pNumCoeff is NULL. + * - sMaxNumCoeff is not equal to either 15 or 16. + * - sVLCSelect is less than 0. + * + * OMX_Sts_Err - if one of the following is true: + * - an illegal code is encountered in the bitstream + * + */ +OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC ( + const OMX_U8 **ppBitStream, + OMX_S32 *pOffset, + OMX_U8 *pNumCoeff, + OMX_U8 **ppPosCoefbuf, + OMX_INT sVLCSelect, + OMX_INT sMaxNumCoeff +); + + + +/** + * Function: omxVCM4P10_TransformDequantLumaDCFromPair (6.3.4.2.1) + * + * Description: + * Reconstructs the 4x4 LumaDC block from the coefficient-position pair + * buffer, performs integer inverse, and dequantization for 4x4 LumaDC + * coefficients, and updates the pair buffer pointer to the next non-empty + * block. + * + * Input Arguments: + * + * ppSrc - Double pointer to residual coefficient-position pair buffer + * output by CALVC decoding + * QP - Quantization parameter QpY + * + * Output Arguments: + * + * ppSrc - *ppSrc is updated to the start of next non empty block + * pDst - Pointer to the reconstructed 4x4 LumaDC coefficients buffer; must + * be aligned on a 8-byte boundary. + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - ppSrc or pDst is NULL. + * - pDst is not 8 byte aligned. + * - QP is not in the range of [0-51]. + * + */ +OMXResult omxVCM4P10_TransformDequantLumaDCFromPair ( + const OMX_U8 **ppSrc, + OMX_S16 *pDst, + OMX_INT QP +); + + + +/** + * Function: omxVCM4P10_TransformDequantChromaDCFromPair (6.3.4.2.2) + * + * Description: + * Reconstruct the 2x2 ChromaDC block from coefficient-position pair buffer, + * perform integer inverse transformation, and dequantization for 2x2 chroma + * DC coefficients, and update the pair buffer pointer to next non-empty + * block. + * + * Input Arguments: + * + * ppSrc - Double pointer to residual coefficient-position pair buffer + * output by CALVC decoding + * QP - Quantization parameter QpC + * + * Output Arguments: + * + * ppSrc - *ppSrc is updated to the start of next non empty block + * pDst - Pointer to the reconstructed 2x2 ChromaDC coefficients buffer; + * must be aligned on a 4-byte boundary. + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - ppSrc or pDst is NULL. + * - pDst is not 4-byte aligned. + * - QP is not in the range of [0-51]. + * + */ +OMXResult omxVCM4P10_TransformDequantChromaDCFromPair ( + const OMX_U8 **ppSrc, + OMX_S16 *pDst, + OMX_INT QP +); + + + +/** + * Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd (6.3.4.2.3) + * + * Description: + * Reconstruct the 4x4 residual block from coefficient-position pair buffer, + * perform dequantization and integer inverse transformation for 4x4 block of + * residuals with previous intra prediction or motion compensation data, and + * update the pair buffer pointer to next non-empty block. If pDC == NULL, + * there re 16 non-zero AC coefficients at most in the packed buffer starting + * from 4x4 block position 0; If pDC != NULL, there re 15 non-zero AC + * coefficients at most in the packet buffer starting from 4x4 block position + * 1. + * + * Input Arguments: + * + * ppSrc - Double pointer to residual coefficient-position pair buffer + * output by CALVC decoding + * pPred - Pointer to the predicted 4x4 block; must be aligned on a 4-byte + * boundary + * predStep - Predicted frame step size in bytes; must be a multiple of 4 + * dstStep - Destination frame step in bytes; must be a multiple of 4 + * pDC - Pointer to the DC coefficient of this block, NULL if it doesn't + * exist + * QP - QP Quantization parameter. It should be QpC in chroma 4x4 block + * decoding, otherwise it should be QpY. + * AC - Flag indicating if at least one non-zero AC coefficient exists + * + * Output Arguments: + * + * pDst - pointer to the reconstructed 4x4 block data; must be aligned on a + * 4-byte boundary + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - pPred or pDst is NULL. + * - pPred or pDst is not 4-byte aligned. + * - predStep or dstStep is not a multiple of 4. + * - AC !=0 and Qp is not in the range of [0-51] or ppSrc == NULL. + * - AC ==0 && pDC ==NULL. + * + */ +OMXResult omxVCM4P10_DequantTransformResidualFromPairAndAdd ( + const OMX_U8 **ppSrc, + const OMX_U8 *pPred, + const OMX_S16 *pDC, + OMX_U8 *pDst, + OMX_INT predStep, + OMX_INT dstStep, + OMX_INT QP, + OMX_INT AC +); + + + +/** + * Function: omxVCM4P10_MEGetBufSize (6.3.5.1.1) + * + * Description: + * Computes the size, in bytes, of the vendor-specific specification + * structure for the omxVCM4P10 motion estimation functions BlockMatch_Integer + * and MotionEstimationMB. + * + * Input Arguments: + * + * MEmode - motion estimation mode; available modes are defined by the + * enumerated type OMXVCM4P10MEMode + * pMEParams -motion estimation parameters + * + * Output Arguments: + * + * pSize - pointer to the number of bytes required for the motion + * estimation specification structure + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - pMEParams or pSize is NULL. + * - an invalid MEMode is specified. + * + */ +OMXResult omxVCM4P10_MEGetBufSize ( + OMXVCM4P10MEMode MEmode, + const OMXVCM4P10MEParams *pMEParams, + OMX_U32 *pSize +); + + + +/** + * Function: omxVCM4P10_MEInit (6.3.5.1.2) + * + * Description: + * Initializes the vendor-specific specification structure required for the + * omxVCM4P10 motion estimation functions: BlockMatch_Integer and + * MotionEstimationMB. Memory for the specification structure *pMESpec must be + * allocated prior to calling the function, and should be aligned on a 4-byte + * boundary. The number of bytes required for the specification structure can + * be determined using the function omxVCM4P10_MEGetBufSize. Following + * initialization by this function, the vendor-specific structure *pMESpec + * should contain an implementation-specific representation of all motion + * estimation parameters received via the structure pMEParams, for example + * searchRange16x16, searchRange8x8, etc. + * + * Input Arguments: + * + * MEmode - motion estimation mode; available modes are defined by the + * enumerated type OMXVCM4P10MEMode + * pMEParams - motion estimation parameters + * pMESpec - pointer to the uninitialized ME specification structure + * + * Output Arguments: + * + * pMESpec - pointer to the initialized ME specification structure + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - pMEParams or pSize is NULL. + * - an invalid value was specified for the parameter MEmode + * - a negative or zero value was specified for one of the search ranges + * (e.g., pMBParams >searchRange8x8, pMEParams->searchRange16x16, etc.) + * - either in isolation or in combination, one or more of the enables or + * search ranges in the structure *pMEParams were configured such + * that the requested behavior fails to comply with [ISO14496-10]. + * + */ +OMXResult omxVCM4P10_MEInit ( + OMXVCM4P10MEMode MEmode, + const OMXVCM4P10MEParams *pMEParams, + void *pMESpec +); + + + +/** + * Function: omxVCM4P10_BlockMatch_Integer (6.3.5.2.1) + * + * Description: + * Performs integer block match. Returns best MV and associated cost. + * + * Input Arguments: + * + * pSrcOrgY - Pointer to the top-left corner of the current block: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * pSrcRefY - Pointer to the top-left corner of the co-located block in the + * reference picture: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * nSrcOrgStep - Stride of the original picture plane, expressed in terms + * of integer pixels; must be a multiple of iBlockWidth. + * nSrcRefStep - Stride of the reference picture plane, expressed in terms + * of integer pixels + * pRefRect - pointer to the valid reference rectangle inside the reference + * picture plane + * nCurrPointPos - position of the current block in the current plane + * iBlockWidth - Width of the current block, expressed in terms of integer + * pixels; must be equal to either 4, 8, or 16. + * iBlockHeight - Height of the current block, expressed in terms of + * integer pixels; must be equal to either 4, 8, or 16. + * nLamda - Lamda factor; used to compute motion cost + * pMVPred - Predicted MV; used to compute motion cost, expressed in terms + * of 1/4-pel units + * pMVCandidate - Candidate MV; used to initialize the motion search, + * expressed in terms of integer pixels + * pMESpec - pointer to the ME specification structure + * + * Output Arguments: + * + * pDstBestMV - Best MV resulting from integer search, expressed in terms + * of 1/4-pel units + * pBestCost - Motion cost associated with the best MV; computed as + * SAD+Lamda*BitsUsedByMV + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - any of the following pointers are NULL: + * pSrcOrgY, pSrcRefY, pRefRect, pMVPred, pMVCandidate, or pMESpec. + * - Either iBlockWidth or iBlockHeight are values other than 4, 8, or 16. + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_BlockMatch_Integer ( + const OMX_U8 *pSrcOrgY, + OMX_S32 nSrcOrgStep, + const OMX_U8 *pSrcRefY, + OMX_S32 nSrcRefStep, + const OMXRect *pRefRect, + const OMXVCM4P2Coordinate *pCurrPointPos, + OMX_U8 iBlockWidth, + OMX_U8 iBlockHeight, + OMX_U32 nLamda, + const OMXVCMotionVector *pMVPred, + const OMXVCMotionVector *pMVCandidate, + OMXVCMotionVector *pBestMV, + OMX_S32 *pBestCost, + void *pMESpec +); + + + +/** + * Function: omxVCM4P10_BlockMatch_Half (6.3.5.2.2) + * + * Description: + * Performs a half-pel block match using results from a prior integer search. + * Returns the best MV and associated cost. This function estimates the + * half-pixel motion vector by interpolating the integer resolution motion + * vector referenced by the input parameter pSrcDstBestMV, i.e., the initial + * integer MV is generated externally. The function + * omxVCM4P10_BlockMatch_Integer may be used for integer motion estimation. + * + * Input Arguments: + * + * pSrcOrgY - Pointer to the current position in original picture plane: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * pSrcRefY - Pointer to the top-left corner of the co-located block in the + * reference picture: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * nSrcOrgStep - Stride of the original picture plane in terms of full + * pixels; must be a multiple of iBlockWidth. + * nSrcRefStep - Stride of the reference picture plane in terms of full + * pixels + * iBlockWidth - Width of the current block in terms of full pixels; must + * be equal to either 4, 8, or 16. + * iBlockHeight - Height of the current block in terms of full pixels; must + * be equal to either 4, 8, or 16. + * nLamda - Lamda factor, used to compute motion cost + * pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to + * compute motion cost + * pSrcDstBestMV - The best MV resulting from a prior integer search, + * represented in terms of 1/4-pel units + * + * Output Arguments: + * + * pSrcDstBestMV - Best MV resulting from the half-pel search, expressed in + * terms of 1/4-pel units + * pBestCost - Motion cost associated with the best MV; computed as + * SAD+Lamda*BitsUsedByMV + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - any of the following pointers is NULL: pSrcOrgY, pSrcRefY, + * pSrcDstBestMV, pMVPred, pBestCost + * - iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16. + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_BlockMatch_Half ( + const OMX_U8 *pSrcOrgY, + OMX_S32 nSrcOrgStep, + const OMX_U8 *pSrcRefY, + OMX_S32 nSrcRefStep, + OMX_U8 iBlockWidth, + OMX_U8 iBlockHeight, + OMX_U32 nLamda, + const OMXVCMotionVector *pMVPred, + OMXVCMotionVector *pSrcDstBestMV, + OMX_S32 *pBestCost +); + + + +/** + * Function: omxVCM4P10_BlockMatch_Quarter (6.3.5.2.3) + * + * Description: + * Performs a quarter-pel block match using results from a prior half-pel + * search. Returns the best MV and associated cost. This function estimates + * the quarter-pixel motion vector by interpolating the half-pel resolution + * motion vector referenced by the input parameter pSrcDstBestMV, i.e., the + * initial half-pel MV is generated externally. The function + * omxVCM4P10_BlockMatch_Half may be used for half-pel motion estimation. + * + * Input Arguments: + * + * pSrcOrgY - Pointer to the current position in original picture plane: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * pSrcRefY - Pointer to the top-left corner of the co-located block in the + * reference picture: + * If iBlockWidth==4, 4-byte alignment required. + * If iBlockWidth==8, 8-byte alignment required. + * If iBlockWidth==16, 16-byte alignment required. + * nSrcOrgStep - Stride of the original picture plane in terms of full + * pixels; must be a multiple of iBlockWidth. + * nSrcRefStep - Stride of the reference picture plane in terms of full + * pixels + * iBlockWidth - Width of the current block in terms of full pixels; must + * be equal to either 4, 8, or 16. + * iBlockHeight - Height of the current block in terms of full pixels; must + * be equal to either 4, 8, or 16. + * nLamda - Lamda factor, used to compute motion cost + * pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to + * compute motion cost + * pSrcDstBestMV - The best MV resulting from a prior half-pel search, + * represented in terms of 1/4 pel units + * + * Output Arguments: + * + * pSrcDstBestMV - Best MV resulting from the quarter-pel search, expressed + * in terms of 1/4-pel units + * pBestCost - Motion cost associated with the best MV; computed as + * SAD+Lamda*BitsUsedByMV + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - One or more of the following pointers is NULL: + * pSrcOrgY, pSrcRefY, pSrcDstBestMV, pMVPred, pBestCost + * - iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16. + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_BlockMatch_Quarter ( + const OMX_U8 *pSrcOrgY, + OMX_S32 nSrcOrgStep, + const OMX_U8 *pSrcRefY, + OMX_S32 nSrcRefStep, + OMX_U8 iBlockWidth, + OMX_U8 iBlockHeight, + OMX_U32 nLamda, + const OMXVCMotionVector *pMVPred, + OMXVCMotionVector *pSrcDstBestMV, + OMX_S32 *pBestCost +); + + + +/** + * Function: omxVCM4P10_MotionEstimationMB (6.3.5.3.1) + * + * Description: + * Performs MB-level motion estimation and selects best motion estimation + * strategy from the set of modes supported in baseline profile [ISO14496-10]. + * + * Input Arguments: + * + * pSrcCurrBuf - Pointer to the current position in original picture plane; + * 16-byte alignment required + * pSrcRefBufList - Pointer to an array with 16 entries. Each entry points + * to the top-left corner of the co-located MB in a reference + * picture. The array is filled from low-to-high with valid + * reference frame pointers; the unused high entries should be set + * to NULL. Ordering of the reference frames should follow + * [ISO14496-10] subclause 8.2.4 Decoding Process for Reference + * Picture Lists. The entries must be 16-byte aligned. + * pSrcRecBuf - Pointer to the top-left corner of the co-located MB in the + * reconstructed picture; must be 16-byte aligned. + * SrcCurrStep - Width of the original picture plane in terms of full + * pixels; must be a multiple of 16. + * SrcRefStep - Width of the reference picture plane in terms of full + * pixels; must be a multiple of 16. + * SrcRecStep - Width of the reconstructed picture plane in terms of full + * pixels; must be a multiple of 16. + * pRefRect - Pointer to the valid reference rectangle; relative to the + * image origin. + * pCurrPointPos - Position of the current macroblock in the current plane. + * Lambda - Lagrange factor for computing the cost function + * pMESpec - Pointer to the motion estimation specification structure; must + * have been allocated and initialized prior to calling this + * function. + * pMBInter - Array, of dimension four, containing pointers to information + * associated with four adjacent type INTER MBs (Left, Top, + * Top-Left, Top-Right). Any pointer in the array may be set equal + * to NULL if the corresponding MB doesn t exist or is not of type + * INTER. + * - pMBInter[0] - Pointer to left MB information + * - pMBInter[1] - Pointer to top MB information + * - pMBInter[2] - Pointer to top-left MB information + * - pMBInter[3] - Pointer to top-right MB information + * pMBIntra - Array, of dimension four, containing pointers to information + * associated with four adjacent type INTRA MBs (Left, Top, + * Top-Left, Top-Right). Any pointer in the array may be set equal + * to NULL if the corresponding MB doesn t exist or is not of type + * INTRA. + * - pMBIntra[0] - Pointer to left MB information + * - pMBIntra[1] - Pointer to top MB information + * - pMBIntra[2] - Pointer to top-left MB information + * - pMBIntra[3] - Pointer to top-right MB information + * pSrcDstMBCurr - Pointer to information structure for the current MB. + * The following entries should be set prior to calling the + * function: sliceID - the number of the slice the to which the + * current MB belongs. + * + * Output Arguments: + * + * pDstCost - Pointer to the minimum motion cost for the current MB. + * pDstBlockSAD - Pointer to the array of SADs for each of the sixteen luma + * 4x4 blocks in each MB. The block SADs are in scan order for + * each MB. For implementations that cannot compute the SAD values + * individually, the maximum possible value (0xffff) is returned + * for each of the 16 block SAD entries. + * pSrcDstMBCurr - Pointer to updated information structure for the current + * MB after MB-level motion estimation has been completed. The + * following fields are updated by the ME function. The following + * parameter set quantifies the MB-level ME search results: + * - MbType + * - subMBType[4] + * - pMV0[4][4] + * - pMVPred[4][4] + * - pRefL0Idx[4] + * - Intra16x16PredMode + * - pIntra4x4PredMode[4][4] + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - One or more of the following pointers is NULL: pSrcCurrBuf, + * pSrcRefBufList, pSrcRecBuf, pRefRect, pCurrPointPos, pMESpec, + * pMBInter, pMBIntra,pSrcDstMBCurr, pDstCost, pSrcRefBufList[0] + * - SrcRefStep, SrcRecStep are not multiples of 16 + * - iBlockWidth or iBlockHeight are values other than 4, 8, or 16. + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_MotionEstimationMB ( + const OMX_U8 *pSrcCurrBuf, + OMX_S32 SrcCurrStep, + const OMX_U8 *pSrcRefBufList[15], + OMX_S32 SrcRefStep, + const OMX_U8 *pSrcRecBuf, + OMX_S32 SrcRecStep, + const OMXRect *pRefRect, + const OMXVCM4P2Coordinate *pCurrPointPos, + OMX_U32 Lambda, + void *pMESpec, + const OMXVCM4P10MBInfoPtr *pMBInter, + const OMXVCM4P10MBInfoPtr *pMBIntra, + OMXVCM4P10MBInfoPtr pSrcDstMBCurr, + OMX_INT *pDstCost, + OMX_U16 *pDstBlockSAD +); + + + +/** + * Function: omxVCM4P10_SAD_4x (6.3.5.4.1) + * + * Description: + * This function calculates the SAD for 4x8 and 4x4 blocks. + * + * Input Arguments: + * + * pSrcOrg -Pointer to the original block; must be aligned on a 4-byte + * boundary. + * iStepOrg -Step of the original block buffer; must be a multiple of 4. + * pSrcRef -Pointer to the reference block + * iStepRef -Step of the reference block buffer + * iHeight -Height of the block; must be equal to either 4 or 8. + * + * Output Arguments: + * + * pDstSAD -Pointer of result SAD + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - One or more of the following pointers is NULL: + * pSrcOrg, pSrcRef, or pDstSAD + * - iHeight is not equal to either 4 or 8. + * - iStepOrg is not a multiple of 4 + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_SAD_4x ( + const OMX_U8 *pSrcOrg, + OMX_U32 iStepOrg, + const OMX_U8 *pSrcRef, + OMX_U32 iStepRef, + OMX_S32 *pDstSAD, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_SADQuar_4x (6.3.5.4.2) + * + * Description: + * This function calculates the SAD between one block (pSrc) and the average + * of the other two (pSrcRef0 and pSrcRef1) for 4x8 or 4x4 blocks. Rounding + * is applied according to the convention (a+b+1)>>1. + * + * Input Arguments: + * + * pSrc - Pointer to the original block; must be aligned on a 4-byte + * boundary. + * pSrcRef0 - Pointer to reference block 0 + * pSrcRef1 - Pointer to reference block 1 + * iSrcStep - Step of the original block buffer; must be a multiple of 4. + * iRefStep0 - Step of reference block 0 + * iRefStep1 - Step of reference block 1 + * iHeight - Height of the block; must be equal to either 4 or 8. + * + * Output Arguments: + * + * pDstSAD - Pointer of result SAD + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - iHeight is not equal to either 4 or 8. + * - One or more of the following pointers is NULL: pSrc, pSrcRef0, + * pSrcRef1, pDstSAD. + * - iSrcStep is not a multiple of 4 + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_SADQuar_4x ( + const OMX_U8 *pSrc, + const OMX_U8 *pSrcRef0, + const OMX_U8 *pSrcRef1, + OMX_U32 iSrcStep, + OMX_U32 iRefStep0, + OMX_U32 iRefStep1, + OMX_U32 *pDstSAD, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_SADQuar_8x (6.3.5.4.3) + * + * Description: + * This function calculates the SAD between one block (pSrc) and the average + * of the other two (pSrcRef0 and pSrcRef1) for 8x16, 8x8, or 8x4 blocks. + * Rounding is applied according to the convention (a+b+1)>>1. + * + * Input Arguments: + * + * pSrc - Pointer to the original block; must be aligned on an 8-byte + * boundary. + * pSrcRef0 - Pointer to reference block 0 + * pSrcRef1 - Pointer to reference block 1 + * iSrcStep - Step of the original block buffer; must be a multiple of 8. + * iRefStep0 - Step of reference block 0 + * iRefStep1 - Step of reference block 1 + * iHeight - Height of the block; must be equal either 4, 8, or 16. + * + * Output Arguments: + * + * pDstSAD - Pointer of result SAD + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - iHeight is not equal to either 4, 8, or 16. + * - One or more of the following pointers is NULL: pSrc, pSrcRef0, + * pSrcRef1, pDstSAD. + * - iSrcStep is not a multiple of 8 + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_SADQuar_8x ( + const OMX_U8 *pSrc, + const OMX_U8 *pSrcRef0, + const OMX_U8 *pSrcRef1, + OMX_U32 iSrcStep, + OMX_U32 iRefStep0, + OMX_U32 iRefStep1, + OMX_U32 *pDstSAD, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_SADQuar_16x (6.3.5.4.4) + * + * Description: + * This function calculates the SAD between one block (pSrc) and the average + * of the other two (pSrcRef0 and pSrcRef1) for 16x16 or 16x8 blocks. + * Rounding is applied according to the convention (a+b+1)>>1. + * + * Input Arguments: + * + * pSrc - Pointer to the original block; must be aligned on a 16-byte + * boundary. + * pSrcRef0 - Pointer to reference block 0 + * pSrcRef1 - Pointer to reference block 1 + * iSrcStep - Step of the original block buffer; must be a multiple of 16 + * iRefStep0 - Step of reference block 0 + * iRefStep1 - Step of reference block 1 + * iHeight - Height of the block; must be equal to either 8 or 16 + * + * Output Arguments: + * + * pDstSAD -Pointer of result SAD + * + * Return Value: + * OMX_Sts_NoErr, if the function runs without error. + * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: + * - iHeight is not equal to either 8 or 16. + * - One or more of the following pointers is NULL: pSrc, pSrcRef0, + * pSrcRef1, pDstSAD. + * - iSrcStep is not a multiple of 16 + * - Any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_SADQuar_16x ( + const OMX_U8 *pSrc, + const OMX_U8 *pSrcRef0, + const OMX_U8 *pSrcRef1, + OMX_U32 iSrcStep, + OMX_U32 iRefStep0, + OMX_U32 iRefStep1, + OMX_U32 *pDstSAD, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_SATD_4x4 (6.3.5.4.5) + * + * Description: + * This function calculates the sum of absolute transform differences (SATD) + * for a 4x4 block by applying a Hadamard transform to the difference block + * and then calculating the sum of absolute coefficient values. + * + * Input Arguments: + * + * pSrcOrg - Pointer to the original block; must be aligned on a 4-byte + * boundary + * iStepOrg - Step of the original block buffer; must be a multiple of 4 + * pSrcRef - Pointer to the reference block; must be aligned on a 4-byte + * boundary + * iStepRef - Step of the reference block buffer; must be a multiple of 4 + * + * Output Arguments: + * + * pDstSAD - pointer to the resulting SAD + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrcOrg, pSrcRef, or pDstSAD either pSrcOrg + * - pSrcRef is not aligned on a 4-byte boundary + * - iStepOrg <= 0 or iStepOrg is not a multiple of 4 + * - iStepRef <= 0 or iStepRef is not a multiple of 4 + * + */ +OMXResult omxVCM4P10_SATD_4x4 ( + const OMX_U8 *pSrcOrg, + OMX_U32 iStepOrg, + const OMX_U8 *pSrcRef, + OMX_U32 iStepRef, + OMX_U32 *pDstSAD +); + + + +/** + * Function: omxVCM4P10_InterpolateHalfHor_Luma (6.3.5.5.1) + * + * Description: + * This function performs interpolation for two horizontal 1/2-pel positions + * (-1/2,0) and (1/2, 0) - around a full-pel position. + * + * Input Arguments: + * + * pSrc - Pointer to the top-left corner of the block used to interpolate in + * the reconstruction frame plane. + * iSrcStep - Step of the source buffer. + * iDstStep - Step of the destination(interpolation) buffer; must be a + * multiple of iWidth. + * iWidth - Width of the current block; must be equal to either 4, 8, or 16 + * iHeight - Height of the current block; must be equal to 4, 8, or 16 + * + * Output Arguments: + * + * pDstLeft -Pointer to the interpolation buffer of the left -pel position + * (-1/2, 0) + * If iWidth==4, 4-byte alignment required. + * If iWidth==8, 8-byte alignment required. + * If iWidth==16, 16-byte alignment required. + * pDstRight -Pointer to the interpolation buffer of the right -pel + * position (1/2, 0) + * If iWidth==4, 4-byte alignment required. + * If iWidth==8, 8-byte alignment required. + * If iWidth==16, 16-byte alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrc, pDstLeft, or pDstRight + * - iWidth or iHeight have values other than 4, 8, or 16 + * - iWidth==4 but pDstLeft and/or pDstRight is/are not aligned on a 4-byte boundary + * - iWidth==8 but pDstLeft and/or pDstRight is/are not aligned on a 8-byte boundary + * - iWidth==16 but pDstLeft and/or pDstRight is/are not aligned on a 16-byte boundary + * - any alignment restrictions are violated + * + */ +OMXResult omxVCM4P10_InterpolateHalfHor_Luma ( + const OMX_U8 *pSrc, + OMX_U32 iSrcStep, + OMX_U8 *pDstLeft, + OMX_U8 *pDstRight, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_InterpolateHalfVer_Luma (6.3.5.5.2) + * + * Description: + * This function performs interpolation for two vertical 1/2-pel positions - + * (0, -1/2) and (0, 1/2) - around a full-pel position. + * + * Input Arguments: + * + * pSrc - Pointer to top-left corner of block used to interpolate in the + * reconstructed frame plane + * iSrcStep - Step of the source buffer. + * iDstStep - Step of the destination (interpolation) buffer; must be a + * multiple of iWidth. + * iWidth - Width of the current block; must be equal to either 4, 8, or 16 + * iHeight - Height of the current block; must be equal to either 4, 8, or 16 + * + * Output Arguments: + * + * pDstUp -Pointer to the interpolation buffer of the -pel position above + * the current full-pel position (0, -1/2) + * If iWidth==4, 4-byte alignment required. + * If iWidth==8, 8-byte alignment required. + * If iWidth==16, 16-byte alignment required. + * pDstDown -Pointer to the interpolation buffer of the -pel position below + * the current full-pel position (0, 1/2) + * If iWidth==4, 4-byte alignment required. + * If iWidth==8, 8-byte alignment required. + * If iWidth==16, 16-byte alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrc, pDstUp, or pDstDown + * - iWidth or iHeight have values other than 4, 8, or 16 + * - iWidth==4 but pDstUp and/or pDstDown is/are not aligned on a 4-byte boundary + * - iWidth==8 but pDstUp and/or pDstDown is/are not aligned on a 8-byte boundary + * - iWidth==16 but pDstUp and/or pDstDown is/are not aligned on a 16-byte boundary + * + */ +OMXResult omxVCM4P10_InterpolateHalfVer_Luma ( + const OMX_U8 *pSrc, + OMX_U32 iSrcStep, + OMX_U8 *pDstUp, + OMX_U8 *pDstDown, + OMX_U32 iDstStep, + OMX_U32 iWidth, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_Average_4x (6.3.5.5.3) + * + * Description: + * This function calculates the average of two 4x4, 4x8 blocks. The result + * is rounded according to (a+b+1)/2. + * + * Input Arguments: + * + * pPred0 - Pointer to the top-left corner of reference block 0 + * pPred1 - Pointer to the top-left corner of reference block 1 + * iPredStep0 - Step of reference block 0; must be a multiple of 4. + * iPredStep1 - Step of reference block 1; must be a multiple of 4. + * iDstStep - Step of the destination buffer; must be a multiple of 4. + * iHeight - Height of the blocks; must be either 4 or 8. + * + * Output Arguments: + * + * pDstPred - Pointer to the destination buffer. 4-byte alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pPred0, pPred1, or pDstPred + * - pDstPred is not aligned on a 4-byte boundary + * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 4 + * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 4 + * - iDstStep <= 0 or iDstStep is not a multiple of 4 + * - iHeight is not equal to either 4 or 8 + * + */ +OMXResult omxVCM4P10_Average_4x ( + const OMX_U8 *pPred0, + const OMX_U8 *pPred1, + OMX_U32 iPredStep0, + OMX_U32 iPredStep1, + OMX_U8 *pDstPred, + OMX_U32 iDstStep, + OMX_U32 iHeight +); + + + +/** + * Function: omxVCM4P10_TransformQuant_ChromaDC (6.3.5.6.1) + * + * Description: + * This function performs 2x2 Hadamard transform of chroma DC coefficients + * and then quantizes the coefficients. + * + * Input Arguments: + * + * pSrcDst - Pointer to the 2x2 array of chroma DC coefficients. 8-byte + * alignment required. + * iQP - Quantization parameter; must be in the range [0,51]. + * bIntra - Indicate whether this is an INTRA block. 1-INTRA, 0-INTER + * + * Output Arguments: + * + * pSrcDst - Pointer to transformed and quantized coefficients. 8-byte + * alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrcDst + * - pSrcDst is not aligned on an 8-byte boundary + * + */ +OMXResult omxVCM4P10_TransformQuant_ChromaDC ( + OMX_S16 *pSrcDst, + OMX_U32 iQP, + OMX_U8 bIntra +); + + + +/** + * Function: omxVCM4P10_TransformQuant_LumaDC (6.3.5.6.2) + * + * Description: + * This function performs a 4x4 Hadamard transform of luma DC coefficients + * and then quantizes the coefficients. + * + * Input Arguments: + * + * pSrcDst - Pointer to the 4x4 array of luma DC coefficients. 16-byte + * alignment required. + * iQP - Quantization parameter; must be in the range [0,51]. + * + * Output Arguments: + * + * pSrcDst - Pointer to transformed and quantized coefficients. 16-byte + * alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: pSrcDst + * - pSrcDst is not aligned on an 16-byte boundary + * + */ +OMXResult omxVCM4P10_TransformQuant_LumaDC ( + OMX_S16 *pSrcDst, + OMX_U32 iQP +); + + + +/** + * Function: omxVCM4P10_InvTransformDequant_LumaDC (6.3.5.6.3) + * + * Description: + * This function performs inverse 4x4 Hadamard transform and then dequantizes + * the coefficients. + * + * Input Arguments: + * + * pSrc - Pointer to the 4x4 array of the 4x4 Hadamard-transformed and + * quantized coefficients. 16 byte alignment required. + * iQP - Quantization parameter; must be in the range [0,51]. + * + * Output Arguments: + * + * pDst - Pointer to inverse-transformed and dequantized coefficients. + * 16-byte alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: pSrc + * - pSrc or pDst is not aligned on a 16-byte boundary + * + */ +OMXResult omxVCM4P10_InvTransformDequant_LumaDC ( + const OMX_S16 *pSrc, + OMX_S16 *pDst, + OMX_U32 iQP +); + + + +/** + * Function: omxVCM4P10_InvTransformDequant_ChromaDC (6.3.5.6.4) + * + * Description: + * This function performs inverse 2x2 Hadamard transform and then dequantizes + * the coefficients. + * + * Input Arguments: + * + * pSrc - Pointer to the 2x2 array of the 2x2 Hadamard-transformed and + * quantized coefficients. 8 byte alignment required. + * iQP - Quantization parameter; must be in the range [0,51]. + * + * Output Arguments: + * + * pDst - Pointer to inverse-transformed and dequantized coefficients. + * 8-byte alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: pSrc + * - pSrc or pDst is not aligned on an 8-byte boundary + * + */ +OMXResult omxVCM4P10_InvTransformDequant_ChromaDC ( + const OMX_S16 *pSrc, + OMX_S16 *pDst, + OMX_U32 iQP +); + + + +/** + * Function: omxVCM4P10_InvTransformResidualAndAdd (6.3.5.7.1) + * + * Description: + * This function performs inverse an 4x4 integer transformation to produce + * the difference signal and then adds the difference to the prediction to get + * the reconstructed signal. + * + * Input Arguments: + * + * pSrcPred - Pointer to prediction signal. 4-byte alignment required. + * pDequantCoeff - Pointer to the transformed coefficients. 8-byte + * alignment required. + * iSrcPredStep - Step of the prediction buffer; must be a multiple of 4. + * iDstReconStep - Step of the destination reconstruction buffer; must be a + * multiple of 4. + * bAC - Indicate whether there is AC coefficients in the coefficients + * matrix. + * + * Output Arguments: + * + * pDstRecon -Pointer to the destination reconstruction buffer. 4-byte + * alignment required. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrcPred, pDequantCoeff, pDstRecon + * - pSrcPred is not aligned on a 4-byte boundary + * - iSrcPredStep or iDstReconStep is not a multiple of 4. + * - pDequantCoeff is not aligned on an 8-byte boundary + * + */ +OMXResult omxVCM4P10_InvTransformResidualAndAdd ( + const OMX_U8 *pSrcPred, + const OMX_S16 *pDequantCoeff, + OMX_U8 *pDstRecon, + OMX_U32 iSrcPredStep, + OMX_U32 iDstReconStep, + OMX_U8 bAC +); + + + +/** + * Function: omxVCM4P10_SubAndTransformQDQResidual (6.3.5.8.1) + * + * Description: + * This function subtracts the prediction signal from the original signal to + * produce the difference signal and then performs a 4x4 integer transform and + * quantization. The quantized transformed coefficients are stored as + * pDstQuantCoeff. This function can also output dequantized coefficients or + * unquantized DC coefficients optionally by setting the pointers + * pDstDeQuantCoeff, pDCCoeff. + * + * Input Arguments: + * + * pSrcOrg - Pointer to original signal. 4-byte alignment required. + * pSrcPred - Pointer to prediction signal. 4-byte alignment required. + * iSrcOrgStep - Step of the original signal buffer; must be a multiple of + * 4. + * iSrcPredStep - Step of the prediction signal buffer; must be a multiple + * of 4. + * pNumCoeff -Number of non-zero coefficients after quantization. If this + * parameter is not required, it is set to NULL. + * nThreshSAD - Zero-block early detection threshold. If this parameter is + * not required, it is set to 0. + * iQP - Quantization parameter; must be in the range [0,51]. + * bIntra - Indicates whether this is an INTRA block, either 1-INTRA or + * 0-INTER + * + * Output Arguments: + * + * pDstQuantCoeff - Pointer to the quantized transformed coefficients. + * 8-byte alignment required. + * pDstDeQuantCoeff - Pointer to the dequantized transformed coefficients + * if this parameter is not equal to NULL. 8-byte alignment + * required. + * pDCCoeff - Pointer to the unquantized DC coefficient if this parameter + * is not equal to NULL. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrcOrg, pSrcPred, pNumCoeff, pDstQuantCoeff, + * pDstDeQuantCoeff, pDCCoeff + * - pSrcOrg is not aligned on a 4-byte boundary + * - pSrcPred is not aligned on a 4-byte boundary + * - iSrcOrgStep is not a multiple of 4 + * - iSrcPredStep is not a multiple of 4 + * - pDstQuantCoeff or pDstDeQuantCoeff is not aligned on an 8-byte boundary + * + */ +OMXResult omxVCM4P10_SubAndTransformQDQResidual ( + const OMX_U8 *pSrcOrg, + const OMX_U8 *pSrcPred, + OMX_U32 iSrcOrgStep, + OMX_U32 iSrcPredStep, + OMX_S16 *pDstQuantCoeff, + OMX_S16 *pDstDeQuantCoeff, + OMX_S16 *pDCCoeff, + OMX_S8 *pNumCoeff, + OMX_U32 nThreshSAD, + OMX_U32 iQP, + OMX_U8 bIntra +); + + + +/** + * Function: omxVCM4P10_GetVLCInfo (6.3.5.9.1) + * + * Description: + * This function extracts run-length encoding (RLE) information from the + * coefficient matrix. The results are returned in an OMXVCM4P10VLCInfo + * structure. + * + * Input Arguments: + * + * pSrcCoeff - pointer to the transform coefficient matrix. 8-byte + * alignment required. + * pScanMatrix - pointer to the scan order definition matrix. For a luma + * block the scan matrix should follow [ISO14496-10] section 8.5.4, + * and should contain the values 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, + * 10, 7, 11, 14, 15. For a chroma block, the scan matrix should + * contain the values 0, 1, 2, 3. + * bAC - indicates presence of a DC coefficient; 0 = DC coefficient + * present, 1= DC coefficient absent. + * MaxNumCoef - specifies the number of coefficients contained in the + * transform coefficient matrix, pSrcCoeff. The value should be 16 + * for blocks of type LUMADC, LUMAAC, LUMALEVEL, and CHROMAAC. The + * value should be 4 for blocks of type CHROMADC. + * + * Output Arguments: + * + * pDstVLCInfo - pointer to structure that stores information for + * run-length coding. + * + * Return Value: + * + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments; returned if any of the following + * conditions are true: + * - at least one of the following pointers is NULL: + * pSrcCoeff, pScanMatrix, pDstVLCInfo + * - pSrcCoeff is not aligned on an 8-byte boundary + * + */ +OMXResult omxVCM4P10_GetVLCInfo ( + const OMX_S16 *pSrcCoeff, + const OMX_U8 *pScanMatrix, + OMX_U8 bAC, + OMX_U32 MaxNumCoef, + OMXVCM4P10VLCInfo*pDstVLCInfo +); + + + +#ifdef __cplusplus +} +#endif + +#endif /** end of #define _OMXVC_H_ */ + +/** EOF */ + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h new file mode 100755 index 0000000..89f3040 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h @@ -0,0 +1,129 @@ +;/****************************************************************************** +;// Copyright (c) 1999-2005 The Khronos Group Inc. All Rights Reserved +;// +;// +;// +;// +;// +;// +;// +;// +;******************************************************************************/ + +;/** =============== Structure Definition for Sample Generation ============== */ +;/** transparent status */ + +;enum { +OMX_VIDEO_TRANSPARENT EQU 0; /** Wholly transparent */ +OMX_VIDEO_PARTIAL EQU 1; /** Partially transparent */ +OMX_VIDEO_OPAQUE EQU 2; /** Opaque */ +;} + +;/** direction */ +;enum { +OMX_VIDEO_NONE EQU 0; +OMX_VIDEO_HORIZONTAL EQU 1; +OMX_VIDEO_VERTICAL EQU 2; +;} + +;/** bilinear interpolation type */ +;enum { +OMX_VIDEO_INTEGER_PIXEL EQU 0; /** case a */ +OMX_VIDEO_HALF_PIXEL_X EQU 1; /** case b */ +OMX_VIDEO_HALF_PIXEL_Y EQU 2; /** case c */ +OMX_VIDEO_HALF_PIXEL_XY EQU 3; /** case d */ +;} + +;enum { +OMX_UPPER EQU 1; /** set if the above macroblock is available */ +OMX_LEFT EQU 2; /** set if the left macroblock is available */ +OMX_CENTER EQU 4; +OMX_RIGHT EQU 8; +OMX_LOWER EQU 16; +OMX_UPPER_LEFT EQU 32; /** set if the above-left macroblock is available */ +OMX_UPPER_RIGHT EQU 64; /** set if the above-right macroblock is available */ +OMX_LOWER_LEFT EQU 128; +OMX_LOWER_RIGHT EQU 256 +;} + +;enum { +OMX_VIDEO_LUMINANCE EQU 0; /** Luminance component */ +OMX_VIDEO_CHROMINANCE EQU 1; /** chrominance component */ +OMX_VIDEO_ALPHA EQU 2; /** Alpha component */ +;} + +;enum { +OMX_VIDEO_INTER EQU 0; /** P picture or P-VOP */ +OMX_VIDEO_INTER_Q EQU 1; /** P picture or P-VOP */ +OMX_VIDEO_INTER4V EQU 2; /** P picture or P-VOP */ +OMX_VIDEO_INTRA EQU 3; /** I and P picture; I- and P-VOP */ +OMX_VIDEO_INTRA_Q EQU 4; /** I and P picture; I- and P-VOP */ +OMX_VIDEO_INTER4V_Q EQU 5; /** P picture or P-VOP (H.263)*/ +OMX_VIDEO_DIRECT EQU 6; /** B picture or B-VOP (MPEG-4 only) */ +OMX_VIDEO_INTERPOLATE EQU 7; /** B picture or B-VOP */ +OMX_VIDEO_BACKWARD EQU 8; /** B picture or B-VOP */ +OMX_VIDEO_FORWARD EQU 9; /** B picture or B-VOP */ +OMX_VIDEO_NOTCODED EQU 10; /** B picture or B-VOP */ +;} + +;enum { +OMX_16X16_VERT EQU 0; /** Intra_16x16_Vertical (prediction mode) */ +OMX_16X16_HOR EQU 1; /** Intra_16x16_Horizontal (prediction mode) */ +OMX_16X16_DC EQU 2; /** Intra_16x16_DC (prediction mode) */ +OMX_16X16_PLANE EQU 3; /** Intra_16x16_Plane (prediction mode) */ +;} + +;enum { +OMX_4x4_VERT EQU 0; /** Intra_4x4_Vertical (prediction mode) */ +OMX_4x4_HOR EQU 1; /** Intra_4x4_Horizontal (prediction mode) */ +OMX_4x4_DC EQU 2; /** Intra_4x4_DC (prediction mode) */ +OMX_4x4_DIAG_DL EQU 3; /** Intra_4x4_Diagonal_Down_Left (prediction mode) */ +OMX_4x4_DIAG_DR EQU 4; /** Intra_4x4_Diagonal_Down_Right (prediction mode) */ +OMX_4x4_VR EQU 5; /** Intra_4x4_Vertical_Right (prediction mode) */ +OMX_4x4_HD EQU 6; /** Intra_4x4_Horizontal_Down (prediction mode) */ +OMX_4x4_VL EQU 7; /** Intra_4x4_Vertical_Left (prediction mode) */ +OMX_4x4_HU EQU 8; /** Intra_4x4_Horizontal_Up (prediction mode) */ +;} + +;enum { +OMX_CHROMA_DC EQU 0; /** Intra_Chroma_DC (prediction mode) */ +OMX_CHROMA_HOR EQU 1; /** Intra_Chroma_Horizontal (prediction mode) */ +OMX_CHROMA_VERT EQU 2; /** Intra_Chroma_Vertical (prediction mode) */ +OMX_CHROMA_PLANE EQU 3; /** Intra_Chroma_Plane (prediction mode) */ +;} + +;typedef struct { +x EQU 0; +y EQU 4; +;}OMXCoordinate; + +;typedef struct { +dx EQU 0; +dy EQU 2; +;}OMXMotionVector; + +;typedef struct { +xx EQU 0; +yy EQU 4; +width EQU 8; +height EQU 12; +;}OMXiRect; + +;typedef enum { +OMX_VC_INTER EQU 0; /** P picture or P-VOP */ +OMX_VC_INTER_Q EQU 1; /** P picture or P-VOP */ +OMX_VC_INTER4V EQU 2; /** P picture or P-VOP */ +OMX_VC_INTRA EQU 3; /** I and P picture, I- and P-VOP */ +OMX_VC_INTRA_Q EQU 4; /** I and P picture, I- and P-VOP */ +OMX_VC_INTER4V_Q EQU 5; /** P picture or P-VOP (H.263)*/ +;} OMXVCM4P2MacroblockType; + +;enum { +OMX_VC_NONE EQU 0 +OMX_VC_HORIZONTAL EQU 1 +OMX_VC_VERTICAL EQU 2 +;}; + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s new file mode 100755 index 0000000..296d59d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s @@ -0,0 +1,95 @@ + ;/** + ; * Function: omxVCCOMM_Copy16x16 + ; * + ; * Description: + ; * Copies the reference 16x16 block to the current block. + ; * Parameters: + ; * [in] pSrc - pointer to the reference block in the source frame; must be aligned on an 16-byte boundary. + ; * [in] step - distance between the starts of consecutive lines in the reference frame, in bytes; + ; * must be a multiple of 16 and must be larger than or equal to 16. + ; * [out] pDst - pointer to the destination block; must be aligned on an 8-byte boundary. + ; * Return Value: + ; * OMX_Sts_NoErr - no error + ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions: + ; * - one or more of the following pointers is NULL: pSrc, pDst + ; * - one or more of the following pointers is not aligned on an 16-byte boundary: pSrc, pDst + ; * - step <16 or step is not a multiple of 16. + ; */ + + INCLUDE omxtypes_s.h + + + M_VARIANTS CortexA8 + + IF CortexA8 + + + ;//Input Arguments +pSrc RN 0 +pDst RN 1 +step RN 2 + +;//Local Variables +Return RN 0 +;// Neon Registers + +X0 DN D0.S8 +X1 DN D1.S8 +X2 DN D2.S8 +X3 DN D3.S8 +X4 DN D4.S8 +X5 DN D5.S8 +X6 DN D6.S8 +X7 DN D7.S8 + + M_START omxVCCOMM_Copy16x16 + + + VLD1 {X0,X1},[pSrc@128],step ;// Load 16 bytes from 16 byte aligned pSrc and pSrc=pSrc + step after loading + VLD1 {X2,X3},[pSrc@128],step + VLD1 {X4,X5},[pSrc@128],step + VLD1 {X6,X7},[pSrc@128],step + + VST1 {X0,X1,X2,X3},[pDst@128]! ;// Store 32 bytes to 16 byte aligned pDst + VST1 {X4,X5,X6,X7},[pDst@128]! + + + VLD1 {X0,X1},[pSrc@128],step + VLD1 {X2,X3},[pSrc@128],step + VLD1 {X4,X5},[pSrc@128],step + VLD1 {X6,X7},[pSrc@128],step + + VST1 {X0,X1,X2,X3},[pDst@128]! + VST1 {X4,X5,X6,X7},[pDst@128]! + + + VLD1 {X0,X1},[pSrc@128],step + VLD1 {X2,X3},[pSrc@128],step + VLD1 {X4,X5},[pSrc@128],step + VLD1 {X6,X7},[pSrc@128],step + + VST1 {X0,X1,X2,X3},[pDst@128]! + VST1 {X4,X5,X6,X7},[pDst@128]! + + + VLD1 {X0,X1},[pSrc@128],step + VLD1 {X2,X3},[pSrc@128],step + VLD1 {X4,X5},[pSrc@128],step + VLD1 {X6,X7},[pSrc@128],step + + VST1 {X0,X1,X2,X3},[pDst@128]! + VST1 {X4,X5,X6,X7},[pDst@128]! + + + MOV Return,#OMX_Sts_NoErr + + + + M_END + ENDIF + + + + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s new file mode 100755 index 0000000..db9e5ef --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s @@ -0,0 +1,70 @@ + ;/** + ; * Function: omxVCCOMM_Copy8x8 + ; * + ; * Description: + ; * Copies the reference 8x8 block to the current block. + ; * Parameters: + ; * [in] pSrc - pointer to the reference block in the source frame; must be aligned on an 8-byte boundary. + ; * [in] step - distance between the starts of consecutive lines in the reference frame, in bytes; + ; * must be a multiple of 8 and must be larger than or equal to 8. + ; * [out] pDst - pointer to the destination block; must be aligned on an 8-byte boundary. + ; * Return Value: + ; * OMX_Sts_NoErr - no error + ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions: + ; * - one or more of the following pointers is NULL: pSrc, pDst + ; * - one or more of the following pointers is not aligned on an 8-byte boundary: pSrc, pDst + ; * - step <8 or step is not a multiple of 8. + ; */ + + INCLUDE omxtypes_s.h + + + M_VARIANTS CortexA8 + + IF CortexA8 + + + ;//Input Arguments +pSrc RN 0 +pDst RN 1 +step RN 2 + +;//Local Variables +Count RN 3 +Return RN 0 +;// Neon Registers + +X0 DN D0.S8 +X1 DN D1.S8 +X2 DN D2.S8 +X3 DN D3.S8 + M_START omxVCCOMM_Copy8x8 + + + + VLD1 {X0},[pSrc],step ;// Load 8 bytes from 8 byte aligned pSrc, pSrc=pSrc+step after load + VLD1 {X1},[pSrc],step + VLD1 {X2},[pSrc],step + VLD1 {X3},[pSrc],step + + VST1 {X0,X1},[pDst]! ;// Store 16 bytes to 8 byte aligned pDst + VST1 {X2,X3},[pDst]! + + VLD1 {X0},[pSrc],step + VLD1 {X1},[pSrc],step + VLD1 {X2},[pSrc],step + VLD1 {X3},[pSrc],step + + VST1 {X0,X1},[pDst]! + VST1 {X2,X3},[pDst]! + + MOV Return,#OMX_Sts_NoErr + + M_END + ENDIF + + + + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s new file mode 100755 index 0000000..5c5b7d8 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s @@ -0,0 +1,236 @@ +;// +;// +;// File Name: omxVCCOMM_ExpandFrame_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// This function will Expand Frame boundary pixels into Plane +;// +;// + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + +;// Import symbols required from other files +;// (For example tables) + + +;// Set debugging level +DEBUG_ON SETL {FALSE} + + + + IF CortexA8 + + M_START omxVCCOMM_ExpandFrame_I,r11 + +;//Input registers + +pSrcDstPlane RN 0 +iFrameWidth RN 1 +iFrameHeight RN 2 +iExpandPels RN 3 +iPlaneStep RN 4 +pTop RN 5 +pBot RN 6 +pDstTop RN 7 +pDstBot RN 8 +pLeft RN 5 +pRight RN 6 +pDstLeft RN 9 +pDstRight RN 10 +Offset RN 11 +Temp RN 14 +Counter RN 12 +Tmp RN 7 +;//Output registers + +result RN 0 +;// Neon registers +qData0 QN 0.U8 +qData1 QN 1.U8 +dData0 DN 0.U8 +dData1 DN 1.U8 +dData2 DN 2.U8 +dData3 DN 3.U8 + + ;// Define stack arguments + M_ARG pPlaneStep, 4 + + ;// Load argument from the stack + M_LDR iPlaneStep, pPlaneStep + + SUB pTop, pSrcDstPlane, #0 ;// Top row pointer of the frame + MUL Offset, iExpandPels, iPlaneStep ;// E*Step + SUB Temp, iFrameHeight, #1 ;// H-1 + MUL Temp, iPlaneStep, Temp ;// (H-1)*Step + ADD pBot, Temp, pSrcDstPlane ;// BPtr = TPtr + (H-1)*Step + MOV Temp, iFrameWidth ;// Outer loop counter + + ;// Check if pSrcDstPlane and iPlaneStep are 16 byte aligned + TST pSrcDstPlane, #0xf + TSTEQ iPlaneStep, #0xf + BNE Hor8Loop00 + + ;// + ;// Copy top and bottom region of the plane as follows + ;// top region = top row elements from the frame + ;// bottom region = last row elements from the frame + ;// + + ;// Case for 16 byte alignment +Hor16Loop00 + SUB pDstTop, pTop, Offset + VLD1 qData0, [pTop @128]! + MOV Counter, iExpandPels ;// Inner loop counter + ADD pDstBot, pBot, iPlaneStep + VLD1 qData1, [pBot @128]! +Ver16Loop0 + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + VST1 qData0, [pDstTop @128], iPlaneStep + SUBS Counter, Counter, #8 + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + VST1 qData1, [pDstBot @128], iPlaneStep + BGT Ver16Loop0 + + SUBS Temp, Temp, #16 + BGT Hor16Loop00 + B EndAlignedLoop + + ;// Case for 8 byte alignment +Hor8Loop00 + SUB pDstTop, pTop, Offset + VLD1 qData0, [pTop @64]! + MOV Counter, iExpandPels ;// Inner loop counter + ADD pDstBot, pBot, iPlaneStep + VLD1 qData1, [pBot @64]! +Ver8Loop0 + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + VST1 qData0, [pDstTop @64], iPlaneStep + SUBS Counter, Counter, #8 + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + VST1 qData1, [pDstBot @64], iPlaneStep + BGT Ver8Loop0 + + SUBS Temp, Temp, #16 + BGT Hor8Loop00 + +EndAlignedLoop + ADD Temp, pSrcDstPlane, iFrameWidth + SUB pDstRight, Temp, Offset + SUB pRight, Temp, #1 + SUB pDstLeft, pSrcDstPlane, Offset + SUB pDstLeft, pDstLeft, iExpandPels + ADD pLeft, pSrcDstPlane, #0 + + VLD1 {dData0 []}, [pLeft], iPlaneStep ;// Top-Left corner pixel from frame duplicated in dData0 + SUB Offset, iPlaneStep, iExpandPels + VLD1 {dData1 []}, [pRight], iPlaneStep ;// Top-Right corner pixel from frame duplicated in dData1 + MOV Temp, iExpandPels + + ;// + ;// Copy top-left and top-right region of the plane as follows + ;// top-left region = top-left corner pixel from the frame + ;// top-right region = top-right corner pixel from the frame + ;// +HorLoop11 + MOV Counter, iExpandPels +VerLoop1 + VST1 dData0, [pDstLeft], #8 + SUBS Counter, Counter, #8 + VST1 dData1, [pDstRight], #8 + BGT VerLoop1 + + SUBS Temp, Temp, #1 + ADD pDstLeft, pDstLeft, Offset + ADD pDstRight, pDstRight, Offset + BPL HorLoop11 + + SUB iFrameHeight, iFrameHeight, #1 + ;// + ;// Copy left and right region of the plane as follows + ;// Left region = copy the row with left start pixel from the frame + ;// Right region = copy the row with right end pixel from the frame + ;// +HorLoop22 + VLD1 {dData0 []}, [pLeft], iPlaneStep + MOV Counter, iExpandPels + VLD1 {dData1 []}, [pRight], iPlaneStep +VerLoop2 + VST1 dData0, [pDstLeft], #8 + SUBS Counter, Counter, #8 + VST1 dData1, [pDstRight], #8 + BGT VerLoop2 + + SUBS iFrameHeight, iFrameHeight, #1 + ADD pDstLeft, pDstLeft, Offset + ADD pDstRight, pDstRight, Offset + BGT HorLoop22 + + MOV Temp, iExpandPels + ;// + ;// Copy bottom-left and bottom-right region of the plane as follows + ;// bottom-left region = bottom-left corner pixel from the frame + ;// bottom-right region = bottom-right corner pixel from the frame + ;// +HorLoop33 + MOV Counter, iExpandPels +VerLoop3 + VST1 dData0, [pDstLeft], #8 + SUBS Counter, Counter, #8 + VST1 dData1, [pDstRight], #8 + BGT VerLoop3 + + SUBS Temp, Temp, #1 + ADD pDstLeft, pDstLeft, Offset + ADD pDstRight, pDstRight, Offset + BGT HorLoop33 +End + MOV r0, #OMX_Sts_NoErr + + M_END + + ENDIF + + + + +;// Guarding implementation by the processor name + + + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h new file mode 100755 index 0000000..547a2d9 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: armVCM4P10_CAVLCTables.h + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * Header file for optimized H.264 CALVC tables + * + */ + +#ifndef ARMVCM4P10_CAVLCTABLES_H +#define ARMVCM4P10_CAVLCTABLES_H + +/* CAVLC tables */ + +extern const OMX_U16 *armVCM4P10_CAVLCCoeffTokenTables[18]; +extern const OMX_U16 *armVCM4P10_CAVLCTotalZeroTables[15]; +extern const OMX_U16 *armVCM4P10_CAVLCTotalZeros2x2Tables[3]; +extern const OMX_U16 *armVCM4P10_CAVLCRunBeforeTables[15]; +extern const OMX_U8 armVCM4P10_ZigZag_4x4[16]; +extern const OMX_U8 armVCM4P10_ZigZag_2x2[4]; +extern const OMX_S8 armVCM4P10_SuffixToLevel[7]; + +#endif diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s new file mode 100755 index 0000000..4f0892d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s @@ -0,0 +1,222 @@ +;// +;// +;// File Name: armVCM4P10_Average_4x_Align_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + +;// Functions: +;// armVCM4P10_Average_4x4_Align<ALIGNMENT>_unsafe +;// +;// Implements Average of 4x4 with equation c = (a+b+1)>>1. +;// First operand will be at offset ALIGNMENT from aligned address +;// Second operand will be at aligned location and will be used as output. +;// destination pointed by (pDst) for vertical interpolation. +;// This function needs to copy 4 bytes in horizontal direction +;// +;// Registers used as input for this function +;// r0,r1,r2,r3 where r2 containings aligned memory pointer and r3 step size +;// +;// Registers preserved for top level function +;// r4,r5,r6,r8,r9,r14 +;// +;// Registers modified by the function +;// r7,r10,r11,r12 +;// +;// Output registers +;// r2 - pointer to the aligned location +;// r3 - step size to this aligned location + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + + EXPORT armVCM4P10_Average_4x4_Align0_unsafe + EXPORT armVCM4P10_Average_4x4_Align2_unsafe + EXPORT armVCM4P10_Average_4x4_Align3_unsafe + +DEBUG_ON SETL {FALSE} + +;// Declare input registers +pPred0 RN 0 +iPredStep0 RN 1 +pPred1 RN 2 +iPredStep1 RN 3 +pDstPred RN 2 +iDstStep RN 3 + +;// Declare other intermediate registers +iPredA0 RN 10 +iPredA1 RN 11 +iPredB0 RN 12 +iPredB1 RN 14 +Temp1 RN 4 +Temp2 RN 5 +ResultA RN 5 +ResultB RN 4 +r0x80808080 RN 7 + + IF ARM1136JS + + ;// This function calculates average of 4x4 block + ;// pPred0 is at alignment offset 0 and pPred1 is alignment 4 + + ;// Function header + M_START armVCM4P10_Average_4x4_Align0_unsafe, r6 + + ;// Code start + LDR r0x80808080, =0x80808080 + + ;// 1st load + M_LDR iPredB0, [pPred1] + M_LDR iPredA0, [pPred0], iPredStep0 + M_LDR iPredB1, [pPred1, iPredStep1] + M_LDR iPredA1, [pPred0], iPredStep0 + + ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128 + MVN iPredB0, iPredB0 + MVN iPredB1, iPredB1 + UHSUB8 ResultA, iPredA0, iPredB0 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep + + ;// 2nd load + M_LDR iPredA0, [pPred0], iPredStep0 + M_LDR iPredB0, [pPred1] + M_LDR iPredA1, [pPred0], iPredStep0 + M_LDR iPredB1, [pPred1, iPredStep1] + + MVN iPredB0, iPredB0 + UHSUB8 ResultA, iPredA0, iPredB0 + MVN iPredB1, iPredB1 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep +End0 + M_END + + ;// This function calculates average of 4x4 block + ;// pPred0 is at alignment offset 2 and pPred1 is alignment 4 + + ;// Function header + M_START armVCM4P10_Average_4x4_Align2_unsafe, r6 + + ;// Code start + LDR r0x80808080, =0x80808080 + + ;// 1st load + LDR Temp1, [pPred0, #4] + M_LDR iPredA0, [pPred0], iPredStep0 + M_LDR iPredB0, [pPred1] + M_LDR iPredB1, [pPred1, iPredStep1] + M_LDR Temp2, [pPred0, #4] + M_LDR iPredA1, [pPred0], iPredStep0 + MVN iPredB0, iPredB0 + MVN iPredB1, iPredB1 + MOV iPredA0, iPredA0, LSR #16 + ORR iPredA0, iPredA0, Temp1, LSL #16 + MOV iPredA1, iPredA1, LSR #16 + ORR iPredA1, iPredA1, Temp2, LSL #16 + + ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128 + UHSUB8 ResultA, iPredA0, iPredB0 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep + + ;// 2nd load + LDR Temp1, [pPred0, #4] + M_LDR iPredA0, [pPred0], iPredStep0 + LDR iPredB0, [pPred1] + LDR iPredB1, [pPred1, iPredStep1] + LDR Temp2, [pPred0, #4] + M_LDR iPredA1, [pPred0], iPredStep0 + MVN iPredB0, iPredB0 + MVN iPredB1, iPredB1 + MOV iPredA0, iPredA0, LSR #16 + ORR iPredA0, iPredA0, Temp1, LSL #16 + MOV iPredA1, iPredA1, LSR #16 + ORR iPredA1, iPredA1, Temp2, LSL #16 + + UHSUB8 ResultA, iPredA0, iPredB0 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep +End2 + M_END + + + ;// This function calculates average of 4x4 block + ;// pPred0 is at alignment offset 3 and pPred1 is alignment 4 + + ;// Function header + M_START armVCM4P10_Average_4x4_Align3_unsafe, r6 + + ;// Code start + LDR r0x80808080, =0x80808080 + + ;// 1st load + LDR Temp1, [pPred0, #4] + M_LDR iPredA0, [pPred0], iPredStep0 + LDR iPredB0, [pPred1] + LDR iPredB1, [pPred1, iPredStep1] + LDR Temp2, [pPred0, #4] + M_LDR iPredA1, [pPred0], iPredStep0 + + MVN iPredB0, iPredB0 + MVN iPredB1, iPredB1 + MOV iPredA0, iPredA0, LSR #24 + ORR iPredA0, iPredA0, Temp1, LSL #8 + MOV iPredA1, iPredA1, LSR #24 + ORR iPredA1, iPredA1, Temp2, LSL #8 + UHSUB8 ResultA, iPredA0, iPredB0 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep + + ;// 2nd load + LDR Temp1, [pPred0, #4] + M_LDR iPredA0, [pPred0], iPredStep0 + LDR iPredB0, [pPred1] + LDR iPredB1, [pPred1, iPredStep1] + LDR Temp2, [pPred0, #4] + M_LDR iPredA1, [pPred0], iPredStep0 + + MVN iPredB0, iPredB0 + MVN iPredB1, iPredB1 + MOV iPredA0, iPredA0, LSR #24 + ORR iPredA0, iPredA0, Temp1, LSL #8 + MOV iPredA1, iPredA1, LSR #24 + ORR iPredA1, iPredA1, Temp2, LSL #8 + + UHSUB8 ResultA, iPredA0, iPredB0 + UHSUB8 ResultB, iPredA1, iPredB1 + EOR ResultA, ResultA, r0x80808080 + M_STR ResultA, [pDstPred], iDstStep + EOR ResultB, ResultB, r0x80808080 + M_STR ResultB, [pDstPred], iDstStep +End3 + M_END + + ENDIF + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c new file mode 100755 index 0000000..137495d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c @@ -0,0 +1,327 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: armVCM4P10_CAVLCTables.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * Optimized CAVLC tables for H.264 + * + */ + +#include "omxtypes.h" +#include "armOMX.h" + +#include "armVCM4P10_CAVLCTables.h" + +/* 4x4 DeZigZag table */ + +const OMX_U8 armVCM4P10_ZigZag_4x4[16] = +{ + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +/* 2x2 DeZigZag table */ + +const OMX_U8 armVCM4P10_ZigZag_2x2[4] = +{ + 0, 1, 2, 3 +}; + + +/* + * Suffix To Level table + * We increment the suffix length if + * ((LevelCode>>1)+1)>(3<<(SuffixLength-1)) && SuffixLength<6 + * (LevelCode>>1)>=(3<<(SuffixLength-1)) && SuffixLength<6 + * LevelCode >= 3<<SuffixLength && SuffixLength<6 + * (LevelCode+2) >= (3<<SuffixLength)+2 && SuffixLength<6 + */ +const OMX_S8 armVCM4P10_SuffixToLevel[7] = +{ + (3<<1)+2, /* SuffixLength=1 */ + (3<<1)+2, /* SuffixLength=1 */ + (3<<2)+2, /* SuffixLength=2 */ + (3<<3)+2, /* SuffixLength=3 */ + (3<<4)+2, /* SuffixLength=4 */ + (3<<5)+2, /* SuffixLength=5 */ + -1 /* SuffixLength=6 - never increment */ +}; + +static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_0[132] = { + 0x0020, 0x0100, 0x2015, 0x2015, 0x400b, 0x400b, 0x400b, 0x400b, + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, + 0x0028, 0x00f0, 0x00f8, 0x0027, 0x0030, 0x00d8, 0x00e0, 0x00e8, + 0x0038, 0x00a0, 0x00c8, 0x00d0, 0x0040, 0x0068, 0x0090, 0x0098, + 0x0048, 0x0050, 0x0058, 0x0060, 0x27ff, 0x27ff, 0x206b, 0x206b, + 0x0081, 0x0085, 0x0083, 0x0079, 0x0087, 0x007d, 0x007b, 0x0071, + 0x007f, 0x0075, 0x0073, 0x0069, 0x0070, 0x0078, 0x0080, 0x0088, + 0x2077, 0x2077, 0x206d, 0x206d, 0x2063, 0x2063, 0x2061, 0x2061, + 0x206f, 0x206f, 0x2065, 0x2065, 0x205b, 0x205b, 0x2059, 0x2059, + 0x0067, 0x005d, 0x0053, 0x0051, 0x005f, 0x0055, 0x004b, 0x0049, + 0x00a8, 0x00b0, 0x00b8, 0x00c0, 0x2041, 0x2041, 0x204d, 0x204d, + 0x2043, 0x2043, 0x2039, 0x2039, 0x2057, 0x2057, 0x2045, 0x2045, + 0x203b, 0x203b, 0x2031, 0x2031, 0x204f, 0x204f, 0x203d, 0x203d, + 0x2033, 0x2033, 0x2029, 0x2029, 0x0047, 0x0035, 0x002b, 0x0021, + 0x203f, 0x203f, 0x202d, 0x202d, 0x2023, 0x2023, 0x2019, 0x2019, + 0x0037, 0x0025, 0x001b, 0x0011, 0x202f, 0x202f, 0x201d, 0x201d, + 0x0013, 0x0009, 0x201f, 0x201f +}; + +static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_1[128] = { + 0x0020, 0x00e8, 0x00f0, 0x00f8, 0x0027, 0x001f, 0x2015, 0x2015, + 0x400b, 0x400b, 0x400b, 0x400b, 0x4001, 0x4001, 0x4001, 0x4001, + 0x0028, 0x00d0, 0x00d8, 0x00e0, 0x0030, 0x0098, 0x00c0, 0x00c8, + 0x0038, 0x0060, 0x0088, 0x0090, 0x0040, 0x0048, 0x0050, 0x0058, + 0x27ff, 0x27ff, 0x207f, 0x207f, 0x0087, 0x0085, 0x0083, 0x0081, + 0x007b, 0x0079, 0x007d, 0x0073, 0x2075, 0x2075, 0x2071, 0x2071, + 0x0068, 0x0070, 0x0078, 0x0080, 0x2077, 0x2077, 0x206d, 0x206d, + 0x206b, 0x206b, 0x2069, 0x2069, 0x206f, 0x206f, 0x2065, 0x2065, + 0x2063, 0x2063, 0x2061, 0x2061, 0x0059, 0x005d, 0x005b, 0x0051, + 0x0067, 0x0055, 0x0053, 0x0049, 0x00a0, 0x00a8, 0x00b0, 0x00b8, + 0x205f, 0x205f, 0x204d, 0x204d, 0x204b, 0x204b, 0x2041, 0x2041, + 0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2039, 0x2039, + 0x204f, 0x204f, 0x203d, 0x203d, 0x203b, 0x203b, 0x2031, 0x2031, + 0x0029, 0x0035, 0x0033, 0x0021, 0x2047, 0x2047, 0x202d, 0x202d, + 0x202b, 0x202b, 0x2019, 0x2019, 0x003f, 0x0025, 0x0023, 0x0011, + 0x0037, 0x001d, 0x001b, 0x0009, 0x202f, 0x202f, 0x2013, 0x2013 +}; + +static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_2[112] = { + 0x0020, 0x0088, 0x00b0, 0x00b8, 0x00c0, 0x00c8, 0x00d0, 0x00d8, + 0x003f, 0x0037, 0x002f, 0x0027, 0x001f, 0x0015, 0x000b, 0x0001, + 0x0028, 0x0050, 0x0078, 0x0080, 0x0030, 0x0038, 0x0040, 0x0048, + 0x07ff, 0x0081, 0x0087, 0x0085, 0x0083, 0x0079, 0x007f, 0x007d, + 0x007b, 0x0071, 0x0077, 0x0075, 0x0073, 0x0069, 0x206b, 0x206b, + 0x0058, 0x0060, 0x0068, 0x0070, 0x2061, 0x2061, 0x206d, 0x206d, + 0x2063, 0x2063, 0x2059, 0x2059, 0x206f, 0x206f, 0x2065, 0x2065, + 0x205b, 0x205b, 0x2051, 0x2051, 0x0067, 0x005d, 0x0053, 0x0049, + 0x005f, 0x0055, 0x004b, 0x0041, 0x0090, 0x0098, 0x00a0, 0x00a8, + 0x2039, 0x2039, 0x2031, 0x2031, 0x204d, 0x204d, 0x2029, 0x2029, + 0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2021, 0x2021, + 0x0019, 0x003d, 0x003b, 0x0011, 0x004f, 0x0035, 0x0033, 0x0009, + 0x202b, 0x202b, 0x202d, 0x202d, 0x2023, 0x2023, 0x2025, 0x2025, + 0x201b, 0x201b, 0x2047, 0x2047, 0x201d, 0x201d, 0x2013, 0x2013 +}; + +static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_3[80] = { + 0x0020, 0x0028, 0x0030, 0x0038, 0x0040, 0x0048, 0x0050, 0x0058, + 0x0060, 0x0068, 0x0070, 0x0078, 0x0080, 0x0088, 0x0090, 0x0098, + 0x0009, 0x000b, 0x07ff, 0x0001, 0x0011, 0x0013, 0x0015, 0x07ff, + 0x0019, 0x001b, 0x001d, 0x001f, 0x0021, 0x0023, 0x0025, 0x0027, + 0x0029, 0x002b, 0x002d, 0x002f, 0x0031, 0x0033, 0x0035, 0x0037, + 0x0039, 0x003b, 0x003d, 0x003f, 0x0041, 0x0043, 0x0045, 0x0047, + 0x0049, 0x004b, 0x004d, 0x004f, 0x0051, 0x0053, 0x0055, 0x0057, + 0x0059, 0x005b, 0x005d, 0x005f, 0x0061, 0x0063, 0x0065, 0x0067, + 0x0069, 0x006b, 0x006d, 0x006f, 0x0071, 0x0073, 0x0075, 0x0077, + 0x0079, 0x007b, 0x007d, 0x007f, 0x0081, 0x0083, 0x0085, 0x0087 +}; + +static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_4[32] = { + 0x0020, 0x0038, 0x2015, 0x2015, 0x4001, 0x4001, 0x4001, 0x4001, + 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, + 0x0028, 0x0030, 0x0021, 0x0019, 0x2027, 0x2027, 0x0025, 0x0023, + 0x201d, 0x201d, 0x201b, 0x201b, 0x0011, 0x001f, 0x0013, 0x0009 +}; + +const OMX_U16 * armVCM4P10_CAVLCCoeffTokenTables[18] = { + armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=0 */ + armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=1 */ + armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=2 */ + armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=3 */ + armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=4 */ + armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=5 */ + armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=6 */ + armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=7 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=8 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=9 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=10 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=11 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=12 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=13 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=14 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=15 */ + armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=16 */ + armVCM4P10_CAVLCCoeffTokenTables_4 /* nC=-1 */ +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_0[40] = { + 0x0020, 0x0048, 0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003, + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, + 0x0028, 0x0040, 0x0011, 0x000f, 0x0030, 0x0038, 0x0019, 0x0017, + 0x27ff, 0x27ff, 0x201f, 0x201f, 0x201d, 0x201d, 0x201b, 0x201b, + 0x2015, 0x2015, 0x2013, 0x2013, 0x200d, 0x200d, 0x200b, 0x200b +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_1[24] = { + 0x0020, 0x0028, 0x0011, 0x000f, 0x000d, 0x000b, 0x2009, 0x2009, + 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001, + 0x001d, 0x001b, 0x0019, 0x0017, 0x2015, 0x2015, 0x2013, 0x2013 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_2[24] = { + 0x0020, 0x0028, 0x0011, 0x000b, 0x0009, 0x0001, 0x200f, 0x200f, + 0x200d, 0x200d, 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, + 0x001b, 0x0017, 0x2019, 0x2019, 0x2015, 0x2015, 0x2013, 0x2013 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_3[24] = { + 0x0020, 0x0028, 0x0013, 0x000f, 0x0007, 0x0005, 0x2011, 0x2011, + 0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2003, 0x2003, + 0x2019, 0x2019, 0x2017, 0x2017, 0x2015, 0x2015, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_4[20] = { + 0x0020, 0x0015, 0x0011, 0x0005, 0x0003, 0x0001, 0x200f, 0x200f, + 0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007, + 0x2017, 0x2017, 0x2013, 0x2013 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_5[20] = { + 0x0020, 0x0011, 0x2013, 0x2013, 0x200f, 0x200f, 0x200d, 0x200d, + 0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2005, 0x2005, + 0x0015, 0x0001, 0x2003, 0x2003 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_6[20] = { + 0x0020, 0x000f, 0x2011, 0x2011, 0x200d, 0x200d, 0x2009, 0x2009, + 0x2007, 0x2007, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b, + 0x0013, 0x0001, 0x2003, 0x2003 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_7[20] = { + 0x0020, 0x0003, 0x200f, 0x200f, 0x200d, 0x200d, 0x2007, 0x2007, + 0x400b, 0x400b, 0x400b, 0x400b, 0x4009, 0x4009, 0x4009, 0x4009, + 0x0011, 0x0001, 0x2005, 0x2005 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_8[20] = { + 0x0020, 0x0005, 0x200b, 0x200b, 0x400d, 0x400d, 0x400d, 0x400d, + 0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007, + 0x0003, 0x0001, 0x200f, 0x200f +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_9[20] = { + 0x0020, 0x000d, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b, + 0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007, + 0x2003, 0x2003, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_10[16] = { + 0x0001, 0x0003, 0x2005, 0x2005, 0x2007, 0x2007, 0x200b, 0x200b, + 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_11[16] = { + 0x0001, 0x0003, 0x2009, 0x2009, 0x4005, 0x4005, 0x4005, 0x4005, + 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_12[16] = { + 0x2001, 0x2001, 0x2003, 0x2003, 0x4007, 0x4007, 0x4007, 0x4007, + 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_13[16] = { + 0x4001, 0x4001, 0x4001, 0x4001, 0x4003, 0x4003, 0x4003, 0x4003, + 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_14[16] = { + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, + 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003 +}; + +const OMX_U16 * armVCM4P10_CAVLCTotalZeroTables[15] = { + armVCM4P10_CAVLCTotalZeroTables_0, + armVCM4P10_CAVLCTotalZeroTables_1, + armVCM4P10_CAVLCTotalZeroTables_2, + armVCM4P10_CAVLCTotalZeroTables_3, + armVCM4P10_CAVLCTotalZeroTables_4, + armVCM4P10_CAVLCTotalZeroTables_5, + armVCM4P10_CAVLCTotalZeroTables_6, + armVCM4P10_CAVLCTotalZeroTables_7, + armVCM4P10_CAVLCTotalZeroTables_8, + armVCM4P10_CAVLCTotalZeroTables_9, + armVCM4P10_CAVLCTotalZeroTables_10, + armVCM4P10_CAVLCTotalZeroTables_11, + armVCM4P10_CAVLCTotalZeroTables_12, + armVCM4P10_CAVLCTotalZeroTables_13, + armVCM4P10_CAVLCTotalZeroTables_14 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_0[16] = { + 0x2007, 0x2007, 0x2005, 0x2005, 0x4003, 0x4003, 0x4003, 0x4003, + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_1[16] = { + 0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003, + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001 +}; + +static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_2[16] = { + 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, + 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001 +}; + +const OMX_U16 * armVCM4P10_CAVLCTotalZeros2x2Tables[3] = { + armVCM4P10_CAVLCTotalZeros2x2Tables_0, + armVCM4P10_CAVLCTotalZeros2x2Tables_1, + armVCM4P10_CAVLCTotalZeros2x2Tables_2 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_0[8] = { + 0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_1[8] = { + 0x2005, 0x2005, 0x2003, 0x2003, 0x4001, 0x4001, 0x4001, 0x4001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_2[8] = { + 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_3[8] = { + 0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_4[8] = { + 0x000b, 0x0009, 0x0007, 0x0005, 0x2003, 0x2003, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_5[8] = { + 0x0003, 0x0005, 0x0009, 0x0007, 0x000d, 0x000b, 0x2001, 0x2001 +}; + +static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_6[24] = { + 0x0010, 0x000d, 0x000b, 0x0009, 0x0007, 0x0005, 0x0003, 0x0001, + 0x0018, 0x0011, 0x200f, 0x200f, 0x0020, 0x0015, 0x2013, 0x2013, + 0x0028, 0x0019, 0x2017, 0x2017, 0x07ff, 0x001d, 0x201b, 0x201b +}; + +/* Tables 7 to 14 are duplicates of table 6 */ + +const OMX_U16 * armVCM4P10_CAVLCRunBeforeTables[15] = { + armVCM4P10_CAVLCRunBeforeTables_0, /* ZerosLeft=1 */ + armVCM4P10_CAVLCRunBeforeTables_1, + armVCM4P10_CAVLCRunBeforeTables_2, + armVCM4P10_CAVLCRunBeforeTables_3, + armVCM4P10_CAVLCRunBeforeTables_4, + armVCM4P10_CAVLCRunBeforeTables_5, /* ZerosLeft=6 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=7 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=8 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=9 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=10 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=11 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=12 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=13 */ + armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=14 */ + armVCM4P10_CAVLCRunBeforeTables_6 /* ZerosLeft=15 */ +}; diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s new file mode 100755 index 0000000..4c3a77c --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s @@ -0,0 +1,198 @@ +;// +;// +;// File Name: armVCM4P10_DeblockingChroma_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + + IF CortexA8 + +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS3210 RN 6 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dP_3 DN D7.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 +dQ_3 DN D11.U8 + + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 + +;// bSLT4 +dTC3210 DN D18.U8 +dTCs DN D31.S8 +dTC DN D31.U8 + +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 +dMask_4 DN D26.U16 + +dTemp DN D28.U8 +dDummy DN D17.U8 + +;// Computing P0,Q0 +qDq0p0 QN Q10.S16 +qDp1q1 QN Q11.S16 +qDelta QN Q10.S16 ; reuse qDq0p0 +dDelta DN D20.S8 + + +;// Computing P1,Q1 +qP_0n QN Q14.S16 +qQ_0n QN Q12.S16 + +dQ_0n DN D24.U8 +dP_0n DN D29.U8 + +;// bSGE4 + +dHSp0q1 DN D13.U8 +dHSq0p1 DN D31.U8 + +dBS3210 DN D28.U16 + +dP_0t DN D13.U8 ;dHSp0q1 +dQ_0t DN D31.U8 ;Temp1 + +dP_0n DN D29.U8 +dQ_0n DN D24.U8 ;Temp2 + +;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe +;// +;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 +;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 +;// - Additional Params - pThresholds: r5 +;// +;// Outputs - Pixels - P0-P1: D29-D30, Q0-Q1: D24-D25 +;// - Additional Params - pThresholds: r5 + +;// Registers Corrupted - D18-D31 + + + M_START armVCM4P10_DeblockingChromabSLT4_unsafe + + + ;dTC3210 -18 + ;dTemp-28 + + VLD1 d18.U32[0], [pThresholds]! ;here + + ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3; + ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1 + + ;// qDp1q1-11 + ;// qDq0p0-10 + VSUBL qDp1q1, dP_1, dQ_1 + VMOV dTemp, dTC3210 + VSUBL qDq0p0, dQ_0, dP_0 + VSHR qDp1q1, qDp1q1, #2 + VZIP.8 dTC3210, dTemp + + ;// qDelta-qDq0p0-10 + + ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1) + + ;// dTC3210-18 + ;// dTemp-28 + ;// dTC-31 + VBIF dTC3210, dMask_0, dFilt + VRHADD qDelta, qDp1q1, qDq0p0 + VADD dTC, dTC3210, dMask_1 + VQMOVN dDelta, qDelta + ;// dDelta-d20 + + ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta); + VLD1 {dAlpha[]}, [pAlpha] + VMIN dDelta, dDelta, dTCs + VNEG dTCs, dTCs + VLD1 {dBeta[]}, [pBeta] + ;1 + VMAX dDelta, dDelta, dTCs + + ;// dP_0n - 29 + ;// dQ_0n - 24 + + ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta); + ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta); + + ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); + ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); + + ;// qP_0n - 14 + ;// qQ_0n - 12 + + VMOVL qP_0n, dP_0 + VMOVL qQ_0n, dQ_0 + + ;1 + VADDW qP_0n, qP_0n, dDelta + VSUBW qQ_0n, qQ_0n, dDelta + + VQMOVUN dP_0n, qP_0n + VQMOVUN dQ_0n, qQ_0n + + M_END + +;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() +;// +;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 +;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 +;// - Additional Params - alpha: D0, dMask_1: D15 +;// +;// Outputs - Pixels - P0-P2: D29-D31, Q0-Q2: D24,D25,D28 + +;// Registers Corrupted - D18-D31 + + M_START armVCM4P10_DeblockingChromabSGE4_unsafe + + ;dHSq0p1 - 31 + ;dHSp0q1 - 13 + VHADD dHSp0q1, dP_0, dQ_1 + VHADD dHSq0p1, dQ_0, dP_1 + + ;// Prepare the bS mask + + ;// dHSp0q1-13 + ;// dP_0t-dHSp0q1-13 + ;// dHSq0p1-31 + ;// dQ_0t-Temp1-31 + VLD1 {dAlpha[]}, [pAlpha] + ADD pThresholds, pThresholds, #4 + VLD1 {dBeta[]}, [pBeta] + + VRHADD dP_0t, dHSp0q1, dP_1 + VRHADD dQ_0t, dHSq0p1, dQ_1 + + M_END + + ENDIF + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s new file mode 100755 index 0000000..0afe4fd --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s @@ -0,0 +1,396 @@ +;// +;// +;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + + IF CortexA8 + +pThresholds RN 5 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dP_3 DN D7.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 +dQ_3 DN D11.U8 + + +;// Filtering Decision +dAlpha DN D0.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 + +;// bSLT4 +dTC0 DN D18.U8 +dTC1 DN D19.U8 +dTC01 DN D18.U8 + +dTCs DN D31.S8 +dTC DN D31.U8 + +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 + +dTemp DN D19.U8 + +;// Computing P0,Q0 +qDq0p0 QN Q10.S16 +qDp1q1 QN Q11.S16 +qDelta QN Q10.S16 ; reuse qDq0p0 +dDelta DN D20.S8 + + +;// Computing P1,Q1 +dRp0q0 DN D24.U8 + +dMaxP DN D23.U8 +dMinP DN D22.U8 + +dMaxQ DN D19.U8 +dMinQ DN D21.U8 + +dDeltaP DN D26.U8 +dDeltaQ DN D27.U8 + +qP_0n QN Q14.S16 +qQ_0n QN Q12.S16 + +dQ_0n DN D24.U8 +dQ_1n DN D25.U8 +dP_0n DN D29.U8 +dP_1n DN D30.U8 + +;// bSGE4 + +qSp0q0 QN Q10.U16 + +qSp2q1 QN Q11.U16 +qSp0q0p1 QN Q12.U16 +qSp3p2 QN Q13.U16 +dHSp0q1 DN D28.U8 + +qSq2p1 QN Q11.U16 +qSp0q0q1 QN Q12.U16 +qSq3q2 QN Q13.U16 ;!! +dHSq0p1 DN D28.U8 ;!! + +qTemp1 QN Q11.U16 ;!!;qSp2q1 +qTemp2 QN Q12.U16 ;!!;qSp0q0p1 + +dP_0t DN D28.U8 ;!!;dHSp0q1 +dQ_0t DN D22.U8 ;!!;Temp1 + +dP_0n DN D29.U8 +dP_1n DN D30.U8 +dP_2n DN D31.U8 + +dQ_0n DN D24.U8 ;!!;Temp2 +dQ_1n DN D25.U8 ;!!;Temp2 +dQ_2n DN D28.U8 ;!!;dQ_0t + +;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe +;// +;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 +;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 +;// - Additional Params - pThresholds: r5 +;// +;// Outputs - Pixels - P0-P1: D29-D30, Q0-Q1: D24-D25 +;// - Additional Params - pThresholds: r5 + +;// Registers Corrupted - D18-D31 + + + M_START armVCM4P10_DeblockingLumabSLT4_unsafe + + + ;// qDq0p0-10 + VSUBL qDp1q1, dP_1, dQ_1 + VLD1 {dTC0[]}, [pThresholds]! + ;// qDp1q1-11 + VSUBL qDq0p0, dQ_0, dP_0 + VLD1 {dTC1[]}, [pThresholds]! + + ;// dRp0q0-24 + VSHR qDp1q1, qDp1q1, #2 + + ;// dTC01 = (dTC1 << 4) | dTC0 + ;// dTC01-18 + VEXT dTC01, dTC0, dTC1, #4 + ;// dTemp-19 + VAND dTemp, dApflg, dMask_1 + + VBIF dTC01, dMask_0, dFilt + + + ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3; + ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1 + + ;// qDelta-qDq0p0-10 + VRHADD qDelta, qDp1q1, qDq0p0 + VRHADD dRp0q0, dP_0, dQ_0 + VADD dTC, dTC01, dTemp + + ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1) + + VAND dTemp, dAqflg, dMask_1 + VQADD dMaxP, dP_1, dTC01 + VQMOVN dDelta, qDelta + VADD dTC, dTC, dTemp + + ;// dMaxP = QADD(dP_1, dTC01) + ;// dMinP = QSUB(dP_1, dTC01) + + ;// dMaxP-d23 + ;// dMinP-d22 + VQSUB dMinP, dP_1, dTC01 + + ;// dDelta-d20 + + ;// dMaxQ = QADD(dQ_1, dTC01) + ;// dMinQ = QSUB(dQ_1, dTC01) + + ;// dMaxQ-19 + ;// dMinQ-21 + VQADD dMaxQ, dQ_1, dTC01 + VHADD dDeltaP, dRp0q0, dP_2 + VMIN dDelta, dDelta, dTCs + + ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta); + VNEG dTCs, dTCs + + VQSUB dMinQ, dQ_1, dTC01 + + ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1; + ;// delta = armClip(-tC0, tC0, delta); + ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta); + + ;// dDeltaP = (dP_2 + dRp0q0)>>1; + ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP); + ;// dP_1n = armClip(MinP, MaxP, dDeltaP); + + ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1; + ;// delta = armClip(-tC0, tC0, delta); + ;// pQ0[1*Step] = (OMX_U8)(q1 + delta); + + ;// dDeltaQ = (dQ_2 + dRp0q0)>>1; + ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ); + ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ); + + ;// dDeltaP-26 + VHADD dDeltaQ, dRp0q0, dQ_2 + + ;// dDeltaQ-27 + + ;// dP_0n - 29 + ;// dP_1n - 30 + ;// dQ_0n - 24 + ;// dQ_1n - 25 + + ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1; + ;// dDeltaQ = (dQ_2 + dRp0q0)>>1; + + VMAX dP_1n, dDeltaP, dMinP + VMAX dDelta, dDelta, dTCs + + ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta); + ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta); + + ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); + ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); + + ;// qP_0n - 14 + ;// qQ_0n - 12 + + VMOVL qP_0n, dP_0 + VMOVL qQ_0n, dQ_0 + + VADDW qP_0n, qP_0n, dDelta + VSUBW qQ_0n, qQ_0n, dDelta + + VQMOVUN dP_0n, qP_0n + VQMOVUN dQ_0n, qQ_0n + + VMAX dQ_1n, dDeltaQ, dMinQ + + VMIN dP_1n, dP_1n, dMaxP + VMIN dQ_1n, dQ_1n, dMaxQ + VBIF dP_0n, dP_0, dFilt + + VBIF dP_1n, dP_1, dApflg + VBIF dQ_0n, dQ_0, dFilt + VBIF dQ_1n, dQ_1, dAqflg + + M_END + +;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() +;// +;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 +;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 +;// - Additional Params - alpha: D0, dMask_1: D15 +;// +;// Outputs - Pixels - P0-P2: D29-D31, Q0-Q2: D24,D25,D28 + +;// Registers Corrupted - D18-D31 + + M_START armVCM4P10_DeblockingLumabSGE4_unsafe + + + ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2) + ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2) + + ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) ) + ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) ) + + ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) ) + ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) ) + + ;// P Filter + + VSHR dTemp, dAlpha, #2 + VADD dTemp, dTemp, dMask_1 + + ;// qSp0q0-10 + VADDL qSp0q0, dQ_0, dP_0 + VADD dTemp, dTemp, dMask_1 + + ;// qSp2q1-11 + ;// qSp0q0p1-12 + VADDL qSp2q1, dP_2, dQ_1 + VADDW qSp0q0p1, qSp0q0, dP_1 + + VCGT dTemp, dTemp, dAp0q0 + VSHR qSp2q1, #1 + + ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3); + ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1 + + ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1 + ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1 + ;// dP_0n = ( qTemp1 + 1 ) >> 1 + + ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2); + + ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2); + ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2); + + ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3); + ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2); + + ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2); + ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2); + + ;// qTemp1-qSp2q1-11 + ;// qTemp2-qSp0q0p1-12 + VHADD qTemp1, qSp0q0p1, qSp2q1 + VADDW qTemp2, qSp0q0p1, dP_2 + + ;// qSp3p2-13 + VADDL qSp3p2, dP_3, dP_2 + + VAND dApflg, dApflg, dTemp + VHADD dHSp0q1, dP_0, dQ_1 + VSRA qSp3p2, qTemp2, #1 + ;// dHSp0q1-28 + VAND dAqflg, dAqflg, dTemp + + ;// dP_0n-29 + ;// dP_0t-dHSp0q1-28 + VQRSHRN dP_0n, qTemp1, #1 + VRHADD dP_0t, dHSp0q1, dP_1 + + ;// dP_1n-30 + VQRSHRN dP_1n, qTemp2, #2 + + VADDL qSq2p1, dQ_2, dP_1 + VADDW qSp0q0q1, qSp0q0, dQ_1 + + VBIF dP_0n, dP_0t, dApflg + + ;// Q Filter + + ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3); + ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1 + + ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1 + ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1 + ;// dQ_0n = ( qTemp1 + 1 ) >> 1 + + ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2); + + ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2); + ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2); + + ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3); + ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2); + + ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2); + ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2); + + ;// qTemp1-qSp2q1-11 + ;// qTemp2-qSp0q0p1-12 + ;// qSq2p1-11 + ;// qSp0q0q1-12 + + + ;// qTemp2-qSp0q0p1-12 + ;// qTemp1-qSq2p1-11 + ;// qSq3q2-13 + ;// dP_2n-31 + + VQRSHRN dP_2n, qSp3p2, #2 + VADDL qSq3q2, dQ_3, dQ_2 + + VSHR qSq2p1, #1 + + VHADD qTemp1, qSp0q0q1, qSq2p1 + VADDW qTemp2, qSp0q0q1, dQ_2 + + ;// dHSq0p1-28 + VHADD dHSq0p1, dQ_0, dP_1 + + VBIF dP_0n, dP_0, dFilt + VBIF dP_1n, dP_1, dApflg + + VSRA qSq3q2, qTemp2, #1 + + ;// dQ_1-Temp2-25 + ;// dQ_0-Temp2-24 + VQRSHRN dQ_1n, qTemp2, #2 + VQRSHRN dQ_0n, qTemp1, #1 + + ;// dQ_0t-Temp1-22 + VRHADD dQ_0t, dHSq0p1, dQ_1 + VBIF dQ_1n, dQ_1, dAqflg + + VBIF dP_2n, dP_2, dApflg + VBIF dQ_0n, dQ_0t, dAqflg + VQRSHRN dQ_2n, qSq3q2, #2 + VBIF dQ_0n, dQ_0, dFilt + VBIF dQ_2n, dQ_2, dAqflg + + M_END + + ENDIF + + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s new file mode 100755 index 0000000..10a89e9 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s @@ -0,0 +1,325 @@ +;// +;// +;// File Name: armVCM4P10_DecodeCoeffsToPair_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + + IMPORT armVCM4P10_CAVLCCoeffTokenTables + IMPORT armVCM4P10_CAVLCTotalZeroTables + IMPORT armVCM4P10_CAVLCTotalZeros2x2Tables + IMPORT armVCM4P10_CAVLCRunBeforeTables + IMPORT armVCM4P10_SuffixToLevel + IMPORT armVCM4P10_ZigZag_4x4 + IMPORT armVCM4P10_ZigZag_2x2 + + M_VARIANTS ARM1136JS + +;//DEBUG_ON SETL {TRUE} + +LAST_COEFF EQU 0x20 ;// End of block flag +TWO_BYTE_COEFF EQU 0x10 + +;// Declare input registers + +ppBitStream RN 0 +pOffset RN 1 +pNumCoeff RN 2 +ppPosCoefbuf RN 3 +nC RN 4 ;// number of coeffs or 17 for chroma +sMaxNumCoeff RN 5 + +;// Declare inner loop registers + +;// Level loop +Count RN 0 +TrailingOnes RN 1 +pLevel RN 2 +LevelSuffix RN 3 +SuffixLength RN 4 +TotalCoeff RN 5 + +pVLDTable RN 6 +Symbol RN 7 +T1 RN 8 +T2 RN 9 +RBitStream RN 10 +RBitBuffer RN 11 +RBitCount RN 12 +lr RN 14 + +;// Run loop +Count RN 0 +ZerosLeft RN 1 +pLevel RN 2 +ppRunTable RN 3 +pRun RN 4 +TotalCoeff RN 5 + +pVLDTable RN 6 +Symbol RN 7 +T1 RN 8 +T2 RN 9 +RBitStream RN 10 +RBitBuffer RN 11 +RBitCount RN 12 +lr RN 14 + +;// Fill in coefficients loop +pPosCoefbuf RN 0 +temp RN 1 +pLevel RN 2 +ppPosCoefbuf RN 3 +pRun RN 4 +TotalCoeff RN 5 +pZigZag RN 6 + +T1 RN 8 +T2 RN 9 +RBitStream RN 10 +RBitBuffer RN 11 +RBitCount RN 12 +CoeffNum RN 14 + + + + IF ARM1136JS + + ;// Allocate stack memory required by the function + M_ALLOC4 pppBitStream, 4 + M_ALLOC4 ppOffset, 4 + M_ALLOC4 pppPosCoefbuf, 4 + M_ALLOC4 ppLevel, 16*2 + M_ALLOC4 ppRun, 16 + + ;// Write function header + M_START armVCM4P10_DecodeCoeffsToPair, r11 + + ;// Define stack arguments + M_ARG pNC, 4 + M_ARG pSMaxNumCoeff,4 + + ;// Code start + M_BD_INIT0 ppBitStream, pOffset, RBitStream, RBitBuffer, RBitCount + LDR pVLDTable, =armVCM4P10_CAVLCCoeffTokenTables + M_LDR nC, pNC + + M_BD_INIT1 T1, T2, lr + LDR pVLDTable, [pVLDTable, nC, LSL #2] ;// Find VLD table + + M_BD_INIT2 T1, T2, lr + + ;// Decode Symbol = TotalCoeff*4 + TrailingOnes + M_BD_VLD Symbol, T1, T2, pVLDTable, 4, 2 + + MOVS TotalCoeff, Symbol, LSR #2 + STRB TotalCoeff, [pNumCoeff] + M_PRINTF "TotalCoeff=%d\n", TotalCoeff + BEQ.W EndNoError ;// Finished if no coefficients + + CMP Symbol, #17*4 + BGE.W EndBadSymbol ;// Error if bad symbol + + ;// Save bitstream pointers + M_STR ppBitStream, pppBitStream + M_STR pOffset, ppOffset + M_STR ppPosCoefbuf, pppPosCoefbuf + + ;// Decode Trailing Ones + ANDS TrailingOnes, Symbol, #3 + M_ADR pLevel, ppLevel + M_PRINTF "TrailingOnes=%d\n", TrailingOnes + BEQ TrailingOnesDone + MOV Count, TrailingOnes +TrailingOnesLoop + M_BD_READ8 Symbol, 1, T1 + SUBS Count, Count, #1 + MOV T1, #1 + SUB T1, T1, Symbol, LSL #1 + M_PRINTF "Level=%d\n", T1 + STRH T1, [pLevel], #2 + BGT TrailingOnesLoop +TrailingOnesDone + + ;// Decode level values + SUBS Count, TotalCoeff, TrailingOnes ;// Number of levels to read + BEQ DecodeRuns ;// None left + + MOV SuffixLength, #1 + CMP TotalCoeff, #10 + MOVLE SuffixLength, #0 + CMP TrailingOnes, #3 ;// if (TrailingOnes<3) + MOVLT TrailingOnes, #4 ;// then TrailingOnes = +4 + MOVGE TrailingOnes, #2 ;// else TrailingOnes = +2 + MOVGE SuffixLength, #0 ;// SuffixLength = 0 + +LevelLoop + M_BD_CLZ16 Symbol, T1, T2 ;// Symbol=LevelPrefix + CMP Symbol,#16 + BGE EndBadSymbol + + MOVS lr, SuffixLength ;// if LevelSuffixSize==0 + TEQEQ Symbol, #14 ;// and LevelPrefix==14 + MOVEQ lr, #4 ;// then LevelSuffixSize=4 + TEQ Symbol, #15 ;// if LevelSuffixSize==15 + MOVEQ lr, #12 ;// then LevelSuffixSize=12 + + TEQEQ SuffixLength,#0 + ADDEQ Symbol,Symbol,#15 + + TEQ lr, #0 ;// if LevelSuffixSize==0 + BEQ LevelCodeRead ;// LevelCode = LevelPrefix + + M_BD_VREAD16 LevelSuffix, lr, T1, T2 ;// Read Level Suffix + + MOV Symbol, Symbol, LSL SuffixLength + ADD Symbol, LevelSuffix, Symbol + +LevelCodeRead + ;// Symbol = LevelCode + ADD Symbol, Symbol, TrailingOnes ;// +4 if level cannot be +/-1, +2 o/w + MOV TrailingOnes, #2 + MOVS T1, Symbol, LSR #1 + RSBCS T1, T1, #0 ;// If Symbol odd then negate + M_PRINTF "Level=%d\n", T1 + STRH T1, [pLevel], #2 ;// Store level. + + LDR T2, =armVCM4P10_SuffixToLevel + LDRSB T1, [T2, SuffixLength] ;// Find increment level + TEQ SuffixLength, #0 + MOVEQ SuffixLength, #1 + CMP Symbol, T1 + ADDCS SuffixLength, SuffixLength, #1 + SUBS Count, Count, #1 + BGT LevelLoop + +DecodeRuns + ;// Find number of zeros + M_LDR T1, pSMaxNumCoeff ;// sMaxNumCoeff + SUB Count, TotalCoeff, #1 ;// Number of runs excluding last + SUBS ZerosLeft, T1, TotalCoeff ;// Maximum number of zeros there could be + M_ADR pRun, ppRun + MOV CoeffNum,TotalCoeff + SUB CoeffNum,CoeffNum,#1 + BEQ NoZerosLeft + + ;// Unpack number of zeros from bitstream + TEQ T1, #4 + LDREQ pVLDTable, =(armVCM4P10_CAVLCTotalZeros2x2Tables-4) + LDRNE pVLDTable, =(armVCM4P10_CAVLCTotalZeroTables-4) + LDR pVLDTable, [pVLDTable, TotalCoeff, LSL #2] + + M_BD_VLD Symbol, T1, T2, pVLDTable, 4, 2 ;// Symbol = ZerosLeft + CMP Symbol,#16 + BGE EndBadSymbol + + LDR ppRunTable, =(armVCM4P10_CAVLCRunBeforeTables-4) + M_ADR pRun, ppRun + MOVS ZerosLeft, Symbol + + ADD CoeffNum,CoeffNum,ZerosLeft + + BEQ NoZerosLeft + + ;// Decode runs while zeros are left and more than one coefficient +RunLoop + SUBS Count, Count, #1 + LDR pVLDTable, [ppRunTable, ZerosLeft, LSL#2] + BLT LastRun + M_BD_VLD Symbol, T1, T2, pVLDTable, 3, 2 ;// Symbol = Run + CMP Symbol,#15 + BGE EndBadSymbol + + SUBS ZerosLeft, ZerosLeft, Symbol + M_PRINTF "Run=%d\n", Symbol + STRB Symbol, [pRun], #1 + BGT RunLoop + + ;// Decode runs while no zeros are left +NoZerosLeft + SUBS Count, Count, #1 + M_PRINTF "Run=%d\n", ZerosLeft + STRGEB ZerosLeft, [pRun], #1 + BGT NoZerosLeft + +LastRun + ;// Final run length is remaining zeros + M_PRINTF "LastRun=%d\n", ZerosLeft + STRB ZerosLeft, [pRun], #1 + + ;// Write coefficients to output array + M_LDR T1, pSMaxNumCoeff ;// sMaxNumCoeff + TEQ T1, #15 + ADDEQ CoeffNum,CoeffNum,#1 + + + SUB pRun,pRun,TotalCoeff + SUB pLevel,pLevel,TotalCoeff + SUB pLevel,pLevel,TotalCoeff + + M_LDR ppPosCoefbuf, pppPosCoefbuf + LDR pPosCoefbuf, [ppPosCoefbuf] + TEQ T1, #4 + LDREQ pZigZag, =armVCM4P10_ZigZag_2x2 + LDRNE pZigZag, =armVCM4P10_ZigZag_4x4 + + + +OutputLoop + + LDRB T2, [pRun],#1 + LDRB T1, [pZigZag, CoeffNum] + SUB CoeffNum, CoeffNum, #1 ;// Skip Non zero + SUB CoeffNum, CoeffNum, T2 ;// Skip Zero run + + LDRSH T2, [pLevel],#2 + + SUBS TotalCoeff, TotalCoeff, #1 + ORREQ T1, T1, #LAST_COEFF + + ADD temp, T2, #128 + CMP temp, #256 + ORRCS T1, T1, #TWO_BYTE_COEFF + + + TEQ TotalCoeff, #0 ;// Preserves carry + + M_PRINTF "Output=%02x %04x\n", T1, T2 + STRB T1, [pPosCoefbuf], #1 + STRB T2, [pPosCoefbuf], #1 + MOV T2, T2, LSR #8 + STRCSB T2, [pPosCoefbuf], #1 + BNE OutputLoop + + ;// Finished + STR pPosCoefbuf, [ppPosCoefbuf] + M_LDR ppBitStream, pppBitStream + M_LDR pOffset, ppOffset + B EndNoError + +EndBadSymbol + MOV r0, #OMX_Sts_Err + B End + +EndNoError + ;// Finished reading from the bitstream + M_BD_FINI ppBitStream, pOffset + + ;// Set return value + MOV r0, #OMX_Sts_NoErr +End + M_END + + ENDIF + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s new file mode 100755 index 0000000..2761600 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s @@ -0,0 +1,123 @@ +;// +;// +;// File Name: armVCM4P10_DequantTables_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + EXPORT armVCM4P10_QPDivTable + EXPORT armVCM4P10_VMatrixQPModTable + EXPORT armVCM4P10_PosToVCol4x4 + EXPORT armVCM4P10_PosToVCol2x2 + EXPORT armVCM4P10_VMatrix + EXPORT armVCM4P10_QPModuloTable + EXPORT armVCM4P10_VMatrixU16 + +;// Define the processor variants supported by this file + + M_VARIANTS CortexA8 + + +;// Guarding implementation by the processor name + + + IF CortexA8 + + + M_TABLE armVCM4P10_PosToVCol4x4 + DCB 0, 2, 0, 2 + DCB 2, 1, 2, 1 + DCB 0, 2, 0, 2 + DCB 2, 1, 2, 1 + + + M_TABLE armVCM4P10_PosToVCol2x2 + DCB 0, 2 + DCB 2, 1 + + + M_TABLE armVCM4P10_VMatrix + DCB 10, 16, 13 + DCB 11, 18, 14 + DCB 13, 20, 16 + DCB 14, 23, 18 + DCB 16, 25, 20 + DCB 18, 29, 23 + +;//------------------------------------------------------- +;// This table evaluates the expression [(INT)(QP/6)], +;// for values of QP from 0 to 51 (inclusive). +;//------------------------------------------------------- + + M_TABLE armVCM4P10_QPDivTable + DCB 0, 0, 0, 0, 0, 0 + DCB 1, 1, 1, 1, 1, 1 + DCB 2, 2, 2, 2, 2, 2 + DCB 3, 3, 3, 3, 3, 3 + DCB 4, 4, 4, 4, 4, 4 + DCB 5, 5, 5, 5, 5, 5 + DCB 6, 6, 6, 6, 6, 6 + DCB 7, 7, 7, 7, 7, 7 + DCB 8, 8, 8, 8, 8, 8 + +;//---------------------------------------------------- +;// This table contains armVCM4P10_VMatrix[QP%6][0] entires, +;// for values of QP from 0 to 51 (inclusive). +;//---------------------------------------------------- + + M_TABLE armVCM4P10_VMatrixQPModTable + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + DCB 10, 11, 13, 14, 16, 18 + +;//------------------------------------------------------- +;// This table evaluates the modulus expression [QP%6]*6, +;// for values of QP from 0 to 51 (inclusive). +;//------------------------------------------------------- + + M_TABLE armVCM4P10_QPModuloTable + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + DCB 0, 6, 12, 18, 24, 30 + +;//------------------------------------------------------- +;// This table contains the invidual byte values stored as +;// halfwords. This avoids unpacking inside the function +;//------------------------------------------------------- + + M_TABLE armVCM4P10_VMatrixU16 + DCW 10, 16, 13 + DCW 11, 18, 14 + DCW 13, 20, 16 + DCW 14, 23, 18 + DCW 16, 25, 20 + DCW 18, 29, 23 + + ENDIF ;//ARM1136JS + + + + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s new file mode 100755 index 0000000..6e912d7 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s @@ -0,0 +1,236 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_Align_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + + EXPORT armVCM4P10_InterpolateLuma_HorAlign9x_unsafe + EXPORT armVCM4P10_InterpolateLuma_VerAlign4x_unsafe + +DEBUG_ON SETL {FALSE} + + IF ARM1136JS + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 8 +iHeight RN 9 + +;// Declare inner loop registers +x RN 7 +x0 RN 7 +x1 RN 10 +x2 RN 11 +Scratch RN 12 + +;// Function: +;// armVCM4P10_InterpolateLuma_HorAlign9x_unsafe +;// +;// Implements copy from an arbitrary aligned source memory location (pSrc) to a 4 byte aligned +;// destination pointed by (pDst) for horizontal interpolation. +;// This function needs to copy 9 bytes in horizontal direction. +;// +;// Registers used as input for this function +;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no rows to copy +;// +;// Registers preserved for top level function +;// r2,r3,r4,r5,r6 +;// +;// Registers modified by the function +;// r7,r8,r9,r10,r11,r12 +;// +;// Output registers +;// r0 - pointer to the new aligned location which will be used as pSrc +;// r1 - step size to this aligned location + + ;// Function header + M_START armVCM4P10_InterpolateLuma_HorAlign9x_unsafe + + ;// Copy pDst to scratch + MOV Scratch, pDst + +StartAlignedStackCopy + AND x, pSrc, #3 + BIC pSrc, pSrc, #3 + + M_SWITCH x + M_CASE Copy0toAligned + M_CASE Copy1toAligned + M_CASE Copy2toAligned + M_CASE Copy3toAligned + M_ENDSWITCH + +Copy0toAligned + LDM pSrc, {x0, x1, x2} + SUBS iHeight, iHeight, #1 + ADD pSrc, pSrc, srcStep + + ;// One cycle stall + + STM pDst!, {x0, x1, x2} ;// Store aligned output row + BGT Copy0toAligned + B CopyEnd + +Copy1toAligned + LDM pSrc, {x0, x1, x2} + SUBS iHeight, iHeight, #1 + ADD pSrc, pSrc, srcStep + + ;// One cycle stall + + MOV x0, x0, LSR #8 + ORR x0, x0, x1, LSL #24 + MOV x1, x1, LSR #8 + ORR x1, x1, x2, LSL #24 + MOV x2, x2, LSR #8 + STM pDst!, {x0, x1, x2} ;// Store aligned output row + BGT Copy1toAligned + B CopyEnd + +Copy2toAligned + LDM pSrc, {x0, x1, x2} + SUBS iHeight, iHeight, #1 + ADD pSrc, pSrc, srcStep + + ;// One cycle stall + + MOV x0, x0, LSR #16 + ORR x0, x0, x1, LSL #16 + MOV x1, x1, LSR #16 + ORR x1, x1, x2, LSL #16 + MOV x2, x2, LSR #16 + STM pDst!, {x0, x1, x2} ;// Store aligned output row + BGT Copy2toAligned + B CopyEnd + +Copy3toAligned + LDM pSrc, {x0, x1, x2} + SUBS iHeight, iHeight, #1 + ADD pSrc, pSrc, srcStep + + ;// One cycle stall + + MOV x0, x0, LSR #24 + ORR x0, x0, x1, LSL #8 + MOV x1, x1, LSR #24 + ORR x1, x1, x2, LSL #8 + MOV x2, x2, LSR #24 + STM pDst!, {x0, x1, x2} ;// Store aligned output row + BGT Copy3toAligned + +CopyEnd + + MOV pSrc, Scratch + MOV srcStep, #12 + + M_END + + +;// Function: +;// armVCM4P10_InterpolateLuma_VerAlign4x_unsafe +;// +;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned +;// destination pointed by (pDst) for vertical interpolation. +;// This function needs to copy 4 bytes in horizontal direction +;// +;// Registers used as input for this function +;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no of rows to copy +;// +;// Registers preserved for top level function +;// r2,r3,r4,r5,r6 +;// +;// Registers modified by the function +;// r7,r8,r9,r10,r11,r12 +;// +;// Output registers +;// r0 - pointer to the new aligned location which will be used as pSrc +;// r1 - step size to this aligned location + + ;// Function header + M_START armVCM4P10_InterpolateLuma_VerAlign4x_unsafe + + ;// Copy pSrc to stack +StartVAlignedStackCopy + AND x, pSrc, #3 + BIC pSrc, pSrc, #3 + + + M_SWITCH x + M_CASE Copy0toVAligned + M_CASE Copy1toVAligned + M_CASE Copy2toVAligned + M_CASE Copy3toVAligned + M_ENDSWITCH + +Copy0toVAligned + M_LDR x0, [pSrc], srcStep + SUBS iHeight, iHeight, #1 + + ;// One cycle stall + + STR x0, [pDst], #4 ;// Store aligned output row + BGT Copy0toVAligned + B CopyVEnd + +Copy1toVAligned + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + SUBS iHeight, iHeight, #1 + + ;// One cycle stall + + MOV x1, x1, LSL #24 + ORR x0, x1, x0, LSR #8 + STR x0, [pDst], #4 ;// Store aligned output row + BGT Copy1toVAligned + B CopyVEnd + +Copy2toVAligned + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + SUBS iHeight, iHeight, #1 + + ;// One cycle stall + + MOV x1, x1, LSL #16 + ORR x0, x1, x0, LSR #16 + STR x0, [pDst], #4 ;// Store aligned output row + BGT Copy2toVAligned + B CopyVEnd + +Copy3toVAligned + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + SUBS iHeight, iHeight, #1 + + ;// One cycle stall + + MOV x1, x1, LSL #8 + ORR x0, x1, x0, LSR #24 + STR x0, [pDst], #4 ;// Store aligned output row + BGT Copy3toVAligned + +CopyVEnd + + SUB pSrc, pDst, #28 + MOV srcStep, #4 + + M_END + + + ENDIF + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s new file mode 100755 index 0000000..d275891 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s @@ -0,0 +1,149 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_Copy_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + +;// Function: +;// armVCM4P10_InterpolateLuma_Copy4x4_unsafe +;// +;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned +;// destination pointed by (pDst) +;// +;// Registers preserved for top level function +;// r1,r3,r4,r5,r6,r7,r10,r11,r14 +;// +;// Registers modified by the function +;// r0,r2,r8,r9,r12 + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + + EXPORT armVCM4P10_InterpolateLuma_Copy4x4_unsafe + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +;// Declare other intermediate registers +x0 RN 4 +x1 RN 5 +x2 RN 8 +x3 RN 9 +Temp RN 12 + + IF ARM1136JS + + M_START armVCM4P10_InterpolateLuma_Copy4x4_unsafe, r6 + +Copy4x4Start + ;// Do Copy and branch to EndOfInterpolation + AND Temp, pSrc, #3 + BIC pSrc, pSrc, #3 + + M_SWITCH Temp + M_CASE Copy4x4Align0 + M_CASE Copy4x4Align1 + M_CASE Copy4x4Align2 + M_CASE Copy4x4Align3 + M_ENDSWITCH + +Copy4x4Align0 + M_LDR x0, [pSrc], srcStep + M_LDR x1, [pSrc], srcStep + M_STR x0, [pDst], dstStep + M_LDR x2, [pSrc], srcStep + M_STR x1, [pDst], dstStep + M_LDR x3, [pSrc], srcStep + M_STR x2, [pDst], dstStep + M_STR x3, [pDst], dstStep + B Copy4x4End + +Copy4x4Align1 + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #8 + ORR x0, x0, x1, LSL #24 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #8 + ORR x2, x2, x3, LSL #24 + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + M_STR x2, [pDst], dstStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #8 + ORR x0, x0, x1, LSL #24 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #8 + ORR x2, x2, x3, LSL #24 + M_STR x2, [pDst], dstStep + B Copy4x4End + +Copy4x4Align2 + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #16 + ORR x0, x0, x1, LSL #16 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #16 + ORR x2, x2, x3, LSL #16 + M_STR x2, [pDst], dstStep + + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #16 + ORR x0, x0, x1, LSL #16 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #16 + ORR x2, x2, x3, LSL #16 + M_STR x2, [pDst], dstStep + B Copy4x4End + +Copy4x4Align3 + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #24 + ORR x0, x0, x1, LSL #8 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #24 + ORR x2, x2, x3, LSL #8 + M_STR x2, [pDst], dstStep + + LDR x1, [pSrc, #4] + M_LDR x0, [pSrc], srcStep + LDR x3, [pSrc, #4] + M_LDR x2, [pSrc], srcStep + MOV x0, x0, LSR #24 + ORR x0, x0, x1, LSL #8 + M_STR x0, [pDst], dstStep + MOV x2, x2, LSR #24 + ORR x2, x2, x3, LSL #8 + M_STR x2, [pDst], dstStep + B Copy4x4End + +Copy4x4End + M_END + + ENDIF + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s new file mode 100755 index 0000000..4e5a39d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s @@ -0,0 +1,178 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + + EXPORT armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe + EXPORT armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe + +;// Functions: +;// armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe and +;// armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe +;// +;// Implements re-arrangement of data from temporary buffer to a buffer pointed by pBuf. +;// This will do the convertion of data from 16 bit to 8 bit and it also +;// remove offset and check for saturation. +;// +;// Registers used as input for this function +;// r0,r1,r7 where r0 is input pointer and r2 its step size, r7 is output pointer +;// +;// Registers preserved for top level function +;// r4,r5,r6,r8,r9,r14 +;// +;// Registers modified by the function +;// r7,r10,r11,r12 +;// +;// Output registers +;// r0 - pointer to the destination location +;// r1 - step size to this destination location + + +DEBUG_ON SETL {FALSE} + +MASK EQU 0x80808080 ;// Mask is used to implement (a+b+1)/2 + +;// Declare input registers + +pSrc0 RN 0 +srcStep0 RN 1 + +;// Declare other intermediate registers +Temp1 RN 4 +Temp2 RN 5 +Temp3 RN 10 +Temp4 RN 11 +pBuf RN 7 +r0x0fe00fe0 RN 6 +r0x00ff00ff RN 12 +Count RN 14 +ValueA0 RN 10 +ValueA1 RN 11 + + IF ARM1136JS + + + ;// Function header + M_START armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe, r6 + + ;// Code start + MOV Count, #4 + LDR r0x0fe00fe0, =0x0fe00fe0 + LDR r0x00ff00ff, =0x00ff00ff +LoopStart1 + LDR Temp4, [pSrc0, #12] + LDR Temp3, [pSrc0, #8] + LDR Temp2, [pSrc0, #4] + M_LDR Temp1, [pSrc0], srcStep0 + UQSUB16 Temp4, Temp4, r0x0fe00fe0 + UQSUB16 Temp3, Temp3, r0x0fe00fe0 + UQSUB16 Temp2, Temp2, r0x0fe00fe0 + UQSUB16 Temp1, Temp1, r0x0fe00fe0 + USAT16 Temp4, #13, Temp4 + USAT16 Temp3, #13, Temp3 + USAT16 Temp2, #13, Temp2 + USAT16 Temp1, #13, Temp1 + AND Temp4, r0x00ff00ff, Temp4, LSR #5 + AND Temp3, r0x00ff00ff, Temp3, LSR #5 + AND Temp2, r0x00ff00ff, Temp2, LSR #5 + AND Temp1, r0x00ff00ff, Temp1, LSR #5 + ORR ValueA1, Temp3, Temp4, LSL #8 + ORR ValueA0, Temp1, Temp2, LSL #8 + SUBS Count, Count, #1 + STRD ValueA0, [pBuf], #8 + BGT LoopStart1 +End1 + SUB pSrc0, pBuf, #32 + MOV srcStep0, #8 + + M_END + + + ;// Function header + M_START armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe, r6 + + ;// Code start + LDR r0x0fe00fe0, =0x0fe00fe0 + LDR r0x00ff00ff, =0x00ff00ff + MOV Count, #2 + +LoopStart + LDR Temp4, [pSrc0, #12] + LDR Temp3, [pSrc0, #8] + LDR Temp2, [pSrc0, #4] + M_LDR Temp1, [pSrc0], srcStep0 + + UQSUB16 Temp4, Temp4, r0x0fe00fe0 + UQSUB16 Temp3, Temp3, r0x0fe00fe0 + UQSUB16 Temp2, Temp2, r0x0fe00fe0 + UQSUB16 Temp1, Temp1, r0x0fe00fe0 + + USAT16 Temp4, #13, Temp4 + USAT16 Temp3, #13, Temp3 + USAT16 Temp2, #13, Temp2 + USAT16 Temp1, #13, Temp1 + + AND Temp4, r0x00ff00ff, Temp4, LSR #5 + AND Temp3, r0x00ff00ff, Temp3, LSR #5 + AND Temp2, r0x00ff00ff, Temp2, LSR #5 + AND Temp1, r0x00ff00ff, Temp1, LSR #5 + ORR ValueA1, Temp3, Temp4, LSL #8 ;// [d2 c2 d0 c0] + ORR ValueA0, Temp1, Temp2, LSL #8 ;// [b2 a2 b0 a0] + + PKHBT Temp1, ValueA0, ValueA1, LSL #16 ;// [d0 c0 b0 a0] + + STR Temp1, [pBuf], #8 + PKHTB Temp2, ValueA1, ValueA0, ASR #16 ;// [d2 c2 b2 a2] + STR Temp2, [pBuf], #-4 + + LDR Temp4, [pSrc0, #12] + LDR Temp3, [pSrc0, #8] + LDR Temp2, [pSrc0, #4] + M_LDR Temp1, [pSrc0], srcStep0 + + UQSUB16 Temp4, Temp4, r0x0fe00fe0 + UQSUB16 Temp3, Temp3, r0x0fe00fe0 + UQSUB16 Temp2, Temp2, r0x0fe00fe0 + UQSUB16 Temp1, Temp1, r0x0fe00fe0 + + USAT16 Temp4, #13, Temp4 + USAT16 Temp3, #13, Temp3 + USAT16 Temp2, #13, Temp2 + USAT16 Temp1, #13, Temp1 + + AND Temp4, r0x00ff00ff, Temp4, LSR #5 + AND Temp3, r0x00ff00ff, Temp3, LSR #5 + AND Temp2, r0x00ff00ff, Temp2, LSR #5 + AND Temp1, r0x00ff00ff, Temp1, LSR #5 + ORR ValueA1, Temp3, Temp4, LSL #8 ;// [d2 c2 d0 c0] + ORR ValueA0, Temp1, Temp2, LSL #8 ;// [b2 a2 b0 a0] + + PKHBT Temp1, ValueA0, ValueA1, LSL #16 ;// [d0 c0 b0 a0] + SUBS Count, Count, #1 + STR Temp1, [pBuf], #8 + PKHTB Temp2, ValueA1, ValueA0, ASR #16 ;// [d2 c2 b2 a2] + STR Temp2, [pBuf], #4 + + BGT LoopStart +End2 + SUB pSrc0, pBuf, #32-8 + MOV srcStep0, #4 + + M_END + + ENDIF + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s new file mode 100755 index 0000000..d1684cb --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s @@ -0,0 +1,313 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + + M_VARIANTS CortexA8 + + IF CortexA8 + + M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11 + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +;// Declare Neon registers +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 +qCoeff5 QN 14.S32 +qCoeff20 QN 15.S32 + +qSrc01 QN 0.U8 +dSrc0 DN 0.U8 +dSrc1 DN 1.U8 + +dSrcb DN 4.U8 +dSrcc DN 2.U8 +dSrcd DN 3.U8 +dSrce DN 5.U8 +dSrcf DN 1.U8 + +qSrcb QN 2.S16 +qSrcc QN 1.S16 +dSrcB DN 4.S16 +dSrcC DN 2.S16 + +qRes0 QN 5.S16 +qRes1 QN 6.S16 +qRes2 QN 7.S16 +qRes3 QN 8.S16 +qRes4 QN 9.S16 +qRes5 QN 10.S16 +qRes6 QN 11.S16 +qRes7 QN 12.S16 +qRes8 QN 13.S16 + +dRes0 DN 10.S16 +dRes1 DN 12.S16 +dRes2 DN 14.S16 +dRes3 DN 16.S16 +dRes4 DN 18.S16 +dRes5 DN 20.S16 +dRes6 DN 22.S16 +dRes7 DN 24.S16 +dRes8 DN 26.S16 + +qAcc01 QN 5.S32 +qAcc23 QN 6.S32 +qAcc45 QN 2.S32 +qAcc67 QN 3.S32 +qSumBE QN 0.S32 +qSumCD QN 1.S32 + +dTempAcc0 DN 0.U16 +dTempAcc1 DN 2.U16 +dTempAcc2 DN 4.U16 +dTempAcc3 DN 6.U16 + +qTAcc0 QN 0.U16 +qTAcc1 QN 1.U16 +qTAcc2 QN 2.U16 +qTAcc3 QN 3.U16 + +dAcc0 DN 0.U8 +dAcc1 DN 2.U8 +dAcc2 DN 4.U8 +dAcc3 DN 6.U8 + +dTmp0 DN 8.S16 +dTmp1 DN 9.S16 +qTmp0 QN 4.S32 + + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + VMOV dCoeff20, #20 + VMOV dCoeff5, #5 + + ;// Row0 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row1 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes0, dRes0, dTmp0 ;// TeRi + + VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row2 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes1, dRes1, dTmp0 + + VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row3 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes2, dRes2, dTmp0 + + VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row4 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes3, dRes3, dTmp0 + + VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row5 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes4, dRes4, dTmp0 + + VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row6 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes5, dRes5, dTmp0 + + VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row7 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f + VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] + + VSUB dRes6, dRes6, dTmp0 + + VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + ;// Row8 + VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrcc, dSrc0, dSrc1, #2 + VEXT dSrcd, dSrc0, dSrc1, #3 + VEXT dSrce, dSrc0, dSrc1, #4 + VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] + VADDL qSrcc, dSrcc, dSrcd ;// c+d + VADDL qSrcb, dSrcb, dSrce ;// b+e + VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f + + VSUB dRes7, dRes7, dTmp0 + + VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) + + VMOV qCoeff20, #20 + VMOV qCoeff5, #5 + + ;// Col0 + VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f + VADDL qSumCD, dRes2, dRes3 ;// c+d + VADDL qSumBE, dRes1, dRes4 ;// b+e + + VSUB dRes8, dRes8, dTmp0 + + VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d) +; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col1 + VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f + VADDL qSumCD, dRes3, dRes4 ;// c+d + VADDL qSumBE, dRes2, dRes5 ;// b+e + VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc01, qAcc01, qTmp0 + +; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col2 + VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f + VADDL qSumCD, dRes4, dRes5 ;// c+d + VADDL qSumBE, dRes3, dRes6 ;// b+e + VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc23, qAcc23, qTmp0 + +; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + ;// Col3 + VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f + VADDL qSumCD, dRes5, dRes6 ;// c+d + VADDL qSumBE, dRes4, dRes7 ;// b+e + VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d) + + VSUB qAcc45, qAcc45, qTmp0 + + VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) + + VQRSHRUN dTempAcc0, qAcc01, #10 + VQRSHRUN dTempAcc1, qAcc23, #10 + VQRSHRUN dTempAcc2, qAcc45, #10 + VQRSHRUN dTempAcc3, qAcc67, #10 + + VQMOVN dAcc0, qTAcc0 + VQMOVN dAcc1, qTAcc1 + VQMOVN dAcc2, qTAcc2 + VQMOVN dAcc3, qTAcc3 + + M_END + + ENDIF + + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s new file mode 100755 index 0000000..7bc091f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s @@ -0,0 +1,266 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + + M_VARIANTS CortexA8 + + IF CortexA8 + M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r11 + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +;// Declare Neon registers +dTCoeff5 DN 30.U8 +dTCoeff20 DN 31.U8 +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 + +qSrcA01 QN 0.U8 +qSrcB23 QN 1.U8 +qSrcC45 QN 2.U8 +qSrcD67 QN 3.U8 +qSrcE89 QN 4.U8 +qSrcF1011 QN 5.U8 +qSrcG1213 QN 6.U8 +qSrcH1415 QN 7.U8 +qSrcI1617 QN 8.U8 + +dSrcA0 DN 0.U8 +dSrcB2 DN 2.U8 +dSrcC4 DN 4.U8 +dSrcD6 DN 6.U8 +dSrcE8 DN 8.U8 +dSrcF10 DN 10.U8 +dSrcG12 DN 12.U8 +dSrcH14 DN 14.U8 +dSrcI16 DN 16.U8 + +dSrcA1 DN 1.U8 +dSrcB3 DN 3.U8 +dSrcC5 DN 5.U8 +dSrcD7 DN 7.U8 +dSrcE9 DN 9.U8 +dSrcF11 DN 11.U8 +dSrcG13 DN 13.U8 +dSrcH15 DN 15.U8 +dSrcI17 DN 17.U8 + +qTempP01 QN 9.S16 +qTempQ01 QN 10.S16 +qTempR01 QN 11.S16 +qTempS01 QN 12.S16 + +qTempP23 QN 0.S16 +qTempQ23 QN 1.S16 +qTempR23 QN 2.S16 +qTempS23 QN 3.S16 + +dTempP0 DN 18.S16 +dTempP1 DN 19.S16 +dTempP2 DN 0.S16 + +dTempQ0 DN 20.S16 +dTempQ1 DN 21.S16 +dTempQ2 DN 2.S16 + +dTempR0 DN 22.S16 +dTempR1 DN 23.S16 +dTempR2 DN 4.S16 + +dTempS0 DN 24.S16 +dTempS1 DN 25.S16 +dTempS2 DN 6.S16 + +dTempB0 DN 26.S16 +dTempC0 DN 27.S16 +dTempD0 DN 28.S16 +dTempF0 DN 29.S16 + +dTempAcc0 DN 0.U16 +dTempAcc1 DN 2.U16 +dTempAcc2 DN 4.U16 +dTempAcc3 DN 6.U16 + +dAcc0 DN 0.U8 +dAcc1 DN 2.U8 +dAcc2 DN 4.U8 +dAcc3 DN 6.U8 + +qAcc0 QN 0.S32 +qAcc1 QN 1.S32 +qAcc2 QN 2.S32 +qAcc3 QN 3.S32 + +qTAcc0 QN 0.U16 +qTAcc1 QN 1.U16 +qTAcc2 QN 2.U16 +qTAcc3 QN 3.U16 + +qTmp QN 4.S16 +dTmp DN 8.S16 + + VLD1 qSrcA01, [pSrc], srcStep ;// [a0 a1 a2 a3 .. a15] + ADD r12, pSrc, srcStep, LSL #2 + VMOV dTCoeff5, #5 + VMOV dTCoeff20, #20 + VLD1 qSrcF1011, [r12], srcStep + VLD1 qSrcB23, [pSrc], srcStep ;// [b0 b1 b2 b3 .. b15] + + VLD1 qSrcG1213, [r12], srcStep + VADDL qTempP01, dSrcA0, dSrcF10 + VLD1 qSrcC45, [pSrc], srcStep ;// [c0 c1 c2 c3 .. c15] + VADDL qTempP23, dSrcA1, dSrcF11 + VLD1 qSrcD67, [pSrc], srcStep + VADDL qTempQ01, dSrcB2, dSrcG12 + VLD1 qSrcE89, [pSrc], srcStep + + ;//t0 + VMLAL qTempP01, dSrcC4, dTCoeff20 + + VLD1 qSrcH1415, [r12], srcStep + + VMLAL qTempP23, dSrcC5, dTCoeff20 + + VLD1 qSrcI1617, [r12], srcStep ;// [i0 i1 i2 i3 .. ] + + VMLAL qTempP01, dSrcD6, dTCoeff20 + VMLAL qTempQ01, dSrcD6, dTCoeff20 + VMLSL qTempP23, dSrcB3, dTCoeff5 + + VADDL qTempR01, dSrcC4, dSrcH14 + + VMLSL qTempP01, dSrcB2, dTCoeff5 + + VADDL qTempQ23, dSrcB3, dSrcG13 + + VMLAL qTempP23, dSrcD7, dTCoeff20 + VMLAL qTempQ01, dSrcE8, dTCoeff20 + + VMLSL qTempP01, dSrcE8, dTCoeff5 + VMLAL qTempQ23, dSrcD7, dTCoeff20 + + VMLSL qTempP23, dSrcE9, dTCoeff5 + + ;//t1 + + VMLAL qTempR01, dSrcE8, dTCoeff20 + VMLSL qTempQ01, dSrcC4, dTCoeff5 + VMLSL qTempQ23, dSrcC5, dTCoeff5 + VADDL qTempR23, dSrcC5, dSrcH15 + + VMLAL qTempR01, dSrcF10, dTCoeff20 + VMLSL qTempQ01, dSrcF10, dTCoeff5 + VMLAL qTempQ23, dSrcE9, dTCoeff20 + VMLAL qTempR23, dSrcE9, dTCoeff20 + VADDL qTempS01, dSrcD6, dSrcI16 + + + VMLSL qTempR01, dSrcD6, dTCoeff5 + VMLSL qTempQ23, dSrcF11, dTCoeff5 + VMLSL qTempR23, dSrcD7, dTCoeff5 + + ;//t2 + VADDL qTempS23, dSrcD7, dSrcI17 + VMLAL qTempS01, dSrcF10, dTCoeff20 + VMLSL qTempR01, dSrcG12, dTCoeff5 + VMLSL qTempR23, dSrcG13, dTCoeff5 + + VMLAL qTempS23, dSrcF11, dTCoeff20 + VMLAL qTempS01, dSrcG12, dTCoeff20 + VEXT dTempB0, dTempP0, dTempP1, #1 + VMLAL qTempR23, dSrcF11, dTCoeff20 + + + ;//t3 + VMLAL qTempS23, dSrcG13, dTCoeff20 + VMLSL qTempS01, dSrcE8, dTCoeff5 + VEXT dTempC0, dTempP0, dTempP1, #2 + VMOV dCoeff20, #20 + VMLSL qTempS23, dSrcE9, dTCoeff5 + VMLSL qTempS01, dSrcH14, dTCoeff5 + VEXT dTempF0, dTempP1, dTempP2, #1 + VEXT dTempD0, dTempP0, dTempP1, #3 + VMLSL qTempS23, dSrcH15, dTCoeff5 + + VADDL qAcc0, dTempP0, dTempF0 + VADD dTempC0, dTempC0, dTempD0 + ;//h + VMOV dCoeff5, #5 + + ;// res0 + VADD dTempB0, dTempB0, dTempP1 + VMLAL qAcc0, dTempC0, dCoeff20 + VEXT dTempC0, dTempQ0, dTempQ1, #2 + VEXT dTempD0, dTempQ0, dTempQ1, #3 + VEXT dTempF0, dTempQ1, dTempQ2, #1 + VMLSL qAcc0, dTempB0, dCoeff5 + + ;// res1 + VEXT dTempB0, dTempQ0, dTempQ1, #1 + VADDL qAcc1, dTempQ0, dTempF0 + VADD dTempC0, dTempC0, dTempD0 + VADD dTempB0, dTempB0, dTempQ1 + VEXT dTempD0, dTempR0, dTempR1, #3 + VMLAL qAcc1, dTempC0, dCoeff20 + VEXT dTempF0, dTempR1, dTempR2, #1 + VEXT dTempC0, dTempR0, dTempR1, #2 + VEXT dTmp, dTempR0, dTempR1, #1 + VADDL qAcc2, dTempR0, dTempF0 + VMLSL qAcc1, dTempB0, dCoeff5 +; VEXT dTempB0, dTempR0, dTempR1, #1 + VADD dTempC0, dTempC0, dTempD0 + + ;// res2 + VADD dTempB0, dTmp, dTempR1 + VEXT dTempD0, dTempS0, dTempS1, #3 + VMLAL qAcc2, dTempC0, dCoeff20 +; VADD dTempB0, dTempB0, dTempR1 + + ;// res3 + VEXT dTempC0, dTempS0, dTempS1, #2 + VEXT dTempF0, dTempS1, dTempS2, #1 + VADD dTempC0, dTempC0, dTempD0 + VEXT dTmp, dTempS0, dTempS1, #1 + VADDL qAcc3, dTempS0, dTempF0 + VMLSL qAcc2, dTempB0, dCoeff5 + VMLAL qAcc3, dTempC0, dCoeff20 + VADD dTmp, dTmp, dTempS1 + VMLSL qAcc3, dTmp, dCoeff5 + + VQRSHRUN dTempAcc0, qAcc0, #10 + VQRSHRUN dTempAcc1, qAcc1, #10 + VQRSHRUN dTempAcc2, qAcc2, #10 + VQRSHRUN dTempAcc3, qAcc3, #10 + + VQMOVN dAcc0, qTAcc0 + VQMOVN dAcc1, qTAcc1 + VQMOVN dAcc2, qTAcc2 + VQMOVN dAcc3, qTAcc3 + + M_END + + ENDIF + + + + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s new file mode 100755 index 0000000..babe8ad --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s @@ -0,0 +1,228 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + +DEBUG_ON SETL {FALSE} + + IF CortexA8 + + M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11 + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +;// Declare Neon registers +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 + +qSrcA01 QN 11.U8 +qSrcB01 QN 12.U8 +qSrcC01 QN 13.U8 +qSrcD01 QN 14.U8 + +dSrcA0 DN 22.U8 +dSrcA1 DN 23.U8 +dSrcB0 DN 24.U8 +dSrcB1 DN 25.U8 +dSrcC0 DN 26.U8 +dSrcC1 DN 27.U8 +dSrcD0 DN 28.U8 +dSrcD1 DN 29.U8 + +dSrcb DN 12.U8 +dSrce DN 13.U8 +dSrcf DN 10.U8 + +dSrc0c DN 14.U8 +dSrc1c DN 16.U8 +dSrc2c DN 18.U8 +dSrc3c DN 20.U8 + +dSrc0d DN 15.U8 +dSrc1d DN 17.U8 +dSrc2d DN 19.U8 +dSrc3d DN 21.U8 + +qTemp01 QN 4.S16 +qTemp23 QN 6.S16 +dTemp0 DN 8.S16 +dTemp2 DN 12.S16 + +qRes01 QN 11.S16 +qRes23 QN 12.S16 +qRes45 QN 13.S16 +qRes67 QN 14.S16 + +dRes0 DN 22.S16 +dRes2 DN 24.S16 +dRes4 DN 26.S16 +dRes6 DN 28.S16 + +dAcc0 DN 22.U8 +dAcc2 DN 24.U8 +dAcc4 DN 26.U8 +dAcc6 DN 28.U8 + +dResult0 DN 22.U32 +dResult2 DN 24.U32 +dResult4 DN 26.U32 +dResult6 DN 28.U32 + + VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..] + ;// One cycle stall + VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..] + VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..] +; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] + VEXT dSrc0c, dSrcA0, dSrcA1, #2 + VEXT dSrc0d, dSrcA0, dSrcA1, #3 + VEXT dSrce, dSrcA0, dSrcA1, #4 + VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f + VADDL qTemp01, dSrc0c, dSrc0d ;// c+d + VADDL qTemp23, dSrcb, dSrce ;// b+e + + VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] +; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] + VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi + + VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..] + VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrc1c, dSrcB0, dSrcB1, #2 + VEXT dSrc1d, dSrcB0, dSrcB1, #3 + VEXT dSrce, dSrcB0, dSrcB1, #4 + VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f + + VSUB dRes0, dRes0, dTemp0 ;// TeRi + + VADDL qTemp01, dSrc1c, dSrc1d ;// c+d + VADDL qTemp23, dSrcb, dSrce ;// b+e + + VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] +; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] + + VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi + + VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..] + VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrc2c, dSrcC0, dSrcC1, #2 + VEXT dSrc2d, dSrcC0, dSrcC1, #3 + VEXT dSrce, dSrcC0, dSrcC1, #4 + VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f + + VSUB dRes2, dRes2, dTemp0 ;// TeRi + + VADDL qTemp01, dSrc2c, dSrc2d ;// c+d + VADDL qTemp23, dSrcb, dSrce ;// b+e + + VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] + + VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d) +; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) + VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi + + + VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..] + VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..] + VEXT dSrc3c, dSrcD0, dSrcD1, #2 + VEXT dSrc3d, dSrcD0, dSrcD1, #3 + VEXT dSrce, dSrcD0, dSrcD1, #4 + VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f + + VSUB dRes4, dRes4, dTemp0 ;// TeRi + + VADDL qTemp01, dSrc3c, dSrc3d ;// c+d + VADDL qTemp23, dSrcb, dSrce ;// b+e + VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d) + VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) + + VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32) + VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32) + VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32) + VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32) + + M_END + + ENDIF + + + END + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s new file mode 100755 index 0000000..89c90aa --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s @@ -0,0 +1,134 @@ +;// +;// +;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + + IF CortexA8 + + M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 + +Temp RN 12 + +;// Declare Neon registers +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 + +dSrc0 DN 7.U8 +dSrc1 DN 8.U8 +dSrc2 DN 9.U8 +dSrc3 DN 10.U8 +dSrc4 DN 11.U8 +dSrc5 DN 12.U8 +dSrc6 DN 13.U8 +dSrc7 DN 14.U8 +dSrc8 DN 15.U8 + +qSumBE01 QN 8.S16 +qSumCD01 QN 9.S16 +dSumBE0 DN 16.S16 +dSumCD0 DN 18.S16 + +qAcc01 QN 0.S16 +qAcc23 QN 1.S16 +qAcc45 QN 2.S16 +qAcc67 QN 3.S16 + +dRes0 DN 0.S16 +dRes1 DN 2.S16 +dRes2 DN 4.S16 +dRes3 DN 6.S16 + +dAcc0 DN 0.U8 +dAcc1 DN 2.U8 +dAcc2 DN 4.U8 +dAcc3 DN 6.U8 + + +dTmp0 DN 20.S16 +dTmp1 DN 21.S16 +dTmp2 DN 22.S16 +dTmp3 DN 23.S16 + + + VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] + ADD Temp, pSrc, srcStep, LSL #2 + VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] + ;// One cycle stall + VLD1 dSrc5, [Temp], srcStep + ;// One cycle stall + VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] + VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f + VLD1 dSrc3, [pSrc], srcStep + ;// One cycle stall + VLD1 dSrc6, [Temp], srcStep ;// TeRi + + VLD1 dSrc4, [pSrc], srcStep + VLD1 dSrc7, [Temp], srcStep ;// TeRi + VADDL qSumBE01, dSrc1, dSrc4 ;// b+e + VADDL qSumCD01, dSrc2, dSrc3 ;// c+d + VLD1 dSrc8, [Temp], srcStep ;// TeRi + VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) +; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + +; VLD1 dSrc6, [Temp], srcStep + VADDL qSumBE01, dSrc2, dSrc5 ;// b+e + VADDL qSumCD01, dSrc3, dSrc4 ;// c+d + VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f + VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) +; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + +; VLD1 dSrc7, [Temp], srcStep + VADDL qSumBE01, dSrc3, dSrc6 ;// b+e + VADDL qSumCD01, dSrc4, dSrc5 ;// c+d + VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f + VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) +; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) + +; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] + VADDL qSumBE01, dSrc4, dSrc7 ;// b+e + VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f + VADDL qSumCD01, dSrc5, dSrc6 ;// c+d + VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) + VADD dRes0, dRes0, dTmp0 + VADD dRes1, dRes1, dTmp1 + VADD dRes2, dRes2, dTmp2 + VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) +; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) +; VADD dRes3, dRes3, dTmp3 + + VQRSHRUN dAcc0, qAcc01, #5 + VQRSHRUN dAcc1, qAcc23, #5 + VQRSHRUN dAcc2, qAcc45, #5 + VQRSHRUN dAcc3, qAcc67, #5 + + M_END + + ENDIF + + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s new file mode 100755 index 0000000..0f0ec78 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s @@ -0,0 +1,318 @@ +;// +;// +;// File Name: armVCM4P10_Interpolate_Chroma_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 9641 +;// Date: Thursday, February 7, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + + IF CortexA8 + + M_TABLE armVCM4P10_WidthBranchTableMVIsNotZero + + DCD WidthIs2MVIsNotZero, WidthIs2MVIsNotZero + DCD WidthIs4MVIsNotZero, WidthIs4MVIsNotZero + DCD WidthIs8MVIsNotZero + + M_TABLE armVCM4P10_WidthBranchTableMVIsZero + + DCD WidthIs2MVIsZero, WidthIs2MVIsZero + DCD WidthIs4MVIsZero, WidthIs4MVIsZero + DCD WidthIs8MVIsZero + + +;// input registers + +pSrc RN 0 +iSrcStep RN 1 +pDst RN 2 +iDstStep RN 3 +iWidth RN 4 +iHeight RN 5 +dx RN 6 +dy RN 7 + +;// local variable registers +pc RN 15 +return RN 0 +EightMinusdx RN 8 +EightMinusdy RN 9 + +ACoeff RN 12 +BCoeff RN 9 +CCoeff RN 8 +DCoeff RN 6 + +pTable RN 11 + +Step1 RN 10 +SrcStepMinus1 RN 14 + +dACoeff DN D12.U8 +dBCoeff DN D13.U8 +dCCoeff DN D14.U8 +dDCoeff DN D15.U8 + +dRow0a DN D0.U8 +dRow0b DN D1.U8 +dRow1a DN D2.U8 +dRow1b DN D3.U8 + +qRow0a QN Q2.S16 +qRow0b QN Q3.S16 + +;//dIndex DN D16.U8 +qRow1a QN Q11.S16 +qRow1b QN Q12.S16 + +dRow2a DN D16.U8 +dRow2b DN D17.U8 +dRow3a DN D18.U8 +dRow3b DN D19.U8 + +qOutRow2 QN Q11.U16 +qOutRow3 QN Q12.U16 +dOutRow2 DN D20.U8 +dOutRow3 DN D21.U8 +dOutRow2U64 DN D20.U64 +dOutRow3U64 DN D21.U64 + +qOutRow0 QN Q2.U16 +qOutRow1 QN Q3.U16 +dOutRow0 DN D8.U8 +dOutRow1 DN D9.U8 + +dOutRow0U64 DN D8.U64 +dOutRow1U64 DN D9.U64 + +dOutRow0U32 DN D8.U32 +dOutRow1U32 DN D9.U32 + +dOutRow0U16 DN D8.U16 +dOutRow1U16 DN D9.U16 + + +dOut0U64 DN D0.U64 +dOut1U64 DN D1.U64 + +dOut00U32 DN D0.U32 +dOut01U32 DN D1.U32 +dOut10U32 DN D2.U32 +dOut11U32 DN D3.U32 + +dOut0U16 DN D0.U16 +dOut1U16 DN D1.U16 + +;//----------------------------------------------------------------------------------------------- +;// armVCM4P10_Interpolate_Chroma_asm starts +;//----------------------------------------------------------------------------------------------- + + ;// Write function header + M_START armVCM4P10_Interpolate_Chroma, r11, d15 + + ;// Define stack arguments + M_ARG Width, 4 + M_ARG Height, 4 + M_ARG Dx, 4 + M_ARG Dy, 4 + + ;// Load argument from the stack + ;// M_STALL ARM1136JS=4 + + M_LDRD dx, dy, Dx + M_LDRD iWidth, iHeight, Width + + ;// EightMinusdx = 8 - dx + ;// EightMinusdy = 8 - dy + + ;// ACoeff = EightMinusdx * EightMinusdy + ;// BCoeff = dx * EightMinusdy + ;// CCoeff = EightMinusdx * dy + ;// DCoeff = dx * dy + + RSB EightMinusdx, dx, #8 + RSB EightMinusdy, dy, #8 + CMN dx,dy + MOV Step1, #1 + LDREQ pTable, =armVCM4P10_WidthBranchTableMVIsZero + SUB SrcStepMinus1, iSrcStep, Step1 + LDRNE pTable, =armVCM4P10_WidthBranchTableMVIsNotZero + + VLD1 dRow0a, [pSrc], Step1 ;// 0a + + SMULBB ACoeff, EightMinusdx, EightMinusdy + SMULBB BCoeff, dx, EightMinusdy + VLD1 dRow0b, [pSrc], SrcStepMinus1 ;// 0b + SMULBB CCoeff, EightMinusdx, dy + SMULBB DCoeff, dx, dy + + VDUP dACoeff, ACoeff + VDUP dBCoeff, BCoeff + VDUP dCCoeff, CCoeff + VDUP dDCoeff, DCoeff + + LDR pc, [pTable, iWidth, LSL #1] ;// Branch to the case based on iWidth + +;// Pixel layout: +;// +;// x00 x01 x02 +;// x10 x11 x12 +;// x20 x21 x22 + +;// If fractionl mv is not (0, 0) +WidthIs8MVIsNotZero + + VLD1 dRow1a, [pSrc], Step1 ;// 1a + VMULL qRow0a, dRow0a, dACoeff + VLD1 dRow1b, [pSrc], SrcStepMinus1 ;// 1b + VMULL qRow0b, dRow1a, dACoeff + VLD1 dRow2a, [pSrc], Step1 ;// 2a + VMLAL qRow0a, dRow0b, dBCoeff + VLD1 dRow2b, [pSrc], SrcStepMinus1 ;// 2b + VMULL qRow1a, dRow2a, dACoeff + VMLAL qRow0b, dRow1b, dBCoeff + VLD1 dRow3a, [pSrc], Step1 ;// 3a + VMLAL qRow0a, dRow1a, dCCoeff + VMLAL qRow1a, dRow2b, dBCoeff + VMULL qRow1b, dRow3a, dACoeff + VLD1 dRow3b, [pSrc], SrcStepMinus1 ;// 3b + VMLAL qRow0b, dRow2a, dCCoeff + VLD1 dRow0a, [pSrc], Step1 ;// 0a + VMLAL qRow1b, dRow3b, dBCoeff + VMLAL qRow1a, dRow3a, dCCoeff + VMLAL qRow0a, dRow1b, dDCoeff + VLD1 dRow0b, [pSrc], SrcStepMinus1 ;// 0b + VMLAL qRow1b, dRow0a, dCCoeff + VMLAL qRow0b, dRow2b, dDCoeff + VMLAL qRow1a, dRow3b, dDCoeff + + + SUBS iHeight, iHeight, #4 + VMLAL qRow1b, dRow0b, dDCoeff + + VQRSHRN dOutRow0, qOutRow0, #6 + VQRSHRN dOutRow1, qOutRow1, #6 + VQRSHRN dOutRow2, qOutRow2, #6 + VST1 dOutRow0U64, [pDst], iDstStep + VQRSHRN dOutRow3, qOutRow3, #6 + + VST1 dOutRow1U64, [pDst], iDstStep + VST1 dOutRow2U64, [pDst], iDstStep + VST1 dOutRow3U64, [pDst], iDstStep + + + BGT WidthIs8MVIsNotZero + MOV return, #OMX_Sts_NoErr + M_EXIT + +WidthIs4MVIsNotZero + + VLD1 dRow1a, [pSrc], Step1 + VMULL qRow0a, dRow0a, dACoeff + VMULL qRow0b, dRow1a, dACoeff + VLD1 dRow1b, [pSrc], SrcStepMinus1 + VMLAL qRow0a, dRow0b, dBCoeff + VMLAL qRow0b, dRow1b, dBCoeff + VLD1 dRow0a, [pSrc], Step1 + VMLAL qRow0a, dRow1a, dCCoeff + VMLAL qRow0b, dRow0a, dCCoeff + VLD1 dRow0b, [pSrc], SrcStepMinus1 + SUBS iHeight, iHeight, #2 + VMLAL qRow0b, dRow0b, dDCoeff + VMLAL qRow0a, dRow1b, dDCoeff + + VQRSHRN dOutRow1, qOutRow1, #6 + VQRSHRN dOutRow0, qOutRow0, #6 + + VST1 dOutRow0U32[0], [pDst], iDstStep + VST1 dOutRow1U32[0], [pDst], iDstStep + + BGT WidthIs4MVIsNotZero + MOV return, #OMX_Sts_NoErr + M_EXIT + +WidthIs2MVIsNotZero + + VLD1 dRow1a, [pSrc], Step1 + VMULL qRow0a, dRow0a, dACoeff + VMULL qRow0b, dRow1a, dACoeff + VLD1 dRow1b, [pSrc], SrcStepMinus1 + VMLAL qRow0a, dRow0b, dBCoeff + VMLAL qRow0b, dRow1b, dBCoeff + VLD1 dRow0a, [pSrc], Step1 + VMLAL qRow0a, dRow1a, dCCoeff + VMLAL qRow0b, dRow0a, dCCoeff + VLD1 dRow0b, [pSrc], SrcStepMinus1 + SUBS iHeight, iHeight, #2 + VMLAL qRow0b, dRow0b, dDCoeff + VMLAL qRow0a, dRow1b, dDCoeff + + VQRSHRN dOutRow1, qOutRow1, #6 + VQRSHRN dOutRow0, qOutRow0, #6 + + VST1 dOutRow0U16[0], [pDst], iDstStep + VST1 dOutRow1U16[0], [pDst], iDstStep + + BGT WidthIs2MVIsNotZero + MOV return, #OMX_Sts_NoErr + M_EXIT + +;// If fractionl mv is (0, 0) +WidthIs8MVIsZero + SUB pSrc, pSrc, iSrcStep + +WidthIs8LoopMVIsZero + VLD1 dRow0a, [pSrc], iSrcStep + SUBS iHeight, iHeight, #2 + VLD1 dRow0b, [pSrc], iSrcStep + VST1 dOut0U64, [pDst], iDstStep + VST1 dOut1U64, [pDst], iDstStep + BGT WidthIs8LoopMVIsZero + + MOV return, #OMX_Sts_NoErr + M_EXIT + +WidthIs4MVIsZero + VLD1 dRow0b, [pSrc], iSrcStep + + SUBS iHeight, iHeight, #2 + + VST1 dOut00U32[0], [pDst], iDstStep + VLD1 dRow0a, [pSrc], iSrcStep + VST1 dOut01U32[0], [pDst], iDstStep + + BGT WidthIs4MVIsZero + MOV return, #OMX_Sts_NoErr + M_EXIT + +WidthIs2MVIsZero + VLD1 dRow0b, [pSrc], iSrcStep + SUBS iHeight, iHeight, #2 + + VST1 dOut0U16[0], [pDst], iDstStep + VLD1 dRow0a, [pSrc], iSrcStep + VST1 dOut1U16[0], [pDst], iDstStep + + BGT WidthIs2MVIsZero + MOV return, #OMX_Sts_NoErr + M_END + + ENDIF ;// CortexA8 + + END + +;//----------------------------------------------------------------------------------------------- +;// armVCM4P10_Interpolate_Chroma_asm ends +;//----------------------------------------------------------------------------------------------- + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s new file mode 100755 index 0000000..7e2642b --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s @@ -0,0 +1,74 @@ +;// +;// +;// File Name: armVCM4P10_QuantTables_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// Description: +;// This file contains quantization tables +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + + EXPORT armVCM4P10_MFMatrixQPModTable + EXPORT armVCM4P10_QPDivIntraTable + EXPORT armVCM4P10_QPDivPlusOneTable + +;//-------------------------------------------------------------- +;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires, +;// for values of iQP from 0 to 51 (inclusive). +;//-------------------------------------------------------------- + + M_TABLE armVCM4P10_MFMatrixQPModTable + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + DCW 13107, 11916, 10082, 9362, 8192, 7282 + +;//--------------------------------------------------------------- +;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values, +;// for values of iQP from 0 to 51 (inclusive). +;//--------------------------------------------------------------- + + M_TABLE armVCM4P10_QPDivPlusOneTable + DCB 16, 16, 16, 16, 16, 16 + DCB 17, 17, 17, 17, 17, 17 + DCB 18, 18, 18, 18, 18, 18 + DCB 19, 19, 19, 19, 19, 19 + DCB 20, 20, 20, 20, 20, 20 + DCB 21, 21, 21, 21, 21, 21 + DCB 22, 22, 22, 22, 22, 22 + DCB 23, 23, 23, 23, 23, 23 + DCB 24, 24, 24, 24, 24, 24 + +;//------------------------------------------------------------------ +;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) , +;// for values of iQP from 0 to 51 (inclusive). +;//------------------------------------------------------------------ + + M_TABLE armVCM4P10_QPDivIntraTable, 2 + DCD 21845, 21845, 21845, 21845, 21845, 21845 + DCD 43690, 43690, 43690, 43690, 43690, 43690 + DCD 87381, 87381, 87381, 87381, 87381, 87381 + DCD 174762, 174762, 174762, 174762, 174762, 174762 + DCD 349525, 349525, 349525, 349525, 349525, 349525 + DCD 699050, 699050, 699050, 699050, 699050, 699050 + DCD 1398101, 1398101, 1398101, 1398101, 1398101, 1398101 + DCD 2796202, 2796202, 2796202, 2796202, 2796202, 2796202 + DCD 5592405, 5592405, 5592405, 5592405, 5592405, 5592405 + + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s new file mode 100755 index 0000000..ee9c339 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s @@ -0,0 +1,186 @@ +;// +;// +;// File Name: armVCM4P10_TransformResidual4x4_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// Transform Residual 4x4 Coefficients +;// +;// + + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + +;// Import symbols required from other files +;// (For example tables) + + + + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + + +;// Guarding implementation by the processor name + + + + + + + + +;// Guarding implementation by the processor name + + IF CortexA8 + +;// ARM Registers + +;//Input Registers +pDst RN 0 +pSrc RN 1 + + +;// Neon Registers + +;// Packed Input pixels +dIn0 DN D0.S16 +dIn1 DN D1.S16 +dIn2 DN D2.S16 +dIn3 DN D3.S16 + +;// Intermediate calculations +dZero DN D4.S16 +de0 DN D5.S16 +de1 DN D6.S16 +de2 DN D7.S16 +de3 DN D8.S16 +dIn1RS DN D7.S16 +dIn3RS DN D8.S16 +df0 DN D0.S16 +df1 DN D1.S16 +df2 DN D2.S16 +df3 DN D3.S16 +qf01 QN Q0.32 +qf23 QN Q1.32 +dg0 DN D5.S16 +dg1 DN D6.S16 +dg2 DN D7.S16 +dg3 DN D8.S16 +df1RS DN D7.S16 +df3RS DN D8.S16 + +;// Output pixels +dh0 DN D0.S16 +dh1 DN D1.S16 +dh2 DN D2.S16 +dh3 DN D3.S16 + + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START armVCM4P10_TransformResidual4x4, ,d8 + + ;****************************************************************** + ;// The strategy used in implementing the transform is as follows:* + ;// Load the 4x4 block into 8 registers * + ;// Transpose the 4x4 matrix * + ;// Perform the row operations (on columns) using SIMD * + ;// Transpose the 4x4 result matrix * + ;// Perform the coloumn operations * + ;// Store the 4x4 block at one go * + ;****************************************************************** + + ;// Load all the 4x4 pixels in transposed form + + VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc] + + VMOV dZero,#0 ;// Used to right shift by 1 + + + ;**************************************** + ;// Row Operations (Performed on columns) + ;**************************************** + + + VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 + VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 + VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 + VHADD dIn3RS,dIn3,dZero + VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 + VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) + VADD df0,de0,de3 ;// f0 = e0 + e3 + VADD df1,de1,de2 ;// f1 = e1 + e2 + VSUB df2,de1,de2 ;// f2 = e1 - e2 + VSUB df3,de0,de3 ;// f3 = e0 - e3 + + + + ;***************************************************************** + ;// Transpose the resultant matrix + ;***************************************************************** + + VTRN df0,df1 + VTRN df2,df3 + VTRN qf01,qf23 + + + ;******************************* + ;// Coloumn Operations + ;******************************* + + + VADD dg0,df0,df2 ;// e0 = d0 + d2 + VSUB dg1,df0,df2 ;// e1 = d0 - d2 + VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 + VHADD df3RS,df3,dZero + VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 + VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) + VADD dh0,dg0,dg3 ;// f0 = e0 + e3 + VADD dh1,dg1,dg2 ;// f1 = e1 + e2 + VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 + VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 + + + ;************************************************ + ;// Calculate final value (colOp[i][j] + 32)>>6 + ;************************************************ + + VRSHR dh0,#6 + VRSHR dh1,#6 + VRSHR dh2,#6 + VRSHR dh3,#6 + + + ;*************************** + ;// Store all the 4x4 pixels + ;*************************** + + VST1 {dh0,dh1,dh2,dh3},[pDst] + + + ;// Set return value + +End + + + ;// Write function tail + M_END + + ENDIF ;//CortexA8 + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s new file mode 100755 index 0000000..4c52e22 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s @@ -0,0 +1,92 @@ +;// +;// +;// File Name: armVCM4P10_UnpackBlock4x4_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Define the processor variants supported by this file + + M_VARIANTS ARM1136JS + + + IF ARM1136JS + +;//-------------------------------------- +;// Input Arguments and their scope/usage +;//-------------------------------------- +ppSrc RN 0 ;// Persistent variable +pDst RN 1 ;// Persistent variable + +;//-------------------------------- +;// Variables and their scope/usage +;//-------------------------------- +pSrc RN 2 ;// Persistent variables +Flag RN 3 +Value RN 4 +Value2 RN 5 +strOffset RN 6 +cstOffset RN 7 + + + M_START armVCM4P10_UnpackBlock4x4, r7 + + LDR pSrc, [ppSrc] ;// Load pSrc + MOV cstOffset, #31 ;// To be used in the loop, to compute offset + + ;//----------------------------------------------------------------------- + ; Firstly, fill all the coefficient values on the <pDst> buffer by zero + ;//----------------------------------------------------------------------- + + MOV Value, #0 ;// Initialize the zero value + MOV Value2, #0 ;// Initialize the zero value + LDRB Flag, [pSrc], #1 ;// Preload <Flag> before <unpackLoop> + + STRD Value, [pDst, #0] ;// pDst[0] = pDst[1] = pDst[2] = pDst[3] = 0 + STRD Value, [pDst, #8] ;// pDst[4] = pDst[5] = pDst[6] = pDst[7] = 0 + STRD Value, [pDst, #16] ;// pDst[8] = pDst[9] = pDst[10] = pDst[11] = 0 + STRD Value, [pDst, #24] ;// pDst[12] = pDst[13] = pDst[14] = pDst[15] = 0 + + ;//---------------------------------------------------------------------------- + ;// The loop below parses and unpacks the input stream. The C-model has + ;// a somewhat complicated logic for sign extension. But in the v6 version, + ;// that can be easily taken care by loading the data from <pSrc> stream as + ;// SIGNED byte/halfword. So, based on the first TST instruction, 8-bits or + ;// 16-bits are read. + ;// + ;// Next, to compute the offset, where the unpacked value needs to be stored, + ;// we modify the computation to perform [(Flag & 15) < 1] as [(Flag < 1) & 31] + ;// This results in a saving of one cycle. + ;//---------------------------------------------------------------------------- + +unpackLoop + TST Flag, #0x10 ;// Computing (Flag & 0x10) + LDRSBNE Value2,[pSrc,#1] ;// Load byte wise to avoid unaligned access + LDRBNE Value, [pSrc], #2 + AND strOffset, cstOffset, Flag, LSL #1 ;// strOffset = (Flag & 15) < 1; + LDRSBEQ Value, [pSrc], #1 ;// Value = (OMX_U8) *pSrc++ + ORRNE Value,Value,Value2, LSL #8 ;// Value = (OMX_U16) *pSrc++ + + TST Flag, #0x20 ;// Computing (Flag & 0x20) to check, if we're done + LDRBEQ Flag, [pSrc], #1 ;// Flag = (OMX_U8) *pSrc++, for next iteration + STRH Value, [pDst, strOffset] ;// Store <Value> at offset <strOffset> + BEQ unpackLoop ;// Branch to the loop beginning + + STR pSrc, [ppSrc] ;// Update the bitstream pointer + M_END + + ENDIF + + + + END +
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c new file mode 100755 index 0000000..40d4d5e --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c @@ -0,0 +1,88 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: omxVCM4P10_DeblockChroma_I.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * H.264 intra chroma deblock + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" +#include "armVC.h" + +/** + * Function: omxVCM4P10_DeblockChroma_I + * + * Description: + * Performs deblocking filtering on all edges of the chroma macroblock (16x16). + * + * Remarks: + * + * Parameters: + * [in] pSrcDst pointer to the input macroblock. Must be 8-byte aligned. + * [in] srcdstStep Step of the arrays + * [in] pAlpha pointer to a 2x2 array of alpha thresholds, organized as follows: { external + * vertical edge, internal vertical edge, external + * horizontal edge, internal horizontal edge } + * [in] pBeta pointer to a 2x2 array of beta thresholds, organized as follows: { external + * vertical edge, internal vertical edge, external horizontal edge, + * internal horizontal edge } + * [in] pThresholds AArray of size 8x2 of Thresholds (TC0) (values for the left or + * above edge of each 4x2 or 2x4 block, arranged in vertical block order + * and then in horizontal block order) + * [in] pBS array of size 16x2 of BS parameters (arranged in scan block order for vertical edges and then horizontal edges); + * valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4. Must be 4-byte aligned. + * [out] pSrcDst pointer to filtered output macroblock + * + * Return Value: + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds, or pBS is NULL. + * - pSrcDst is not 8-byte aligned. + * - either pThresholds or pBS is not 4-byte aligned. + * - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3. + * - srcdstStep is not a multiple of 8. + * + */ +OMXResult omxVCM4P10_DeblockChroma_I( + OMX_U8* pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8* pAlpha, + const OMX_U8* pBeta, + const OMX_U8* pThresholds, + const OMX_U8 *pBS +) +{ + OMXResult errorCode; + + armRetArgErrIf(pSrcDst == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot8ByteAligned(pSrcDst), OMX_Sts_BadArgErr); + armRetArgErrIf(srcdstStep & 7, OMX_Sts_BadArgErr); + armRetArgErrIf(pAlpha == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(pBeta == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(pThresholds == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr); + armRetArgErrIf(pBS == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot4ByteAligned(pBS), OMX_Sts_BadArgErr); + + errorCode = omxVCM4P10_FilterDeblockingChroma_VerEdge_I( + pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS); + + armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode) + + errorCode = omxVCM4P10_FilterDeblockingChroma_HorEdge_I( + pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+8, pBS+16); + + return errorCode; +} diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c new file mode 100755 index 0000000..619365f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c @@ -0,0 +1,91 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: omxVCM4P10_DeblockLuma_I.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * H.264 luma deblock + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" +#include "armVC.h" + + +/** + * Function: omxVCM4P10_DeblockLuma_I + * + * Description: + * This function performs deblock filtering the horizontal and vertical edges of a luma macroblock + *(16x16). + * + * Remarks: + * + * Parameters: + * [in] pSrcDst pointer to the input macroblock. Must be 8-byte aligned. + * [in] srcdstStep image width + * [in] pAlpha pointer to a 2x2 table of alpha thresholds, organized as follows: { external + * vertical edge, internal vertical edge, external horizontal + * edge, internal horizontal edge } + * [in] pBeta pointer to a 2x2 table of beta thresholds, organized as follows: { external + * vertical edge, internal vertical edge, external horizontal edge, + * internal horizontal edge } + * [in] pThresholds pointer to a 16x2 table of threshold (TC0), organized as follows: { values for + * the left or above edge of each 4x4 block, arranged in vertical block order + * and then in horizontal block order) + * [in] pBS pointer to a 16x2 table of BS parameters arranged in scan block order for vertical edges and then horizontal edges; + * valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4. Must be 4-byte aligned. + * [out] pSrcDst pointer to filtered output macroblock. + * + * Return Value: + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds or pBS is NULL. + * - pSrcDst is not 8-byte aligned. + * - srcdstStep is not a multiple of 8 + * - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3. +. + * + */ + +OMXResult omxVCM4P10_DeblockLuma_I( + OMX_U8* pSrcDst, + OMX_S32 srcdstStep, + const OMX_U8* pAlpha, + const OMX_U8* pBeta, + const OMX_U8* pThresholds, + const OMX_U8 *pBS +) +{ + OMXResult errorCode; + + armRetArgErrIf(pSrcDst == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot8ByteAligned(pSrcDst), OMX_Sts_BadArgErr); + armRetArgErrIf(srcdstStep & 7, OMX_Sts_BadArgErr); + armRetArgErrIf(pAlpha == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(pBeta == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(pThresholds == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr); + armRetArgErrIf(pBS == NULL, OMX_Sts_BadArgErr); + armRetArgErrIf(armNot4ByteAligned(pBS), OMX_Sts_BadArgErr); + + errorCode = omxVCM4P10_FilterDeblockingLuma_VerEdge_I( + pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS); + + armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode) + + errorCode = omxVCM4P10_FilterDeblockingLuma_HorEdge_I( + pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+16, pBS+16); + + return errorCode; +} diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c new file mode 100755 index 0000000..4e871bf --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c @@ -0,0 +1,62 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * H.264 decode coefficients module + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" +#include "armVC.h" + +/** + * Function: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC + * + * Description: + * Performs CAVLC decoding and inverse raster scan for 2x2 block of + * ChromaDCLevel. The decoded coefficients in packed position-coefficient + * buffer are stored in increasing raster scan order, namely position order. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream Double pointer to current byte in bit stream + * buffer + * [in] pOffset Pointer to current bit position in the byte + * pointed to by *ppBitStream + * [out] ppBitStream *ppBitStream is updated after each block is decoded + * [out] pOffset *pOffset is updated after each block is decoded + * [out] pNumCoeff Pointer to the number of nonzero coefficients + * in this block + * [out] ppPosCoefbuf Double pointer to destination residual + * coefficient-position pair buffer + * + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC ( + const OMX_U8** ppBitStream, + OMX_S32* pOffset, + OMX_U8* pNumCoeff, + OMX_U8** ppPosCoefbuf + ) + +{ + return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff, + ppPosCoefbuf, 17, 4); + +} diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c new file mode 100755 index 0000000..b29e576 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c @@ -0,0 +1,68 @@ +/* ---------------------------------------------------------------- + * + * + * File Name: omxVCM4P10_DecodeCoeffsToPairCAVLC.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * H.264 decode coefficients module + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" +#include "armVC.h" + +/** + * Function: omxVCM4P10_DecodeCoeffsToPairCAVLC + * + * Description: + * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of + * Intra16x16DCLevel, Intra16x16ACLevel,LumaLevel, and ChromaACLevel. + * Inverse field scan is not supported. The decoded coefficients in packed + * position-coefficient buffer are stored in increasing zigzag order instead + * of position order. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream Double pointer to current byte in bit stream buffer + * [in] pOffset Pointer to current bit position in the byte pointed + * to by *ppBitStream + * [in] sMaxNumCoeff Maximum number of non-zero coefficients in current + * block + * [in] sVLCSelect VLC table selector, obtained from number of non-zero + * AC coefficients of above and left 4x4 blocks. It is + * equivalent to the variable nC described in H.264 standard + * table 9-5, except its value can¡¯t be less than zero. + * [out] ppBitStream *ppBitStream is updated after each block is decoded + * [out] pOffset *pOffset is updated after each block is decoded + * [out] pNumCoeff Pointer to the number of nonzero coefficients in + * this block + * [out] ppPosCoefbuf Double pointer to destination residual + * coefficient-position pair buffer + * Return Value: + * Standard omxError result. See enumeration for possible result codes. + * + */ + +OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC( + const OMX_U8** ppBitStream, + OMX_S32* pOffset, + OMX_U8* pNumCoeff, + OMX_U8**ppPosCoefbuf, + OMX_INT sVLCSelect, + OMX_INT sMaxNumCoeff + ) +{ + return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff, + ppPosCoefbuf, sVLCSelect, sMaxNumCoeff); +} diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s new file mode 100755 index 0000000..485a488 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s @@ -0,0 +1,396 @@ +;// +;// +;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// H.264 inverse quantize and transform module +;// +;// + + + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import symbols required from other files +;// (For example tables) + + IMPORT armVCM4P10_UnpackBlock4x4 + IMPORT armVCM4P10_TransformResidual4x4 + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixU16 + IMPORT armVCM4P10_QPModuloTable + + M_VARIANTS CortexA8 + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + +;// Static Function: armVCM4P10_DequantLumaAC4x4 + +;// Guarding implementation by the processor name + + + +;// Guarding implementation by the processor name + + + + + + +;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd + +;// Guarding implementation by the processor name + + + +;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd + +;// Guarding implementation by the processor name + + IF CortexA8 + + +;// ARM Registers + +;//Input Registers +ppSrc RN 0 +pPred RN 1 +pDC RN 2 +pDst RN 3 + + +;//Output Registers +result RN 0 + +;//Local Scratch Registers + +;//Registers used in armVCM4P10_DequantLumaAC4x4 +pQPdiv RN 10 +pQPmod RN 11 +pVRow RN 2 +QPmod RN 12 +shift RN 14 +index0 RN 1 +index1 RN 10 + +;//Registers used in DequantTransformResidualFromPairAndAdd +pDelta RN 4 +pDeltaTmp RN 6 +AC RN 5 ;//Load from stack +pPredTemp RN 7 +pDCTemp RN 8 +pDstTemp RN 9 +pDeltaArg1 RN 1 +pDeltaArg0 RN 0 +QP RN 1 ;//Load from stack +DCval RN 10 +predstep RN 1 +dstStep RN 10 +PredVal1 RN 3 +PredVal2 RN 5 + + + + +;// Neon Registers + +;// Registers used in armVCM4P10_DequantLumaAC4x4 + +dVmatrix DN D6.8 +dindexRow0 DN D7.32 +dindexRow1 DN D9.32 +dByteIndexRow0 DN D7.8 +dByteIndexRow1 DN D9.8 +dVRow0 DN D8.8 +dVRow1 DN D4.8 +dVRow0U16 DN D8.U16 +dVRow1U16 DN D4.U16 +dVRow2U16 DN D8.U16 +dVRow3U16 DN D4.U16 + +dShift DN D5.U16 +dSrcRow0 DN D0.I16 +dSrcRow1 DN D1.I16 +dSrcRow2 DN D2.I16 +dSrcRow3 DN D3.I16 +dDqntRow0 DN D0.I16 +dDqntRow1 DN D1.I16 +dDqntRow2 DN D2.I16 +dDqntRow3 DN D3.I16 + +;// Registers used in TransformResidual4x4 + +;// Packed Input pixels +dIn0 DN D0.S16 +dIn1 DN D1.S16 +dIn2 DN D2.S16 +dIn3 DN D3.S16 +qIn01 QN Q0.32 +qIn23 QN Q1.32 + +;// Intermediate calculations +dZero DN D4.S16 +de0 DN D5.S16 +de1 DN D6.S16 +de2 DN D7.S16 +de3 DN D8.S16 +dIn1RS DN D7.S16 +dIn3RS DN D8.S16 +df0 DN D0.S16 +df1 DN D1.S16 +df2 DN D2.S16 +df3 DN D3.S16 +qf01 QN Q0.32 +qf23 QN Q1.32 +dg0 DN D5.S16 +dg1 DN D6.S16 +dg2 DN D7.S16 +dg3 DN D8.S16 +df1RS DN D7.S16 +df3RS DN D8.S16 + +;// Output pixels +dh0 DN D0.S16 +dh1 DN D1.S16 +dh2 DN D2.S16 +dh3 DN D3.S16 + +;// Registers used in DequantTransformResidualFromPairAndAdd + +dDeltaRow0 DN D0.S16 +dDeltaRow1 DN D1.S16 +dDeltaRow2 DN D2.S16 +dDeltaRow3 DN D3.S16 +qDeltaRow01 QN Q0.S16 +qDeltaRow23 QN Q1.S16 + +dPredValRow01 DN D4.U8 +dPredValRow23 DN D5.U8 + +qSumRow01 QN Q3.S16 +qSumRow23 QN Q4.S16 +dDstRow01 DN D0.U8 +dDstRow23 DN D1.U8 +dDstRow0 DN D0.32[0] +dDstRow1 DN D0.32[1] +dDstRow2 DN D1.32[0] +dDstRow3 DN D1.32[1] + + + ;// Allocate stack memory required by the function + M_ALLOC8 pBuffer, 32 + + + ;// Write function header + M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 + + ;// Define stack arguments + M_ARG predStepOnStack, 4 + M_ARG dstStepOnStack,4 + M_ARG QPOnStack, 4 + M_ARG ACOnStack,4 + + + M_ADR pDelta,pBuffer + M_LDR AC,ACOnStack + + + ;// Save registers r1,r2,r3 before function call + MOV pPredTemp,pPred + MOV pDCTemp,pDC + MOV pDstTemp,pDst + + CMP AC,#0 + BEQ DCcase + MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 + + BL armVCM4P10_UnpackBlock4x4 + + ;//-------------------------------------------------------- + ;// armVCM4P10_DequantLumaAC4x4 : static function inlined + ;//-------------------------------------------------------- + + ;//BL armVCM4P10_DequantLumaAC4x4 + M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 + + LDR pQPmod,=armVCM4P10_QPModuloTable + LDR pQPdiv,=armVCM4P10_QPDivTable + LDR pVRow,=armVCM4P10_VMatrixU16 + + + LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 + LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 + + LDR index1,=0x03020504 + LDR index0,=0x05040100 ;// Indexes into dVmatrix + ADD pVRow,pVRow,QPmod + VDUP dindexRow0,index0 + VDUP dindexRow1,index1 + VDUP dShift,shift + + ;// Load all 4x4 pVRow[] values + VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] + + + VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] + VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] + CMP pDCTemp,#0 + ;// Load all the 4x4 'src' values + VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] + + VSHL dVRow0U16,dVRow0U16,dShift + VSHL dVRow1U16,dVRow1U16,dShift + LDRSHNE DCval,[pDCTemp] + + + ;// Multiply src[] with pVRow[] + VMUL dDqntRow0,dSrcRow0,dVRow0U16 + VMUL dDqntRow1,dSrcRow1,dVRow1U16 + VMUL dDqntRow2,dSrcRow2,dVRow2U16 + VMUL dDqntRow3,dSrcRow3,dVRow3U16 + + + + ;//------------------------------------------------------------- + ;// TransformResidual4x4 : Inlined to avoid Load/Stores + ;//------------------------------------------------------------- + + + ;//BL armVCM4P10_TransformResidual4x4 + ;//STRHNE DCval,[pDelta] + VMOVNE dIn0[0],DCval + + + + ;//***************************************************************** + ;// Transpose the input pixels : perform Row ops as Col ops + ;//***************************************************************** + + VTRN dIn0,dIn1 + VTRN dIn2,dIn3 + VTRN qIn01,qIn23 + + + VMOV dZero,#0 ;// Used to right shift by 1 + + + ;//**************************************** + ;// Row Operations (Performed on columns) + ;//**************************************** + + + VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 + VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 + VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 + VHADD dIn3RS,dIn3,dZero + VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 + VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) + VADD df0,de0,de3 ;// f0 = e0 + e3 + VADD df1,de1,de2 ;// f1 = e1 + e2 + VSUB df2,de1,de2 ;// f2 = e1 - e2 + VSUB df3,de0,de3 ;// f3 = e0 - e3 + + + + ;//***************************************************************** + ;// Transpose the resultant matrix + ;//***************************************************************** + + VTRN df0,df1 + VTRN df2,df3 + VTRN qf01,qf23 + + + ;//******************************* + ;// Coloumn Operations + ;//******************************* + + + VADD dg0,df0,df2 ;// e0 = d0 + d2 + VSUB dg1,df0,df2 ;// e1 = d0 - d2 + VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 + VHADD df3RS,df3,dZero + VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 + VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) + VADD dh0,dg0,dg3 ;// f0 = e0 + e3 + VADD dh1,dg1,dg2 ;// f1 = e1 + e2 + VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 + VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 + + + ;//************************************************ + ;// Calculate final value (colOp[i][j] + 32)>>6 + ;//************************************************ + + VRSHR dh0,#6 + VRSHR dh1,#6 + VRSHR dh2,#6 + VRSHR dh3,#6 + + + B OutDCcase + + +DCcase + ;// Calculate the Transformed DCvalue : (DCval+32)>>6 + LDRSH DCval,[pDCTemp] + ADD DCval,DCval,#32 + ASR DCval,DCval,#6 + + VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval + VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval + VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval + VDUP dDeltaRow3, DCval + + +OutDCcase + M_LDR predstep,predStepOnStack + M_LDR dstStep,dstStepOnStack + + LDR PredVal1,[pPredTemp],predstep + LDR PredVal2,[pPredTemp],predstep + VMOV dPredValRow01,PredVal1,PredVal2 + + LDR PredVal1,[pPredTemp],predstep + LDR PredVal2,[pPredTemp] + VMOV dPredValRow23,PredVal1,PredVal2 + + + VADDW qSumRow01,qDeltaRow01,dPredValRow01 + VADDW qSumRow23,qDeltaRow23,dPredValRow23 + VQMOVUN dDstRow01,qSumRow01 + VQMOVUN dDstRow23,qSumRow23 + + + VST1 dDstRow0,[pDstTemp],dstStep + VST1 dDstRow1,[pDstTemp],dstStep + VST1 dDstRow2,[pDstTemp],dstStep + VST1 dDstRow3,[pDstTemp] + + ;// Set return value + MOV result,#OMX_Sts_NoErr + +End + + + ;// Write function tail + + M_END + + ENDIF ;//CORTEXA8 + + + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s new file mode 100644 index 0000000..4606197 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s @@ -0,0 +1,202 @@ +;// +;// +;// File Name: omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IF CortexA8 + + IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe + IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe + +LOOP_COUNT EQU 0x40000000 +MASK_3 EQU 0x03030303 +MASK_4 EQU 0x04040404 + +;// Function arguments + +pSrcDst RN 0 +srcdstStep RN 1 +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS3210 RN 6 + +;// Loop + +XY RN 7 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 +dAp1p0 DN D12.U8 +dAq1q0 DN D18.U8 +dAp2p0 DN D19.U8 +dAq2q0 DN D17.U8 + +qBS3210 QN Q13.U16 +dBS3210 DN D26 +dMask_bs DN D27 +dFilt_bs DN D26.U16 + +;// bSLT4 +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 +dMask_4 DN D1.U16 + +Mask_4 RN 8 +Mask_3 RN 9 + +dTemp DN D19.U8 + +;// Result +dP_0t DN D13.U8 +dQ_0t DN D31.U8 + +dP_0n DN D29.U8 +dQ_0n DN D24.U8 + + + ;// Function header + M_START omxVCM4P10_FilterDeblockingChroma_HorEdge_I, r9, d15 + + ;//Arguments on the stack + M_ARG ppThresholds, 4 + M_ARG ppBS, 4 + + ;// d0-dAlpha_0 + ;// d2-dBeta_0 + + ;load alpha1,beta1 somewhere to avoid more loads + VLD1 {dAlpha[]}, [pAlpha]! + SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 ;? + SUB pSrcDst, pSrcDst, srcdstStep + VLD1 {dBeta[]}, [pBeta]! + + M_LDR pBS, ppBS + M_LDR pThresholds, ppThresholds + + LDR Mask_3, =MASK_3 + LDR Mask_4, =MASK_4 + + VMOV dMask_0, #0 + VMOV dMask_1, #1 + VMOV dMask_4, #4 + + LDR XY, =LOOP_COUNT + + ;// p0-p3 - d4-d7 + ;// q0-q3 - d8-d11 +LoopY + LDR bS3210, [pBS], #8 + + VLD1 dP_2, [pSrcDst], srcdstStep + ;1 + VLD1 dP_1, [pSrcDst], srcdstStep + CMP bS3210, #0 + VLD1 dP_0, [pSrcDst], srcdstStep + ;1 + VLD1 dQ_0, [pSrcDst], srcdstStep + VABD dAp2p0, dP_2, dP_0 + VLD1 dQ_1, [pSrcDst], srcdstStep + VABD dAp0q0, dP_0, dQ_0 + VLD1 dQ_2, [pSrcDst], srcdstStep + BEQ NoFilterBS0 + + VABD dAp1p0, dP_1, dP_0 + VABD dAq1q0, dQ_1, dQ_0 + + VCGT dFilt, dAlpha, dAp0q0 + VMOV.U32 dBS3210[0], bS3210 + VMAX dAp1p0, dAq1q0, dAp1p0 + VMOVL qBS3210, dBS3210.U8 + VABD dAq2q0, dQ_2, dQ_0 + VCGT dMask_bs.S16, dBS3210.S16, #0 + + VCGT dAp1p0, dBeta, dAp1p0 + VCGT dAp2p0, dBeta, dAp2p0 + + VAND dFilt, dMask_bs.U8 + + TST bS3210, Mask_3 + + VCGT dAq2q0, dBeta, dAq2q0 + VAND dFilt, dFilt, dAp1p0 + + VAND dAqflg, dFilt, dAq2q0 + VAND dApflg, dFilt, dAp2p0 + + ;// bS < 4 Filtering + BLNE armVCM4P10_DeblockingChromabSLT4_unsafe + + TST bS3210, Mask_4 + + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + VTST dFilt_bs, dFilt_bs, dMask_4 + + ;// bS == 4 Filtering + BLNE armVCM4P10_DeblockingChromabSGE4_unsafe + + VBIT dP_0n, dP_0t, dFilt_bs + VBIT dQ_0n, dQ_0t, dFilt_bs + + VBIF dP_0n, dP_0, dFilt + VBIF dQ_0n, dQ_0, dFilt + + ;// Result Storage + VST1 dP_0n, [pSrcDst], srcdstStep + ADDS XY, XY, XY + VST1 dQ_0n, [pSrcDst], srcdstStep + + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_EXIT + +NoFilterBS0 + + VLD1 {dAlpha[]}, [pAlpha] + SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 + ADDS XY, XY, XY + VLD1 {dBeta[]}, [pBeta] + ADD pThresholds, pThresholds, #4 + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + M_END + + ENDIF + + + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s new file mode 100644 index 0000000..18e6c1d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s @@ -0,0 +1,282 @@ +;// +;// +;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IF CortexA8 + + IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe + IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe + +LOOP_COUNT EQU 0x40000000 +MASK_3 EQU 0x03030303 +MASK_4 EQU 0x04040404 + +;// Function arguments + +pSrcDst RN 0 +srcdstStep RN 1 +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS3210 RN 6 +pSrcDst_P RN 10 +pSrcDst_Q RN 12 + +pTmp RN 10 +pTmp2 RN 12 +step RN 14 + +;// Loop + +XY RN 7 + +;// Rows input +dRow0 DN D7.U8 +dRow1 DN D8.U8 +dRow2 DN D5.U8 +dRow3 DN D10.U8 +dRow4 DN D6.U8 +dRow5 DN D9.U8 +dRow6 DN D4.U8 +dRow7 DN D11.U8 + + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 +dAp1p0 DN D12.U8 +dAq1q0 DN D18.U8 +dAp2p0 DN D19.U8 +dAq2q0 DN D17.U8 + +qBS3210 QN Q13.U16 +dBS3210 DN D26 +dMask_bs DN D27 +dFilt_bs DN D26.U16 + +;// bSLT4 +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 +dMask_4 DN D1.U16 + +Mask_4 RN 8 +Mask_3 RN 9 + +dTemp DN D19.U8 + +;// Result +dP_0t DN D13.U8 +dQ_0t DN D31.U8 + +dP_0n DN D29.U8 +dQ_0n DN D24.U8 + + + ;// Function header + M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15 + + ;//Arguments on the stack + M_ARG ppThresholds, 4 + M_ARG ppBS, 4 + + ;// d0-dAlpha_0 + ;// d2-dBeta_0 + + ;load alpha1,beta1 somewhere to avoid more loads + VLD1 {dAlpha[]}, [pAlpha]! + SUB pSrcDst, pSrcDst, #4 + VLD1 {dBeta[]}, [pBeta]! + + M_LDR pBS, ppBS + M_LDR pThresholds, ppThresholds + + LDR Mask_4, =MASK_4 + LDR Mask_3, =MASK_3 + + ;dMask_0-14 + ;dMask_1-15 + ;dMask_4-19 + + VMOV dMask_0, #0 + VMOV dMask_1, #1 + VMOV dMask_4, #4 + + LDR XY, =LOOP_COUNT + + ;// p0-p3 - d4-d7 + ;// q0-q3 - d8-d11 + + +LoopY + LDR bS3210, [pBS], #8 + ADD pTmp, pSrcDst, srcdstStep + ADD step, srcdstStep, srcdstStep + + ;1 + VLD1 dRow0, [pSrcDst], step + ;1 + VLD1 dRow1, [pTmp], step + VLD1 dRow2, [pSrcDst], step + VLD1 dRow3, [pTmp], step + VLD1 dRow4, [pSrcDst], step + VLD1 dRow5, [pTmp], step + VLD1 dRow6, [pSrcDst], step + VLD1 dRow7, [pTmp], step + + + ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] + ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] + ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] + ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] + ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] + ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] + ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] + ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] + + ;// 8x8 Transpose + VZIP.8 dRow0, dRow1 + VZIP.8 dRow2, dRow3 + VZIP.8 dRow4, dRow5 + VZIP.8 dRow6, dRow7 + + VZIP.16 dRow0, dRow2 + VZIP.16 dRow1, dRow3 + VZIP.16 dRow4, dRow6 + VZIP.16 dRow5, dRow7 + + VZIP.32 dRow0, dRow4 + VZIP.32 dRow2, dRow6 + VZIP.32 dRow3, dRow7 + VZIP.32 dRow1, dRow5 + + + ;Realign the pointers + + CMP bS3210, #0 + VABD dAp2p0, dP_2, dP_0 + VABD dAp0q0, dP_0, dQ_0 + BEQ NoFilterBS0 + + VABD dAp1p0, dP_1, dP_0 + VABD dAq1q0, dQ_1, dQ_0 + + VMOV.U32 dBS3210[0], bS3210 + VCGT dFilt, dAlpha, dAp0q0 + VMAX dAp1p0, dAq1q0, dAp1p0 + VMOVL qBS3210, dBS3210.U8 + VABD dAq2q0, dQ_2, dQ_0 + VCGT dMask_bs.S16, dBS3210.S16, #0 + + VCGT dAp1p0, dBeta, dAp1p0 + VCGT dAp2p0, dBeta, dAp2p0 + VAND dFilt, dMask_bs.U8 + + TST bS3210, Mask_3 + + VCGT dAq2q0, dBeta, dAq2q0 + VAND dFilt, dFilt, dAp1p0 + + VAND dAqflg, dFilt, dAq2q0 + VAND dApflg, dFilt, dAp2p0 + + ;// bS < 4 Filtering + BLNE armVCM4P10_DeblockingChromabSLT4_unsafe + + TST bS3210, Mask_4 + + SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 + VTST dFilt_bs, dFilt_bs, dMask_4 + + ;// bS == 4 Filtering + BLNE armVCM4P10_DeblockingChromabSGE4_unsafe + + VBIT dP_0n, dP_0t, dFilt_bs + VBIT dQ_0n, dQ_0t, dFilt_bs + + ;// Result Storage + ADD pSrcDst_P, pSrcDst, #3 + VBIF dP_0n, dP_0, dFilt + + ADD pTmp2, pSrcDst_P, srcdstStep + ADD step, srcdstStep, srcdstStep + VBIF dQ_0n, dQ_0, dFilt + + ADDS XY, XY, XY + + VST1 {dP_0n[0]}, [pSrcDst_P], step + VST1 {dP_0n[1]}, [pTmp2], step + VST1 {dP_0n[2]}, [pSrcDst_P], step + VST1 {dP_0n[3]}, [pTmp2], step + VST1 {dP_0n[4]}, [pSrcDst_P], step + VST1 {dP_0n[5]}, [pTmp2], step + VST1 {dP_0n[6]}, [pSrcDst_P], step + VST1 {dP_0n[7]}, [pTmp2], step + + ADD pSrcDst_Q, pSrcDst, #4 + ADD pTmp, pSrcDst_Q, srcdstStep + + VST1 {dQ_0n[0]}, [pSrcDst_Q], step + VST1 {dQ_0n[1]}, [pTmp], step + VST1 {dQ_0n[2]}, [pSrcDst_Q], step + VST1 {dQ_0n[3]}, [pTmp], step + VST1 {dQ_0n[4]}, [pSrcDst_Q], step + VST1 {dQ_0n[5]}, [pTmp], step + VST1 {dQ_0n[6]}, [pSrcDst_Q], step + VST1 {dQ_0n[7]}, [pTmp], step + + ADD pSrcDst, pSrcDst, #4 + + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_EXIT + +NoFilterBS0 + VLD1 {dAlpha[]}, [pAlpha] + ADD pSrcDst, pSrcDst, #4 + SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 + ADDS XY, XY, XY + VLD1 {dBeta[]}, [pBeta] + ADD pThresholds, pThresholds, #4 + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_END + + ENDIF + + + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s new file mode 100755 index 0000000..0c3f4f2 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s @@ -0,0 +1,288 @@ +;// +;// +;// File Name: omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe + IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe + + IF CortexA8 + +LOOP_COUNT EQU 0x55000000 + + +;// Function arguments + +pSrcDst RN 0 +srcdstStep RN 1 +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS10 RN 12 + +pAlpha_0 RN 2 +pBeta_0 RN 3 + +pAlpha_1 RN 7 +pBeta_1 RN 8 + + + +;// Loop + +XY RN 9 + +pTmp RN 6 +step RN 10 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dP_3 DN D7.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 +dQ_3 DN D11.U8 + + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 +dAp1p0 DN D12.U8 +dAq1q0 DN D18.U8 +dAp2p0 DN D19.U8 +dAq2q0 DN D17.U8 + +;// bSLT4 +dTC0 DN D18.U8 +dTC1 DN D19.U8 +dTC01 DN D18.U8 + +dTCs DN D31.S8 +dTC DN D31.U8 + +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 + +Mask_0 RN 11 + +dTemp DN D19.U8 + +;// Computing P0,Q0 +qDq0p0 QN Q10.S16 +qDp1q1 QN Q11.S16 +qDelta QN Q10.S16 ; reuse qDq0p0 +dDelta DN D20.S8 + + +;// Computing P1,Q1 +dRp0q0 DN D24.U8 + +dMaxP DN D23.U8 +dMinP DN D22.U8 + +dMaxQ DN D19.U8 +dMinQ DN D21.U8 + +dDeltaP DN D26.U8 +dDeltaQ DN D27.U8 + +qP_0n QN Q14.S16 +qQ_0n QN Q12.S16 + +dQ_0n DN D24.U8 +dQ_1n DN D25.U8 +dP_0n DN D29.U8 +dP_1n DN D30.U8 + +;// bSGE4 + +qSp0q0 QN Q10.U16 + +qSp2q1 QN Q11.U16 +qSp0q0p1 QN Q12.U16 +qSp3p2 QN Q13.U16 +dHSp0q1 DN D28.U8 + +qSq2p1 QN Q11.U16 +qSp0q0q1 QN Q12.U16 +qSq3q2 QN Q13.U16 ;!! +dHSq0p1 DN D28.U8 ;!! + +qTemp1 QN Q11.U16 ;!!;qSp2q1 +qTemp2 QN Q12.U16 ;!!;qSp0q0p1 + +dP_0t DN D28.U8 ;!!;dHSp0q1 +dQ_0t DN D22.U8 ;!!;Temp1 + +dP_0n DN D29.U8 +dP_1n DN D30.U8 +dP_2n DN D31.U8 + +dQ_0n DN D24.U8 ;!!;Temp2 +dQ_1n DN D25.U8 ;!!;Temp2 +dQ_2n DN D28.U8 ;!!;dQ_0t + + + ;// Function header + M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15 + + ;//Arguments on the stack + M_ARG ppThresholds, 4 + M_ARG ppBS, 4 + + ;// d0-dAlpha_0 + ;// d2-dBeta_0 + + ADD pAlpha_1, pAlpha_0, #1 + ADD pBeta_1, pBeta_0, #1 + + VLD1 {dAlpha[]}, [pAlpha_0] + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + VLD1 {dBeta[]}, [pBeta_0] + + M_LDR pBS, ppBS + M_LDR pThresholds, ppThresholds + + MOV Mask_0,#0 + + ;dMask_0-14 + ;dMask_1-15 + + VMOV dMask_0, #0 + VMOV dMask_1, #1 + + ADD step, srcdstStep, srcdstStep + + LDR XY,=LOOP_COUNT + + ;// p0-p3 - d4-d7 + ;// q0-q3 - d8-d11 +LoopY +LoopX + LDRH bS10, [pBS], #2 + ADD pTmp, pSrcDst, srcdstStep + CMP bS10, #0 + BEQ NoFilterBS0 + + VLD1 dP_3, [pSrcDst], step + VLD1 dP_2, [pTmp], step + VLD1 dP_1, [pSrcDst], step + VLD1 dP_0, [pTmp], step + VLD1 dQ_0, [pSrcDst], step + VABD dAp1p0, dP_0, dP_1 + VLD1 dQ_1, [pTmp] + VABD dAp0q0, dQ_0, dP_0 + VLD1 dQ_2, [pSrcDst], srcdstStep + + VABD dAq1q0, dQ_1, dQ_0 + VABD dAp2p0, dP_2, dP_0 + VCGT dFilt, dAlpha, dAp0q0 + + TST bS10, #0xff + VMAX dAp1p0, dAq1q0, dAp1p0 + VABD dAq2q0, dQ_2, dQ_0 + + VMOVEQ.U32 dFilt[0], Mask_0 + TST bS10, #0xff00 + + VCGT dAp2p0, dBeta, dAp2p0 + VCGT dAp1p0, dBeta, dAp1p0 + + VMOVEQ.U32 dFilt[1], Mask_0 + + VCGT dAq2q0, dBeta, dAq2q0 + VLD1 dQ_3, [pSrcDst] + VAND dFilt, dFilt, dAp1p0 + TST bS10, #4 + + VAND dAqflg, dFilt, dAq2q0 + VAND dApflg, dFilt, dAp2p0 + + BNE bSGE4 +bSLT4 + ;// bS < 4 Filtering + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + SUB pSrcDst, pSrcDst, srcdstStep + + BL armVCM4P10_DeblockingLumabSLT4_unsafe + + ;// Result Storage + VST1 dP_1n, [pSrcDst], srcdstStep + VST1 dP_0n, [pSrcDst], srcdstStep + SUB pTmp, pSrcDst, srcdstStep, LSL #2 + VST1 dQ_0n, [pSrcDst], srcdstStep + ADDS XY, XY, XY + VST1 dQ_1n, [pSrcDst] + ADD pSrcDst, pTmp, #8 + + BCC LoopX + B ExitLoopY + +NoFilterBS0 + ADD pSrcDst, pSrcDst, #8 + ADDS XY, XY, XY + ADD pThresholds, pThresholds, #2 + BCC LoopX + B ExitLoopY +bSGE4 + ;// bS >= 4 Filtering + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 + BL armVCM4P10_DeblockingLumabSGE4_unsafe + + ;// Result Storage + VST1 dP_2n, [pSrcDst], srcdstStep + VST1 dP_1n, [pSrcDst], srcdstStep + VST1 dP_0n, [pSrcDst], srcdstStep + SUB pTmp, pSrcDst, srcdstStep, LSL #2 + VST1 dQ_0n, [pSrcDst], srcdstStep + ADDS XY,XY,XY + VST1 dQ_1n, [pSrcDst], srcdstStep + ADD pThresholds, pThresholds, #2 + VST1 dQ_2n, [pSrcDst] + + ADD pSrcDst, pTmp, #8 + BCC LoopX + +ExitLoopY + + SUB pSrcDst, pSrcDst, #16 + VLD1 {dAlpha[]}, [pAlpha_1] + ADD pSrcDst, pSrcDst, srcdstStep, LSL #2 + VLD1 {dBeta[]}, [pBeta_1] + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_END + + ENDIF + + + + + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s new file mode 100755 index 0000000..e6fbb34 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s @@ -0,0 +1,436 @@ +;// +;// +;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe + IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe + + IF CortexA8 + +LOOP_COUNT EQU 0x11000000 + + +;// Function arguments + +pSrcDst RN 0 +srcdstStep RN 1 +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS10 RN 12 + +pAlpha_0 RN 2 +pBeta_0 RN 3 + +pAlpha_1 RN 7 +pBeta_1 RN 8 + +pTmp RN 10 +pTmpStep RN 11 + +;// Loop + +XY RN 9 + +;// Rows input +dRow0 DN D7.U8 +dRow1 DN D8.U8 +dRow2 DN D5.U8 +dRow3 DN D10.U8 +dRow4 DN D6.U8 +dRow5 DN D9.U8 +dRow6 DN D4.U8 +dRow7 DN D11.U8 + +;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 +;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 + +;// Rows output +dRown0 DN D7.U8 +dRown1 DN D24.U8 +dRown2 DN D30.U8 +dRown3 DN D10.U8 +dRown4 DN D6.U8 +dRown5 DN D25.U8 +dRown6 DN D29.U8 +dRown7 DN D11.U8 + +;// dP_0n DN D29.U8 +;// dP_1n DN D30.U8 +;// dP_2n DN D31.U8 +;// +;// dQ_0n DN D24.U8 ;!!;Temp2 +;// dQ_1n DN D25.U8 ;!!;Temp2 +;// dQ_2n DN D28.U8 ;!!;dQ_0t +;// +;// dRown0 - dP_3, dRown1 - dQ_0n +;// dRown2 - dP_1n, dRown3 - dQ_2 +;// dRown4 - dP_2, dRown5 - dQ_1n +;// dRown6 - dP_0n, dRown7 - dQ_3 + +dRow0n DN D7.U8 +dRow1n DN D24.U8 +dRow2n DN D30.U8 +dRow3n DN D28.U8 +dRow4n DN D31.U8 +dRow5n DN D25.U8 +dRow6n DN D29.U8 +dRow7n DN D11.U8 + +;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n +;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dP_3 DN D7.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 +dQ_3 DN D11.U8 + + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 +dAp1p0 DN D12.U8 +dAq1q0 DN D18.U8 +dAp2p0 DN D19.U8 +dAq2q0 DN D17.U8 + +;// bSLT4 +dTC0 DN D18.U8 +dTC1 DN D19.U8 +dTC01 DN D18.U8 + +dTCs DN D31.S8 +dTC DN D31.U8 + +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 + +Mask_0 RN 6 + +dTemp DN D19.U8 + +;// Computing P0,Q0 +qDq0p0 QN Q10.S16 +qDp1q1 QN Q11.S16 +qDelta QN Q10.S16 ; reuse qDq0p0 +dDelta DN D20.S8 + + +;// Computing P1,Q1 +dRp0q0 DN D24.U8 + +dMaxP DN D23.U8 +dMinP DN D22.U8 + +dMaxQ DN D19.U8 +dMinQ DN D21.U8 + +dDeltaP DN D26.U8 +dDeltaQ DN D27.U8 + +qP_0n QN Q14.S16 +qQ_0n QN Q12.S16 + +dQ_0n DN D24.U8 +dQ_1n DN D25.U8 +dP_0n DN D29.U8 +dP_1n DN D30.U8 + +;// bSGE4 + +qSp0q0 QN Q10.U16 + +qSp2q1 QN Q11.U16 +qSp0q0p1 QN Q12.U16 +qSp3p2 QN Q13.U16 +dHSp0q1 DN D28.U8 + +qSq2p1 QN Q11.U16 +qSp0q0q1 QN Q12.U16 +qSq3q2 QN Q13.U16 ;!! +dHSq0p1 DN D28.U8 ;!! + +qTemp1 QN Q11.U16 ;!!;qSp2q1 +qTemp2 QN Q12.U16 ;!!;qSp0q0p1 + +dP_0t DN D28.U8 ;!!;dHSp0q1 +dQ_0t DN D22.U8 ;!!;Temp1 + +dP_0n DN D29.U8 +dP_1n DN D30.U8 +dP_2n DN D31.U8 + +dQ_0n DN D24.U8 ;!!;Temp2 +dQ_1n DN D25.U8 ;!!;Temp2 +dQ_2n DN D28.U8 ;!!;dQ_0t + + + ;// Function header + M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15 + + ;//Arguments on the stack + M_ARG ppThresholds, 4 + M_ARG ppBS, 4 + + ;// d0-dAlpha_0 + ;// d2-dBeta_0 + + ADD pAlpha_1, pAlpha_0, #1 + ADD pBeta_1, pBeta_0, #1 + + VLD1 {dAlpha[]}, [pAlpha_0] + SUB pSrcDst, pSrcDst, #4 + VLD1 {dBeta[]}, [pBeta_0] + + M_LDR pBS, ppBS + M_LDR pThresholds, ppThresholds + + MOV Mask_0,#0 + + ;dMask_0-14 + ;dMask_1-15 + + VMOV dMask_0, #0 + VMOV dMask_1, #1 + + LDR XY,=LOOP_COUNT + + ADD pTmpStep, srcdstStep, srcdstStep + + ;// p0-p3 - d4-d7 + ;// q0-q3 - d8-d11 +LoopY +LoopX + LDRH bS10, [pBS], #4 + + CMP bS10, #0 + BEQ NoFilterBS0 + + ;// Load 8 rows of data + ADD pTmp, pSrcDst, srcdstStep + VLD1 dRow0, [pSrcDst], pTmpStep + VLD1 dRow1, [pTmp], pTmpStep + VLD1 dRow2, [pSrcDst], pTmpStep + VZIP.8 dRow0, dRow1 + VLD1 dRow3, [pTmp], pTmpStep + VLD1 dRow4, [pSrcDst], pTmpStep + VZIP.8 dRow2, dRow3 + VLD1 dRow5, [pTmp], pTmpStep + VLD1 dRow6, [pSrcDst], pTmpStep + VLD1 dRow7, [pTmp], pTmpStep + VZIP.8 dRow4, dRow5 + VZIP.16 dRow1, dRow3 + + + ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] + ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] + ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] + ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] + ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] + ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] + ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] + ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] + + ;// 8x8 Transpose + + VZIP.8 dRow6, dRow7 + + SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 + VZIP.16 dRow0, dRow2 + VZIP.16 dRow5, dRow7 + + + VZIP.16 dRow4, dRow6 + VZIP.32 dRow1, dRow5 + VZIP.32 dRow2, dRow6 + VZIP.32 dRow3, dRow7 + VZIP.32 dRow0, dRow4 + + + ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 + ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 + + ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0] + ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0] + ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0] + ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0] + + ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0] + ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0] + ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0] + ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0] + + VABD dAp0q0, dP_0, dQ_0 + VABD dAp1p0, dP_1, dP_0 + + VABD dAq1q0, dQ_1, dQ_0 + VABD dAp2p0, dP_2, dP_0 + + TST bS10, #0xff + VCGT dFilt, dAlpha, dAp0q0 + + VMAX dAp1p0, dAq1q0, dAp1p0 + VABD dAq2q0, dQ_2, dQ_0 + + VMOVEQ.U32 dFilt[0], Mask_0 + TST bS10, #0xff00 + + VCGT dAp2p0, dBeta, dAp2p0 + VCGT dAp1p0, dBeta, dAp1p0 + + VMOVEQ.U32 dFilt[1], Mask_0 + + VCGT dAq2q0, dBeta, dAq2q0 + VAND dFilt, dFilt, dAp1p0 + TST bS10, #4 + + VAND dAqflg, dFilt, dAq2q0 + VAND dApflg, dFilt, dAp2p0 + + BNE bSGE4 +bSLT4 + ;// bS < 4 Filtering + + BL armVCM4P10_DeblockingLumabSLT4_unsafe + + ;// Transpose + + VZIP.8 dP_3, dP_2 + VZIP.8 dP_1n, dP_0n + VZIP.8 dQ_0n, dQ_1n + VZIP.8 dQ_2, dQ_3 + + + VZIP.16 dP_3, dP_1n + ADD pTmp, pSrcDst, srcdstStep + VZIP.16 dQ_0n, dQ_2 + VZIP.16 dQ_1n, dQ_3 + VZIP.16 dP_2, dP_0n + + VZIP.32 dP_3, dQ_0n + VZIP.32 dP_1n, dQ_2 + VZIP.32 dP_2, dQ_1n + VZIP.32 dP_0n, dQ_3 + + ;// dRown0 - dP_3, dRown1 - dQ_0n + ;// dRown2 - dP_1n, dRown3 - dQ_2 + ;// dRown4 - dP_2, dRown5 - dQ_1n + ;// dRown6 - dP_0n, dRown7 - dQ_3 + + VST1 dRown0, [pSrcDst], pTmpStep + VST1 dRown1, [pTmp], pTmpStep + VST1 dRown2, [pSrcDst], pTmpStep + VST1 dRown3, [pTmp], pTmpStep + ;1 + VST1 dRown4, [pSrcDst], pTmpStep + VST1 dRown5, [pTmp], pTmpStep + ADDS XY, XY, XY + VST1 dRown6, [pSrcDst], pTmpStep + ADD pThresholds, pThresholds, #2 + VST1 dRown7, [pTmp], srcdstStep + + SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 + VLD1 {dAlpha[]}, [pAlpha_1] + ADD pSrcDst, pSrcDst, #4 + VLD1 {dBeta[]}, [pBeta_1] + + BCC LoopX + B ExitLoopY + +NoFilterBS0 + ADD pSrcDst, pSrcDst, #4 + ADDS XY, XY, XY + VLD1 {dAlpha[]}, [pAlpha_1] + ADD pThresholds, pThresholds, #4 + VLD1 {dBeta[]}, [pBeta_1] + BCC LoopX + B ExitLoopY +bSGE4 + ;// bS >= 4 Filtering + + BL armVCM4P10_DeblockingLumabSGE4_unsafe + + ;// Transpose + + VZIP.8 dP_3, dP_2n + VZIP.8 dP_1n, dP_0n + VZIP.8 dQ_0n, dQ_1n + VZIP.8 dQ_2n, dQ_3 + + VZIP.16 dP_3, dP_1n + ADD pTmp, pSrcDst, srcdstStep + VZIP.16 dQ_0n, dQ_2n + VZIP.16 dQ_1n, dQ_3 + VZIP.16 dP_2n, dP_0n + + VZIP.32 dP_3, dQ_0n + VZIP.32 dP_1n, dQ_2n + VZIP.32 dP_2n, dQ_1n + VZIP.32 dP_0n, dQ_3 + + ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n + ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 + + VST1 dRow0n, [pSrcDst], pTmpStep + VST1 dRow1n, [pTmp], pTmpStep + VST1 dRow2n, [pSrcDst], pTmpStep + VST1 dRow3n, [pTmp], pTmpStep + VST1 dRow4n, [pSrcDst], pTmpStep + VST1 dRow5n, [pTmp], pTmpStep + ADDS XY,XY,XY + VST1 dRow6n, [pSrcDst], pTmpStep + ADD pThresholds, pThresholds, #4 + VST1 dRow7n, [pTmp], pTmpStep + + SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 + VLD1 {dAlpha[]}, [pAlpha_1] + ADD pSrcDst, pSrcDst, #4 + VLD1 {dBeta[]}, [pBeta_1] + + BCC LoopX + +ExitLoopY + SUB pBS, pBS, #14 + SUB pThresholds, pThresholds, #14 + SUB pSrcDst, pSrcDst, #16 + VLD1 {dAlpha[]}, [pAlpha_0] + ADD pSrcDst, pSrcDst, srcdstStep, LSL #3 + VLD1 {dBeta[]}, [pBeta_0] + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_END + + ENDIF + + + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c new file mode 100755 index 0000000..3ce41be --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c @@ -0,0 +1,79 @@ +/** + * + * File Name: omxVCM4P10_InterpolateChroma.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * Description: + * This function will calculate 1/8 Pixel interpolation for Chroma Block + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armVC.h" +#include "armCOMM.h" + + +/** + * Function: omxVCM4P10_InterpolateChroma, + * + * Description: + * Performs 1/8-pixel interpolation for inter chroma MB. + * + * Remarks: + * + * Parameters: + * [in] pSrc Pointer to the source reference frame buffer + * [in] srcStep Reference frame step in byte + * [in] dstStep Destination frame step in byte. Must be multiple of roi.width. + * [in] dx Fractional part of horizontal motion vector component + * in 1/8 pixel unit;valid in the range [0,7] + * [in] dy Fractional part of vertical motion vector component + * in 1/8 pixel unit;valid in the range [0,7] + * [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must + * be equal to either 2, 4, or 8. + * [out] pDst Pointer to the destination frame buffer. + * if roi.width==2, 2-byte alignment required + * if roi.width==4, 4-byte alignment required + * if roi.width==8, 8-byte alignment required + * + * Return Value: + * If the function runs without error, it returns OMX_Sts_NoErr. + * If one of the following cases occurs, the function returns OMX_Sts_BadArgErr: + * pSrc or pDst is NULL. + * srcStep or dstStep < 8. + * dx or dy is out of range [0-7]. + * roi.width or roi.height is out of range {2,4,8}. + * roi.width is equal to 2, but pDst is not 2-byte aligned. + * roi.width is equal to 4, but pDst is not 4-byte aligned. + * roi.width is equal to 8, but pDst is not 8 byte aligned. + * srcStep or dstStep is not a multiple of 8. + * + */ + +OMXResult omxVCM4P10_InterpolateChroma ( + const OMX_U8* pSrc, + OMX_S32 srcStep, + OMX_U8* pDst, + OMX_S32 dstStep, + OMX_S32 dx, + OMX_S32 dy, + OMXSize roi + ) +{ + return armVCM4P10_Interpolate_Chroma + ((OMX_U8*)pSrc, srcStep, pDst, dstStep, roi.width, roi.height, dx, dy); +} + + +/***************************************************************************** + * END OF FILE + *****************************************************************************/ + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s new file mode 100755 index 0000000..942ebc6 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s @@ -0,0 +1,553 @@ +;// +;// +;// File Name: omxVCM4P10_InterpolateLuma_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + +;// Function: +;// omxVCM4P10_InterpolateLuma +;// +;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly. +;// Performs quarter pel interpolation of inter luma MB. +;// It's assumed that the frame is already padded when calling this function. +;// Parameters: +;// [in] pSrc Pointer to the source reference frame buffer +;// [in] srcStep Reference frame step in byte +;// [in] dstStep Destination frame step in byte. Must be multiple of roi.width +;// [in] dx Fractional part of horizontal motion vector +;// component in 1/4 pixel unit; valid in the range [0,3] +;// [in] dy Fractional part of vertical motion vector +;// component in 1/4 pixel unit; valid in the range [0,3] +;// [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must +;// be equal to either 4, 8, or 16. +;// [out] pDst Pointer to the destination frame buffer. +;// if roi.width==4, 4-byte alignment required +;// if roi.width==8, 8-byte alignment required +;// if roi.width==16, 16-byte alignment required +;// +;// Return Value: +;// If the function runs without error, it returns OMX_Sts_NoErr. +;// It is assued that following cases are satisfied before calling this function: +;// pSrc or pDst is not NULL. +;// srcStep or dstStep >= roi.width. +;// dx or dy is in the range [0-3]. +;// roi.width or roi.height is not out of range {4, 8, 16}. +;// If roi.width is equal to 4, Dst is 4 byte aligned. +;// If roi.width is equal to 8, pDst is 8 byte aligned. +;// If roi.width is equal to 16, pDst is 16 byte aligned. +;// srcStep and dstStep is multiple of 8. +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + EXPORT omxVCM4P10_InterpolateLuma + + + IF CortexA8 + IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + ENDIF + + + +;// Declare input registers +pSrc RN 0 +srcStep RN 1 +pDst RN 2 +dstStep RN 3 +iHeight RN 4 +iWidth RN 5 + +;// Declare other intermediate registers +idx RN 6 +idy RN 7 +index RN 6 +Temp RN 12 +pArgs RN 11 + + + IF CortexA8 + + ;// + ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time. + ;// + M_ALLOC4 ppArgs, 16 + + ;// Function header + M_START omxVCM4P10_InterpolateLuma, r11, d15 + +pSrcBK RN 8 + +;// Declare Neon registers +dCoeff5 DN 30.S16 +dCoeff20 DN 31.S16 + +;// Registers used for implementing Horizontal interpolation +dSrc0c DN 14.U8 +dSrc1c DN 16.U8 +dSrc2c DN 18.U8 +dSrc3c DN 20.U8 +dSrc0d DN 15.U8 +dSrc1d DN 17.U8 +dSrc2d DN 19.U8 +dSrc3d DN 21.U8 +dAccH0 DN 22.U8 +dAccH1 DN 24.U8 +dAccH2 DN 26.U8 +dAccH3 DN 28.U8 +dResultH0 DN 22.U32 +dResultH1 DN 24.U32 +dResultH2 DN 26.U32 +dResultH3 DN 28.U32 + +;// Registers used for implementing Vertical interpolation +dSrc0 DN 9.U8 +dSrc1 DN 10.U8 +dSrc2 DN 11.U8 +dSrc3 DN 12.U8 +dSrc4 DN 13.U8 +dAccV0 DN 0.U8 +dAccV1 DN 2.U8 +dAccV2 DN 4.U8 +dAccV3 DN 6.U8 +dResultV0 DN 0.U32 +dResultV1 DN 2.U32 +dResultV2 DN 4.U32 +dResultV3 DN 6.U32 + +;// Registers used for implementing Diagonal interpolation +dTAcc0 DN 0.U8 +dTAcc1 DN 2.U8 +dTAcc2 DN 4.U8 +dTAcc3 DN 6.U8 +dTRes0 DN 0.32 +dTRes1 DN 2.32 +dTRes2 DN 4.32 +dTRes3 DN 6.32 +dTResult0 DN 14.U8 +dTResult1 DN 16.U8 +dTResult2 DN 18.U8 +dTResult3 DN 20.U8 +dTempP0 DN 18.S16 +dTempP1 DN 19.S16 +dTempQ0 DN 20.S16 +dTempQ1 DN 21.S16 +dTempR0 DN 22.S16 +dTempR1 DN 23.S16 +dTempS0 DN 24.S16 +dTempS1 DN 25.S16 +qTempP01 QN 9.S16 +qTempQ01 QN 10.S16 +qTempR01 QN 11.S16 +qTempS01 QN 12.S16 + +;// Intermediate values for averaging +qRes2 QN 7.S16 +qRes3 QN 8.S16 +qRes4 QN 9.S16 +qRes5 QN 10.S16 +qRes6 QN 11.S16 + +;// For implementing copy +dDst0 DN 9.32 +dDst1 DN 10.32 +dDst2 DN 11.32 +dDst3 DN 12.32 + + ;// Define stack arguments + M_ARG ptridx, 4 + M_ARG ptridy, 4 + M_ARG ptrWidth, 4 + M_ARG ptrHeight, 4 + + ;// Load structure elements of roi + M_LDR idx, ptridx + M_LDR idy, ptridy + M_LDR iWidth, ptrWidth + M_LDR iHeight, ptrHeight + + ADD index, idx, idy, LSL #2 ;// [index] = [idy][idx] + M_ADR pArgs, ppArgs + + ;// Move coefficients Neon registers + VMOV dCoeff20, #20 + VMOV dCoeff5, #5 + +Block4x4WidthLoop +Block4x4HeightLoop + + STM pArgs, {pSrc,srcStep,pDst,dstStep} + + ;// switch table using motion vector as index + ADD pc, pc, index, LSL #2 + B Case_f + B Case_0 + B Case_1 + B Case_2 + B Case_3 + B Case_4 + B Case_5 + B Case_6 + B Case_7 + B Case_8 + B Case_9 + B Case_a + B Case_b + B Case_c + B Case_d + B Case_e + B Case_f + +Case_0 + ;// Case G + M_PRINTF "Case 0 \n" + + ;// Loads a 4x4 block of .8 and stores as .32 + ADD Temp, pSrc, srcStep, LSL #1 + VLD1 dSrc0, [pSrc], srcStep + VLD1 dSrc2, [Temp], srcStep + VLD1 dSrc1, [pSrc] + VLD1 dSrc3, [Temp] + + ADD Temp, pDst, dstStep, LSL #1 + VST1 dDst0[0], [pDst], dstStep + VST1 dDst2[0], [Temp], dstStep + VST1 dDst1[0], [pDst] + VST1 dDst3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_1 + ;// Case a + M_PRINTF "Case 1 \n" + + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dSrc0c + VRHADD dAccH2, dAccH2, dSrc2c + VRHADD dAccH1, dAccH1, dSrc1c + VRHADD dAccH3, dAccH3, dSrc3c + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_2 + ;// Case b + M_PRINTF "Case 2 \n" + + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_3 + ;// Case c + M_PRINTF "Case 3 \n" + + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dSrc0d + VRHADD dAccH2, dAccH2, dSrc2d + VRHADD dAccH1, dAccH1, dSrc1d + VRHADD dAccH3, dAccH3, dSrc3d + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_4 + ;// Case d + M_PRINTF "Case 4 \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + VRHADD dAccV0, dAccV0, dSrc0 + VRHADD dAccV2, dAccV2, dSrc2 + VRHADD dAccV1, dAccV1, dSrc1 + VRHADD dAccV3, dAccV3, dSrc3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultV0[0], [pDst], dstStep + VST1 dResultV2[0], [Temp], dstStep + VST1 dResultV1[0], [pDst] + VST1 dResultV3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_5 + ;// Case e + M_PRINTF "Case 5 \n" + + MOV pSrcBK, pSrc + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + SUB pSrc, pSrcBK, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dAccV0 + VRHADD dAccH2, dAccH2, dAccV2 + VRHADD dAccH1, dAccH1, dAccV1 + VRHADD dAccH3, dAccH3, dAccV3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_6 + ;// Case f + M_PRINTF "Case 6 \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + VQRSHRUN dTResult0, qRes2, #5 + VQRSHRUN dTResult1, qRes3, #5 + VQRSHRUN dTResult2, qRes4, #5 + VQRSHRUN dTResult3, qRes5, #5 + VRHADD dTAcc0, dTAcc0, dTResult0 + VRHADD dTAcc2, dTAcc2, dTResult2 + VRHADD dTAcc1, dTAcc1, dTResult1 + VRHADD dTAcc3, dTAcc3, dTResult3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dTRes0[0], [pDst], dstStep + VST1 dTRes2[0], [Temp], dstStep + VST1 dTRes1[0], [pDst] + VST1 dTRes3[0], [Temp] + + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_7 + ;// Case g + M_PRINTF "Case 7 \n" + MOV pSrcBK, pSrc + ADD pSrc, pSrc, #1 + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + SUB pSrc, pSrcBK, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dAccV0 + VRHADD dAccH2, dAccH2, dAccV2 + VRHADD dAccH1, dAccH1, dAccV1 + VRHADD dAccH3, dAccH3, dAccV3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_8 + ;// Case h + M_PRINTF "Case 8 \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultV0[0], [pDst], dstStep + VST1 dResultV2[0], [Temp], dstStep + VST1 dResultV1[0], [pDst] + VST1 dResultV3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_9 + ;// Case i + M_PRINTF "Case 9 \n" + SUB pSrc, pSrc, srcStep, LSL #1 + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + VEXT dTempP0, dTempP0, dTempP1, #2 + VEXT dTempQ0, dTempQ0, dTempQ1, #2 + VEXT dTempR0, dTempR0, dTempR1, #2 + VEXT dTempS0, dTempS0, dTempS1, #2 + + VQRSHRUN dTResult0, qTempP01, #5 + VQRSHRUN dTResult1, qTempQ01, #5 + VQRSHRUN dTResult2, qTempR01, #5 + VQRSHRUN dTResult3, qTempS01, #5 + + VRHADD dTAcc0, dTAcc0, dTResult0 + VRHADD dTAcc2, dTAcc2, dTResult2 + VRHADD dTAcc1, dTAcc1, dTResult1 + VRHADD dTAcc3, dTAcc3, dTResult3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dTRes0[0], [pDst], dstStep + VST1 dTRes2[0], [Temp], dstStep + VST1 dTRes1[0], [pDst] + VST1 dTRes3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_a + ;// Case j + M_PRINTF "Case a \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + ADD Temp, pDst, dstStep, LSL #1 + VST1 dTRes0[0], [pDst], dstStep + VST1 dTRes2[0], [Temp], dstStep + VST1 dTRes1[0], [pDst] + VST1 dTRes3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_b + ;// Case k + M_PRINTF "Case b \n" + SUB pSrc, pSrc, srcStep, LSL #1 + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + VEXT dTempP0, dTempP0, dTempP1, #3 + VEXT dTempQ0, dTempQ0, dTempQ1, #3 + VEXT dTempR0, dTempR0, dTempR1, #3 + VEXT dTempS0, dTempS0, dTempS1, #3 + + VQRSHRUN dTResult0, qTempP01, #5 + VQRSHRUN dTResult1, qTempQ01, #5 + VQRSHRUN dTResult2, qTempR01, #5 + VQRSHRUN dTResult3, qTempS01, #5 + + VRHADD dTAcc0, dTAcc0, dTResult0 + VRHADD dTAcc2, dTAcc2, dTResult2 + VRHADD dTAcc1, dTAcc1, dTResult1 + VRHADD dTAcc3, dTAcc3, dTResult3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dTRes0[0], [pDst], dstStep + VST1 dTRes2[0], [Temp], dstStep + VST1 dTRes1[0], [pDst] + VST1 dTRes3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_c + ;// Case n + M_PRINTF "Case c \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + VRHADD dAccV0, dAccV0, dSrc1 + VRHADD dAccV2, dAccV2, dSrc3 + VRHADD dAccV1, dAccV1, dSrc2 + VRHADD dAccV3, dAccV3, dSrc4 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultV0[0], [pDst], dstStep + VST1 dResultV2[0], [Temp], dstStep + VST1 dResultV1[0], [pDst] + VST1 dResultV3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_d + ;// Case p + M_PRINTF "Case d \n" + + MOV pSrcBK, pSrc + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD pSrc, pSrcBK, srcStep + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dAccV0 + VRHADD dAccH2, dAccH2, dAccV2 + VRHADD dAccH1, dAccH1, dAccV1 + VRHADD dAccH3, dAccH3, dAccV3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_e + ;// Case q + M_PRINTF "Case e \n" + + SUB pSrc, pSrc, srcStep, LSL #1 + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + VQRSHRUN dTResult0, qRes3, #5 + VQRSHRUN dTResult1, qRes4, #5 + VQRSHRUN dTResult2, qRes5, #5 + VQRSHRUN dTResult3, qRes6, #5 + + VRHADD dTAcc0, dTAcc0, dTResult0 + VRHADD dTAcc2, dTAcc2, dTResult2 + VRHADD dTAcc1, dTAcc1, dTResult1 + VRHADD dTAcc3, dTAcc3, dTResult3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dTRes0[0], [pDst], dstStep + VST1 dTRes2[0], [Temp], dstStep + VST1 dTRes1[0], [pDst] + VST1 dTRes3[0], [Temp] + M_ADR pArgs, ppArgs + B Block4x4LoopEnd +Case_f + ;// Case r + M_PRINTF "Case f \n" + MOV pSrcBK, pSrc + ADD pSrc, pSrc, #1 + SUB pSrc, pSrc, srcStep, LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD pSrc, pSrcBK, srcStep + SUB pSrc, pSrc, #2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD dAccH0, dAccH0, dAccV0 + VRHADD dAccH2, dAccH2, dAccV2 + VRHADD dAccH1, dAccH1, dAccV1 + VRHADD dAccH3, dAccH3, dAccV3 + ADD Temp, pDst, dstStep, LSL #1 + VST1 dResultH0[0], [pDst], dstStep + VST1 dResultH2[0], [Temp], dstStep + VST1 dResultH1[0], [pDst] + VST1 dResultH3[0], [Temp] + M_ADR pArgs, ppArgs + + +Block4x4LoopEnd + + ;// Width Loop + ;//M_ADR pArgs, ppArgs + LDM pArgs, {pSrc,srcStep,pDst,dstStep} ;// Load arguments + SUBS iWidth, iWidth, #4 + ADD pSrc, pSrc, #4 + ADD pDst, pDst, #4 + BGT Block4x4WidthLoop + + ;// Height Loop + SUBS iHeight, iHeight, #4 + M_LDR iWidth, ptrWidth + M_ADR pArgs, ppArgs + ADD pSrc, pSrc, srcStep, LSL #2 + ADD pDst, pDst, dstStep, LSL #2 + SUB pSrc, pSrc, iWidth + SUB pDst, pDst, iWidth + BGT Block4x4HeightLoop + +EndOfInterpolation + MOV r0, #0 + M_END + + ENDIF + ;// End of CortexA8 + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s new file mode 100755 index 0000000..3a60705 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s @@ -0,0 +1,436 @@ +;// +;// +;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + EXPORT armVCM4P10_pIndexTable8x8 + +;// Define the processor variants supported by this file + + M_VARIANTS CortexA8 + + AREA table, DATA +;//------------------------------------------------------- +;// This table for implementing switch case of C in asm by +;// the mehtod of two levels of indexing. +;//------------------------------------------------------- + + M_TABLE armVCM4P10_pIndexTable8x8 + DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR + DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE + + M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 + DCW 3, 2, 1,4 + DCW -3,-2,-1,0 + DCW 1, 2, 3,4 + + + + IF CortexA8 + +;//-------------------------------------------- +;// Scratch variable +;//-------------------------------------------- + +pc RN 15 +return RN 0 +pTable RN 8 + +;//-------------------------------------------- +;// Input Arguments +;//-------------------------------------------- +pSrcLeft RN 0 ;// input pointer +pSrcAbove RN 1 ;// input pointer +pSrcAboveLeft RN 2 ;// input pointer +pDst RN 3 ;// output pointer +leftStep RN 4 ;// input variable +dstStep RN 5 ;// input variable +predMode RN 6 ;// input variable +availability RN 7 ;// input variable +pMultiplierTable RN 2 + +pTmp RN 9 +step RN 10 + +;//--------------------- +;// Neon Registers +;//--------------------- + +;// OMX_VC_CHROMA_HOR + +dLeftVal0 DN D0.8 +dLeftVal1 DN D1.8 +dLeftVal2 DN D2.8 +dLeftVal3 DN D3.8 +dLeftVal4 DN D4.8 +dLeftVal5 DN D5.8 +dLeftVal6 DN D6.8 +dLeftVal7 DN D7.8 + +;// OMX_VC_CHROMA_VERT + +dAboveVal DN D0.U8 + +;// OMX_VC_CHROMA_DC + +dLeftVal DN D1.U8 +dSumAboveValU16 DN D2.U16 +dSumAboveValU32 DN D3.U32 +dSumAboveValU8 DN D3.U8 +dSumLeftValU16 DN D2.U16 +dSumLeftValU32 DN D1.U32 +dSumLeftValU8 DN D1.U8 +dSumAboveLeft DN D2.U32 +dSumAboveLeftU8 DN D2.U8 +dIndexRow0U8 DN D5.U8 +dIndexRow0 DN D5.U64 +dIndexRow4U8 DN D6.U8 +dIndexRow4 DN D6.U64 +dDstRow0 DN D0.U8 +dDstRow4 DN D4.U8 +dConst128U8 DN D0.U8 + +;// OMX_VC_CHROMA_PLANE + +dRevAboveVal DN D3.U8 +dRevAboveValU64 DN D3.U64 +dAboveLeftVal DN D2.U8 +qAbove7minus0 QN Q3.S16 +qAboveDiff QN Q2.S16 +dIndex DN D8.U8 +dDiffAboveU8 DN D9.U8 +dDiffAboveS16 DN D9.S16 +dAboveDiff0U8 DN D4.U8 +dAboveDiff0U64 DN D4.U64 +dAbove7minus0U8 DN D6.U8 +dMultiplier DN D10.S16 +dHorPred DN D11.S16 +dRevLeftVal DN D3.U8 +dRevLeftValU64 DN D3.U64 +qLeft7minus0 QN Q7.S16 +qLeftDiff QN Q6.S16 +dDiffLeftU8 DN D16.U8 +dDiffLeftS16 DN D16.S16 +dLeftDiff0U8 DN D12.U8 +dLeftDiff0U64 DN D12.U64 +dLeft7minus0U8 DN D14.U8 +dVerPred DN D3.S16 +dHVValS16 DN D3.S16 +dHVValS32 DN D3.S32 +dHVTempS32 DN D2.S32 +qA QN Q0.S16 +qB QN Q2.S16 +qC QN Q3.S16 +qMultiplier QN Q5.S16 +dMultiplier0 DN D10.S16 +dMultiplier1 DN D11.S16 +qC0 QN Q0.S16 +qC1 QN Q1.S16 +qC2 QN Q4.S16 +qC3 QN Q5.S16 +qC4 QN Q6.S16 +qC5 QN Q7.S16 +qC6 QN Q8.S16 +qC7 QN Q9.S16 +qSum0 QN Q0.S16 +qSum1 QN Q1.S16 +qSum2 QN Q4.S16 +qSum3 QN Q5.S16 +qSum4 QN Q6.S16 +qSum5 QN Q7.S16 +qSum6 QN Q8.S16 +qSum7 QN Q9.S16 +dSum0 DN D0.U8 +dSum1 DN D1.U8 +dSum2 DN D2.U8 +dSum3 DN D3.U8 +dSum4 DN D4.U8 +dSum5 DN D5.U8 +dSum6 DN D6.U8 +dSum7 DN D7.U8 + +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntraChroma_8x8 starts +;//----------------------------------------------------------------------------------------------- + + ;// Write function header + M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15 + + ;// Define stack arguments + M_ARG LeftStep, 4 + M_ARG DstStep, 4 + M_ARG PredMode, 4 + M_ARG Availability, 4 + + LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case + + ;// Load argument from the stack + M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg + M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg + M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg + M_LDR availability, Availability ;// Arg availability loaded from stack to reg + + + LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode + +OMX_VC_CHROMA_DC + + TST availability, #OMX_VC_LEFT + BEQ DCChroma8x8LeftNotAvailable + + ADD pTmp, pSrcLeft, leftStep + ADD step, leftStep, leftStep + + ;// Load Left Edge + VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] + VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] + VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] + VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] + VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] + + TST availability, #OMX_VC_UPPER + BEQ DCChroma8x8LeftOnlyAvailable + + ;// Load Upper Edge also + VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] + + MOV return, #OMX_Sts_NoErr ;// returnNoError + + VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] + VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] + + VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] + VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] + + VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32 + VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3 + VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 + VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 + + VMOV dIndexRow0U8,#0x0c + VMOV dIndexRow4U8,#0x04 + VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000 + VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404 + VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404 + VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8 + VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8 + +DCChroma8x8LeftStore + ADD pTmp, pDst, dstStep + ADD step, dstStep, dstStep + + VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 + VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 + VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 + VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 + VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 + + M_EXIT + + +DCChroma8x8LeftOnlyAvailable + + MOV return, #OMX_Sts_NoErr + + VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] + VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] + VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 + + VDUP dDstRow0,dSumLeftValU8[0] + VDUP dDstRow4,dSumLeftValU8[4] + + B DCChroma8x8LeftStore + + +DCChroma8x8LeftNotAvailable + + TST availability, #OMX_VC_UPPER + BEQ DCChroma8x8NoneAvailable + + ;// Load Upper Edge + VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] + MOV return, #OMX_Sts_NoErr ;// returnNoError + + VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] + VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] + VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 + VMOV dIndexRow0U8,#0x04 + VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000 + VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8 + + B DCChroma8x8UpperStore + + +DCChroma8x8NoneAvailable + + VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) + MOV return, #OMX_Sts_NoErr ;// returnNoError + +DCChroma8x8UpperStore + + ADD pTmp, pDst, dstStep + ADD step, dstStep, dstStep + + VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 + VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 + + M_EXIT + + +OMX_VC_CHROMA_VERT + + VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 + MOV return, #OMX_Sts_NoErr + + B DCChroma8x8UpperStore + + +OMX_VC_CHROMA_HOR + + ADD pTmp, pSrcLeft, leftStep + ADD step, leftStep, leftStep + + VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep] + VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] + VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep] + VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] + VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep] + + B DCChroma8x8PlaneStore + + +OMX_VC_CHROMA_PLANE + ADD pTmp, pSrcLeft, leftStep + ADD step, leftStep, leftStep + + VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 + VLD1 dAboveLeftVal[0],[pSrcAboveLeft] + + VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] + VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] + VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] + VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] + VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] + + + VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7] + VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0] + VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6] + VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0] + ;// pSrcAbove[5] - pSrcAbove[1] + ;// pSrcAbove[4] - pSrcAbove[2] + + VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7] + VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] + VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6] + VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0] + ;// pSrcLeft[5] - pSrcLeft[1] + ;// pSrcLeft[4] - pSrcLeft[2] + + LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval + VSHL dAboveDiff0U64,dAboveDiff0U64,#16 + VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ] + VLD1 dMultiplier,[pMultiplierTable]! + VSHL dLeftDiff0U64,dLeftDiff0U64,#16 + VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ] + + + VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ] + VMUL dVerPred,dDiffLeftS16,dMultiplier + VPADD dHVValS16,dHorPred,dVerPred + + + VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each + VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H + VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each + VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ] + VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each + VADDL qA,dAboveVal,dLeftVal + VDUP qA,qA[7] + VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a] + VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b] + VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c] + + + VMUL qB,qB,qMultiplier + VMUL qC,qC,qMultiplier + VADD qB,qB,qA + + VDUP qC0,qC[0] + VDUP qC1,qC[1] + VDUP qC2,qC[2] + VDUP qC3,qC[3] + VDUP qC4,qC[4] + VDUP qC5,qC[5] + VDUP qC6,qC[6] + VDUP qC7,qC[7] + + VADD qSum0,qB,qC0 + VADD qSum1,qB,qC1 + VADD qSum2,qB,qC2 + VADD qSum3,qB,qC3 + VADD qSum4,qB,qC4 + VADD qSum5,qB,qC5 + VADD qSum6,qB,qC6 + VADD qSum7,qB,qC7 + + VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5) + VQRSHRUN dSum1,qSum1,#5 + VQRSHRUN dSum2,qSum2,#5 + VQRSHRUN dSum3,qSum3,#5 + VQRSHRUN dSum4,qSum4,#5 + VQRSHRUN dSum5,qSum5,#5 + VQRSHRUN dSum6,qSum6,#5 + VQRSHRUN dSum7,qSum7,#5 + +DCChroma8x8PlaneStore + ADD pTmp, pDst, dstStep + ADD step, dstStep, dstStep + + VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 + VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 + VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 + VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 + VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 + VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 + VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 + VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 + + MOV return, #OMX_Sts_NoErr + M_END + + ENDIF ;// CortexA8 + + END +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntraChroma_8x8 ends +;//----------------------------------------------------------------------------------------------- diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s new file mode 100755 index 0000000..e9c0eee --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s @@ -0,0 +1,424 @@ +;// +;// +;// File Name: omxVCM4P10_PredictIntra_16x16_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + +;//------------------------------------------------------- +;// This table for implementing switch case of C in asm by +;// the mehtod of two levels of indexing. +;//------------------------------------------------------- + + M_TABLE armVCM4P10_pIndexTable16x16 + DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR + DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE + + + IF CortexA8 + + M_TABLE armVCM4P10_MultiplierTable16x16,1 + DCW 7, 6, 5, 4, 3, 2, 1, 8 + DCW 0, 1, 2, 3, 4, 5, 6, 7 + DCW 8, 9, 10, 11, 12, 13, 14, 15 + +;//-------------------------------------------- +;// Constants +;//-------------------------------------------- +BLK_SIZE EQU 0x10 +MUL_CONST0 EQU 0x01010101 +MUL_CONST1 EQU 0x00060004 +MUL_CONST2 EQU 0x00070005 +MUL_CONST3 EQU 0x00030001 +MASK_CONST EQU 0x00FF00FF + +;//-------------------------------------------- +;// Scratch variable +;//-------------------------------------------- +y RN 12 +pc RN 15 + +return RN 0 +pTable RN 9 +count RN 11 +pMultTable RN 9 +; ---------------------------------------------- +; Neon registers +; ---------------------------------------------- +qAbove QN Q0.U8 +qLeft QN Q1.U8 +qSum8 QN Q0.U16 +dSum80 DN D0.U16 +dSum81 DN D1.U16 +dSum4 DN D0.U16 +dSum2 DN D0.U32 +dSum1 DN D0.U64 +qOut QN Q3.U8 +dSumLeft DN D6.U64 +dSumAbove DN D7.U64 +dSum DN D8.U64 +dSum0 DN D8.U8[0] + +qH QN Q11.S32 +qV QN Q12.S32 +qA QN Q11.S16 +qB QN Q6.S16 +qC QN Q7.S16 + +qB0 QN Q5.S16 +qB1 QN Q6.S16 +dA1 DN D23.S16 + +dH0 DN D22.S32 +dH1 DN D23.S32 +dV0 DN D24.S32 +dV1 DN D25.S32 + +qHV QN Q11.S64 +qHV0 QN Q11.S32 +qHV1 QN Q12.S64 + +dHV00 DN D22.S32 +dHV01 DN D23.S32 + +dHV0 DN D22.S16[0] +dHV1 DN D23.S16[0] +dHV10 DN D24.S64 +dHV11 DN D25.S64 + +qSum0 QN Q0.S16 +qSum1 QN Q1.S16 + +dOut0 DN D6.U8 +dOut1 DN D7.U8 + +dLeft0 DN D2.U8 +dLeft1 DN D3.U8 +qConst QN Q13.S16 + +dAbove0 DN D0.U8 +dAbove1 DN D1.U8 + +dRevLeft64 DN D12.U64 +dRevLeft DN D12.U8 +dRevAbove64 DN D5.U64 +dRevAbove DN D5.U8 +qLeftDiff QN Q8.S16 +dLeftDiff1 DN D17.S16 +dLeftDiff64 DN D17.S64 +qDiffLeft QN Q8.S16 +qDiffAbove QN Q4.S16 +dAboveDiff1 DN D9.S16 +dAboveDiff64 DN D9.S64 +qAboveDiff QN Q4.S16 + +dAboveLeft DN D4.U8 + +dDiffLeft0 DN D16.S16 +dDiffLeft1 DN D17.S16 +dDiffAbove0 DN D8.S16 +dDiffAbove1 DN D9.S16 + +qLeft15minus0 QN Q7.S16 +dLeft15minus0 DN D14.S16 +qAbove15minus0 QN Q3.S16 +dAbove15minus0 DN D6.S16 + +qMultiplier QN Q10.S16 +qMultiplier0 QN Q10.S16 +qMultiplier1 QN Q12.S16 +dMultiplier0 DN D20.S16 +dMultiplier1 DN D21.S16 + +dBPlusCMult7 DN D1.S64 +dBPlusCMult7S16 DN D1.S16 + +qTmp QN Q0.U8 + +;//-------------------------------------------- +;// Declare input registers +;//-------------------------------------------- +pSrcLeft RN 0 ;// input pointer +pSrcAbove RN 1 ;// input pointer +pSrcAboveLeft RN 2 ;// input pointer +pDst RN 3 ;// output pointer +leftStep RN 4 ;// input variable +dstStep RN 5 ;// input variable +predMode RN 6 ;// input variable +availability RN 7 ;// input variable + +pTmp RN 8 +step RN 10 +pTmp2 RN 11 + +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntra_16x16 starts +;//----------------------------------------------------------------------------------------------- + + ;// Write function header + M_START omxVCM4P10_PredictIntra_16x16, r11, d15 + + ;// Define stack arguments + M_ARG LeftStep, 4 + M_ARG DstStep, 4 + M_ARG PredMode, 4 + M_ARG Availability, 4 + + ;// M_STALL ARM1136JS=4 + + LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case + + ;// Load argument from the stack + M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg + M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg + M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg + M_LDR availability, Availability ;// Arg availability loaded from stack to reg + + MOV y, #BLK_SIZE ;// Outer Loop Count + LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode + +OMX_VC_16X16_VERT + VLD1 qAbove, [pSrcAbove] + ADD pTmp, pDst, dstStep + ADD step, dstStep, dstStep + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst], step + VST1 qAbove, [pTmp], step + VST1 qAbove, [pDst] + VST1 qAbove, [pTmp] + MOV return, #OMX_Sts_NoErr ;// returnNoError + M_EXIT + +OMX_VC_16X16_HOR + ADD pTmp, pSrcLeft, leftStep + ADD leftStep, leftStep, leftStep + ADD pTmp2, pDst, dstStep + ADD dstStep, dstStep, dstStep +LoopHor + VLD1 {qLeft[]}, [pSrcLeft], leftStep + VLD1 {qTmp[]}, [pTmp], leftStep + SUBS y, y, #8 + VST1 qLeft, [pDst], dstStep + VST1 qTmp, [pTmp2], dstStep + VLD1 {qLeft[]}, [pSrcLeft], leftStep + VLD1 {qTmp[]}, [pTmp], leftStep + VST1 qLeft, [pDst], dstStep + VST1 qTmp, [pTmp2], dstStep + VLD1 {qLeft[]}, [pSrcLeft], leftStep + VLD1 {qTmp[]}, [pTmp], leftStep + VST1 qLeft, [pDst], dstStep + VST1 qTmp, [pTmp2], dstStep + VLD1 {qLeft[]}, [pSrcLeft], leftStep + VLD1 {qTmp[]}, [pTmp], leftStep + VST1 qLeft, [pDst], dstStep + VST1 qTmp, [pTmp2], dstStep + + BNE LoopHor ;// Loop for 16 times + MOV return, #OMX_Sts_NoErr + M_EXIT + +OMX_VC_16X16_DC + MOV count, #0 ;// count = 0 + TST availability, #OMX_VC_LEFT + BEQ UpperOrNoneAvailable ;// Jump to Upper if not left + + ADD pTmp, pSrcLeft, leftStep + ADD step, leftStep, leftStep + + VLD1 {qLeft[0]}, [pSrcLeft],step + VLD1 {qLeft[1]}, [pTmp],step + VLD1 {qLeft[2]}, [pSrcLeft],step + VLD1 {qLeft[3]}, [pTmp],step + VLD1 {qLeft[4]}, [pSrcLeft],step + VLD1 {qLeft[5]}, [pTmp],step + VLD1 {qLeft[6]}, [pSrcLeft],step + VLD1 {qLeft[7]}, [pTmp],step + VLD1 {qLeft[8]}, [pSrcLeft],step + VLD1 {qLeft[9]}, [pTmp],step + VLD1 {qLeft[10]},[pSrcLeft],step + VLD1 {qLeft[11]},[pTmp],step + VLD1 {qLeft[12]},[pSrcLeft],step + VLD1 {qLeft[13]},[pTmp],step + VLD1 {qLeft[14]},[pSrcLeft],step + VLD1 {qLeft[15]},[pTmp] + + VPADDL qSum8, qLeft + ADD count, count, #1 + VPADD dSum4, dSum80, dSum81 + VPADDL dSum2, dSum4 + VPADDL dSumLeft, dSum2 + VRSHR dSum, dSumLeft, #4 + +UpperOrNoneAvailable + TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) + BEQ BothOrNoneAvailable ;// Jump to Left if not upper + VLD1 qAbove, [pSrcAbove] + ADD count, count, #1 ;// if upper inc count by 1 + VPADDL qSum8, qAbove + VPADD dSum4, dSum80, dSum81 + VPADDL dSum2, dSum4 + VPADDL dSumAbove, dSum2 + VRSHR dSum, dSumAbove, #4 + +BothOrNoneAvailable + CMP count, #2 ;// check if both available + BNE NoneAvailable + VADD dSum, dSumAbove, dSumLeft + VRSHR dSum, dSum, #5 + + +NoneAvailable + VDUP qOut, dSum0 + CMP count, #0 ;// check if none available + ADD pTmp, pDst, dstStep + ADD step, dstStep, dstStep + BNE LoopDC + VMOV qOut, #128 +LoopDC + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + VST1 qOut, [pDst], step + VST1 qOut, [pTmp], step + MOV return, #OMX_Sts_NoErr + M_EXIT + +OMX_VC_16X16_PLANE + LDR pMultTable, =armVCM4P10_MultiplierTable16x16 + VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 + VLD1 dAboveLeft[0],[pSrcAboveLeft] + ADD pTmp, pSrcLeft, leftStep + ADD step, leftStep, leftStep + VLD1 {qLeft[0]}, [pSrcLeft],step + VLD1 {qLeft[1]}, [pTmp],step + VLD1 {qLeft[2]}, [pSrcLeft],step + VLD1 {qLeft[3]}, [pTmp],step + VLD1 {qLeft[4]}, [pSrcLeft],step + VLD1 {qLeft[5]}, [pTmp],step + VLD1 {qLeft[6]}, [pSrcLeft],step + VLD1 {qLeft[7]}, [pTmp],step + VLD1 {qLeft[8]}, [pSrcLeft],step + VLD1 {qLeft[9]}, [pTmp],step + VLD1 {qLeft[10]}, [pSrcLeft],step + VLD1 {qLeft[11]}, [pTmp],step + VLD1 {qLeft[12]}, [pSrcLeft],step + VLD1 {qLeft[13]}, [pTmp],step + VLD1 {qLeft[14]}, [pSrcLeft],step + VLD1 {qLeft[15]}, [pTmp] + + VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8] + VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] + VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X] + VSUBL qAboveDiff, dRevAbove, dAbove0 + + VSHL dAboveDiff64, dAboveDiff64, #16 + VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1 + + VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8] + VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] + VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X] + VSUBL qLeftDiff,dRevLeft, dLeft0 + + ;// Multiplier = [8|1|2|...|6|7] + VLD1 qMultiplier, [pMultTable]! + + VSHL dLeftDiff64, dLeftDiff64, #16 + VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1 + + VMULL qH,dDiffAbove0, dMultiplier0 + VMULL qV,dDiffLeft0, dMultiplier0 + VMLAL qH,dDiffAbove1, dMultiplier1 + VMLAL qV,dDiffLeft1, dMultiplier1 + + VPADD dHV00,dH1,dH0 + VPADD dHV01,dV1,dV0 + VPADDL qHV, qHV0 + VSHL qHV1,qHV,#2 + VADD qHV,qHV,qHV1 + + ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)] + VRSHR qHV,qHV,#6 + + ;// HV1 = [c*7|b*7] + VSHL qHV1,qHV,#3 + VSUB qHV1,qHV1,qHV + + ;// Multiplier1 = [0|1|2|...|7] + VLD1 qMultiplier0, [pMultTable]! + VDUP qB, dHV0 + VDUP qC, dHV1 + + VADDL qA,dAbove1,dLeft1 + VSHL qA,qA, #4 + VDUP qA,dA1[3] + VADD dBPlusCMult7, dHV10, dHV11 + + ;// Multiplier1 = [8|9|10|...|15] + VLD1 qMultiplier1, [pMultTable] + ;// Const = a - 7*(b+c) + VDUP qConst, dBPlusCMult7S16[0] + VSUB qConst, qA, qConst + + ;// B0 = [0*b|1*b|2*b|3*b|......|7*b] + VMUL qB0,qB,qMultiplier0 + + ;// B0 = [8*b|9*b|10*b|11*b|....|15*b] + VMUL qB1,qB,qMultiplier1 + + VADD qSum0, qB0, qConst + VADD qSum1, qB1, qConst + + ;// Loops for 16 times +LoopPlane + ;// (b*x + c*y + C)>>5 + VQRSHRUN dOut0, qSum0,#5 + VQRSHRUN dOut1, qSum1,#5 + SUBS y, y, #1 + VST1 qOut,[pDst],dstStep + VADD qSum0,qSum0,qC + VADD qSum1,qSum1,qC + BNE LoopPlane + + MOV return, #OMX_Sts_NoErr + + M_END + + ENDIF ;// CortexA8 + + END +;----------------------------------------------------------------------------------------------- +; omxVCM4P10_PredictIntra_16x16 ends +;----------------------------------------------------------------------------------------------- diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s new file mode 100755 index 0000000..39eb8a4 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s @@ -0,0 +1,531 @@ +;// +;// +;// File Name: omxVCM4P10_PredictIntra_4x4_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Define the processor variants supported by this file + + M_VARIANTS CortexA8 + +;//------------------------------------------------------- +;// This table for implementing switch case of C in asm by +;// the mehtod of two levels of indexing. +;//------------------------------------------------------- + + M_TABLE armVCM4P10_pSwitchTable4x4 + DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR + DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL + DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR + DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL + DCD OMX_VC_4x4_HU + + + IF CortexA8 + +;//-------------------------------------------- +;// Scratch variable +;//-------------------------------------------- +return RN 0 +pTable RN 8 +pc RN 15 + +;//-------------------------------------------- +;// Declare input registers +;//-------------------------------------------- +pSrcLeft RN 0 ;// input pointer +pSrcAbove RN 1 ;// input pointer +pSrcAboveLeft RN 2 ;// input pointer +pDst RN 3 ;// output pointer +leftStep RN 4 ;// input variable +dstStep RN 5 ;// input variable +predMode RN 6 ;// input variable +availability RN 7 ;// input variable +pDst1 RN 1 +pDst2 RN 4 +pDst3 RN 6 + +pSrcTmp RN 9 +srcStep RN 10 +pDstTmp RN 11 +dstep RN 12 + +;//------------------- +;// Neon registers +;//------------------- + +;// OMX_VC_CHROMA_VERT +dAboveU32 DN D0.U32 + +;// OMX_VC_CHROMA_HOR +dLeftVal0 DN D0.8 +dLeftVal1 DN D1.8 +dLeftVal2 DN D2.8 +dLeftVal3 DN D3.8 +dLeftVal0U32 DN D0.U32 +dLeftVal1U32 DN D1.U32 +dLeftVal2U32 DN D2.U32 +dLeftVal3U32 DN D3.U32 + +;// OMX_VC_4x4_DC +dLeftVal DN D0.U8 +dLeftValU32 DN D0.U32 +dSumAboveLeftU16 DN D1.U16 +dSumAboveLeftU32 DN D1.U32 +dSumAboveLeftU64 DN D1.U64 +dSumAboveLeftU8 DN D1.U8 +dSum DN D0.U8 + +dSumLeftValU16 DN D1.U16 +dSumLeftValU32 DN D1.U32 +dSumLeftValU64 DN D1.U64 +dSumLeftValU8 DN D1.U8 + +dAboveVal DN D0.U8 +dSumAboveValU16 DN D1.U16 +dSumAboveValU32 DN D1.U32 +dSumAboveValU64 DN D1.U64 +dSumAboveValU8 DN D1.U8 +dConst128U8 DN D0.U8 + + +;//OMX_VC_4x4_DIAG_DL + +dAbove DN D0.U8 +dU7 DN D2.U8 +dU3 DN D2.U8 +dAbove0 DN D3.U8 +dAbove1 DN D4.U8 +dAbove2 DN D5.U8 +dTmp DN D6.U8 +dTmp0 DN D7.U8 +dTmp1 DN D8.U8 +dTmp2 DN D9.U8 +dTmp3 DN D10.U8 +dTmpU32 DN D6.U32 + + +;//OMX_VC_4x4_DIAG_DR +dLeft DN D1.U8 +dUL DN D2.U8 + +;//OMX_VC_4x4_VR +dLeft0 DN D1.U8 +dLeft1 DN D2.U8 +dEven0 DN D3.U8 +dEven1 DN D4.U8 +dEven2 DN D5.U8 +dOdd0 DN D6.U8 +dOdd1 DN D11.U8 +dOdd2 DN D12.U8 +dTmp3U32 DN D10.U32 +dTmp2U32 DN D9.U32 + + +;//OMX_VC_4x4_HD +dTmp1U64 DN D8.U64 +dTmp0U64 DN D7.U64 +dTmpU64 DN D6.U64 +dTmpU32 DN D6.U32 +dTmp1U32 DN D8.U32 + +;//OMX_VC_4x4_HU +dL3 DN D2.U8 +dLeftHU0 DN D3.U8 +dLeftHU1 DN D4.U8 +dLeftHU2 DN D5.U8 +dTmp0U32 DN D7.U32 + + + + +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntra_4x4 starts +;//----------------------------------------------------------------------------------------------- + + ;// Write function header + M_START omxVCM4P10_PredictIntra_4x4, r12,d12 + + ;// Define stack arguments + M_ARG LeftStep, 4 + M_ARG DstStep, 4 + M_ARG PredMode, 4 + M_ARG Availability, 4 + + + LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case + + ;// Load argument from the stack + M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg + M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg + + + LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode + + +OMX_VC_4x4_HOR + + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ;// Load Left Edge + VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + + VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7 + VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7 + VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7 + VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7 + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VERT + + ;// Load Upper Edge + VLD1 dAboveU32[0],[pSrcAbove] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + +DCPredict4x4VertStore + + VST1 dAboveU32[0],[pDst],dstep + VST1 dAboveU32[0],[pDstTmp],dstep + VST1 dAboveU32[0],[pDst] + VST1 dAboveU32[0],[pDstTmp] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_DC + + + TST availability, #OMX_VC_LEFT + BEQ DCPredict4x4LeftNotAvailable + + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ;// Load Left Edge + VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + TST availability, #OMX_VC_UPPER + BEQ DCPredict4x4LeftOnlyAvailable + + ;// Load Upper Edge also + VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3] + MOV return, #OMX_Sts_NoErr + + VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]] + VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]] + VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]] + VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumAboveLeftU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4LeftOnlyAvailable + + MOV return, #OMX_Sts_NoErr ;// returnNoError + + VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]] + VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]] + + VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumLeftValU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4LeftNotAvailable + + TST availability, #OMX_VC_UPPER + BEQ DCPredict4x4NoneAvailable + + ;// Load Upper Edge + VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3] + MOV return, #OMX_Sts_NoErr + + VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]] + VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]] + + VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumAboveValU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4NoneAvailable + + VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) + MOV return, #OMX_Sts_NoErr + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + B DCPredict4x4VertStore + + + +OMX_VC_4x4_DIAG_DL + + TST availability, #OMX_VC_UPPER_RIGHT + BEQ DiagDLUpperRightNotAvailable + + VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] + VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7] + VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1] + VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2] + B DiagDLPredict4x4Store + +DiagDLUpperRightNotAvailable + VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] + VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] + + VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] + VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] + VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] + +DiagDLPredict4x4Store + + VHADD dTmp, dAbove0, dAbove2 + VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 + + + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst] + + B ExitPredict4x4 ;// Branch to exit code + + +OMX_VC_4x4_DIAG_DR + + + ;// Load U0,U1,U2,U3 + + VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0] + + ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] + VLD1 {dLeft[7]},[pSrcAboveLeft] + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ADD pDst1,pDst,dstStep + + VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + + VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3] + ADD pDst2,pDst1,dstStep + VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2] + ADD pDst3,pDst2,dstStep + VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1] + + VHADD dTmp, dAbove0, dAbove2 + VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 + + + VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VR + + + ;// Load UL,U0,U1,U2,U3 + VLD1 dAboveU32[0],[pSrcAbove] + VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0] + + ;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X] + ;// dLeft1 = [L1| X|X|X|X|X|X|X] + VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + + + VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ] + VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ] + VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ] + VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ] + VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ] + VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ] + + VHADD dTmp1, dOdd0, dOdd2 + VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ] + + VHADD dTmp0, dEven0, dEven2 + VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ] + + + VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ] + + + VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3] + VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2] + VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1] + VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_HD + + + ;// Load U0,U1,U2,U3 + VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0] + + ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] + VLD1 {dLeft[7]},[pSrcAboveLeft] + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + + VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ] + VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ] + VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ] + + VHADD dTmp0, dAbove0, dAbove2 + VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ] + + + VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1 + VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ] + + + VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ] + VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ] + VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ] + VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ] + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + + VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3] + VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5] + VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7] + VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VL + + + TST availability, #OMX_VC_UPPER_RIGHT + BEQ DiagVLUpperRightNotAvailable + + VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] + VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1] + VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2] + + B DiagVLPredict4x4Store + +DiagVLUpperRightNotAvailable + VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] + VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] + + VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] + VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] + VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] + +DiagVLPredict4x4Store + + VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1 + ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ] + + VHADD dTmp3, dAbove0, dAbove2 + VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2 + ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ] + + VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ] + + VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0] + VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1] + VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2] + VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_HU + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + + ;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X] + VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3] + + VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0] + VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1] + VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2] + + VHADD dTmp0, dLeftHU0, dLeftHU2 + VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ] + + VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1 + ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ] + + VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0] + ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3] + + + VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0] + VEXT dTmp1,dTmp1,dTmp1,#2 + VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2] + VEXT dTmp1,dTmp1,dTmp1,#2 + VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4] + VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6] + + +ExitPredict4x4 + + MOV return, #OMX_Sts_NoErr + M_END + + ENDIF ;// CortexA8 + + END +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntra_4x4 ends +;//----------------------------------------------------------------------------------------------- diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s new file mode 100755 index 0000000..e394339 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s @@ -0,0 +1,140 @@ +;// +;// +;// File Name: omxVCM4P10_TransformDequantChromaDCFromPair_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixQPModTable + + M_VARIANTS CortexA8 + + + + + IF CortexA8 + +;// ARM Registers +;//-------------------------------------- +;// Declare input registers +;//-------------------------------------- +ppSrc RN 0 +pDst RN 1 +QP RN 2 + +;//-------------------------------- +;// Scratch variable for Unpack2x2 +;//-------------------------------- +pSrc RN 9 +Value RN 4 +Value2 RN 5 +Flag RN 6 +strOffset RN 7 +cstOffset RN 8 + +;//-------------------------------- +;// Scratch variable +;//-------------------------------- +r0w0 RN 3 +r0w1 RN 4 + +c0w0 RN 5 +c1w0 RN 6 + +return RN 0 +pQPDivTable RN 5 +pQPModTable RN 6 +Shift RN 9 +Scale RN 2 + + + +;// Neon Registers + +dZero DN D0.U16 +dInvTrCoeff DN D0.S16 +dScale DN D1.S16 +qDqntCoeff QN Q1.S32 +dDqntCoeff DN D2.S16 + + + ;// Write function header + M_START omxVCM4P10_TransformDequantChromaDCFromPair, r9 + + LDR pSrc, [ppSrc] ;// Load pSrc + VMOV dZero, #0 + MOV cstOffset, #31 ;// To be used in the loop, to compute offset + + ;//----------------------------------------------------------------------- + ;// Firstly, fill all the coefficient values on the <pDst> buffer by zero + ;//----------------------------------------------------------------------- + + VST1 dZero,[pDst] ;// pDst[0] = pDst[1] = pDst[2] = pDst[3] = 0 + LDRB Flag, [pSrc], #1 ;// Preload <Flag> before <unpackLoop> + + +unpackLoop + TST Flag, #0x10 ;// Computing (Flag & 0x10) + LDRSBNE Value2,[pSrc,#1] + LDRBNE Value, [pSrc], #2 ;// Load byte wise to avoid unaligned access + AND strOffset, cstOffset, Flag, LSL #1 ;// strOffset = (Flag & 15) < 1; + LDRSBEQ Value, [pSrc], #1 ;// Value = (OMX_U8) *pSrc++ + ORRNE Value,Value,Value2, LSL #8 ;// Value = (OMX_U16) *pSrc++ + + TST Flag, #0x20 ;// Computing (Flag & 0x20) to check, if we're done + LDRBEQ Flag, [pSrc], #1 ;// Flag = (OMX_U8) *pSrc++, for next iteration + STRH Value, [pDst, strOffset] ;// Store <Value> at offset <strOffset> + BEQ unpackLoop ;// Branch to the loop beginning + + ;//-------------------------------------------------- + ;//InvTransformDC2x2: Inlined (Implemented in ARM V6) + ;//-------------------------------------------------- + + LDMIA pDst, {r0w0, r0w1} ;// r0w0 = |c1|c0| & r0w1 = |c3|c2| + + STR pSrc, [ppSrc] ;// Update the bitstream pointer + + LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer + LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer + + SADDSUBX r0w0, r0w0, r0w0 ;// [ c00+c01, c00-c01 ] + SADDSUBX r0w1, r0w1, r0w1 ;// [ c10+c11, c10-c11 ] + + LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] + LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] + + SADD16 c0w0, r0w0, r0w1 ;// [ d00+d10, d01+d11 ] + SSUB16 c1w0, r0w0, r0w1 ;// [ d00-d10, d01-d11 ] + + ;//------------------------------------------------- + ;//DequantChromaDC2x2: Inlined (Neon Implementation) + ;//------------------------------------------------- + + LSL Scale, Scale, Shift ;// Scale = Scale << Shift + VMOV dInvTrCoeff, c0w0, c1w0 + VREV32 dInvTrCoeff,dInvTrCoeff + VDUP dScale,Scale + + VMULL qDqntCoeff,dInvTrCoeff,dScale + VSHRN dDqntCoeff,qDqntCoeff,#1 + + + VST1 dDqntCoeff,[pDst] ;// Storing all the coefficients at once + + MOV return, #OMX_Sts_NoErr + M_END + + ENDIF ;// CortexA8 + + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s new file mode 100755 index 0000000..2529959 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s @@ -0,0 +1,264 @@ +;// +;// +;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// H.264 inverse quantize and transform module +;// +;// + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import/Export symbols required from/to other files +;// (For example tables) + + IMPORT armVCM4P10_UnpackBlock4x4 + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixQPModTable + + M_VARIANTS CortexA8 + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + + +;// Guarding implementation by the processor name + + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + +;// Guarding implementation by the processor name + + IF CortexA8 + +;//Input Registers +pData RN 0 +QP RN 1 + + +;//Local Scratch Registers + +;// ARM Registers + +pQPDivTable RN 2 +pQPModTable RN 3 +Shift RN 4 +Scale RN 5 + +;// NEON Registers + +;// Packed Input pixels +dIn0 DN D0.S16 +dIn1 DN D1.S16 +dIn2 DN D2.S16 +dIn3 DN D3.S16 + +;// Intermediate calculations +dRowSum1 DN D4.S16 +dRowSum2 DN D5.S16 +dRowDiff1 DN D6.S16 +dRowDiff2 DN D7.S16 + +;// Row operated pixels +dRowOp0 DN D0.S16 +dRowOp1 DN D1.S16 +dRowOp2 DN D2.S16 +dRowOp3 DN D3.S16 +qRowOp01 QN Q0.32 +qRowOp23 QN Q1.32 + +;// Intermediate calculations +dColSum1 DN D4.S16 +dColSum2 DN D5.S16 +dColDiff1 DN D6.S16 +dColDiff2 DN D7.S16 + +;// Coloumn operated pixels +dColOp0 DN D0.S16 +dColOp1 DN D1.S16 +dColOp2 DN D2.S16 +dColOp3 DN D3.S16 + +;// Temporary scratch varaibles + +dScale DN D5.S16 +qRound0 QN Q3.S32 +qRound1 QN Q4.S32 +qRound2 QN Q5.S32 +qRound3 QN Q6.S32 + +;// InvTransformed and Dequantized pixels +dOut0 DN D0.S16 +dOut1 DN D1.S16 +dOut2 DN D2.S16 +dOut3 DN D3.S16 + + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 + + ;****************************************************************** + ;// The strategy used in implementing the transform is as follows:* + ;// Load the 4x4 block into 4 D-registers * + ;// Transpose the 4x4 matrix * + ;// Perform the row operations (on columns) using SIMD * + ;// Transpose the 4x4 result matrix * + ;// Perform the coloumn operations * + ;****************************************************************** + + ;// Load all the 4x4 pixels in Transposed form + + VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] + LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer + LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer + + ;**************************************** + ;// Row Operations (Performed on columns) + ;**************************************** + ;// Scale factor calculation is done using ARM instructions + ;// Interleaved with NEON instructions inorder to Dual issue + + VADD dRowSum1,dIn0,dIn1 + VADD dRowSum2,dIn2,dIn3 + VSUB dRowDiff1,dIn0,dIn1 + LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] + VSUB dRowDiff2,dIn2,dIn3 + LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] + VADD dRowOp0,dRowSum1,dRowSum2 + VSUB dRowOp1,dRowSum1,dRowSum2 + VSUB dRowOp2,dRowDiff1,dRowDiff2 + LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift + VADD dRowOp3,dRowDiff1,dRowDiff2 + + ;**************************************** + ;// Transpose the resultant matrix + ;**************************************** + + VTRN dRowOp0,dRowOp1 + VTRN dRowOp2,dRowOp3 + VTRN qRowOp01,qRowOp23 + + ;**************************************** + ;// Coloumn Operations + ;**************************************** + + VADD dColSum1,dRowOp0,dRowOp1 + VADD dColSum2,dRowOp2,dRowOp3 + VSUB dColDiff1,dRowOp0,dRowOp1 + VSUB dColDiff2,dRowOp2,dRowOp3 + VADD dColOp0,dColSum1,dColSum2 + VSUB dColOp1,dColSum1,dColSum2 + VSUB dColOp2,dColDiff1,dColDiff2 + VADD dColOp3,dColDiff1,dColDiff2 + + ;//---------------------------------------------------------------------- + ;// + ;// <Dequantize> improves on the c-reference code + ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together + ;// We do not subtract 2 from Shift as in C reference, instead perform a + ;// Scale << Shift once in the beginning and do a right shift by a + ;// constant 2 after the Multiplication. The value of Round would be 2 + ;// + ;// By doing this we aviod the Branches required and also + ;// reduce the code size substantially + ;// + ;//---------------------------------------------------------------------- + + + VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector + + + VMOV qRound0,#2 ;// Set the Round Value + VMOV qRound1,#2 + VMOV qRound2,#2 + VMOV qRound3,#2 + + VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round + VMLAL qRound1,dColOp1,dScale + VMLAL qRound2,dColOp2,dScale + VMLAL qRound3,dColOp3,dScale + + VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value + VSHRN dOut1,qRound1,#2 + VSHRN dOut2,qRound2,#2 + VSHRN dOut3,qRound3,#2 + + ;*************************** + ;// Store all the 4x4 pixels + ;*************************** + + VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] + + + ;// Set return value + + ;// Write function tail + M_END + + ENDIF ;//CORTEXA8 + + + +;// Function: omxVCM4P10_TransformDequantLumaDCFromPair + +;//Input Registers +ppSrc RN 0 +pDst RN 1 +QPR2 RN 2 + +;//Output Registers +result RN 0 + +;//Local Scratch Registers +pDstR4 RN 4 +pDstR0 RN 0 +QPR1 RN 1 +QPR5 RN 5 + +;// Guarding implementation by the processor name + + IF CortexA8 + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 + + MOV pDstR4,pDst ;// Saving register r1 + MOV QPR5,QPR2 ;// Saving register r2 + BL armVCM4P10_UnpackBlock4x4 + + MOV pDstR0,pDstR4 ;// Setting up register r0 + MOV QPR1,QPR5 ;// Setting up register r1 + BL armVCM4P10_InvTransformDequantLumaDC4x4 + + + ;// Set return value + MOV result,#OMX_Sts_NoErr + + ;// Write function tail + M_END + + + ENDIF ;//ARM1136JS + + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S new file mode 100644 index 0000000..aca2df4 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S @@ -0,0 +1,134 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_Average_4x4_Align0_unsafe + .func armVCM4P10_Average_4x4_Align0_unsafe +armVCM4P10_Average_4x4_Align0_unsafe: + PUSH {r4-r6,lr} + LDR r7, =0x80808080 + LDR r12,[r2,#0] + LDR r10,[r0],r1 + LDR lr,[r2,r3] + LDR r11,[r0],r1 + MVN r12,r12 + MVN lr,lr + UHSUB8 r5,r10,r12 + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + LDR r10,[r0],r1 + LDR r12,[r2,#0] + LDR r11,[r0],r1 + LDR lr,[r2,r3] + MVN r12,r12 + UHSUB8 r5,r10,r12 + MVN lr,lr + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + POP {r4-r6,pc} + .endfunc + + .global armVCM4P10_Average_4x4_Align2_unsafe + .func armVCM4P10_Average_4x4_Align2_unsafe +armVCM4P10_Average_4x4_Align2_unsafe: + PUSH {r4-r6,lr} + LDR r7, =0x80808080 + LDR r4,[r0,#4] + LDR r10,[r0],r1 + LDR r12,[r2,#0] + LDR lr,[r2,r3] + LDR r5,[r0,#4] + LDR r11,[r0],r1 + MVN r12,r12 + MVN lr,lr + LSR r10,r10,#16 + ORR r10,r10,r4,LSL #16 + LSR r11,r11,#16 + ORR r11,r11,r5,LSL #16 + UHSUB8 r5,r10,r12 + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + LDR r4,[r0,#4] + LDR r10,[r0],r1 + LDR r12,[r2,#0] + LDR lr,[r2,r3] + LDR r5,[r0,#4] + LDR r11,[r0],r1 + MVN r12,r12 + MVN lr,lr + LSR r10,r10,#16 + ORR r10,r10,r4,LSL #16 + LSR r11,r11,#16 + ORR r11,r11,r5,LSL #16 + UHSUB8 r5,r10,r12 + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + POP {r4-r6,pc} + .endfunc + + .global armVCM4P10_Average_4x4_Align3_unsafe + .func armVCM4P10_Average_4x4_Align3_unsafe +armVCM4P10_Average_4x4_Align3_unsafe: + PUSH {r4-r6,lr} + LDR r7, =0x80808080 + LDR r4,[r0,#4] + LDR r10,[r0],r1 + LDR r12,[r2,#0] + LDR lr,[r2,r3] + LDR r5,[r0,#4] + LDR r11,[r0],r1 + MVN r12,r12 + MVN lr,lr + LSR r10,r10,#24 + ORR r10,r10,r4,LSL #8 + LSR r11,r11,#24 + ORR r11,r11,r5,LSL #8 + UHSUB8 r5,r10,r12 + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + LDR r4,[r0,#4] + LDR r10,[r0],r1 + LDR r12,[r2,#0] + LDR lr,[r2,r3] + LDR r5,[r0,#4] + LDR r11,[r0],r1 + MVN r12,r12 + MVN lr,lr + LSR r10,r10,#24 + ORR r10,r10,r4,LSL #8 + LSR r11,r11,#24 + ORR r11,r11,r5,LSL #8 + UHSUB8 r5,r10,r12 + UHSUB8 r4,r11,lr + EOR r5,r5,r7 + STR r5,[r2],r3 + EOR r4,r4,r7 + STR r4,[r2],r3 + POP {r4-r6,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S new file mode 100644 index 0000000..b9ee221 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S @@ -0,0 +1,54 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_DeblockingChromabSLT4_unsafe + .func armVCM4P10_DeblockingChromabSLT4_unsafe +armVCM4P10_DeblockingChromabSLT4_unsafe: + VLD1.32 {d18[0]},[r5]! + VSUBL.U8 q11,d5,d9 + VMOV d28,d18 + VSUBL.U8 q10,d8,d4 + VSHR.S16 q11,q11,#2 + VZIP.8 d18,d28 + VBIF d18,d14,d16 + VRHADD.S16 q10,q11,q10 + VADD.I8 d31,d18,d15 + VQMOVN.S16 d20,q10 + VLD1.8 {d0[]},[r2] + VMIN.S8 d20,d20,d31 + VNEG.S8 d31,d31 + VLD1.8 {d2[]},[r3] + VMAX.S8 d20,d20,d31 + VMOVL.U8 q14,d4 + VMOVL.U8 q12,d8 + VADDW.S8 q14,q14,d20 + VSUBW.S8 q12,q12,d20 + VQMOVUN.S16 d29,q14 + VQMOVUN.S16 d24,q12 + BX lr + .endfunc + + .global armVCM4P10_DeblockingChromabSGE4_unsafe + .func armVCM4P10_DeblockingChromabSGE4_unsafe +armVCM4P10_DeblockingChromabSGE4_unsafe: + VHADD.U8 d13,d4,d9 + VHADD.U8 d31,d8,d5 + VLD1.8 {d0[]},[r2] + ADD r5,r5,#4 + VLD1.8 {d2[]},[r3] + VRHADD.U8 d13,d13,d5 + VRHADD.U8 d31,d31,d9 + BX lr + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S new file mode 100644 index 0000000..47f3d44 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S @@ -0,0 +1,102 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_DeblockingLumabSLT4_unsafe + .func armVCM4P10_DeblockingLumabSLT4_unsafe +armVCM4P10_DeblockingLumabSLT4_unsafe: + VSUBL.U8 q11,d5,d9 + VLD1.8 {d18[]},[r5]! + VSUBL.U8 q10,d8,d4 + VLD1.8 {d19[]},[r5]! + VSHR.S16 q11,q11,#2 + VEXT.8 d18,d18,d19,#4 + VAND d19,d17,d15 + VBIF d18,d14,d16 + VRHADD.S16 q10,q11,q10 + VRHADD.U8 d24,d4,d8 + VADD.I8 d31,d18,d19 + VAND d19,d12,d15 + VQADD.U8 d23,d5,d18 + VQMOVN.S16 d20,q10 + VADD.I8 d31,d31,d19 + VQSUB.U8 d22,d5,d18 + VQADD.U8 d19,d9,d18 + VHADD.U8 d26,d24,d6 + VMIN.S8 d20,d20,d31 + VNEG.S8 d31,d31 + VQSUB.U8 d21,d9,d18 + VHADD.U8 d27,d24,d10 + VMAX.U8 d30,d26,d22 + VMAX.S8 d20,d20,d31 + VMOVL.U8 q14,d4 + VMOVL.U8 q12,d8 + VADDW.S8 q14,q14,d20 + VSUBW.S8 q12,q12,d20 + VQMOVUN.S16 d29,q14 + VQMOVUN.S16 d24,q12 + VMAX.U8 d25,d27,d21 + VMIN.U8 d30,d30,d23 + VMIN.U8 d25,d25,d19 + VBIF d29,d4,d16 + VBIF d30,d5,d17 + VBIF d24,d8,d16 + VBIF d25,d9,d12 + BX lr + .endfunc + + .global armVCM4P10_DeblockingLumabSGE4_unsafe + .func armVCM4P10_DeblockingLumabSGE4_unsafe +armVCM4P10_DeblockingLumabSGE4_unsafe: + VSHR.U8 d19,d0,#2 + VADD.I8 d19,d19,d15 + VADDL.U8 q10,d8,d4 + VADD.I8 d19,d19,d15 + VADDL.U8 q11,d6,d9 + VADDW.U8 q12,q10,d5 + VCGT.U8 d19,d19,d13 + VSHR.U16 q11,q11,#1 + VHADD.U16 q11,q12,q11 + VADDW.U8 q12,q12,d6 + VADDL.U8 q13,d7,d6 + VAND d17,d17,d19 + VHADD.U8 d28,d4,d9 + VSRA.U16 q13,q12,#1 + VAND d12,d12,d19 + VQRSHRN.U16 d29,q11,#1 + VRHADD.U8 d28,d28,d5 + VQRSHRN.U16 d30,q12,#2 + VADDL.U8 q11,d10,d5 + VADDW.U8 q12,q10,d9 + VBIF d29,d28,d17 + VQRSHRN.U16 d31,q13,#2 + VADDL.U8 q13,d11,d10 + VSHR.U16 q11,q11,#1 + VHADD.U16 q11,q12,q11 + VADDW.U8 q12,q12,d10 + VHADD.U8 d28,d8,d5 + VBIF d29,d4,d16 + VBIF d30,d5,d17 + VSRA.U16 q13,q12,#1 + VQRSHRN.U16 d25,q12,#2 + VQRSHRN.U16 d24,q11,#1 + VRHADD.U8 d22,d28,d9 + VBIF d25,d9,d12 + VBIF d31,d6,d17 + VBIF d24,d22,d12 + VQRSHRN.U16 d28,q13,#2 + VBIF d24,d8,d16 + VBIF d28,d10,d12 + BX lr + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S new file mode 100644 index 0000000..e68bd8e --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S @@ -0,0 +1,272 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_DecodeCoeffsToPair + .func armVCM4P10_DecodeCoeffsToPair +armVCM4P10_DecodeCoeffsToPair: + PUSH {r4-r12,lr} + SUB sp,sp,#0x40 + LDR r10,[r0,#0] + LDR r12,[r1,#0] + LDR r6, =armVCM4P10_CAVLCCoeffTokenTables + LDR r4,[sp,#0x68] + LDRB r9,[r10,#2] + LDRB r8,[r10,#1] + LDRB r11,[r10],#3 + ADD r12,r12,#8 + LDR r6,[r6,r4,LSL #2] + ORR r9,r9,r8,LSL #8 + ORR r11,r9,r11,LSL #16 + LSLS r8,r11,r12 + MOVS r7,#0x1e + AND r7,r7,r8,LSR #27 + SUBS r12,r12,#8 +L0x44: + BCC L1 + LDRB r8,[r10],#1 +L1: + LDRH r7,[r6,r7] + ADDCC r12,r12,#8 + ADD r12,r12,#4 + ORRCS r11,r8,r11,LSL #8 + LSRS r8,r7,#1 + BCS L0x74 + LSLS r8,r11,r12 + SUBS r12,r12,#0xa + ADD r7,r7,r8,LSR #29 + BIC r7,r7,#1 + B L0x44 +L0x74: + SUB r12,r12,r7,LSR #13 + BIC r7,r8,#0xf000 + LSRS r5,r7,#2 + STRB r5,[r2,#0] + BEQ L0x344 + CMP r7,#0x44 + BGE L0x33c + STR r0,[sp,#0] + STR r1,[sp,#4] + STR r3,[sp,#8] + ANDS r1,r7,#3 + ADD r2,sp,#0xc + BEQ L0xd8 + MOV r0,r1 +L0xac: + LSLS r7,r11,r12 + SUBS r12,r12,#7 + BCC L2 + LDRB r8,[r10],#1 +L2: + ADDCC r12,r12,#8 + LSR r7,r7,#31 + ORRCS r11,r8,r11,LSL #8 + SUBS r0,r0,#1 + MOV r8,#1 + SUB r8,r8,r7,LSL #1 + STRH r8,[r2],#2 + BGT L0xac +L0xd8: + SUBS r0,r5,r1 + BEQ L0x1b8 + MOV r4,#1 + CMP r5,#0xa + MOVLE r4,#0 + CMP r1,#3 + MOVLT r1,#4 + MOVGE r1,#2 + MOVGE r4,#0 +L0xfc: + LSLS r7,r11,r12 + CLZ r7,r7 + ADD r12,r12,r7 + SUBS r12,r12,#7 + BCC L3 + LDRB r8,[r10],#1 + ORR r11,r8,r11,LSL #8 + SUBS r12,r12,#8 + BCC L3 + LDRB r8,[r10],#1 +L3: + ADDCC r12,r12,#8 + ORRCS r11,r8,r11,LSL #8 + CMP r7,#0x10 + BGE L0x33c + MOVS lr,r4 + TEQEQ r7,#0xe + MOVEQ lr,#4 + TEQ r7,#0xf + MOVEQ lr,#0xc + TEQEQ r4,#0 + ADDEQ r7,r7,#0xf + TEQ lr,#0 + BEQ L0x184 + LSL r3,r11,r12 + ADD r12,r12,lr + SUBS r12,r12,#8 + RSB r9,lr,#0x20 + BCC L4 + LDRB r8,[r10],#1 + ORR r11,r8,r11,LSL #8 + SUBS r12,r12,#8 + BCC L4 + LDRB r8,[r10],#1 +L4: + ADDCC r12,r12,#8 + LSR r3,r3,r9 + ORRCS r11,r8,r11,LSL #8 + LSL r7,r7,r4 + ADD r7,r3,r7 +L0x184: + ADD r7,r7,r1 + MOV r1,#2 + LSRS r8,r7,#1 + RSBCS r8,r8,#0 + STRH r8,[r2],#2 + LDR r9, =armVCM4P10_SuffixToLevel + LDRSB r8,[r9,r4] + TEQ r4,#0 + MOVEQ r4,#1 + CMP r7,r8 + ADDCS r4,r4,#1 + SUBS r0,r0,#1 + BGT L0xfc +L0x1b8: + LDR r8,[sp,#0x6c] + SUB r0,r5,#1 + SUBS r1,r8,r5 + ADD r4,sp,#0x2c + MOV lr,r5 + SUB lr,lr,#1 + BEQ L0x2b0 + TEQ r8,#4 + LDREQ r6, =(armVCM4P10_CAVLCTotalZeros2x2Tables - 4) + LDRNE r6, =(armVCM4P10_CAVLCTotalZeroTables - 4) + LDR r6,[r6,r5,LSL #2] + LSLS r8,r11,r12 + MOVS r7,#0x1e + AND r7,r7,r8,LSR #27 + SUBS r12,r12,#8 +L0x1f4: + BCC L5 + LDRB r8,[r10],#1 +L5: + LDRH r7,[r6,r7] + ADDCC r12,r12,#8 + ADD r12,r12,#4 + ORRCS r11,r8,r11,LSL #8 + LSRS r8,r7,#1 + BCS L0x224 + LSLS r8,r11,r12 + SUBS r12,r12,#0xa + ADD r7,r7,r8,LSR #29 + BIC r7,r7,#1 + B L0x1f4 +L0x224: + SUB r12,r12,r7,LSR #13 + BIC r7,r8,#0xf000 + CMP r7,#0x10 + BGE L0x33c + LDR r3, =(armVCM4P10_CAVLCRunBeforeTables - 4) + ADD r4,sp,#0x2c + MOVS r1,r7 + ADD lr,lr,r1 + BEQ L0x2b0 +L0x248: + SUBS r0,r0,#1 + LDR r6,[r3,r1,LSL #2] + BLT L0x2bc + LSLS r8,r11,r12 + MOVS r7,#0xe + AND r7,r7,r8,LSR #28 + SUBS r12,r12,#8 +L0x264: + BCC L6 + LDRB r8,[r10],#1 +L6: + LDRH r7,[r6,r7] + ADDCC r12,r12,#8 + ADD r12,r12,#3 + ORRCS r11,r8,r11,LSL #8 + LSRS r8,r7,#1 + BCS L0x294 + LSLS r8,r11,r12 + SUBS r12,r12,#9 + ADD r7,r7,r8,LSR #29 + BIC r7,r7,#1 + B L0x264 +L0x294: + SUB r12,r12,r7,LSR #13 + BIC r7,r8,#0xf000 + CMP r7,#0xf + BGE L0x33c + SUBS r1,r1,r7 + STRB r7,[r4],#1 + BGT L0x248 +L0x2b0: + SUBS r0,r0,#1 + BLT L7 + STRB r1,[r4],#1 +L7: + BGT L0x2b0 +L0x2bc: + STRB r1,[r4],#1 + LDR r8,[sp,#0x6c] + TEQ r8,#0xf + ADDEQ lr,lr,#1 + SUB r4,r4,r5 + SUB r2,r2,r5 + SUB r2,r2,r5 + LDR r3,[sp,#8] + LDR r0,[r3,#0] + TEQ r8,#4 + LDREQ r6, =armVCM4P10_ZigZag_2x2 + LDRNE r6, =armVCM4P10_ZigZag_4x4 +L0x2ec: + LDRB r9,[r4],#1 + LDRB r8,[r6,lr] + SUB lr,lr,#1 + SUB lr,lr,r9 + LDRSH r9,[r2],#2 + SUBS r5,r5,#1 + ORREQ r8,r8,#0x20 + ADD r1,r9,#0x80 + CMP r1,#0x100 + ORRCS r8,r8,#0x10 + TEQ r5,#0 + STRB r8,[r0],#1 + STRB r9,[r0],#1 + LSR r9,r9,#8 + BCC L8 + STRB r9,[r0],#1 +L8: + BNE L0x2ec + STR r0,[r3,#0] + LDR r0,[sp,#0] + LDR r1,[sp,#4] + B L0x344 +L0x33c: + MVN r0,#1 + B L0x35c +L0x344: + ADD r10,r10,r12,LSR #3 + AND r12,r12,#7 + SUB r10,r10,#4 + STR r12,[r1,#0] + STR r10,[r0,#0] + MOV r0,#0 +L0x35c: + ADD sp,sp,#0x40 + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S new file mode 100644 index 0000000..44eb428 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S @@ -0,0 +1,103 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .section .rodata + .align 4 + + + .global armVCM4P10_QPDivTable + .global armVCM4P10_VMatrixQPModTable + .global armVCM4P10_PosToVCol4x4 + .global armVCM4P10_PosToVCol2x2 + .global armVCM4P10_VMatrix + .global armVCM4P10_QPModuloTable + .global armVCM4P10_VMatrixU16 + +armVCM4P10_PosToVCol4x4: + .byte 0, 2, 0, 2 + .byte 2, 1, 2, 1 + .byte 0, 2, 0, 2 + .byte 2, 1, 2, 1 + +armVCM4P10_PosToVCol2x2: + .byte 0, 2 + .byte 2, 1 + +armVCM4P10_VMatrix: + .byte 10, 16, 13 + .byte 11, 18, 14 + .byte 13, 20, 16 + .byte 14, 23, 18 + .byte 16, 25, 20 + .byte 18, 29, 23 + +;//------------------------------------------------------- +;// This table evaluates the expression [(INT)(QP/6)], +;// for values of QP from 0 to 51 (inclusive). +;//------------------------------------------------------- + +armVCM4P10_QPDivTable: + .byte 0, 0, 0, 0, 0, 0 + .byte 1, 1, 1, 1, 1, 1 + .byte 2, 2, 2, 2, 2, 2 + .byte 3, 3, 3, 3, 3, 3 + .byte 4, 4, 4, 4, 4, 4 + .byte 5, 5, 5, 5, 5, 5 + .byte 6, 6, 6, 6, 6, 6 + .byte 7, 7, 7, 7, 7, 7 + .byte 8, 8, 8, 8, 8, 8 + +;//---------------------------------------------------- +;// This table contains armVCM4P10_VMatrix[QP%6][0] entires, +;// for values of QP from 0 to 51 (inclusive). +;//---------------------------------------------------- + +armVCM4P10_VMatrixQPModTable: + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + .byte 10, 11, 13, 14, 16, 18 + +;//------------------------------------------------------- +;// This table evaluates the modulus expression [QP%6]*6, +;// for values of QP from 0 to 51 (inclusive). +;//------------------------------------------------------- + +armVCM4P10_QPModuloTable: + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + .byte 0, 6, 12, 18, 24, 30 + +;//------------------------------------------------------- +;// This table contains the invidual byte values stored as +;// halfwords. This avoids unpacking inside the function +;//------------------------------------------------------- + +armVCM4P10_VMatrixU16: + .hword 10, 16, 13 + .hword 11, 18, 14 + .hword 13, 20, 16 + .hword 14, 23, 18 + .hword 16, 25, 20 + .hword 18, 29, 23 + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S new file mode 100644 index 0000000..37bc69b --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S @@ -0,0 +1,123 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HorAlign9x_unsafe + .func armVCM4P10_InterpolateLuma_HorAlign9x_unsafe +armVCM4P10_InterpolateLuma_HorAlign9x_unsafe: + MOV r12,r8 + AND r7,r0,#3 + BIC r0,r0,#3 + ADD pc,pc,r7,LSL #2 + NOP + B Copy0toAligned + B Copy1toAligned + B Copy2toAligned + B Copy3toAligned +Copy0toAligned: + LDM r0,{r7,r10,r11} + SUBS r9,r9,#1 + ADD r0,r0,r1 + STM r8!,{r7,r10,r11} + BGT Copy0toAligned + B CopyEnd +Copy1toAligned: + LDM r0,{r7,r10,r11} + SUBS r9,r9,#1 + ADD r0,r0,r1 + LSR r7,r7,#8 + ORR r7,r7,r10,LSL #24 + LSR r10,r10,#8 + ORR r10,r10,r11,LSL #24 + LSR r11,r11,#8 + STM r8!,{r7,r10,r11} + BGT Copy1toAligned + B CopyEnd +Copy2toAligned: + LDM r0,{r7,r10,r11} + SUBS r9,r9,#1 + ADD r0,r0,r1 + LSR r7,r7,#16 + ORR r7,r7,r10,LSL #16 + LSR r10,r10,#16 + ORR r10,r10,r11,LSL #16 + LSR r11,r11,#16 + STM r8!,{r7,r10,r11} + BGT Copy2toAligned + B CopyEnd +Copy3toAligned: + LDM r0,{r7,r10,r11} + SUBS r9,r9,#1 + ADD r0,r0,r1 + LSR r7,r7,#24 + ORR r7,r7,r10,LSL #8 + LSR r10,r10,#24 + ORR r10,r10,r11,LSL #8 + LSR r11,r11,#24 + STM r8!,{r7,r10,r11} + BGT Copy3toAligned +CopyEnd: + MOV r0,r12 + MOV r1,#0xc + BX lr + .endfunc + + .global armVCM4P10_InterpolateLuma_VerAlign4x_unsafe + .func armVCM4P10_InterpolateLuma_VerAlign4x_unsafe +armVCM4P10_InterpolateLuma_VerAlign4x_unsafe: + AND r7,r0,#3 + BIC r0,r0,#3 + ADD pc,pc,r7,LSL #2 + NOP + B Copy0toVAligned + B Copy1toVAligned + B Copy2toVAligned + B Copy3toVAligned +Copy0toVAligned: + LDR r7,[r0],r1 + SUBS r9,r9,#1 + STR r7,[r8],#4 + BGT Copy0toVAligned + B CopyVEnd +Copy1toVAligned: + LDR r10,[r0,#4] + LDR r7,[r0],r1 + SUBS r9,r9,#1 + LSL r10,r10,#24 + ORR r7,r10,r7,LSR #8 + STR r7,[r8],#4 + BGT Copy1toVAligned + B CopyVEnd +Copy2toVAligned: + LDR r10,[r0,#4] + LDR r7,[r0],r1 + SUBS r9,r9,#1 + LSL r10,r10,#16 + ORR r7,r10,r7,LSR #16 + STR r7,[r8],#4 + BGT Copy2toVAligned + B CopyVEnd +Copy3toVAligned: + LDR r10,[r0,#4] + LDR r7,[r0],r1 + SUBS r9,r9,#1 + LSL r10,r10,#8 + ORR r7,r10,r7,LSR #24 + STR r7,[r8],#4 + BGT Copy3toVAligned +CopyVEnd: + SUB r0,r8,#0x1c + MOV r1,#4 + BX lr + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S new file mode 100644 index 0000000..fe92201 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S @@ -0,0 +1,105 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_Copy4x4_unsafe + .func armVCM4P10_InterpolateLuma_Copy4x4_unsafe +armVCM4P10_InterpolateLuma_Copy4x4_unsafe: + PUSH {r4-r6,lr} + AND r12,r0,#3 + BIC r0,r0,#3 + ADD pc,pc,r12,LSL #2 + NOP + B Copy4x4Align0 + B Copy4x4Align1 + B Copy4x4Align2 + B Copy4x4Align3 +Copy4x4Align0: + LDR r4,[r0],r1 + LDR r5,[r0],r1 + STR r4,[r2],r3 + LDR r8,[r0],r1 + STR r5,[r2],r3 + LDR r9,[r0],r1 + STR r8,[r2],r3 + STR r9,[r2],r3 + B Copy4x4End +Copy4x4Align1: + LDR r5,[r0,#4] + LDR r4,[r0],r1 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#8 + ORR r4,r4,r5,LSL #24 + STR r4,[r2],r3 + LSR r8,r8,#8 + ORR r8,r8,r9,LSL #24 + LDR r5,[r0,#4] + LDR r4,[r0],r1 + STR r8,[r2],r3 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#8 + ORR r4,r4,r5,LSL #24 + STR r4,[r2],r3 + LSR r8,r8,#8 + ORR r8,r8,r9,LSL #24 + STR r8,[r2],r3 + B Copy4x4End +Copy4x4Align2: + LDR r5,[r0,#4] + LDR r4,[r0],r1 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#16 + ORR r4,r4,r5,LSL #16 + STR r4,[r2],r3 + LSR r8,r8,#16 + ORR r8,r8,r9,LSL #16 + STR r8,[r2],r3 + LDR r5,[r0,#4] + LDR r4,[r0],r1 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#16 + ORR r4,r4,r5,LSL #16 + STR r4,[r2],r3 + LSR r8,r8,#16 + ORR r8,r8,r9,LSL #16 + STR r8,[r2],r3 + B Copy4x4End +Copy4x4Align3: + LDR r5,[r0,#4] + LDR r4,[r0],r1 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#24 + ORR r4,r4,r5,LSL #8 + STR r4,[r2],r3 + LSR r8,r8,#24 + ORR r8,r8,r9,LSL #8 + STR r8,[r2],r3 + LDR r5,[r0,#4] + LDR r4,[r0],r1 + LDR r9,[r0,#4] + LDR r8,[r0],r1 + LSR r4,r4,#24 + ORR r4,r4,r5,LSL #8 + STR r4,[r2],r3 + LSR r8,r8,#24 + ORR r8,r8,r9,LSL #8 + STR r8,[r2],r3 +Copy4x4End: + POP {r4-r6,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S new file mode 100644 index 0000000..544abe8 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S @@ -0,0 +1,107 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe + .func armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe +armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe: + PUSH {r4-r6,lr} + MOV lr,#4 + LDR r6, =0xfe00fe0 + LDR r12, =0xff00ff +LoopStart1: + LDR r11,[r0,#0xc] + LDR r10,[r0,#8] + LDR r5,[r0,#4] + LDR r4,[r0],r1 + UQSUB16 r11,r11,r6 + UQSUB16 r10,r10,r6 + UQSUB16 r5,r5,r6 + UQSUB16 r4,r4,r6 + USAT16 r11,#13,r11 + USAT16 r10,#13,r10 + USAT16 r5,#13,r5 + USAT16 r4,#13,r4 + AND r11,r12,r11,LSR #5 + AND r10,r12,r10,LSR #5 + AND r5,r12,r5,LSR #5 + AND r4,r12,r4,LSR #5 + ORR r11,r10,r11,LSL #8 + ORR r10,r4,r5,LSL #8 + SUBS lr,lr,#1 + STRD r10,r11,[r7],#8 + BGT LoopStart1 + SUB r0,r7,#0x20 + MOV r1,#8 + POP {r4-r6,pc} + .endfunc + + .global armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe + .func armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe +armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe: + PUSH {r4-r6,lr} + LDR r6, =0xfe00fe0 + LDR r12, =0xff00ff + MOV lr,#2 +LoopStart: + LDR r11,[r0,#0xc] + LDR r10,[r0,#8] + LDR r5,[r0,#4] + LDR r4,[r0],r1 + UQSUB16 r11,r11,r6 + UQSUB16 r10,r10,r6 + UQSUB16 r5,r5,r6 + UQSUB16 r4,r4,r6 + USAT16 r11,#13,r11 + USAT16 r10,#13,r10 + USAT16 r5,#13,r5 + USAT16 r4,#13,r4 + AND r11,r12,r11,LSR #5 + AND r10,r12,r10,LSR #5 + AND r5,r12,r5,LSR #5 + AND r4,r12,r4,LSR #5 + ORR r11,r10,r11,LSL #8 + ORR r10,r4,r5,LSL #8 + PKHBT r4,r10,r11,LSL #16 + STR r4,[r7],#8 + PKHTB r5,r11,r10,ASR #16 + STR r5,[r7],#-4 + LDR r11,[r0,#0xc] + LDR r10,[r0,#8] + LDR r5,[r0,#4] + LDR r4,[r0],r1 + UQSUB16 r11,r11,r6 + UQSUB16 r10,r10,r6 + UQSUB16 r5,r5,r6 + UQSUB16 r4,r4,r6 + USAT16 r11,#13,r11 + USAT16 r10,#13,r10 + USAT16 r5,#13,r5 + USAT16 r4,#13,r4 + AND r11,r12,r11,LSR #5 + AND r10,r12,r10,LSR #5 + AND r5,r12,r5,LSR #5 + AND r4,r12,r4,LSR #5 + ORR r11,r10,r11,LSL #8 + ORR r10,r4,r5,LSL #8 + PKHBT r4,r10,r11,LSL #16 + SUBS lr,lr,#1 + STR r4,[r7],#8 + PKHTB r5,r11,r10,ASR #16 + STR r5,[r7],#4 + BGT LoopStart + SUB r0,r7,#0x18 + MOV r1,#4 + POP {r4-r6,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S new file mode 100644 index 0000000..a330972 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S @@ -0,0 +1,164 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + .func armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe +armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe: + PUSH {r4-r12,lr} + VLD1.8 {d0,d1},[r0],r1 + VMOV.I16 d31,#0x14 + VMOV.I16 d30,#0x5 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q5,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VMLA.I16 d10,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q6,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d10,d10,d8 + VMLA.I16 d12,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q7,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d12,d12,d8 + VMLA.I16 d14,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q8,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d14,d14,d8 + VMLA.I16 d16,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q9,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d16,d16,d8 + VMLA.I16 d18,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q10,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d18,d18,d8 + VMLA.I16 d20,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q11,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d20,d20,d8 + VMLA.I16 d22,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q12,d0,d1 + VLD1.8 {d0,d1},[r0],r1 + VSUB.I16 d22,d22,d8 + VMLA.I16 d24,d2,d31 + VMUL.I16 d8,d4,d30 + VEXT.8 d4,d0,d1,#1 + VEXT.8 d2,d0,d1,#2 + VEXT.8 d3,d0,d1,#3 + VEXT.8 d5,d0,d1,#4 + VEXT.8 d1,d0,d1,#5 + VADDL.U8 q1,d2,d3 + VADDL.U8 q2,d4,d5 + VADDL.U8 q13,d0,d1 + VSUB.I16 d24,d24,d8 + VMLA.I16 d26,d2,d31 + VMUL.I16 d8,d4,d30 + VMOV.I32 q15,#0x14 + VMOV.I32 q14,#0x5 + VADDL.S16 q5,d10,d20 + VADDL.S16 q1,d14,d16 + VADDL.S16 q0,d12,d18 + VSUB.I16 d26,d26,d8 + VMLA.I32 q5,q1,q15 + VMUL.I32 q4,q0,q14 + VADDL.S16 q6,d12,d22 + VADDL.S16 q1,d16,d18 + VADDL.S16 q0,d14,d20 + VMLA.I32 q6,q1,q15 + VSUB.I32 q5,q5,q4 + VMUL.I32 q4,q0,q14 + VADDL.S16 q2,d14,d24 + VADDL.S16 q1,d18,d20 + VADDL.S16 q0,d16,d22 + VMLA.I32 q2,q1,q15 + VSUB.I32 q6,q6,q4 + VMUL.I32 q4,q0,q14 + VADDL.S16 q3,d16,d26 + VADDL.S16 q1,d20,d22 + VADDL.S16 q0,d18,d24 + VMLA.I32 q3,q1,q15 + VSUB.I32 q2,q2,q4 + VMLS.I32 q3,q0,q14 + VQRSHRUN.S32 d0,q5,#10 + VQRSHRUN.S32 d2,q6,#10 + VQRSHRUN.S32 d4,q2,#10 + VQRSHRUN.S32 d6,q3,#10 + VQMOVN.U16 d0,q0 + VQMOVN.U16 d2,q1 + VQMOVN.U16 d4,q2 + VQMOVN.U16 d6,q3 + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S new file mode 100644 index 0000000..991c33f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S @@ -0,0 +1,119 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + .func armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe +armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe: + PUSH {r4-r12,lr} + VLD1.8 {d0,d1},[r0],r1 + ADD r12,r0,r1,LSL #2 + VMOV.I8 d30,#0x5 + VMOV.I8 d31,#0x14 + VLD1.8 {d10,d11},[r12],r1 + VLD1.8 {d2,d3},[r0],r1 + VLD1.8 {d12,d13},[r12],r1 + VADDL.U8 q9,d0,d10 + VLD1.8 {d4,d5},[r0],r1 + VADDL.U8 q0,d1,d11 + VLD1.8 {d6,d7},[r0],r1 + VADDL.U8 q10,d2,d12 + VLD1.8 {d8,d9},[r0],r1 + VMLAL.U8 q9,d4,d31 + VLD1.8 {d14,d15},[r12],r1 + VMLAL.U8 q0,d5,d31 + VLD1.8 {d16,d17},[r12],r1 + VMLAL.U8 q9,d6,d31 + VMLAL.U8 q10,d6,d31 + VMLSL.U8 q0,d3,d30 + VADDL.U8 q11,d4,d14 + VMLSL.U8 q9,d2,d30 + VADDL.U8 q1,d3,d13 + VMLAL.U8 q0,d7,d31 + VMLAL.U8 q10,d8,d31 + VMLSL.U8 q9,d8,d30 + VMLAL.U8 q1,d7,d31 + VMLSL.U8 q0,d9,d30 + VMLAL.U8 q11,d8,d31 + VMLSL.U8 q10,d4,d30 + VMLSL.U8 q1,d5,d30 + VADDL.U8 q2,d5,d15 + VMLAL.U8 q11,d10,d31 + VMLSL.U8 q10,d10,d30 + VMLAL.U8 q1,d9,d31 + VMLAL.U8 q2,d9,d31 + VADDL.U8 q12,d6,d16 + VMLSL.U8 q11,d6,d30 + VMLSL.U8 q1,d11,d30 + VMLSL.U8 q2,d7,d30 + VADDL.U8 q3,d7,d17 + VMLAL.U8 q12,d10,d31 + VMLSL.U8 q11,d12,d30 + VMLSL.U8 q2,d13,d30 + VMLAL.U8 q3,d11,d31 + VMLAL.U8 q12,d12,d31 + VEXT.8 d26,d18,d19,#2 + VMLAL.U8 q2,d11,d31 + VMLAL.U8 q3,d13,d31 + VMLSL.U8 q12,d8,d30 + VEXT.8 d27,d18,d19,#4 + VMOV.I16 d31,#0x14 + VMLSL.U8 q3,d9,d30 + VMLSL.U8 q12,d14,d30 + VEXT.8 d29,d19,d0,#2 + VEXT.8 d28,d18,d19,#6 + VMLSL.U8 q3,d15,d30 + VADDL.S16 q0,d18,d29 + VADD.I16 d27,d27,d28 + VMOV.I16 d30,#0x5 + VADD.I16 d26,d26,d19 + VMLAL.S16 q0,d27,d31 + VEXT.8 d27,d20,d21,#4 + VEXT.8 d28,d20,d21,#6 + VEXT.8 d29,d21,d2,#2 + VMLSL.S16 q0,d26,d30 + VEXT.8 d26,d20,d21,#2 + VADDL.S16 q1,d20,d29 + VADD.I16 d27,d27,d28 + VADD.I16 d26,d26,d21 + VEXT.8 d28,d22,d23,#6 + VMLAL.S16 q1,d27,d31 + VEXT.8 d29,d23,d4,#2 + VEXT.8 d27,d22,d23,#4 + VEXT.8 d8,d22,d23,#2 + VADDL.S16 q2,d22,d29 + VMLSL.S16 q1,d26,d30 + VADD.I16 d27,d27,d28 + VADD.I16 d26,d8,d23 + VEXT.8 d28,d24,d25,#6 + VMLAL.S16 q2,d27,d31 + VEXT.8 d27,d24,d25,#4 + VEXT.8 d29,d25,d6,#2 + VADD.I16 d27,d27,d28 + VEXT.8 d8,d24,d25,#2 + VADDL.S16 q3,d24,d29 + VMLSL.S16 q2,d26,d30 + VMLAL.S16 q3,d27,d31 + VADD.I16 d8,d8,d25 + VMLSL.S16 q3,d8,d30 + VQRSHRUN.S32 d0,q0,#10 + VQRSHRUN.S32 d2,q1,#10 + VQRSHRUN.S32 d4,q2,#10 + VQRSHRUN.S32 d6,q3,#10 + VQMOVN.U16 d0,q0 + VQMOVN.U16 d2,q1 + VQMOVN.U16 d4,q2 + VQMOVN.U16 d6,q3 + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S new file mode 100644 index 0000000..40e141b --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S @@ -0,0 +1,72 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + .func armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe +armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe: + PUSH {r4-r12,lr} + VLD1.8 {d22,d23},[r0],r1 + VEXT.8 d10,d22,d23,#5 + VEXT.8 d12,d22,d23,#1 + VEXT.8 d14,d22,d23,#2 + VEXT.8 d15,d22,d23,#3 + VEXT.8 d13,d22,d23,#4 + VADDL.U8 q11,d22,d10 + VADDL.U8 q4,d14,d15 + VADDL.U8 q6,d12,d13 + VLD1.8 {d24,d25},[r0],r1 + VMLA.I16 d22,d8,d31 + VMUL.I16 d8,d12,d30 + VEXT.8 d10,d24,d25,#5 + VEXT.8 d12,d24,d25,#1 + VEXT.8 d16,d24,d25,#2 + VEXT.8 d17,d24,d25,#3 + VEXT.8 d13,d24,d25,#4 + VADDL.U8 q12,d24,d10 + VSUB.I16 d22,d22,d8 + VADDL.U8 q4,d16,d17 + VADDL.U8 q6,d12,d13 + VLD1.8 {d26,d27},[r0],r1 + VMLA.I16 d24,d8,d31 + VMUL.I16 d8,d12,d30 + VEXT.8 d10,d26,d27,#5 + VEXT.8 d12,d26,d27,#1 + VEXT.8 d18,d26,d27,#2 + VEXT.8 d19,d26,d27,#3 + VEXT.8 d13,d26,d27,#4 + VADDL.U8 q13,d26,d10 + VSUB.I16 d24,d24,d8 + VADDL.U8 q4,d18,d19 + VADDL.U8 q6,d12,d13 + VLD1.8 {d28,d29},[r0],r1 + VMLA.I16 d26,d8,d31 + VMUL.I16 d8,d12,d30 + VEXT.8 d10,d28,d29,#5 + VEXT.8 d12,d28,d29,#1 + VEXT.8 d20,d28,d29,#2 + VEXT.8 d21,d28,d29,#3 + VEXT.8 d13,d28,d29,#4 + VADDL.U8 q14,d28,d10 + VSUB.I16 d26,d26,d8 + VADDL.U8 q4,d20,d21 + VADDL.U8 q6,d12,d13 + VMLA.I16 d28,d8,d31 + VMLS.I16 d28,d12,d30 + VQRSHRUN.S16 d22,q11,#5 + VQRSHRUN.S16 d24,q12,#5 + VQRSHRUN.S16 d26,q13,#5 + VQRSHRUN.S16 d28,q14,#5 + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S new file mode 100644 index 0000000..955846f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S @@ -0,0 +1,58 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + .func armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe +armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe: + PUSH {r4-r12,lr} + VLD1.8 {d7},[r0],r1 + ADD r12,r0,r1,LSL #2 + VLD1.8 {d8},[r0],r1 + VLD1.8 {d12},[r12],r1 + VLD1.8 {d9},[r0],r1 + VADDL.U8 q0,d7,d12 + VLD1.8 {d10},[r0],r1 + VLD1.8 {d13},[r12],r1 + VLD1.8 {d11},[r0],r1 + VLD1.8 {d14},[r12],r1 + VADDL.U8 q8,d8,d11 + VADDL.U8 q9,d9,d10 + VLD1.8 {d15},[r12],r1 + VMLS.I16 d0,d16,d30 + VMUL.I16 d20,d18,d31 + VADDL.U8 q8,d9,d12 + VADDL.U8 q9,d10,d11 + VADDL.U8 q1,d8,d13 + VMLS.I16 d2,d16,d30 + VMUL.I16 d21,d18,d31 + VADDL.U8 q8,d10,d13 + VADDL.U8 q9,d11,d12 + VADDL.U8 q2,d9,d14 + VMLS.I16 d4,d16,d30 + VMUL.I16 d22,d18,d31 + VADDL.U8 q8,d11,d14 + VADDL.U8 q3,d10,d15 + VADDL.U8 q9,d12,d13 + VMLS.I16 d6,d16,d30 + VADD.I16 d0,d0,d20 + VADD.I16 d2,d2,d21 + VADD.I16 d4,d4,d22 + VMLA.I16 d6,d18,d31 + VQRSHRUN.S16 d0,q0,#5 + VQRSHRUN.S16 d2,q1,#5 + VQRSHRUN.S16 d4,q2,#5 + VQRSHRUN.S16 d6,q3,#5 + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S new file mode 100644 index 0000000..66520da --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S @@ -0,0 +1,175 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + + .section .rodata + .align 4 + +armVCM4P10_WidthBranchTableMVIsNotZero: + .word WidthIs2MVIsNotZero, WidthIs2MVIsNotZero + .word WidthIs4MVIsNotZero, WidthIs4MVIsNotZero + .word WidthIs8MVIsNotZero + +armVCM4P10_WidthBranchTableMVIsZero: + .word WidthIs2MVIsZero, WidthIs2MVIsZero + .word WidthIs4MVIsZero, WidthIs4MVIsZero + .word WidthIs8MVIsZero + + .text + + .global armVCM4P10_Interpolate_Chroma + .func armVCM4P10_Interpolate_Chroma +armVCM4P10_Interpolate_Chroma: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + LDRD r6,r7,[sp,#0x70] + LDRD r4,r5,[sp,#0x68] + RSB r8,r6,#8 + RSB r9,r7,#8 + CMN r6,r7 + MOV r10,#1 + LDREQ r11, =armVCM4P10_WidthBranchTableMVIsZero + SUB lr,r1,r10 + LDRNE r11, =armVCM4P10_WidthBranchTableMVIsNotZero + VLD1.8 {d0},[r0],r10 + SMULBB r12,r8,r9 + SMULBB r9,r6,r9 + VLD1.8 {d1},[r0],lr + SMULBB r8,r8,r7 + SMULBB r6,r6,r7 + VDUP.8 d12,r12 + VDUP.8 d13,r9 + VDUP.8 d14,r8 + VDUP.8 d15,r6 + LDR pc,[r11,r4,LSL #1] + +WidthIs8MVIsNotZero: + VLD1.8 {d2},[r0],r10 + VMULL.U8 q2,d0,d12 + VLD1.8 {d3},[r0],lr + VMULL.U8 q3,d2,d12 + VLD1.8 {d16},[r0],r10 + VMLAL.U8 q2,d1,d13 + VLD1.8 {d17},[r0],lr + VMULL.U8 q11,d16,d12 + VMLAL.U8 q3,d3,d13 + VLD1.8 {d18},[r0],r10 + VMLAL.U8 q2,d2,d14 + VMLAL.U8 q11,d17,d13 + VMULL.U8 q12,d18,d12 + VLD1.8 {d19},[r0],lr + VMLAL.U8 q3,d16,d14 + VLD1.8 {d0},[r0],r10 + VMLAL.U8 q12,d19,d13 + VMLAL.U8 q11,d18,d14 + VMLAL.U8 q2,d3,d15 + VLD1.8 {d1},[r0],lr + VMLAL.U8 q12,d0,d14 + VMLAL.U8 q3,d17,d15 + VMLAL.U8 q11,d19,d15 + SUBS r5,r5,#4 + VMLAL.U8 q12,d1,d15 + VQRSHRN.U16 d8,q2,#6 + VQRSHRN.U16 d9,q3,#6 + VQRSHRN.U16 d20,q11,#6 + VST1.64 {d8},[r2],r3 + VQRSHRN.U16 d21,q12,#6 + VST1.64 {d9},[r2],r3 + VST1.64 {d20},[r2],r3 + VST1.64 {d21},[r2],r3 + BGT WidthIs8MVIsNotZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + +WidthIs4MVIsNotZero: + VLD1.8 {d2},[r0],r10 + VMULL.U8 q2,d0,d12 + VMULL.U8 q3,d2,d12 + VLD1.8 {d3},[r0],lr + VMLAL.U8 q2,d1,d13 + VMLAL.U8 q3,d3,d13 + VLD1.8 {d0},[r0],r10 + VMLAL.U8 q2,d2,d14 + VMLAL.U8 q3,d0,d14 + VLD1.8 {d1},[r0],lr + SUBS r5,r5,#2 + VMLAL.U8 q3,d1,d15 + VMLAL.U8 q2,d3,d15 + VQRSHRN.U16 d9,q3,#6 + VQRSHRN.U16 d8,q2,#6 + VST1.32 {d8[0]},[r2],r3 + VST1.32 {d9[0]},[r2],r3 + BGT WidthIs4MVIsNotZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + +WidthIs2MVIsNotZero: + VLD1.8 {d2},[r0],r10 + VMULL.U8 q2,d0,d12 + VMULL.U8 q3,d2,d12 + VLD1.8 {d3},[r0],lr + VMLAL.U8 q2,d1,d13 + VMLAL.U8 q3,d3,d13 + VLD1.8 {d0},[r0],r10 + VMLAL.U8 q2,d2,d14 + VMLAL.U8 q3,d0,d14 + VLD1.8 {d1},[r0],lr + SUBS r5,r5,#2 + VMLAL.U8 q3,d1,d15 + VMLAL.U8 q2,d3,d15 + VQRSHRN.U16 d9,q3,#6 + VQRSHRN.U16 d8,q2,#6 + VST1.16 {d8[0]},[r2],r3 + VST1.16 {d9[0]},[r2],r3 + BGT WidthIs2MVIsNotZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + +WidthIs8MVIsZero: + SUB r0,r0,r1 +WidthIs8LoopMVIsZero: + VLD1.8 {d0},[r0],r1 + SUBS r5,r5,#2 + VLD1.8 {d1},[r0],r1 + VST1.64 {d0},[r2],r3 + VST1.64 {d1},[r2],r3 + BGT WidthIs8LoopMVIsZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + +WidthIs4MVIsZero: + VLD1.8 {d1},[r0],r1 + SUBS r5,r5,#2 + VST1.32 {d0[0]},[r2],r3 + VLD1.8 {d0},[r0],r1 + VST1.32 {d1[0]},[r2],r3 + BGT WidthIs4MVIsZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + +WidthIs2MVIsZero: + VLD1.8 {d1},[r0],r1 + SUBS r5,r5,#2 + VST1.16 {d0[0]},[r2],r3 + VLD1.8 {d0},[r0],r1 + VST1.16 {d1[0]},[r2],r3 + BGT WidthIs2MVIsZero + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S new file mode 100644 index 0000000..f5d6d1f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S @@ -0,0 +1,68 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .section .rodata + .align 4 + + .global armVCM4P10_MFMatrixQPModTable + .global armVCM4P10_QPDivIntraTable + .global armVCM4P10_QPDivPlusOneTable + +;//------------------------------------------------------------------ +;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) , +;// for values of iQP from 0 to 51 (inclusive). +;//------------------------------------------------------------------ + + +armVCM4P10_QPDivIntraTable: + .word 21845, 21845, 21845, 21845, 21845, 21845 + .word 43690, 43690, 43690, 43690, 43690, 43690 + .word 87381, 87381, 87381, 87381, 87381, 87381 + .word 174762, 174762, 174762, 174762, 174762, 174762 + .word 349525, 349525, 349525, 349525, 349525, 349525 + .word 699050, 699050, 699050, 699050, 699050, 699050 + .word 1398101, 1398101, 1398101, 1398101, 1398101, 1398101 + .word 2796202, 2796202, 2796202, 2796202, 2796202, 2796202 + + +;//-------------------------------------------------------------- +;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires, +;// for values of iQP from 0 to 51 (inclusive). +;//-------------------------------------------------------------- + +armVCM4P10_MFMatrixQPModTable: + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + .hword 13107, 11916, 10082, 9362, 8192, 7282 + +;//--------------------------------------------------------------- +;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values, +;// for values of iQP from 0 to 51 (inclusive). +;//--------------------------------------------------------------- + +armVCM4P10_QPDivPlusOneTable: + .byte 16, 16, 16, 16, 16, 16 + .byte 17, 17, 17, 17, 17, 17 + .byte 18, 18, 18, 18, 18, 18 + .byte 19, 19, 19, 19, 19, 19 + .byte 20, 20, 20, 20, 20, 20 + .byte 21, 21, 21, 21, 21, 21 + .byte 22, 22, 22, 22, 22, 22 + .byte 23, 23, 23, 23, 23, 23 + .byte 24, 24, 24, 24, 24, 24 + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S new file mode 100644 index 0000000..c24d717 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S @@ -0,0 +1,52 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_TransformResidual4x4 + .func armVCM4P10_TransformResidual4x4 +armVCM4P10_TransformResidual4x4: + VPUSH {d8} + VLD4.16 {d0,d1,d2,d3},[r1] + VMOV.I16 d4,#0 + VADD.I16 d5,d0,d2 + VSUB.I16 d6,d0,d2 + VHADD.S16 d7,d1,d4 + VHADD.S16 d8,d3,d4 + VSUB.I16 d7,d7,d3 + VADD.I16 d8,d1,d8 + VADD.I16 d0,d5,d8 + VADD.I16 d1,d6,d7 + VSUB.I16 d2,d6,d7 + VSUB.I16 d3,d5,d8 + VTRN.16 d0,d1 + VTRN.16 d2,d3 + VTRN.32 q0,q1 + VADD.I16 d5,d0,d2 + VSUB.I16 d6,d0,d2 + VHADD.S16 d7,d1,d4 + VHADD.S16 d8,d3,d4 + VSUB.I16 d7,d7,d3 + VADD.I16 d8,d1,d8 + VADD.I16 d0,d5,d8 + VADD.I16 d1,d6,d7 + VSUB.I16 d2,d6,d7 + VSUB.I16 d3,d5,d8 + VRSHR.S16 d0,d0,#6 + VRSHR.S16 d1,d1,#6 + VRSHR.S16 d2,d2,#6 + VRSHR.S16 d3,d3,#6 + VST1.16 {d0,d1,d2,d3},[r0] + VPOP {d8} + BX lr + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S new file mode 100644 index 0000000..c552f8d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S @@ -0,0 +1,40 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_UnpackBlock4x4 + .func armVCM4P10_UnpackBlock4x4 +armVCM4P10_UnpackBlock4x4: + PUSH {r4-r8,lr} + LDR r2,[r0,#0] + MOV r7,#0x1f + MOV r4,#0 + MOV r5,#0 + LDRB r3,[r2],#1 + STRD r4,r5,[r1,#0] + STRD r4,r5,[r1,#8] + STRD r4,r5,[r1,#0x10] + STRD r4,r5,[r1,#0x18] +unpackLoop: + TST r3,#0x10 + LDRNESB r5,[r2,#1] + LDRNEB r4,[r2],#2 + AND r6,r7,r3,LSL #1 + LDREQSB r4,[r2],#1 + ORRNE r4,r4,r5,LSL #8 + TST r3,#0x20 + LDREQB r3,[r2],#1 + STRH r4,[r1,r6] + BEQ unpackLoop + STR r2,[r0,#0] + POP {r4-r8,pc} + .endfunc + .end diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S new file mode 100644 index 0000000..ba61059 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S @@ -0,0 +1,67 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_DeblockLuma_I + .func omxVCM4P10_DeblockLuma_I +omxVCM4P10_DeblockLuma_I: + PUSH {r4-r9,lr} + MOVS r6,r0 + SUB sp,sp,#0xc + MOV r9,r1 + MOV r7,r2 + MOV r8,r3 + LDR r4,[sp,#0x28] + LDR r5,[sp,#0x2c] + BEQ L0x58 + TST r6,#7 + TSTEQ r9,#7 + BNE L0x58 + CMP r7,#0 + CMPNE r8,#0 + CMPNE r4,#0 + BEQ L0x58 + TST r4,#3 + BNE L0x58 + CMP r5,#0 + BEQ L0x58 + TST r5,#3 + BEQ L0x64 +L0x58: + MVN r0,#4 +L0x5c: + ADD sp,sp,#0xc + POP {r4-r9,pc} +L0x64: + STR r4,[sp,#0] + MOV r3,r8 + STR r5,[sp,#4] + MOV r2,r7 + MOV r1,r9 + MOV r0,r6 + BL omxVCM4P10_FilterDeblockingLuma_VerEdge_I + CMP r0,#0 + BNE L0x5c + ADD r3,r5,#0x10 + ADD r2,r4,#0x10 + STR r3,[sp,#4] + STR r2,[sp,#0] + ADD r3,r8,#2 + ADD r2,r7,#2 + MOV r1,r9 + MOV r0,r6 + BL omxVCM4P10_FilterDeblockingLuma_HorEdge_I + ADD sp,sp,#0xc + POP {r4-r9,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S new file mode 100644 index 0000000..be21ee7 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S @@ -0,0 +1,119 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_DequantTransformResidualFromPairAndAdd + .func omxVCM4P10_DequantTransformResidualFromPairAndAdd +omxVCM4P10_DequantTransformResidualFromPairAndAdd: + PUSH {r4-r12,lr} + VPUSH {d8-d9} + SUB sp,sp,#0x20 + ADD r4,sp,#0 + LDR r5,[sp,#0x64] + MOV r7,r1 + MOV r8,r2 + MOV r9,r3 + CMP r5,#0 + BEQ L0x114 + MOV r1,r4 + BL armVCM4P10_UnpackBlock4x4 ;// + LDR r1,[sp,#0x60] + LDR r11, =armVCM4P10_QPModuloTable + LDR r10, =armVCM4P10_QPDivTable + LDR r2, =armVCM4P10_VMatrixU16 + LDRSB r12,[r11,r1] + LDRSB lr,[r10,r1] + LDR r10, =0x3020504 + LDR r1, =0x5040100 + ADD r2,r2,r12 + VDUP.32 d7,r1 + VDUP.32 d9,r10 + VDUP.16 d5,lr + VLD1.8 {d6},[r2] + VTBL.8 d8,{d6},d7 + VTBL.8 d4,{d6},d9 + CMP r8,#0 + VLD1.16 {d0,d1,d2,d3},[r4] + VSHL.U16 d8,d8,d5 + VSHL.U16 d4,d4,d5 + BEQ L1 + LDRSH r10,[r8,#0] +L1: + VMUL.I16 d0,d0,d8 + VMUL.I16 d1,d1,d4 + VMUL.I16 d2,d2,d8 + VMUL.I16 d3,d3,d4 + VMOVNE.16 d0[0],r10 + VTRN.16 d0,d1 + VTRN.16 d2,d3 + VTRN.32 q0,q1 + VMOV.I16 d4,#0 + VADD.I16 d5,d0,d2 + VSUB.I16 d6,d0,d2 + VHADD.S16 d7,d1,d4 + VHADD.S16 d8,d3,d4 + VSUB.I16 d7,d7,d3 + VADD.I16 d8,d1,d8 + VADD.I16 d0,d5,d8 + VADD.I16 d1,d6,d7 + VSUB.I16 d2,d6,d7 + VSUB.I16 d3,d5,d8 + VTRN.16 d0,d1 + VTRN.16 d2,d3 + VTRN.32 q0,q1 + VADD.I16 d5,d0,d2 + VSUB.I16 d6,d0,d2 + VHADD.S16 d7,d1,d4 + VHADD.S16 d8,d3,d4 + VSUB.I16 d7,d7,d3 + VADD.I16 d8,d1,d8 + VADD.I16 d0,d5,d8 + VADD.I16 d1,d6,d7 + VSUB.I16 d2,d6,d7 + VSUB.I16 d3,d5,d8 + VRSHR.S16 d0,d0,#6 + VRSHR.S16 d1,d1,#6 + VRSHR.S16 d2,d2,#6 + VRSHR.S16 d3,d3,#6 + B L0x130 +L0x114: + LDRSH r10,[r8,#0] + ADD r10,r10,#0x20 + ASR r10,r10,#6 + VDUP.16 d0,r10 + VDUP.16 d1,r10 + VDUP.16 d2,r10 + VDUP.16 d3,r10 +L0x130: + LDR r1,[sp,#0x58] + LDR r10,[sp,#0x5c] + LDR r3,[r7],r1 + LDR r5,[r7],r1 + VMOV d4,r3,r5 + LDR r3,[r7],r1 + LDR r5,[r7,#0] + VMOV d5,r3,r5 + VADDW.U8 q3,q0,d4 + VADDW.U8 q4,q1,d5 + VQMOVUN.S16 d0,q3 + VQMOVUN.S16 d1,q4 + VST1.32 {d0[0]},[r9],r10 + VST1.32 {d0[1]},[r9],r10 + VST1.32 {d1[0]},[r9],r10 + VST1.32 {d1[1]},[r9] + MOV r0,#0 + ADD sp,sp,#0x20 + VPOP {d8-d9} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S new file mode 100644 index 0000000..79ba538 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S @@ -0,0 +1,87 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_FilterDeblockingChroma_HorEdge_I + .func omxVCM4P10_FilterDeblockingChroma_HorEdge_I +omxVCM4P10_FilterDeblockingChroma_HorEdge_I: + PUSH {r4-r10,lr} + VPUSH {d8-d15} + VLD1.8 {d0[]},[r2]! + SUB r0,r0,r1,LSL #1 + SUB r0,r0,r1 + VLD1.8 {d2[]},[r3]! + LDR r4,[sp,#0x64] + LDR r5,[sp,#0x60] + LDR r9, =0x3030303 + LDR r8, =0x4040404 + VMOV.I8 d14,#0 + VMOV.I8 d15,#0x1 + VMOV.I16 d1,#0x4 + MOV r7,#0x40000000 +L0x38: + LDR r6,[r4],#8 + VLD1.8 {d6},[r0],r1 + VLD1.8 {d5},[r0],r1 + CMP r6,#0 + VLD1.8 {d4},[r0],r1 + VLD1.8 {d8},[r0],r1 + VABD.U8 d19,d6,d4 + VLD1.8 {d9},[r0],r1 + VABD.U8 d13,d4,d8 + VLD1.8 {d10},[r0],r1 + BEQ L0xe4 + VABD.U8 d12,d5,d4 + VABD.U8 d18,d9,d8 + VCGT.U8 d16,d0,d13 + VMOV.32 d26[0],r6 + VMAX.U8 d12,d18,d12 + VMOVL.U8 q13,d26 + VABD.U8 d17,d10,d8 + VCGT.S16 d27,d26,#0 + VCGT.U8 d12,d2,d12 + VCGT.U8 d19,d2,d19 + VAND d16,d16,d27 + TST r6,r9 + VCGT.U8 d17,d2,d17 + VAND d16,d16,d12 + VAND d12,d16,d17 + VAND d17,d16,d19 + BLNE armVCM4P10_DeblockingChromabSLT4_unsafe + TST r6,r8 + SUB r0,r0,r1,LSL #2 + VTST.16 d26,d26,d1 + BLNE armVCM4P10_DeblockingChromabSGE4_unsafe + VBIT d29,d13,d26 + VBIT d24,d31,d26 + VBIF d29,d4,d16 + VBIF d24,d8,d16 + VST1.8 {d29},[r0],r1 + ADDS r7,r7,r7 + VST1.8 {d24},[r0],r1 + BNE L0x38 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r10,pc} +L0xe4: + VLD1.8 {d0[]},[r2] + SUB r0,r0,r1,LSL #1 + ADDS r7,r7,r7 + VLD1.8 {d2[]},[r3] + ADD r5,r5,#4 + BNE L0x38 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r10,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S new file mode 100644 index 0000000..dcdddbe --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S @@ -0,0 +1,123 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_FilterDeblockingChroma_VerEdge_I + .func omxVCM4P10_FilterDeblockingChroma_VerEdge_I +omxVCM4P10_FilterDeblockingChroma_VerEdge_I: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + VLD1.8 {d0[]},[r2]! + SUB r0,r0,#4 + VLD1.8 {d2[]},[r3]! + LDR r4,[sp,#0x6c] + LDR r5,[sp,#0x68] + LDR r8, =0x4040404 + LDR r9, =0x3030303 + VMOV.I8 d14,#0 + VMOV.I8 d15,#0x1 + VMOV.I16 d1,#0x4 + MOV r7,#0x40000000 +L0x34: + LDR r6,[r4],#8 + ADD r10,r0,r1 + ADD lr,r1,r1 + VLD1.8 {d7},[r0],lr + VLD1.8 {d8},[r10],lr + VLD1.8 {d5},[r0],lr + VLD1.8 {d10},[r10],lr + VLD1.8 {d6},[r0],lr + VLD1.8 {d9},[r10],lr + VLD1.8 {d4},[r0],lr + VLD1.8 {d11},[r10],lr + VZIP.8 d7,d8 + VZIP.8 d5,d10 + VZIP.8 d6,d9 + VZIP.8 d4,d11 + VZIP.16 d7,d5 + VZIP.16 d8,d10 + VZIP.16 d6,d4 + VZIP.16 d9,d11 + VTRN.32 d7,d6 + VTRN.32 d5,d4 + VTRN.32 d10,d11 + VTRN.32 d8,d9 + CMP r6,#0 + VABD.U8 d19,d6,d4 + VABD.U8 d13,d4,d8 + BEQ L0x170 + VABD.U8 d12,d5,d4 + VABD.U8 d18,d9,d8 + VMOV.32 d26[0],r6 + VCGT.U8 d16,d0,d13 + VMAX.U8 d12,d18,d12 + VMOVL.U8 q13,d26 + VABD.U8 d17,d10,d8 + VCGT.S16 d27,d26,#0 + VCGT.U8 d12,d2,d12 + VCGT.U8 d19,d2,d19 + VAND d16,d16,d27 + TST r6,r9 + VCGT.U8 d17,d2,d17 + VAND d16,d16,d12 + VAND d12,d16,d17 + VAND d17,d16,d19 + BLNE armVCM4P10_DeblockingChromabSLT4_unsafe + TST r6,r8 + SUB r0,r0,r1,LSL #3 + VTST.16 d26,d26,d1 + BLNE armVCM4P10_DeblockingChromabSGE4_unsafe + VBIT d29,d13,d26 + VBIT d24,d31,d26 + ADD r10,r0,#3 + VBIF d29,d4,d16 + ADD r12,r10,r1 + ADD lr,r1,r1 + VBIF d24,d8,d16 + ADDS r7,r7,r7 + VST1.8 {d29[0]},[r10],lr + VST1.8 {d29[1]},[r12],lr + VST1.8 {d29[2]},[r10],lr + VST1.8 {d29[3]},[r12],lr + VST1.8 {d29[4]},[r10],lr + VST1.8 {d29[5]},[r12],lr + VST1.8 {d29[6]},[r10],lr + VST1.8 {d29[7]},[r12],lr + ADD r12,r0,#4 + ADD r10,r12,r1 + VST1.8 {d24[0]},[r12],lr + VST1.8 {d24[1]},[r10],lr + VST1.8 {d24[2]},[r12],lr + VST1.8 {d24[3]},[r10],lr + VST1.8 {d24[4]},[r12],lr + VST1.8 {d24[5]},[r10],lr + VST1.8 {d24[6]},[r12],lr + VST1.8 {d24[7]},[r10],lr + ADD r0,r0,#4 + BNE L0x34 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} +L0x170: + VLD1.8 {d0[]},[r2] + ADD r0,r0,#4 + SUB r0,r0,r1,LSL #3 + ADDS r7,r7,r7 + VLD1.8 {d2[]},[r3] + ADD r5,r5,#4 + BNE L0x34 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S new file mode 100644 index 0000000..9755899 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S @@ -0,0 +1,107 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_FilterDeblockingLuma_HorEdge_I + .func omxVCM4P10_FilterDeblockingLuma_HorEdge_I +omxVCM4P10_FilterDeblockingLuma_HorEdge_I: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + ADD r7,r2,#1 + ADD r8,r3,#1 + VLD1.8 {d0[]},[r2] + SUB r0,r0,r1,LSL #2 + VLD1.8 {d2[]},[r3] + LDR r4,[sp,#0x6c] + LDR r5,[sp,#0x68] + MOV r11,#0 + VMOV.I8 d14,#0 + VMOV.I8 d15,#0x1 + ADD r10,r1,r1 + MOV r9,#0x55000000 +L0x38: + LDRH r12,[r4],#2 + ADD r6,r0,r1 + CMP r12,#0 + BEQ L0xe4 + VLD1.8 {d7},[r0],r10 + VLD1.8 {d6},[r6],r10 + VLD1.8 {d5},[r0],r10 + VLD1.8 {d4},[r6],r10 + VLD1.8 {d8},[r0],r10 + VABD.U8 d12,d4,d5 + VLD1.8 {d9},[r6] + VABD.U8 d13,d8,d4 + VLD1.8 {d10},[r0],r1 + VABD.U8 d18,d9,d8 + VABD.U8 d19,d6,d4 + VCGT.U8 d16,d0,d13 + TST r12,#0xff + VMAX.U8 d12,d18,d12 + VABD.U8 d17,d10,d8 + VMOVEQ.32 d16[0],r11 + TST r12,#0xff00 + VCGT.U8 d19,d2,d19 + VCGT.U8 d12,d2,d12 + VMOVEQ.32 d16[1],r11 + VCGT.U8 d17,d2,d17 + VLD1.8 {d11},[r0] + VAND d16,d16,d12 + TST r12,#4 + VAND d12,d16,d17 + VAND d17,d16,d19 + BNE L0xf8 + SUB r0,r0,r1,LSL #2 + SUB r0,r0,r1 + BL armVCM4P10_DeblockingLumabSLT4_unsafe + VST1.8 {d30},[r0],r1 + VST1.8 {d29},[r0],r1 + SUB r6,r0,r1,LSL #2 + VST1.8 {d24},[r0],r1 + ADDS r9,r9,r9 + VST1.8 {d25},[r0] + ADD r0,r6,#8 + BCC L0x38 + B L0x130 +L0xe4: + ADD r0,r0,#8 + ADDS r9,r9,r9 + ADD r5,r5,#2 + BCC L0x38 + B L0x130 +L0xf8: + SUB r0,r0,r1,LSL #2 + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_DeblockingLumabSGE4_unsafe + VST1.8 {d31},[r0],r1 + VST1.8 {d30},[r0],r1 + VST1.8 {d29},[r0],r1 + SUB r6,r0,r1,LSL #2 + VST1.8 {d24},[r0],r1 + ADDS r9,r9,r9 + VST1.8 {d25},[r0],r1 + ADD r5,r5,#2 + VST1.8 {d28},[r0] + ADD r0,r6,#8 + BCC L0x38 +L0x130: + SUB r0,r0,#0x10 + VLD1.8 {d0[]},[r7] + ADD r0,r0,r1,LSL #2 + VLD1.8 {d2[]},[r8] + BNE L0x38 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S new file mode 100644 index 0000000..66cc32e --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S @@ -0,0 +1,157 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_FilterDeblockingLuma_VerEdge_I + .func omxVCM4P10_FilterDeblockingLuma_VerEdge_I +omxVCM4P10_FilterDeblockingLuma_VerEdge_I: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + ADD r7,r2,#1 + ADD r8,r3,#1 + VLD1.8 {d0[]},[r2] + SUB r0,r0,#4 + VLD1.8 {d2[]},[r3] + LDR r4,[sp,#0x6c] + LDR r5,[sp,#0x68] + MOV r6,#0 + VMOV.I8 d14,#0 + VMOV.I8 d15,#0x1 + MOV r9,#0x11000000 + ADD r11,r1,r1 +L0x38: + LDRH r12,[r4],#4 + CMP r12,#0 + BEQ L0x160 + ADD r10,r0,r1 + VLD1.8 {d7},[r0],r11 + VLD1.8 {d8},[r10],r11 + VLD1.8 {d5},[r0],r11 + VZIP.8 d7,d8 + VLD1.8 {d10},[r10],r11 + VLD1.8 {d6},[r0],r11 + VZIP.8 d5,d10 + VLD1.8 {d9},[r10],r11 + VLD1.8 {d4},[r0],r11 + VLD1.8 {d11},[r10],r11 + VZIP.8 d6,d9 + VZIP.16 d8,d10 + VZIP.8 d4,d11 + SUB r0,r0,r1,LSL #3 + VZIP.16 d7,d5 + VZIP.16 d9,d11 + VZIP.16 d6,d4 + VTRN.32 d8,d9 + VTRN.32 d5,d4 + VTRN.32 d10,d11 + VTRN.32 d7,d6 + VABD.U8 d13,d4,d8 + VABD.U8 d12,d5,d4 + VABD.U8 d18,d9,d8 + VABD.U8 d19,d6,d4 + TST r12,#0xff + VCGT.U8 d16,d0,d13 + VMAX.U8 d12,d18,d12 + VABD.U8 d17,d10,d8 + VMOVEQ.32 d16[0],r6 + TST r12,#0xff00 + VCGT.U8 d19,d2,d19 + VCGT.U8 d12,d2,d12 + VMOVEQ.32 d16[1],r6 + VCGT.U8 d17,d2,d17 + VAND d16,d16,d12 + TST r12,#4 + VAND d12,d16,d17 + VAND d17,d16,d19 + BNE L0x17c + BL armVCM4P10_DeblockingLumabSLT4_unsafe + VZIP.8 d7,d6 + VZIP.8 d30,d29 + VZIP.8 d24,d25 + VZIP.8 d10,d11 + VZIP.16 d7,d30 + ADD r10,r0,r1 + VZIP.16 d24,d10 + VZIP.16 d25,d11 + VZIP.16 d6,d29 + VTRN.32 d7,d24 + VTRN.32 d30,d10 + VTRN.32 d6,d25 + VTRN.32 d29,d11 + VST1.8 {d7},[r0],r11 + VST1.8 {d24},[r10],r11 + VST1.8 {d30},[r0],r11 + VST1.8 {d10},[r10],r11 + VST1.8 {d6},[r0],r11 + VST1.8 {d25},[r10],r11 + ADDS r9,r9,r9 + VST1.8 {d29},[r0],r11 + ADD r5,r5,#2 + VST1.8 {d11},[r10],r1 + SUB r0,r0,r1,LSL #3 + VLD1.8 {d0[]},[r7] + ADD r0,r0,#4 + VLD1.8 {d2[]},[r8] + BCC L0x38 + B L0x1f0 +L0x160: + ADD r0,r0,#4 + ADDS r9,r9,r9 + VLD1.8 {d0[]},[r7] + ADD r5,r5,#4 + VLD1.8 {d2[]},[r8] + BCC L0x38 + B L0x1f0 +L0x17c: + BL armVCM4P10_DeblockingLumabSGE4_unsafe + VZIP.8 d7,d31 + VZIP.8 d30,d29 + VZIP.8 d24,d25 + VZIP.8 d28,d11 + VZIP.16 d7,d30 + ADD r10,r0,r1 + VZIP.16 d24,d28 + VZIP.16 d25,d11 + VZIP.16 d31,d29 + VTRN.32 d7,d24 + VTRN.32 d30,d28 + VTRN.32 d31,d25 + VTRN.32 d29,d11 + VST1.8 {d7},[r0],r11 + VST1.8 {d24},[r10],r11 + VST1.8 {d30},[r0],r11 + VST1.8 {d28},[r10],r11 + VST1.8 {d31},[r0],r11 + VST1.8 {d25},[r10],r11 + ADDS r9,r9,r9 + VST1.8 {d29},[r0],r11 + ADD r5,r5,#4 + VST1.8 {d11},[r10],r11 + SUB r0,r0,r1,LSL #3 + VLD1.8 {d0[]},[r7] + ADD r0,r0,#4 + VLD1.8 {d2[]},[r8] + BCC L0x38 +L0x1f0: + SUB r4,r4,#0xe + SUB r5,r5,#0xe + SUB r0,r0,#0x10 + VLD1.8 {d0[]},[r2] + ADD r0,r0,r1,LSL #3 + VLD1.8 {d2[]},[r3] + BNE L0x38 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S new file mode 100644 index 0000000..76c3d7d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S @@ -0,0 +1,323 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_InterpolateLuma + .func omxVCM4P10_InterpolateLuma +omxVCM4P10_InterpolateLuma: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + SUB sp,sp,#0x10 + LDR r6,[sp,#0x78] + LDR r7,[sp,#0x7c] + LDR r5,[sp,#0x80] + LDR r4,[sp,#0x84] + ADD r6,r6,r7,LSL #2 + ADD r11,sp,#0 + VMOV.I16 d31,#0x14 + VMOV.I16 d30,#0x5 +L0x2c: + STM r11,{r0-r3} + ADD pc,pc,r6,LSL #2 + B L0x3f0 + B L0x78 + B L0xa8 + B L0xdc + B L0x100 + B L0x134 + B L0x168 + B L0x1a8 + B L0x1f0 + B L0x234 + B L0x258 + B L0x2b0 + B L0x2d8 + B L0x330 + B L0x364 + B L0x3a8 + B L0x3f0 +L0x78: + ADD r12,r0,r1,LSL #1 + VLD1.8 {d9},[r0],r1 + VLD1.8 {d11},[r12],r1 + VLD1.8 {d10},[r0] + VLD1.8 {d12},[r12] + ADD r12,r2,r3,LSL #1 + VST1.32 {d9[0]},[r2],r3 + VST1.32 {d11[0]},[r12],r3 + VST1.32 {d10[0]},[r2] + VST1.32 {d12[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0xa8: + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d14 + VRHADD.U8 d26,d26,d18 + VRHADD.U8 d24,d24,d16 + VRHADD.U8 d28,d28,d20 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0xdc: + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x100: + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d15 + VRHADD.U8 d26,d26,d19 + VRHADD.U8 d24,d24,d17 + VRHADD.U8 d28,d28,d21 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x134: + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + VRHADD.U8 d0,d0,d9 + VRHADD.U8 d4,d4,d11 + VRHADD.U8 d2,d2,d10 + VRHADD.U8 d6,d6,d12 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x168: + MOV r8,r0 + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + SUB r0,r8,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d0 + VRHADD.U8 d26,d26,d4 + VRHADD.U8 d24,d24,d2 + VRHADD.U8 d28,d28,d6 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x1a8: + SUB r0,r0,r1,LSL #1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + VQRSHRUN.S16 d14,q7,#5 + VQRSHRUN.S16 d16,q8,#5 + VQRSHRUN.S16 d18,q9,#5 + VQRSHRUN.S16 d20,q10,#5 + VRHADD.U8 d0,d0,d14 + VRHADD.U8 d4,d4,d18 + VRHADD.U8 d2,d2,d16 + VRHADD.U8 d6,d6,d20 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x1f0: + MOV r8,r0 + ADD r0,r0,#1 + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + SUB r0,r8,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d0 + VRHADD.U8 d26,d26,d4 + VRHADD.U8 d24,d24,d2 + VRHADD.U8 d28,d28,d6 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x234: + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x258: + SUB r0,r0,r1,LSL #1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + VEXT.8 d18,d18,d19,#4 + VEXT.8 d20,d20,d21,#4 + VEXT.8 d22,d22,d23,#4 + VEXT.8 d24,d24,d25,#4 + VQRSHRUN.S16 d14,q9,#5 + VQRSHRUN.S16 d16,q10,#5 + VQRSHRUN.S16 d18,q11,#5 + VQRSHRUN.S16 d20,q12,#5 + VRHADD.U8 d0,d0,d14 + VRHADD.U8 d4,d4,d18 + VRHADD.U8 d2,d2,d16 + VRHADD.U8 d6,d6,d20 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x2b0: + SUB r0,r0,r1,LSL #1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x2d8: + SUB r0,r0,r1,LSL #1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe + VEXT.8 d18,d18,d19,#6 + VEXT.8 d20,d20,d21,#6 + VEXT.8 d22,d22,d23,#6 + VEXT.8 d24,d24,d25,#6 + VQRSHRUN.S16 d14,q9,#5 + VQRSHRUN.S16 d16,q10,#5 + VQRSHRUN.S16 d18,q11,#5 + VQRSHRUN.S16 d20,q12,#5 + VRHADD.U8 d0,d0,d14 + VRHADD.U8 d4,d4,d18 + VRHADD.U8 d2,d2,d16 + VRHADD.U8 d6,d6,d20 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x330: + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + VRHADD.U8 d0,d0,d10 + VRHADD.U8 d4,d4,d12 + VRHADD.U8 d2,d2,d11 + VRHADD.U8 d6,d6,d13 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x364: + MOV r8,r0 + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD r0,r8,r1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d0 + VRHADD.U8 d26,d26,d4 + VRHADD.U8 d24,d24,d2 + VRHADD.U8 d28,d28,d6 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x3a8: + SUB r0,r0,r1,LSL #1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe + VQRSHRUN.S16 d14,q8,#5 + VQRSHRUN.S16 d16,q9,#5 + VQRSHRUN.S16 d18,q10,#5 + VQRSHRUN.S16 d20,q11,#5 + VRHADD.U8 d0,d0,d14 + VRHADD.U8 d4,d4,d18 + VRHADD.U8 d2,d2,d16 + VRHADD.U8 d6,d6,d20 + ADD r12,r2,r3,LSL #1 + VST1.32 {d0[0]},[r2],r3 + VST1.32 {d4[0]},[r12],r3 + VST1.32 {d2[0]},[r2] + VST1.32 {d6[0]},[r12] + ADD r11,sp,#0 + B L0x434 +L0x3f0: + MOV r8,r0 + ADD r0,r0,#1 + SUB r0,r0,r1,LSL #1 + BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe + ADD r0,r8,r1 + SUB r0,r0,#2 + BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe + VRHADD.U8 d22,d22,d0 + VRHADD.U8 d26,d26,d4 + VRHADD.U8 d24,d24,d2 + VRHADD.U8 d28,d28,d6 + ADD r12,r2,r3,LSL #1 + VST1.32 {d22[0]},[r2],r3 + VST1.32 {d26[0]},[r12],r3 + VST1.32 {d24[0]},[r2] + VST1.32 {d28[0]},[r12] + ADD r11,sp,#0 +L0x434: + LDM r11,{r0-r3} + SUBS r5,r5,#4 + ADD r0,r0,#4 + ADD r2,r2,#4 + BGT L0x2c + SUBS r4,r4,#4 + LDR r5,[sp,#0x80] + ADD r11,sp,#0 + ADD r0,r0,r1,LSL #2 + ADD r2,r2,r3,LSL #2 + SUB r0,r0,r5 + SUB r2,r2,r5 + BGT L0x2c + MOV r0,#0 + ADD sp,sp,#0x10 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S new file mode 100644 index 0000000..0d49e4b --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S @@ -0,0 +1,217 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .section .rodata + .align 4 + +armVCM4P10_pIndexTable8x8: + .word OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR + .word OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE + +armVCM4P10_MultiplierTableChroma8x8: + .hword 3, 2, 1,4 + .hword -3,-2,-1,0 + .hword 1, 2, 3,4 + + + .text + .global omxVCM4P10_PredictIntraChroma_8x8 + .func omxVCM4P10_PredictIntraChroma_8x8 +omxVCM4P10_PredictIntraChroma_8x8: + PUSH {r4-r10,lr} + VPUSH {d8-d15} + LDR r8, =armVCM4P10_pIndexTable8x8 + LDR r6,[sp,#0x68] + LDR r4,[sp,#0x60] + LDR r5,[sp,#0x64] + LDR r7,[sp,#0x6c] + LDR pc,[r8,r6,LSL #2] +OMX_VC_CHROMA_DC: + TST r7,#2 + BEQ L0xe8 + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d1[0]},[r0],r10 + VLD1.8 {d1[1]},[r9],r10 + VLD1.8 {d1[2]},[r0],r10 + VLD1.8 {d1[3]},[r9],r10 + VLD1.8 {d1[4]},[r0],r10 + VLD1.8 {d1[5]},[r9],r10 + VLD1.8 {d1[6]},[r0],r10 + VLD1.8 {d1[7]},[r9] + TST r7,#1 + BEQ L0xcc + VLD1.8 {d0},[r1] + MOV r0,#0 + VPADDL.U8 d2,d0 + VPADDL.U16 d3,d2 + VPADDL.U8 d2,d1 + VPADDL.U16 d1,d2 + VADD.I32 d2,d3,d1 + VRSHR.U32 d2,d2,#3 + VRSHR.U32 d3,d3,#2 + VRSHR.U32 d1,d1,#2 + VMOV.I8 d5,#0xc + VMOV.I8 d6,#0x4 + VSHL.I64 d5,d5,#32 + VSHR.U64 d6,d6,#32 + VADD.I8 d6,d6,d5 + VTBL.8 d0,{d2-d3},d5 + VTBL.8 d4,{d1-d2},d6 +L0x9c: + ADD r9,r3,r5 + ADD r10,r5,r5 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9],r10 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9],r10 + VST1.8 {d4},[r3],r10 + VST1.8 {d4},[r9],r10 + VST1.8 {d4},[r3],r10 + VST1.8 {d4},[r9] + VPOP {d8-d15} + POP {r4-r10,pc} +L0xcc: + MOV r0,#0 + VPADDL.U8 d2,d1 + VPADDL.U16 d1,d2 + VRSHR.U32 d1,d1,#2 + VDUP.8 d0,d1[0] + VDUP.8 d4,d1[4] + B L0x9c +L0xe8: + TST r7,#1 + BEQ L0x114 + VLD1.8 {d0},[r1] + MOV r0,#0 + VPADDL.U8 d2,d0 + VPADDL.U16 d3,d2 + VRSHR.U32 d3,d3,#2 + VMOV.I8 d5,#0x4 + VSHL.I64 d5,d5,#32 + VTBL.8 d0,{d3},d5 + B L0x11c +L0x114: + VMOV.I8 d0,#0x80 + MOV r0,#0 +L0x11c: + ADD r9,r3,r5 + ADD r10,r5,r5 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9],r10 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9],r10 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9],r10 + VST1.8 {d0},[r3],r10 + VST1.8 {d0},[r9] + VPOP {d8-d15} + POP {r4-r10,pc} +OMX_VC_CHROMA_VERT: + VLD1.8 {d0},[r1] + MOV r0,#0 + B L0x11c +OMX_VC_CHROMA_HOR: + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d0[]},[r0],r10 + VLD1.8 {d1[]},[r9],r10 + VLD1.8 {d2[]},[r0],r10 + VLD1.8 {d3[]},[r9],r10 + VLD1.8 {d4[]},[r0],r10 + VLD1.8 {d5[]},[r9],r10 + VLD1.8 {d6[]},[r0],r10 + VLD1.8 {d7[]},[r9] + B L0x28c +OMX_VC_CHROMA_PLANE: + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d0},[r1] + VLD1.8 {d2[0]},[r2] + VLD1.8 {d1[0]},[r0],r10 + VLD1.8 {d1[1]},[r9],r10 + VLD1.8 {d1[2]},[r0],r10 + VLD1.8 {d1[3]},[r9],r10 + VLD1.8 {d1[4]},[r0],r10 + VLD1.8 {d1[5]},[r9],r10 + VLD1.8 {d1[6]},[r0],r10 + VLD1.8 {d1[7]},[r9] + VREV64.8 d3,d0 + VSUBL.U8 q3,d3,d2 + VSHR.U64 d3,d3,#8 + VSUBL.U8 q2,d3,d0 + VREV64.8 d3,d1 + VSUBL.U8 q7,d3,d2 + VSHR.U64 d3,d3,#8 + VSUBL.U8 q6,d3,d1 + LDR r2, =armVCM4P10_MultiplierTableChroma8x8 + VSHL.I64 d4,d4,#16 + VEXT.8 d9,d4,d6,#2 + VLD1.16 {d10},[r2]! + VSHL.I64 d12,d12,#16 + VEXT.8 d16,d12,d14,#2 + VMUL.I16 d11,d9,d10 + VMUL.I16 d3,d16,d10 + VPADD.I16 d3,d11,d3 + VPADDL.S16 d3,d3 + VSHL.I32 d2,d3,#4 + VADD.I32 d3,d3,d2 + VLD1.16 {d10,d11},[r2] + VRSHR.S32 d3,d3,#5 + VADDL.U8 q0,d0,d1 + VDUP.16 q0,d1[3] + VSHL.I16 q0,q0,#4 + VDUP.16 q2,d3[0] + VDUP.16 q3,d3[2] + VMUL.I16 q2,q2,q5 + VMUL.I16 q3,q3,q5 + VADD.I16 q2,q2,q0 + VDUP.16 q0,d6[0] + VDUP.16 q1,d6[1] + VDUP.16 q4,d6[2] + VDUP.16 q5,d6[3] + VDUP.16 q6,d7[0] + VDUP.16 q7,d7[1] + VDUP.16 q8,d7[2] + VDUP.16 q9,d7[3] + VADD.I16 q0,q2,q0 + VADD.I16 q1,q2,q1 + VADD.I16 q4,q2,q4 + VADD.I16 q5,q2,q5 + VADD.I16 q6,q2,q6 + VADD.I16 q7,q2,q7 + VADD.I16 q8,q2,q8 + VADD.I16 q9,q2,q9 + VQRSHRUN.S16 d0,q0,#5 + VQRSHRUN.S16 d1,q1,#5 + VQRSHRUN.S16 d2,q4,#5 + VQRSHRUN.S16 d3,q5,#5 + VQRSHRUN.S16 d4,q6,#5 + VQRSHRUN.S16 d5,q7,#5 + VQRSHRUN.S16 d6,q8,#5 + VQRSHRUN.S16 d7,q9,#5 +L0x28c: + ADD r9,r3,r5 + ADD r10,r5,r5 + VST1.8 {d0},[r3],r10 + VST1.8 {d1},[r9],r10 + VST1.8 {d2},[r3],r10 + VST1.8 {d3},[r9],r10 + VST1.8 {d4},[r3],r10 + VST1.8 {d5},[r9],r10 + VST1.8 {d6},[r3],r10 + VST1.8 {d7},[r9] + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r10,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S new file mode 100644 index 0000000..53268f6 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S @@ -0,0 +1,239 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + + .section .rodata + .align 4 +;//------------------------------------------------------- +;// This table for implementing switch case of C in asm by +;// the mehtod of two levels of indexing. +;//------------------------------------------------------- + +armVCM4P10_pIndexTable16x16: + .word OMX_VC_16X16_VERT, OMX_VC_16X16_HOR + .word OMX_VC_16X16_DC, OMX_VC_16X16_PLANE + + + +armVCM4P10_MultiplierTable16x16: + .hword 7, 6, 5, 4, 3, 2, 1, 8 + .hword 0, 1, 2, 3, 4, 5, 6, 7 + .hword 8, 9, 10, 11, 12, 13, 14, 15 + + .text + + .global omxVCM4P10_PredictIntra_16x16 + .func omxVCM4P10_PredictIntra_16x16 +omxVCM4P10_PredictIntra_16x16: + PUSH {r4-r12,lr} + VPUSH {d8-d15} + LDR r9, =armVCM4P10_pIndexTable16x16 + LDR r6,[sp,#0x70] + LDR r4,[sp,#0x68] + LDR r5,[sp,#0x6c] + LDR r7,[sp,#0x74] + MOV r12,#0x10 + LDR pc,[r9,r6,LSL #2] +OMX_VC_16X16_VERT: + VLD1.8 {d0,d1},[r1] + ADD r8,r3,r5 + ADD r10,r5,r5 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3],r10 + VST1.8 {d0,d1},[r8],r10 + VST1.8 {d0,d1},[r3] + VST1.8 {d0,d1},[r8] + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} +OMX_VC_16X16_HOR: + ADD r8,r0,r4 + ADD r4,r4,r4 + ADD r11,r3,r5 + ADD r5,r5,r5 +L0x8c: + VLD1.8 {d2[],d3[]},[r0],r4 + VLD1.8 {d0[],d1[]},[r8],r4 + SUBS r12,r12,#8 + VST1.8 {d2,d3},[r3],r5 + VST1.8 {d0,d1},[r11],r5 + VLD1.8 {d2[],d3[]},[r0],r4 + VLD1.8 {d0[],d1[]},[r8],r4 + VST1.8 {d2,d3},[r3],r5 + VST1.8 {d0,d1},[r11],r5 + VLD1.8 {d2[],d3[]},[r0],r4 + VLD1.8 {d0[],d1[]},[r8],r4 + VST1.8 {d2,d3},[r3],r5 + VST1.8 {d0,d1},[r11],r5 + VLD1.8 {d2[],d3[]},[r0],r4 + VLD1.8 {d0[],d1[]},[r8],r4 + VST1.8 {d2,d3},[r3],r5 + VST1.8 {d0,d1},[r11],r5 + BNE L0x8c + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} +OMX_VC_16X16_DC: + MOV r11,#0 + TST r7,#2 + BEQ L0x14c + ADD r8,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d2[0]},[r0],r10 + VLD1.8 {d2[1]},[r8],r10 + VLD1.8 {d2[2]},[r0],r10 + VLD1.8 {d2[3]},[r8],r10 + VLD1.8 {d2[4]},[r0],r10 + VLD1.8 {d2[5]},[r8],r10 + VLD1.8 {d2[6]},[r0],r10 + VLD1.8 {d2[7]},[r8],r10 + VLD1.8 {d3[0]},[r0],r10 + VLD1.8 {d3[1]},[r8],r10 + VLD1.8 {d3[2]},[r0],r10 + VLD1.8 {d3[3]},[r8],r10 + VLD1.8 {d3[4]},[r0],r10 + VLD1.8 {d3[5]},[r8],r10 + VLD1.8 {d3[6]},[r0],r10 + VLD1.8 {d3[7]},[r8] + VPADDL.U8 q0,q1 + ADD r11,r11,#1 + VPADD.I16 d0,d0,d1 + VPADDL.U16 d0,d0 + VPADDL.U32 d6,d0 + VRSHR.U64 d8,d6,#4 +L0x14c: + TST r7,#1 + BEQ L0x170 + VLD1.8 {d0,d1},[r1] + ADD r11,r11,#1 + VPADDL.U8 q0,q0 + VPADD.I16 d0,d0,d1 + VPADDL.U16 d0,d0 + VPADDL.U32 d7,d0 + VRSHR.U64 d8,d7,#4 +L0x170: + CMP r11,#2 + BNE L0x180 + VADD.I64 d8,d7,d6 + VRSHR.U64 d8,d8,#5 +L0x180: + VDUP.8 q3,d8[0] + CMP r11,#0 + ADD r8,r3,r5 + ADD r10,r5,r5 + BNE L0x198 + VMOV.I8 q3,#0x80 +L0x198: + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + VST1.8 {d6,d7},[r3],r10 + VST1.8 {d6,d7},[r8],r10 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} +OMX_VC_16X16_PLANE: + LDR r9, =armVCM4P10_MultiplierTable16x16 + VLD1.8 {d0,d1},[r1] + VLD1.8 {d4[0]},[r2] + ADD r8,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d2[0]},[r0],r10 + VLD1.8 {d2[1]},[r8],r10 + VLD1.8 {d2[2]},[r0],r10 + VLD1.8 {d2[3]},[r8],r10 + VLD1.8 {d2[4]},[r0],r10 + VLD1.8 {d2[5]},[r8],r10 + VLD1.8 {d2[6]},[r0],r10 + VLD1.8 {d2[7]},[r8],r10 + VLD1.8 {d3[0]},[r0],r10 + VLD1.8 {d3[1]},[r8],r10 + VLD1.8 {d3[2]},[r0],r10 + VLD1.8 {d3[3]},[r8],r10 + VLD1.8 {d3[4]},[r0],r10 + VLD1.8 {d3[5]},[r8],r10 + VLD1.8 {d3[6]},[r0],r10 + VLD1.8 {d3[7]},[r8] + VREV64.8 d5,d1 + VSUBL.U8 q3,d5,d4 + VSHR.U64 d5,d5,#8 + VSUBL.U8 q4,d5,d0 + VSHL.I64 d9,d9,#16 + VEXT.8 d9,d9,d6,#2 + VREV64.8 d12,d3 + VSUBL.U8 q7,d12,d4 + VSHR.U64 d12,d12,#8 + VSUBL.U8 q8,d12,d2 + VLD1.16 {d20,d21},[r9]! + VSHL.I64 d17,d17,#16 + VEXT.8 d17,d17,d14,#2 + VMULL.S16 q11,d8,d20 + VMULL.S16 q12,d16,d20 + VMLAL.S16 q11,d9,d21 + VMLAL.S16 q12,d17,d21 + VPADD.I32 d22,d23,d22 + VPADD.I32 d23,d25,d24 + VPADDL.S32 q11,q11 + VSHL.I64 q12,q11,#2 + VADD.I64 q11,q11,q12 + VRSHR.S64 q11,q11,#6 + VSHL.I64 q12,q11,#3 + VSUB.I64 q12,q12,q11 + VLD1.16 {d20,d21},[r9]! + VDUP.16 q6,d22[0] + VDUP.16 q7,d23[0] + VADDL.U8 q11,d1,d3 + VSHL.I16 q11,q11,#4 + VDUP.16 q11,d23[3] + VADD.I64 d1,d24,d25 + VLD1.16 {d24,d25},[r9] + VDUP.16 q13,d1[0] + VSUB.I16 q13,q11,q13 + VMUL.I16 q5,q6,q10 + VMUL.I16 q6,q6,q12 + VADD.I16 q0,q5,q13 + VADD.I16 q1,q6,q13 +L0x2d4: + VQRSHRUN.S16 d6,q0,#5 + VQRSHRUN.S16 d7,q1,#5 + SUBS r12,r12,#1 + VST1.8 {d6,d7},[r3],r5 + VADD.I16 q0,q0,q7 + VADD.I16 q1,q1,q7 + BNE L0x2d4 + MOV r0,#0 + VPOP {d8-d15} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S new file mode 100644 index 0000000..aa6d7ef --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S @@ -0,0 +1,261 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + + .section .rodata + .align 4 + +armVCM4P10_pSwitchTable4x4: + .word OMX_VC_4x4_VERT, OMX_VC_4x4_HOR + .word OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL + .word OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR + .word OMX_VC_4x4_HD, OMX_VC_4x4_VL + .word OMX_VC_4x4_HU + + .text + + .global omxVCM4P10_PredictIntra_4x4 + .func omxVCM4P10_PredictIntra_4x4 +omxVCM4P10_PredictIntra_4x4: + PUSH {r4-r12,lr} + VPUSH {d8-d12} + LDR r8, =armVCM4P10_pSwitchTable4x4 + LDRD r6,r7,[sp,#0x58] + LDRD r4,r5,[sp,#0x50] + LDR pc,[r8,r6,LSL #2] +OMX_VC_4x4_HOR: + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d0[]},[r0],r10 + VLD1.8 {d1[]},[r9],r10 + VLD1.8 {d2[]},[r0] + VLD1.8 {d3[]},[r9] + ADD r11,r3,r5 + ADD r12,r5,r5 + VST1.32 {d0[0]},[r3],r12 + VST1.32 {d1[0]},[r11],r12 + VST1.32 {d2[0]},[r3] + VST1.32 {d3[0]},[r11] + B L0x348 +OMX_VC_4x4_VERT: + VLD1.32 {d0[0]},[r1] + ADD r11,r3,r5 + ADD r12,r5,r5 +L0x58: + VST1.32 {d0[0]},[r3],r12 + VST1.32 {d0[0]},[r11],r12 + VST1.32 {d0[0]},[r3] + VST1.32 {d0[0]},[r11] + B L0x348 +OMX_VC_4x4_DC: + TST r7,#2 + BEQ L0xdc + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d0[0]},[r0],r10 + VLD1.8 {d0[1]},[r9],r10 + VLD1.8 {d0[2]},[r0] + VLD1.8 {d0[3]},[r9] + TST r7,#1 + BEQ L0xbc + VLD1.32 {d0[1]},[r1] + MOV r0,#0 + VPADDL.U8 d1,d0 + VPADDL.U16 d1,d1 + VPADDL.U32 d1,d1 + VRSHR.U64 d1,d1,#3 + ADD r11,r3,r5 + ADD r12,r5,r5 + VDUP.8 d0,d1[0] + B L0x58 +L0xbc: + MOV r0,#0 + VPADDL.U8 d1,d0 + VPADDL.U16 d1,d1 + VRSHR.U32 d1,d1,#2 + ADD r11,r3,r5 + ADD r12,r5,r5 + VDUP.8 d0,d1[0] + B L0x58 +L0xdc: + TST r7,#1 + BEQ L0x108 + VLD1.32 {d0[0]},[r1] + MOV r0,#0 + VPADDL.U8 d1,d0 + VPADDL.U16 d1,d1 + VRSHR.U32 d1,d1,#2 + ADD r11,r3,r5 + ADD r12,r5,r5 + VDUP.8 d0,d1[0] + B L0x58 +L0x108: + VMOV.I8 d0,#0x80 + MOV r0,#0 + ADD r11,r3,r5 + ADD r12,r5,r5 + B L0x58 +OMX_VC_4x4_DIAG_DL: + TST r7,#0x40 + BEQ L0x138 + VLD1.8 {d3},[r1] + VDUP.8 d2,d3[7] + VEXT.8 d4,d3,d2,#1 + VEXT.8 d5,d3,d2,#2 + B L0x14c +L0x138: + VLD1.32 {d0[1]},[r1] + VDUP.8 d2,d0[7] + VEXT.8 d3,d0,d2,#4 + VEXT.8 d4,d0,d2,#5 + VEXT.8 d5,d0,d2,#6 +L0x14c: + VHADD.U8 d6,d3,d5 + VRHADD.U8 d6,d6,d4 + VST1.32 {d6[0]},[r3],r5 + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r3],r5 + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r3],r5 + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r3] + B L0x348 +OMX_VC_4x4_DIAG_DR: + VLD1.32 {d0[0]},[r1] + VLD1.8 {d1[7]},[r2] + ADD r9,r0,r4 + ADD r10,r4,r4 + ADD r1,r3,r5 + VLD1.8 {d1[6]},[r0],r10 + VLD1.8 {d1[5]},[r9],r10 + VLD1.8 {d1[4]},[r0] + VLD1.8 {d1[3]},[r9] + VEXT.8 d3,d1,d0,#3 + ADD r4,r1,r5 + VEXT.8 d4,d1,d0,#4 + ADD r6,r4,r5 + VEXT.8 d5,d1,d0,#5 + VHADD.U8 d6,d3,d5 + VRHADD.U8 d6,d6,d4 + VST1.32 {d6[0]},[r6] + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r4] + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r1] + VEXT.8 d6,d6,d6,#1 + VST1.32 {d6[0]},[r3] + B L0x348 +OMX_VC_4x4_VR: + VLD1.32 {d0[0]},[r1] + VLD1.8 {d0[7]},[r2] + VLD1.8 {d1[7]},[r0],r4 + VLD1.8 {d2[7]},[r0],r4 + VLD1.8 {d1[6]},[r0] + VEXT.8 d12,d0,d0,#7 + VEXT.8 d3,d1,d12,#6 + VEXT.8 d4,d2,d12,#7 + VEXT.8 d5,d1,d0,#7 + VEXT.8 d6,d2,d0,#7 + VEXT.8 d11,d1,d12,#7 + VHADD.U8 d8,d6,d12 + VRHADD.U8 d8,d8,d11 + VHADD.U8 d7,d3,d5 + VRHADD.U8 d7,d7,d4 + VEXT.8 d10,d8,d8,#1 + ADD r11,r3,r5 + ADD r12,r5,r5 + VEXT.8 d9,d7,d7,#1 + VST1.32 {d10[0]},[r3],r12 + VST1.32 {d9[0]},[r11],r12 + VST1.32 {d8[0]},[r3],r12 + VST1.32 {d7[0]},[r11] + B L0x348 +OMX_VC_4x4_HD: + VLD1.8 {d0},[r1] + VLD1.8 {d1[7]},[r2] + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d1[6]},[r0],r10 + VLD1.8 {d1[5]},[r9],r10 + VLD1.8 {d1[4]},[r0] + VLD1.8 {d1[3]},[r9] + VEXT.8 d3,d1,d0,#3 + VEXT.8 d4,d1,d0,#2 + VEXT.8 d5,d1,d0,#1 + VHADD.U8 d7,d3,d5 + VRHADD.U8 d7,d7,d4 + VRHADD.U8 d8,d4,d3 + VSHL.I64 d8,d8,#24 + VSHL.I64 d6,d7,#16 + VZIP.8 d8,d6 + VEXT.8 d7,d7,d7,#6 + VEXT.8 d8,d6,d7,#2 + ADD r11,r3,r5 + ADD r12,r5,r5 + VST1.32 {d8[1]},[r3],r12 + VST1.32 {d6[1]},[r11],r12 + VST1.32 {d8[0]},[r3] + VST1.32 {d6[0]},[r11] + B L0x348 +OMX_VC_4x4_VL: + TST r7,#0x40 + BEQ L0x2b4 + VLD1.8 {d3},[r1] + VEXT.8 d4,d3,d3,#1 + VEXT.8 d5,d4,d4,#1 + B L0x2c8 +L0x2b4: + VLD1.32 {d0[1]},[r1] + VDUP.8 d2,d0[7] + VEXT.8 d3,d0,d2,#4 + VEXT.8 d4,d0,d2,#5 + VEXT.8 d5,d0,d2,#6 +L0x2c8: + VRHADD.U8 d7,d4,d3 + VHADD.U8 d10,d3,d5 + VRHADD.U8 d10,d10,d4 + VEXT.8 d8,d7,d7,#1 + ADD r11,r3,r5 + ADD r12,r5,r5 + VEXT.8 d9,d10,d8,#1 + VST1.32 {d7[0]},[r3],r12 + VST1.32 {d10[0]},[r11],r12 + VST1.32 {d8[0]},[r3] + VST1.32 {d9[0]},[r11] + B L0x348 +OMX_VC_4x4_HU: + ADD r9,r0,r4 + ADD r10,r4,r4 + VLD1.8 {d1[4]},[r0],r10 + VLD1.8 {d1[5]},[r9],r10 + VLD1.8 {d1[6]},[r0] + VLD1.8 {d1[7]},[r9] + VDUP.8 d2,d1[7] + VEXT.8 d3,d1,d2,#4 + VEXT.8 d4,d1,d2,#5 + VEXT.8 d5,d1,d2,#6 + VHADD.U8 d7,d3,d5 + VRHADD.U8 d7,d7,d4 + VRHADD.U8 d8,d4,d3 + VZIP.8 d8,d7 + VST1.32 {d8[0]},[r3],r5 + VEXT.8 d8,d8,d8,#2 + VST1.32 {d8[0]},[r3],r5 + VEXT.8 d8,d8,d8,#2 + VST1.32 {d8[0]},[r3],r5 + VST1.32 {d7[0]},[r3] +L0x348: + MOV r0,#0 + VPOP {d8-d12} + POP {r4-r12,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S new file mode 100644 index 0000000..28a89cb --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S @@ -0,0 +1,54 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global omxVCM4P10_TransformDequantChromaDCFromPair + .func omxVCM4P10_TransformDequantChromaDCFromPair +omxVCM4P10_TransformDequantChromaDCFromPair: + push {r4-r10, lr} + ldr r9, [r0,#0] + vmov.i16 d0, #0 + mov r8, #0x1f + vst1.16 {d0}, [r1] + ldrb r6, [r9], #1 +unpackLoop: + tst r6, #0x10 + ldrnesb r5, [r9, #1] + ldrneb r4, [r9], #2 + and r7, r8, r6, lsl #1 + ldreqsb r4, [r9], #1 + orrne r4, r4, r5, lsl #8 + tst r6, #0x20 + ldreqb r6, [r9], #1 + strh r4, [r1, r7] + beq unpackLoop + ldmia r1, {r3, r4} + str r9, [r0, #0] + ldr r5, =armVCM4P10_QPDivTable + ldr r6, =armVCM4P10_VMatrixQPModTable + saddsubx r3, r3, r3 + saddsubx r4, r4, r4 + ldrsb r9, [r5, r2] + ldrsb r2, [r6, r2] + sadd16 r5, r3, r4 + ssub16 r6, r3, r4 + lsl r2, r2, r9 + vmov d0, r5, r6 + vrev32.16 d0, d0 + vdup.16 d1, r2 + vmull.s16 q1, d0, d1 + vshrn.i32 d2, q1, #1 + vst1.16 {d2}, [r1] + mov r0, #0 + pop {r4-r10, pc} + .endfunc + + .end diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S new file mode 100644 index 0000000..a3a0715 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S @@ -0,0 +1,76 @@ +/* + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + */ + + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + + .arm + .fpu neon + .text + + .global armVCM4P10_InvTransformDequantLumaDC4x4 + .func armVCM4P10_InvTransformDequantLumaDC4x4 +armVCM4P10_InvTransformDequantLumaDC4x4: + PUSH {r4-r6,lr} + VPUSH {d8-d13} + VLD4.16 {d0,d1,d2,d3},[r0] + LDR r2, =armVCM4P10_QPDivTable + LDR r3, =armVCM4P10_VMatrixQPModTable + VADD.I16 d4,d0,d1 + VADD.I16 d5,d2,d3 + VSUB.I16 d6,d0,d1 + LDRSB r4,[r2,r1] + VSUB.I16 d7,d2,d3 + LDRSB r5,[r3,r1] + VADD.I16 d0,d4,d5 + VSUB.I16 d1,d4,d5 + VSUB.I16 d2,d6,d7 + LSL r5,r5,r4 + VADD.I16 d3,d6,d7 + VTRN.16 d0,d1 + VTRN.16 d2,d3 + VTRN.32 q0,q1 + VADD.I16 d4,d0,d1 + VADD.I16 d5,d2,d3 + VSUB.I16 d6,d0,d1 + VSUB.I16 d7,d2,d3 + VADD.I16 d0,d4,d5 + VSUB.I16 d1,d4,d5 + VSUB.I16 d2,d6,d7 + VADD.I16 d3,d6,d7 + VDUP.16 d5,r5 + VMOV.I32 q3,#0x2 + VMOV.I32 q4,#0x2 + VMOV.I32 q5,#0x2 + VMOV.I32 q6,#0x2 + VMLAL.S16 q3,d0,d5 + VMLAL.S16 q4,d1,d5 + VMLAL.S16 q5,d2,d5 + VMLAL.S16 q6,d3,d5 + VSHRN.I32 d0,q3,#2 + VSHRN.I32 d1,q4,#2 + VSHRN.I32 d2,q5,#2 + VSHRN.I32 d3,q6,#2 + VST1.16 {d0,d1,d2,d3},[r0] + VPOP {d8-d13} + POP {r4-r6,pc} + .endfunc + +.global omxVCM4P10_TransformDequantLumaDCFromPair +.func omxVCM4P10_TransformDequantLumaDCFromPair +omxVCM4P10_TransformDequantLumaDCFromPair: + PUSH {r4-r6,lr} + MOV r4,r1 + MOV r5,r2 + BL armVCM4P10_UnpackBlock4x4 + MOV r0,r4 + MOV r1,r5 + BL armVCM4P10_InvTransformDequantLumaDC4x4 + MOV r0,#0 + POP {r4-r6,pc} + .endfunc + + .end + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h new file mode 100755 index 0000000..74b5505 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h @@ -0,0 +1,37 @@ +/** + * + * File Name: armVCM4P2_Huff_Tables_VLC.h + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * + * File: armVCM4P2_Huff_Tables.h + * Description: Declares Tables used for Hufffman coding and decoding + * in MP4P2 codec. + * + */ + +#ifndef _OMXHUFFTAB_H_ +#define _OMXHUFFTAB_H_ + + +extern const OMX_U16 armVCM4P2_IntraVlcL0L1[200]; + + +extern const OMX_U16 armVCM4P2_InterVlcL0L1[200]; + +extern const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64]; +//extern const OMX_U16 armVCM4P2_aIntraDCChromaIndex[32]; +extern const OMX_U16 armVCM4P2_aVlcMVD[124]; + +extern const OMX_U8 armVCM4P2_InterL0L1LMAX[73]; +extern const OMX_U8 armVCM4P2_InterL0L1RMAX[35]; +extern const OMX_U8 armVCM4P2_IntraL0L1LMAX[53]; +extern const OMX_U8 armVCM4P2_IntraL0L1RMAX[40] + +#endif /* _OMXHUFFTAB_H_ */ diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h new file mode 100755 index 0000000..e95203a --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h @@ -0,0 +1,25 @@ +/** + * + * File Name: armVCM4P2_ZigZag_Tables.h + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * + * File: armVCM4P2_Zigzag_Tables.h + * Description: Declares Tables used for Zigzag scan in MP4P2 codec. + * + */ + +#ifndef _OMXZIGZAGTAB_H +#define _OMXZIGZAGTAB_H + +extern const OMX_U8 armVCM4P2_aClassicalZigzagScan [192]; +//extern const OMX_U8 armVCM4P2_aHorizontalZigzagScan [64]; +//extern const OMX_U8 armVCM4P2_aVerticalZigzagScan [64]; + +#endif /* _OMXZIGZAGTAB_H_ */ diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s new file mode 100755 index 0000000..95fe6d2 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s @@ -0,0 +1,82 @@ +; /** +; * +; * File Name: armVCM4P2_Clip8_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains module for Clipping 16 bit value to [0,255] Range +; */ + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + + M_VARIANTS CortexA8 + + IF CortexA8 +;//Input Arguments + +pSrc RN 0 +pDst RN 1 +step RN 2 + +;// Neon Registers + +qx0 QN Q0.S16 +dx00 DN D0.S16 +dx01 DN D1.S16 +qx1 QN Q1.S16 +dx10 DN D2.S16 +dx11 DN D3.S16 + +qx2 QN Q2.S16 +dx20 DN D4.S16 +dx21 DN D5.S16 +qx3 QN Q3.S16 +dx30 DN D6.S16 +dx31 DN D7.S16 + + +dclip0 DN D0.U8 +dclip1 DN D2.U8 +dclip2 DN D4.U8 +dclip3 DN D6.U8 + + M_START armVCM4P2_Clip8 + + VLD1 {dx00,dx01,dx10,dx11},[pSrc]! ;// Load 16 entries from pSrc + VLD1 {dx20,dx21,dx30,dx31},[pSrc]! ;// Load next 16 entries from pSrc + VQSHRUN dclip0,qx0,#0 ;// dclip0[i]=clip qx0[i] to [0,255] + VQSHRUN dclip1,qx1,#0 ;// dclip1[i]=clip qx1[i] to [0,255] + VST1 {dclip0},[pDst],step ;// store 8 bytes and pDst=pDst+step + VST1 {dclip1},[pDst],step ;// store 8 bytes and pDst=pDst+step + VQSHRUN dclip2,qx2,#0 + VQSHRUN dclip3,qx3,#0 + VST1 {dclip2},[pDst],step + VST1 {dclip3},[pDst],step + + VLD1 {dx00,dx01,dx10,dx11},[pSrc]! ;// Load 16 entries from pSrc + VLD1 {dx20,dx21,dx30,dx31},[pSrc]! ;// Load next 16 entries from pSrc + VQSHRUN dclip0,qx0,#0 ;// dclip0[i]=clip qx0[i] to [0,255] + VQSHRUN dclip1,qx1,#0 ;// dclip1[i]=clip qx1[i] to [0,255] + VST1 {dclip0},[pDst],step ;// store 8 bytes and pDst=pDst+step + VST1 {dclip1},[pDst],step ;// store 8 bytes and pDst=pDst+step + VQSHRUN dclip2,qx2,#0 + VQSHRUN dclip3,qx3,#0 + VST1 {dclip2},[pDst],step + VST1 {dclip3},[pDst],step + + + + M_END + ENDIF + + + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s new file mode 100755 index 0000000..e4a7f33 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s @@ -0,0 +1,398 @@ +;/** +; * +; * File Name: armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for zigzag scanning and VLC decoding +; * for inter, intra block. +; * +; * +; * +; * Function: omxVCM4P2_DecodeVLCZigzag_AC_unsafe +; * +; * Description: +; * Performs VLC decoding and inverse zigzag scan +; * +; * +; * +; * +; */ + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + + + M_VARIANTS ARM1136JS + + + + + + IF ARM1136JS + + + + + +;//Input Arguments + +ppBitStream RN 0 +pBitOffset RN 1 +pDst RN 2 +shortVideoHeader RN 3 + + +;//Local Variables + +Return RN 0 + +pVlcTableL0L1 RN 4 +pLMAXTableL0L1 RN 4 +pRMAXTableL0L1 RN 4 +pZigzagTable RN 4 + +ftype RN 0 +temp3 RN 4 +temp RN 5 +Count RN 6 +Escape RN 5 + +;// armVCM4P2_FillVLDBuffer +zigzag RN 0 +storeLevel RN 1 +temp2 RN 4 +temp1 RN 5 +sign RN 5 +Last RN 7 +storeRun RN 14 + + +packRetIndex RN 5 + + +markerbit RN 5 + +;// Scratch Registers + +RBitStream RN 8 +RBitBuffer RN 9 +RBitCount RN 10 + +T1 RN 11 +T2 RN 12 +LR RN 14 + + + + M_ALLOC4 pppBitStream,4 + M_ALLOC4 ppOffset,4 + M_ALLOC4 pLinkRegister,4 + + M_START armVCM4P2_DecodeVLCZigzag_AC_unsafe + + ;// get the table addresses from stack + M_ARG ppVlcTableL0L1,4 + M_ARG ppLMAXTableL0L1,4 + M_ARG ppRMAXTableL0L1,4 + M_ARG ppZigzagTable,4 + + ;// Store ALL zeros at pDst + + MOV temp1,#0 ;// Initialize Count to zero + MOV Last,#0 + M_STR LR,pLinkRegister ;// Store Link Register on Stack + MOV temp2,#0 + MOV LR,#0 + + ;// Initialize the Macro and Store all zeros to pDst + + STM pDst!,{temp2,temp1,Last,LR} + M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount + STM pDst!,{temp2,temp1,Last,LR} + M_BD_INIT1 T1, T2, T2 + STM pDst!,{temp2,temp1,Last,LR} + M_BD_INIT2 T1, T2, T2 + STM pDst!,{temp2,temp1,Last,LR} + M_STR ppBitStream,pppBitStream ;// Store ppBitstream on stack + STM pDst!,{temp2,temp1,Last,LR} + M_STR pBitOffset,ppOffset ;// Store pBitOffset on stack + STM pDst!,{temp2,temp1,Last,LR} + + STM pDst!,{temp2,temp1,Last,LR} + STM pDst!,{temp2,temp1,Last,LR} + + + SUB pDst,pDst,#128 ;// Restore pDst + + ;// The armVCM4P2_GetVLCBits begins + +getVLCbits + + M_BD_LOOK8 Escape,7 ;// Load Escape Value + LSR Escape,Escape,#25 + CMP Escape,#3 ;// check for escape mode + MOVNE ftype,#0 + BNE notEscapemode ;// Branch if not in Escape mode 3 + + M_BD_VSKIP8 #7,T1 + CMP shortVideoHeader,#0 ;// Check shortVideoHeader flag to know the type of Escape mode + BEQ endFillVLD + + ;// Escape Mode 4 + + M_BD_READ8 Last,1,T1 + M_BD_READ8 storeRun,6,T1 + M_BD_READ8 storeLevel,8,T1 + + + ;// Check whether the Reserved values for Level are used and Exit with an Error Message if it is so + + TEQ storeLevel,#0 + TEQNE storeLevel,#128 + BEQ ExitError + + ADD temp2,storeRun,Count + CMP temp2,#64 + BGE ExitError ;// error if Count+storeRun >= 64 + + + ;// Load address of zigzagTable + + M_LDR pZigzagTable,ppZigzagTable ;// Loading the Address of Zigzag table + + + ;// armVCM4P2_FillVLDBuffer + + SXTB storeLevel,storeLevel ;// Sign Extend storeLevel to 32 bits + + + ;// To Reflect Runlength + + ADD Count,Count,storeRun + LDRB zigzag,[pZigzagTable,Count] + ADD Count,Count,#1 + STRH storeLevel,[pDst,zigzag] ;// store Level + + B ExitOk + + + +endFillVLD + + + ;// Load Ftype( Escape Mode) value based on the two successive bits in the bitstream + + M_BD_READ8 temp1,1,T1 + CMP temp1,#0 + MOVEQ ftype,#1 + BEQ notEscapemode + M_BD_READ8 temp1,1,T1 + CMP temp1,#1 + MOVEQ ftype,#3 + MOVNE ftype,#2 + + +notEscapemode + + ;// Load optimized packed VLC table with last=0 and Last=1 + + M_LDR pVlcTableL0L1,ppVlcTableL0L1 ;// Load Combined VLC Table + + + CMP ftype,#3 ;// If ftype >=3 get perform Fixed Length Decoding (Escape Mode 3) + BGE EscapeMode3 ;// Else continue normal VLC Decoding + + ;// Variable lengh decoding, "armUnPackVLC32" + + + M_BD_VLD packRetIndex,T1,T2,pVlcTableL0L1,4,2 + + + LDR temp3,=0xFFF + + CMP packRetIndex,temp3 ;// Check for invalid symbol + BEQ ExitError ;// if invalid symbol occurs exit with an error message + + AND Last,packRetIndex,#2 ;// Get Last from packed Index + + + + + LSR storeRun,packRetIndex,#7 ;// Get Run Value from Packed index + AND storeLevel,packRetIndex,#0x7c ;// storeLevel=packRetIndex[2-6],storeLevel[0-1]=0 + + + M_LDR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Load LMAX table + + + LSR storeLevel,storeLevel,#2 ;// Level value + + CMP ftype,#1 + BNE ftype2 + + ;// ftype==1; Escape mode =1 + + + ADD temp1, pLMAXTableL0L1, Last, LSL#4 ;// If the Last=1 add 32 to table address + LDRB temp1,[temp1,storeRun] + + + ADD storeLevel,temp1,storeLevel + +ftype2 + + ;// ftype =2; Escape mode =2 + + M_LDR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Load RMAX Table + + CMP ftype,#2 + BNE FillVLDL1 + + ADD temp1, pRMAXTableL0L1, Last, LSL#4 ;// If Last=1 add 32 to table address + SUB temp2,storeLevel,#1 + LDRB temp1,[temp1,temp2] + + + ADD storeRun,storeRun,#1 + ADD storeRun,temp1 + +FillVLDL1 + + + ;// armVCM4P2_FillVLDBuffer + + M_LDR pZigzagTable,ppZigzagTable ;// Load address of zigzagTable + + M_BD_READ8 sign,1,T1 + + CMP sign,#1 + RSBEQ storeLevel,storeLevel,#0 + + ADD temp1,storeRun,Count ;// Exit with an error message if Run + Count exceeds 63 + CMP temp1,#64 + BGE ExitError + + + + + + + ;// To Reflect Runlenght + + ADD Count,Count,storeRun + +storeLevelL1 + + LDRB zigzag,[pZigzagTable,Count] + CMP Last,#2 ;// Check if the Level val is Last non zero val + ADD Count,Count,#1 + LSR Last,Last,#1 + STRH storeLevel,[pDst,zigzag] + + BNE end + + B ExitOk + + + + ;// Fixed Lengh Decoding Escape Mode 3 + +EscapeMode3 + + M_BD_READ8 Last,1,T1 + M_BD_READ8 storeRun,6,T1 + + ADD temp2,storeRun,Count ;// Exit with an error message if Run + Count exceeds 63 + CMP temp2,#64 + BGE ExitError + + M_BD_READ8 markerbit,1,T1 + TEQ markerbit,#0 ;// Exit with an error message if marker bit is zero + BEQ ExitError + + M_BD_READ16 storeLevel,12,T1 + + TST storeLevel,#0x800 ;// test if the level is negative + SUBNE storeLevel,storeLevel,#4096 + CMP storeLevel,#0 + CMPNE storeLevel,#-2048 + BEQ ExitError ;// Exit with an error message if Level==0 or -2048 + + M_LDR pZigzagTable,ppZigzagTable ;// Load address of zigzagTable + + M_BD_READ8 markerbit,1,T1 + + + ;// armVCM4P2_FillVLDBuffer ( Sign not used as storeLevel is preprocessed) + + + + ;// To Reflect Run Length + + ADD Count,Count,storeRun + + + +storeLevelLast + + LDRB zigzag,[pZigzagTable,Count] + CMP Last,#1 + ADD Count,Count,#1 + STRH storeLevel,[pDst,zigzag] + + BNE end + + B ExitOk + +end + + CMP Count,#64 ;//Run the Loop untill Count reaches 64 + + BLT getVLCbits + + +ExitOk + ;// Exit When VLC Decoding is done Successfully + + ;// Loading ppBitStream and pBitOffset from stack + + CMP Last,#1 + M_LDR ppBitStream,pppBitStream + M_LDR pBitOffset,ppOffset + + ;//Ending the macro + + M_BD_FINI ppBitStream,pBitOffset + + MOVEQ Return,#OMX_Sts_NoErr + MOVNE Return,#OMX_Sts_Err + M_LDR LR,pLinkRegister ;// Load the Link Register Back + B exit2 + +ExitError + ;// Exit When an Error occurs + + M_LDR ppBitStream,pppBitStream + M_LDR pBitOffset,ppOffset + ;//Ending the macro + + M_BD_FINI ppBitStream,pBitOffset + M_LDR LR,pLinkRegister + MOV Return,#OMX_Sts_Err + +exit2 + + + M_END + ENDIF + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c new file mode 100755 index 0000000..38af975 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c @@ -0,0 +1,211 @@ + /** + * + * File Name: armVCM4P2_Huff_Tables_VLC.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * File: armVCM4P2_Huff_Tables_VLC.c + * Description: Contains all the Huffman tables used in MPEG4 codec + * + */ + +#include "omxtypes.h" +#include "armOMX.h" + +#include "armCOMM_Bitstream.h" + + + + +// Contains optimized and Packed VLC tables with Last=0 and Last=1 + +// optimized Packed VLC table Entry Format +// --------------------------------------- +// +// 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 +// +------------------------------------------------+ +// | Len | Run | Level |L | 1 | +// +------------------------------------------------+ +// | Offset | 0 | +// +------------------------------------------------+ +// If the table entry is a leaf entry then bit 0 set: +// Len = Number of bits overread (0 to 7) 3 bits +// Run = RunLength of the Symbol (0 to 63) 6 bits +// Level = Level of the Symbol (0 to 31) 5 bits +// L = Last Value of the Symbol (0 or 1) 1 bit +// +// If the table entry is an internal node then bit 0 is clear: +// Offset = Number of (16-bit) half words from the table +// start to the next table node +// +// The table is accessed by successive lookup up on the +// next Step bits of the input bitstream until a leaf node +// is obtained. The Step sizes are supplied to the VLD macro. + +// The VLC tables used for Intra and non inta coefficients in non Escape mode +// contains symbols with both Last=0 and Last=1. +// If a symbol is not found in the table it will be coded as 0xFFF + + +const OMX_U16 armVCM4P2_InterVlcL0L1[200] = { + 0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x1b09, + 0x4009, 0x4009, 0x4009, 0x4009, 0x2109, 0x2109, 0x0209, 0x0011, + 0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058, + 0x3fff, 0x3fff, 0x0038, 0x0040, 0x2115, 0x2115, 0x201d, 0x201d, + 0x2059, 0x2059, 0x2051, 0x2051, 0x1c0d, 0x1b0d, 0x1a0d, 0x190d, + 0x0911, 0x0811, 0x0711, 0x0611, 0x0511, 0x0319, 0x0219, 0x0121, + 0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088, + 0x2061, 0x2061, 0x2129, 0x2129, 0x3709, 0x3709, 0x3809, 0x3809, + 0x3d0d, 0x3d0d, 0x3e0d, 0x3e0d, 0x3f0d, 0x3f0d, 0x200d, 0x200d, + 0x0098, 0x00a0, 0x00a8, 0x00b0, 0x0131, 0x0221, 0x0419, 0x0519, + 0x0619, 0x0a11, 0x1909, 0x1a09, 0x210d, 0x220d, 0x230d, 0x240d, + 0x250d, 0x260d, 0x270d, 0x280d, 0x00c0, 0x00c8, 0x00d0, 0x00d8, + 0x0049, 0x0041, 0x380d, 0x380d, 0x370d, 0x370d, 0x360d, 0x360d, + 0x350d, 0x350d, 0x340d, 0x340d, 0x330d, 0x330d, 0x320d, 0x320d, + 0x00e8, 0x00f0, 0x00f8, 0x0100, 0x310d, 0x310d, 0x2015, 0x2015, + 0x3609, 0x3609, 0x3509, 0x3509, 0x3409, 0x3409, 0x3309, 0x3309, + 0x3209, 0x3209, 0x3109, 0x3109, 0x0110, 0x0130, 0x0138, 0x0140, + 0x0118, 0x0120, 0x0128, 0x100d, 0x3009, 0x3009, 0x2f09, 0x2f09, + 0x2411, 0x2411, 0x2311, 0x2311, 0x2039, 0x2039, 0x2031, 0x2031, + 0x0f0d, 0x0e0d, 0x0d0d, 0x0c0d, 0x0b0d, 0x0a0d, 0x090d, 0x0e09, + 0x0d09, 0x0211, 0x0119, 0x0029, 0x0150, 0x0158, 0x0160, 0x0168, + 0x280d, 0x280d, 0x270d, 0x270d, 0x260d, 0x260d, 0x250d, 0x250d, + 0x2c09, 0x2c09, 0xb759, 0xb759, 0x2a09, 0x2a09, 0x2021, 0x2021, + 0x040d, 0x030d, 0x0b35, 0x010d, 0x0909, 0x0809, 0x0709, 0x0609, + 0x0111, 0x0019, 0x2509, 0x2509, 0x2409, 0x2409, 0x2309, 0x2309 +}; + + +const OMX_U16 armVCM4P2_IntraVlcL0L1[200] = { + 0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x0f09, + 0x4009, 0x4009, 0x4009, 0x4009, 0x2011, 0x2011, 0x0109, 0x0019, + 0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058, + 0x3fff, 0x3fff, 0x0038, 0x0040, 0x203d, 0x203d, 0x2035, 0x2035, + 0x20b1, 0x20b1, 0x20a9, 0x20a9, 0x0215, 0x011d, 0x002d, 0x0d09, + 0x0519, 0x0811, 0x0419, 0x0321, 0x0221, 0x0139, 0x00a1, 0x0099, + 0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088, + 0x20b9, 0x20b9, 0x20c1, 0x20c1, 0x2141, 0x2141, 0x2911, 0x2911, + 0x2315, 0x2315, 0x2415, 0x2415, 0x2f0d, 0x2f0d, 0x300d, 0x300d, + 0x0098, 0x00a0, 0x00a8, 0x00b0, 0x00c9, 0x00d1, 0x00d9, 0x0149, + 0x0619, 0x0151, 0x0229, 0x0719, 0x0e09, 0x0045, 0x0515, 0x0615, + 0x110d, 0x120d, 0x130d, 0x140d, 0x00c0, 0x00c8, 0x00d0, 0x00d8, + 0x0091, 0x0089, 0x2e0d, 0x2e0d, 0x2d0d, 0x2d0d, 0x2c0d, 0x2c0d, + 0x2b0d, 0x2b0d, 0x2a0d, 0x2a0d, 0x2115, 0x2115, 0x2025, 0x2025, + 0x00e8, 0x00f0, 0x00f8, 0x0100, 0x2c09, 0x2c09, 0x2b09, 0x2b09, + 0x2711, 0x2711, 0x2611, 0x2611, 0x2511, 0x2511, 0x2319, 0x2319, + 0x2219, 0x2219, 0x2131, 0x2131, 0x0110, 0x0130, 0x0138, 0x0140, + 0x0118, 0x0120, 0x0128, 0x080d, 0x2129, 0x2129, 0x2081, 0x2081, + 0x2411, 0x2411, 0x2079, 0x2079, 0x2071, 0x2071, 0x2069, 0x2069, + 0x1bb5, 0x060d, 0x001d, 0xd3f9, 0x0909, 0x0809, 0x090d, 0x0311, + 0x0121, 0x0061, 0x0059, 0x0051, 0x0150, 0x0158, 0x0160, 0x0168, + 0x240d, 0x240d, 0x230d, 0x230d, 0x2609, 0x2609, 0x250d, 0x250d, + 0x2709, 0x2709, 0x2211, 0x2211, 0x2119, 0x2119, 0x2049, 0x2049, + 0x0015, 0x0509, 0x020d, 0x010d, 0x0409, 0x0309, 0x0041, 0x0039, + 0x0111, 0x0031, 0x2209, 0x2209, 0x2029, 0x2029, 0x2021, 0x2021 +}; + +const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64] = { + 0x0020, 0x000b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2001, 0x2001, + 0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003, + 0x0028, 0x000f, 0x200d, 0x200d, 0x0030, 0x0013, 0x2011, 0x2011, + 0x0038, 0x0017, 0x2015, 0x2015, 0x3fff, 0x3fff, 0x2019, 0x2019, + + 0x0020, 0x0009, 0x2007, 0x2007, 0x4005, 0x4005, 0x4005, 0x4005, + 0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001, + 0x0028, 0x000d, 0x200b, 0x200b, 0x0030, 0x0011, 0x200f, 0x200f, + 0x0038, 0x0015, 0x2013, 0x2013, 0x1fff, 0x0019, 0x2017, 0x2017 +}; + + +const OMX_U16 armVCM4P2_aVlcMVD[124] = { + 0x0010, 0x00f0, 0x0043, 0x003f, 0x4041, 0x4041, 0x4041, 0x4041, + 0x0018, 0x00d8, 0x0047, 0x003b, 0x0020, 0x0080, 0x00a8, 0x00d0, + 0x0028, 0x0048, 0x0070, 0x0078, 0x1fff, 0x0030, 0x0038, 0x0040, + 0x0081, 0x0001, 0x007f, 0x0003, 0x207d, 0x207d, 0x2005, 0x2005, + 0x207b, 0x207b, 0x2007, 0x2007, 0x0050, 0x0058, 0x0060, 0x0068, + 0x2079, 0x2079, 0x2009, 0x2009, 0x2077, 0x2077, 0x200b, 0x200b, + 0x2075, 0x2075, 0x200d, 0x200d, 0x2073, 0x2073, 0x200f, 0x200f, + 0x0071, 0x0011, 0x006f, 0x0013, 0x006d, 0x0015, 0x006b, 0x0017, + 0x0088, 0x0090, 0x0098, 0x00a0, 0x0069, 0x0019, 0x0067, 0x001b, + 0x0065, 0x001d, 0x0063, 0x001f, 0x0061, 0x0021, 0x005f, 0x0023, + 0x005d, 0x0025, 0x005b, 0x0027, 0x00b0, 0x00b8, 0x00c0, 0x00c8, + 0x0059, 0x0029, 0x0057, 0x002b, 0x2055, 0x2055, 0x202d, 0x202d, + 0x2053, 0x2053, 0x202f, 0x202f, 0x2051, 0x2051, 0x2031, 0x2031, + 0x204f, 0x204f, 0x2033, 0x2033, 0x00e0, 0x00e8, 0x0049, 0x0039, + 0x204d, 0x204d, 0x2035, 0x2035, 0x204b, 0x204b, 0x2037, 0x2037, + 0x2045, 0x2045, 0x203d, 0x203d +}; + +/* LMAX table for non Inter (Last == 0 and Last=1) + Level - 1 Indexed + padded armVCM4P2_InterL0L1LMAX[27-31] with zeros to acess entries for Last=1 effectively + +*/ +const OMX_U8 armVCM4P2_InterL0L1LMAX[73] = +{ + 12, 6, 4, 3, 3, 3, 3, 2, + 2, 2, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, + 3, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1 +}; + +/* RMAX table for non Inter (Last == 0 and Last=1) + Level - 1 Indexed + padded armVCM4P2_InterL0L1RMAX[12-31] with zeros to access entries for Last=1 table effectively */ + + +const OMX_U8 armVCM4P2_InterL0L1RMAX[35] = +{ + 26, 10, 6, 2, 1, 1, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, 40, 1, 0 +}; + +/* LMAX table for non Intra (Last == 0 and Last=1) + Level - 1 Indexed + padded armVCM4P2_IntraL0L1LMAX[15-31] with zeros to acess entries for Last=1 effectively + +*/ +const OMX_U8 armVCM4P2_IntraL0L1LMAX[53] = +{ + 27, 10, 5, 4, 3, 3, 3, + 3, 2, 2, 1, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 8, 3, 2, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 +}; + + +/* RMAX table for non Inter (Last == 0 and Last=1) + Level - 1 Indexed + padded armVCM4P2_IntraL0L1RMAX[27-31] with zeros to access entries for Last=1 table effectively */ + + +const OMX_U8 armVCM4P2_IntraL0L1RMAX[40] = +{ + 14, 9, 7, 3, 2, 1, 1, + 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + + 20, 6, 1, 0, 0, 0, 0, 0 + +}; diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c new file mode 100755 index 0000000..6948f80 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c @@ -0,0 +1,75 @@ + /** + * + * File Name: armVCM4P2_Lookup_Tables.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * File: armVCM4P2_Lookup_Tables.c + * Description: Contains all the Lookup tables used in MPEG4 codec + * + */ + +#include "omxtypes.h" +#include "armOMX.h" + + /* * Table Entries contain Dc Scaler values + * armVCM4P2_DCScaler[i]= 8 for i=1 to 4 and i=33 to 36 + * = 2*i for i=5 to 8 + * = i+8 for i=9 to 25 + * = 2*i-16 for i=26 to 31 + * = (i-32+13)/2 for i=37 to 59 + * = i-6-32 for i=60 to 63 + * = 255 for i=0 and i=32 + */ + +const OMX_U8 armVCM4P2_DCScaler[64]={ + 0xff, 0x8, 0x8, 0x8, 0x8, 0xa, 0xc, 0xe, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, + 0xff, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, + 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, + 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + +}; + + + /* Table Entries Contain reciprocal of 1 to 63 + * armVCM4P2_Reciprocal_QP_S16[i]=round(32767/i) + * armVCM4P2_Reciprocal_QP_S16[0]= 0 + */ + +const OMX_S16 armVCM4P2_Reciprocal_QP_S16[64]={ + 0x0000,0x7fff,0x4000,0x2aaa,0x2000,0x1999,0x1555,0x1249, + 0x1000,0x0e39,0x0ccd,0x0ba3,0x0aab,0x09d9,0x0925,0x0888, + 0x0800,0x0787,0x071c,0x06bd,0x0666,0x0618,0x05d1,0x0591, + 0x0555,0x051f,0x04ec,0x04be,0x0492,0x046a,0x0444,0x0421, + 0x0400,0x03e1,0x03c4,0x03a8,0x038e,0x0376,0x035e,0x0348, + 0x0333,0x031f,0x030c,0x02fa,0x02e9,0x02d8,0x02c8,0x02b9, + 0x02ab,0x029d,0x028f,0x0282,0x0276,0x026a,0x025f,0x0254, + 0x0249,0x023f,0x0235,0x022b,0x0222,0x0219,0x0211,0x0208 + +}; + + /* Table Entries Contain reciprocal of 1 to 63 + * armVCM4P2_Reciprocal_QP_S32[i]=round(131071/i) + * armVCM4P2_Reciprocal_QP_S32[0]= 0 + */ + +const OMX_S32 armVCM4P2_Reciprocal_QP_S32[64]={ + 0x00000000,0x0001ffff,0x00010000,0x0000aaaa, 0x00008000, 0x00006666, 0x00005555, 0x00004924, + 0x00004000,0x000038e3,0x00003333,0x00002e8c, 0x00002aab, 0x00002762, 0x00002492, 0x00002222, + 0x00002000,0x00001e1e,0x00001c72,0x00001af2, 0x0000199a, 0x00001861, 0x00001746, 0x00001643, + 0x00001555,0x0000147b,0x000013b1,0x000012f6, 0x00001249, 0x000011a8, 0x00001111, 0x00001084, + 0x00001000,0x00000f84,0x00000f0f,0x00000ea1, 0x00000e39, 0x00000dd6, 0x00000d79, 0x00000d21, + 0x00000ccd,0x00000c7d,0x00000c31,0x00000be8, 0x00000ba3, 0x00000b61, 0x00000b21, 0x00000ae5, + 0x00000aab,0x00000a73,0x00000a3d,0x00000a0a, 0x000009d9, 0x000009a9, 0x0000097b, 0x0000094f, + 0x00000925,0x000008fb,0x000008d4,0x000008ae, 0x00000889, 0x00000865, 0x00000842, 0x00000820 + +}; diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s new file mode 100755 index 0000000..44f2460 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s @@ -0,0 +1,104 @@ +;// +;// +;// File Name: armVCM4P2_SetPredDir_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + +; ** +; * Function: armVCM4P2_SetPredDir +; * +; * Description: +; * Performs detecting the prediction direction +; * +; * Remarks: +; * +; * Parameters: +; * [in] blockIndex block index indicating the component type and +; * position as defined in subclause 6.1.3.8, of ISO/IEC +; * 14496-2. Furthermore, indexes 6 to 9 indicate the +; * alpha blocks spatially corresponding to luminance +; * blocks 0 to 3 in the same macroblock. +; * [in] pCoefBufRow pointer to the coefficient row buffer +; * [in] pQpBuf pointer to the quantization parameter buffer +; * [out]predQP quantization parameter of the predictor block +; * [out]predDir indicates the prediction direction which takes one +; * of the following values: +; * OMX_VC_HORIZONTAL predict horizontally +; * OMX_VC_VERTICAL predict vertically +; * +; * Return Value: +; * Standard OMXResult result. See enumeration for possible result codes. +; * +; */ + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE omxVC_s.h + + + M_VARIANTS ARM1136JS + + + IF ARM1136JS + +;// Input Arguments +BlockIndex RN 0 +pCoefBufRow RN 1 +pCoefBufCol RN 2 +predDir RN 3 +predQP RN 4 +pQpBuf RN 5 + +;// Local Variables + +Return RN 0 +blockDCLeft RN 6 +blockDCTop RN 7 +blockDCTopLeft RN 8 +temp1 RN 9 +temp2 RN 14 + + M_START armVCM4P2_SetPredDir,r9 + + M_ARG ppredQP,4 + M_ARG ppQpBuf,4 + + LDRH blockDCTopLeft,[pCoefBufRow,#-16] + LDRH blockDCLeft,[pCoefBufCol] + + TEQ BlockIndex,#3 + LDREQH blockDCTop,[pCoefBufCol,#-16] + LDRNEH blockDCTop,[pCoefBufRow] + + SUBS temp1,blockDCLeft,blockDCTopLeft + RSBLT temp1,temp1,#0 + SUBS temp2,blockDCTopLeft,blockDCTop + RSBLT temp2,temp2,#0 + + M_LDR pQpBuf,ppQpBuf + M_LDR predQP,ppredQP + CMP temp1,temp2 + MOV temp2,#OMX_VC_VERTICAL + LDRLTB temp1,[pQpBuf,#1] + STRLT temp2,[predDir] + STRLT temp1,[predQP] + MOV temp2,#OMX_VC_HORIZONTAL + LDRGEB temp1,[pQpBuf] + STRGE temp2,[predDir] + MOV Return,#OMX_Sts_NoErr + STRGE temp1,[predQP] + + + + M_END + + ENDIF + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c new file mode 100755 index 0000000..21fa715 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c @@ -0,0 +1,61 @@ +/** + * + * File Name: armVCM4P2_Zigzag_Tables.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * File: armVCM4P2_ZigZag_Tables.c + * Description: Contains the zigzag tables + * + */ + +#include "omxtypes.h" + +/* Contains Double the values in the reference Zigzag Table + * Contains Classical,Vetical and Horizontal Zigzagscan tables in one array + */ + +const OMX_U8 armVCM4P2_aClassicalZigzagScan [192] = +{ + 0, 2, 16, 32, 18, 4, 6, 20, + 34, 48, 64, 50, 36, 22, 8, 10, + 24, 38, 52, 66, 80, 96, 82, 68, + 54, 40, 26, 12, 14, 28, 42, 56, + 70, 84, 98, 112, 114, 100, 86, 72, + 58, 44, 30, 46, 60, 74, 88, 102, + 116, 118, 104, 90, 76, 62, 78, 92, + 106, 120, 122, 104, 94, 110, 124, 126, + + 0, 16, 32, 48, 2, 18, 4, 20, + 34, 50, 64, 80, 96, 112, 114, 98, + 82, 66, 52, 36, 6, 22, 8, 24, + 38, 54, 68, 84, 100, 116, 70, 86, + 102, 118, 40, 56, 10, 26, 12, 28, + 42, 58, 72, 88, 104, 120, 74, 90, + 106, 122, 44, 60, 14, 30, 46, 62, + 76, 92, 108, 124, 78, 94, 110, 126, + + 0, 2, 4, 6, 16, 18, 32, 34, + 20, 22, 8, 10, 12, 14, 30, 28, + 26, 24, 38, 36, 48, 50, 64, 66, + 52, 54, 40, 42, 44, 46, 56, 58, + 60, 62, 68, 70, 80, 82, 96, 98, + 84, 86, 72, 74, 76, 78, 88, 90, + 92, 94, 100, 102, 112, 114, 116, 118, + 104, 106, 108, 110, 120, 122, 124, 126 + + +}; + + + + + +/* End of file */ + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c new file mode 100755 index 0000000..796ad6e --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c @@ -0,0 +1,102 @@ +/** + * + * File Name: omxVCM4P2_DecodeBlockCoef_Inter.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * Description: + * Contains modules for inter reconstruction + * + */ + + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" + + +/** + * Function: omxVCM4P2_DecodeBlockCoef_Inter + * + * Description: + * Decodes the INTER block coefficients. Inverse quantization, inversely zigzag + * positioning and IDCT, with appropriate clipping on each step, are performed + * on the coefficients. The results (residuals) are placed in a contiguous array + * of 64 elements. For INTER block, the output buffer holds the residuals for + * further reconstruction. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream buffer. There is no boundary + * check for the bit stream buffer. + * [in] pBitOffset pointer to the bit position in the byte pointed + * to by *ppBitStream. *pBitOffset is valid within + * [0-7] + * [in] QP quantization parameter + * [in] shortVideoHeader a flag indicating presence of short_video_header; + * shortVideoHeader==1 indicates using quantization method defined in short + * video header mode, and shortVideoHeader==0 indicates normail quantization method. + * [out] ppBitStream *ppBitStream is updated after the block is decoded, so that it points to the + * current byte in the bit stream buffer. + * [out] pBitOffset *pBitOffset is updated so that it points to the current bit position in the + * byte pointed by *ppBitStream + * [out] pDst pointer to the decoded residual buffer (a contiguous array of 64 elements of + * OMX_S16 data type). Must be 16-byte aligned. + * + * Return Value: + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - At least one of the following pointers is Null: ppBitStream, *ppBitStream, pBitOffset , pDst + * - At least one of the below case: + * - *pBitOffset exceeds [0,7], QP <= 0; + * - pDst not 16-byte aligned + * OMX_Sts_Err - status error + * + */ +OMXResult omxVCM4P2_DecodeBlockCoef_Inter( + const OMX_U8 ** ppBitStream, + OMX_INT * pBitOffset, + OMX_S16 * pDst, + OMX_INT QP, + OMX_INT shortVideoHeader +) +{ + /* 64 elements are needed but to align it to 16 bytes need + 15 more elements of padding */ + OMX_S16 tempBuf[79]; + OMX_S16 *pTempBuf1; + OMXResult errorCode; + /* Aligning the local buffers */ + pTempBuf1 = armAlignTo16Bytes(tempBuf); + + + /* VLD and zigzag */ + errorCode = omxVCM4P2_DecodeVLCZigzag_Inter(ppBitStream, pBitOffset, + pTempBuf1,shortVideoHeader); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + /* Dequantization */ + errorCode = omxVCM4P2_QuantInvInter_I( + pTempBuf1, + QP); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + /* Inverse transform */ + errorCode = omxVCM4P2_IDCT8x8blk(pTempBuf1, pDst); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + return OMX_Sts_NoErr; +} + +/* End of file */ + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c new file mode 100755 index 0000000..b28657c --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c @@ -0,0 +1,214 @@ +/** + * + * File Name: omxVCM4P2_DecodeBlockCoef_Intra.c + * OpenMAX DL: v1.0.2 + * Revision: 12290 + * Date: Wednesday, April 9, 2008 + * + * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. + * + * + * + * Description: + * Contains modules for intra reconstruction + * + */ + +#include "omxtypes.h" +#include "armOMX.h" +#include "omxVC.h" + +#include "armCOMM.h" +#include "armVC.h" + +/* Function for saturating 16 bit values to the [0,255] range and */ +/* writing out as 8 bit values. Does 64 entries */ +void armVCM4P2_Clip8(OMX_S16 *pSrc, OMX_U8 *pDst, OMX_INT dstStep ); + + + +/** + * Function: omxVCM4P2_DecodeBlockCoef_Intra + * + * Description: + * Decodes the INTRA block coefficients. Inverse quantization, inversely zigzag + * positioning, and IDCT, with appropriate clipping on each step, are performed + * on the coefficients. The results are then placed in the output frame/plane on + * a pixel basis. For INTRA block, the output values are clipped to [0, 255] and + * written to corresponding block buffer within the destination plane. + * + * Remarks: + * + * Parameters: + * [in] ppBitStream pointer to the pointer to the current byte in + * the bit stream buffer. There is no boundary + * check for the bit stream buffer. + * [in] pBitOffset pointer to the bit position in the byte pointed + * to by *ppBitStream. *pBitOffset is valid within + * [0-7]. + * [in] step width of the destination plane + * [in/out] pCoefBufRow [in] pointer to the coefficient row buffer + * [out] updated coefficient rwo buffer + * [in/out] pCoefBufCol [in] pointer to the coefficient column buffer + * [out] updated coefficient column buffer + * [in] curQP quantization parameter of the macroblock which + * the current block belongs to + * [in] pQpBuf Pointer to a 2-element QP array. pQpBuf[0] holds the QP of the 8x8 block left to + * the current block(QPa). pQpBuf[1] holds the QP of the 8x8 block just above the + * current block(QPc). + * Note, in case the corresponding block is out of VOP bound, the QP value will have + * no effect to the intra-prediction process. Refer to subclause "7.4.3.3 Adaptive + * ac coefficient prediction" of ISO/IEC 14496-2(MPEG4 Part2) for accurate description. + * [in] blockIndex block index indicating the component type and + * position as defined in subclause 6.1.3.8, + * Figure 6-5 of ISO/IEC 14496-2. + * [in] intraDCVLC a code determined by intra_dc_vlc_thr and QP. + * This allows a mechanism to switch between two VLC + * for coding of Intra DC coefficients as per Table + * 6-21 of ISO/IEC 14496-2. + * [in] ACPredFlag a flag equal to ac_pred_flag (of luminance) indicating + * if the ac coefficients of the first row or first + * column are differentially coded for intra coded + * macroblock. + * [in] shortVideoHeader a flag indicating presence of short_video_header; + * shortVideoHeader==1 selects linear intra DC mode, + * and shortVideoHeader==0 selects nonlinear intra DC mode. + * [out] ppBitStream *ppBitStream is updated after the block is + * decoded, so that it points to the current byte + * in the bit stream buffer + * [out] pBitOffset *pBitOffset is updated so that it points to the + * current bit position in the byte pointed by + * *ppBitStream + * [out] pDst pointer to the block in the destination plane. + * pDst should be 16-byte aligned. + * [out] pCoefBufRow pointer to the updated coefficient row buffer. + * + * Return Value: + * OMX_Sts_NoErr - no error + * OMX_Sts_BadArgErr - bad arguments + * - At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, + * pCoefBufRow, pCoefBufCol, pQPBuf, pDst. + * or + * - At least one of the below case: *pBitOffset exceeds [0,7], curQP exceeds (1, 31), + * blockIndex exceeds [0,9], step is not the multiple of 8, intraDCVLC is zero while + * blockIndex greater than 5. + * or + * - pDst is not 16-byte aligned + * OMX_Sts_Err - status error + * + */ + +OMXResult omxVCM4P2_DecodeBlockCoef_Intra( + const OMX_U8 ** ppBitStream, + OMX_INT *pBitOffset, + OMX_U8 *pDst, + OMX_INT step, + OMX_S16 *pCoefBufRow, + OMX_S16 *pCoefBufCol, + OMX_U8 curQP, + const OMX_U8 *pQPBuf, + OMX_INT blockIndex, + OMX_INT intraDCVLC, + OMX_INT ACPredFlag, + OMX_INT shortVideoHeader + ) +{ + OMX_S16 tempBuf1[79], tempBuf2[79]; + OMX_S16 *pTempBuf1, *pTempBuf2; + OMX_INT predDir, predACDir; + OMX_INT predQP; + OMXVCM4P2VideoComponent videoComp; + OMXResult errorCode; + + + /* Aligning the local buffers */ + pTempBuf1 = armAlignTo16Bytes(tempBuf1); + pTempBuf2 = armAlignTo16Bytes(tempBuf2); + + /* Setting the AC prediction direction and prediction direction */ + armVCM4P2_SetPredDir( + blockIndex, + pCoefBufRow, + pCoefBufCol, + &predDir, + &predQP, + pQPBuf); + + predACDir = predDir; + + + if (ACPredFlag == 0) + { + predACDir = OMX_VC_NONE; + } + + /* Setting the videoComp */ + if (blockIndex <= 3) + { + videoComp = OMX_VC_LUMINANCE; + } + else + { + videoComp = OMX_VC_CHROMINANCE; + } + + + /* VLD and zigzag */ + if (intraDCVLC == 1) + { + errorCode = omxVCM4P2_DecodeVLCZigzag_IntraDCVLC( + ppBitStream, + pBitOffset, + pTempBuf1, + predACDir, + shortVideoHeader, + videoComp); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + } + else + { + errorCode = omxVCM4P2_DecodeVLCZigzag_IntraACVLC( + ppBitStream, + pBitOffset, + pTempBuf1, + predACDir, + shortVideoHeader); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + } + + /* AC DC prediction */ + errorCode = omxVCM4P2_PredictReconCoefIntra( + pTempBuf1, + pCoefBufRow, + pCoefBufCol, + curQP, + predQP, + predDir, + ACPredFlag, + videoComp); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + /* Dequantization */ + errorCode = omxVCM4P2_QuantInvIntra_I( + pTempBuf1, + curQP, + videoComp, + shortVideoHeader); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + /* Inverse transform */ + errorCode = omxVCM4P2_IDCT8x8blk (pTempBuf1, pTempBuf2); + armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode); + + /* Placing the linear array into the destination plane and clipping + it to 0 to 255 */ + + armVCM4P2_Clip8(pTempBuf2,pDst,step); + + + return OMX_Sts_NoErr; +} + +/* End of file */ + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s new file mode 100755 index 0000000..cc16f5a --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s @@ -0,0 +1,364 @@ +; ********** +; * +; * File Name: omxVCM4P2_DecodePadMV_PVOP_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; ** +; * Function: omxVCM4P2_DecodePadMV_PVOP +; * +; * Description: +; * Decodes and pads four motion vectors of the non-intra macroblock in P-VOP. +; * The motion vector padding process is specified in subclause 7.6.1.6 of +; * ISO/IEC 14496-2. +; * +; * Remarks: +; * +; * +; * Parameters: +; * [in] ppBitStream pointer to the pointer to the current byte in +; * the bit stream buffer +; * [in] pBitOffset pointer to the bit position in the byte pointed +; * to by *ppBitStream. *pBitOffset is valid within +; * [0-7]. +; * [in] pSrcMVLeftMB pointers to the motion vector buffers of the +; * macroblocks specially at the left side of the current macroblock +; * respectively. +; * [in] pSrcMVUpperMB pointers to the motion vector buffers of the +; * macroblocks specially at the upper side of the current macroblock +; * respectively. +; * [in] pSrcMVUpperRightMB pointers to the motion vector buffers of the +; * macroblocks specially at the upper-right side of the current macroblock +; * respectively. +; * [in] fcodeForward a code equal to vop_fcode_forward in MPEG-4 +; * bit stream syntax +; * [in] MBType the type of the current macroblock. If MBType +; * is not equal to OMX_VC_INTER4V, the destination +; * motion vector buffer is still filled with the +; * same decoded vector. +; * [out] ppBitStream *ppBitStream is updated after the block is decoded, +; * so that it points to the current byte in the bit +; * stream buffer +; * [out] pBitOffset *pBitOffset is updated so that it points to the +; * current bit position in the byte pointed by +; * *ppBitStream +; * [out] pDstMVCurMB pointer to the motion vector buffer of the current +; * macroblock which contains four decoded motion vectors +; * +; * Return Value: +; * OMX_Sts_NoErr -no error +; * +; * +; * OMX_Sts_Err - status error +; * +; * + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + INCLUDE omxVC_s.h + + M_VARIANTS ARM1136JS + + + + + IF ARM1136JS + +;//Input Arguments + +ppBitStream RN 0 +pBitOffset RN 1 +pSrcMVLeftMB RN 2 +pSrcMVUpperMB RN 3 +pSrcMVUpperRightMB RN 4 +pDstMVCurMB RN 5 +fcodeForward RN 6 +MBType RN 7 + +;//Local Variables + +zero RN 4 +one RN 4 +scaleFactor RN 1 + + +Return RN 0 + +VlcMVD RN 0 +index RN 4 +Count RN 7 + +mvHorData RN 4 +mvHorResidual RN 0 + +mvVerData RN 4 +mvVerResidual RN 0 + +temp RN 1 + +temp1 RN 3 +High RN 4 +Low RN 2 +Range RN 1 + +BlkCount RN 14 + +diffMVdx RN 0 +diffMVdy RN 1 + +;// Scratch Registers + +RBitStream RN 8 +RBitCount RN 9 +RBitBuffer RN 10 + +T1 RN 11 +T2 RN 12 +LR RN 14 + + IMPORT armVCM4P2_aVlcMVD + IMPORT omxVCM4P2_FindMVpred + + ;// Allocate stack memory + + M_ALLOC4 ppDstMVCurMB,4 + M_ALLOC4 pDstMVPredME,4 + M_ALLOC4 pBlkCount,4 + + M_ALLOC4 pppBitStream,4 + M_ALLOC4 ppBitOffset,4 + M_ALLOC4 ppSrcMVLeftMB,4 + M_ALLOC4 ppSrcMVUpperMB,4 + + M_ALLOC4 pdiffMVdx,4 + M_ALLOC4 pdiffMVdy,4 + M_ALLOC4 pHigh,4 + + + + + M_START omxVCM4P2_DecodePadMV_PVOP,r11 + + M_ARG pSrcMVUpperRightMBonStack,4 ;// pointer to pSrcMVUpperRightMB on stack + M_ARG pDstMVCurMBonStack,4 ;// pointer to pDstMVCurMB on stack + M_ARG fcodeForwardonStack,4 ;// pointer to fcodeForward on stack + M_ARG MBTypeonStack,4 ;// pointer to MBType on stack + + + + + + ;// Initializing the BitStream Macro + + M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount + M_LDR MBType,MBTypeonStack ;// Load MBType from stack + M_LDR pDstMVCurMB,pDstMVCurMBonStack ;// Load pDstMVCurMB from stack + MOV zero,#0 + + TEQ MBType,#OMX_VC_INTRA ;// Check if MBType=OMX_VC_INTRA + TEQNE MBType,#OMX_VC_INTRA_Q ;// check if MBType=OMX_VC_INTRA_Q + STREQ zero,[pDstMVCurMB] + M_BD_INIT1 T1, T2, T2 + STREQ zero,[pDstMVCurMB,#4] + M_BD_INIT2 T1, T2, T2 + STREQ zero,[pDstMVCurMB,#4] + MOVEQ Return,#OMX_Sts_NoErr + MOV BlkCount,#0 + STREQ zero,[pDstMVCurMB,#4] + + BEQ ExitOK + + TEQ MBType,#OMX_VC_INTER4V ;// Check if MBType=OMX_VC_INTER4V + TEQNE MBType,#OMX_VC_INTER4V_Q ;// Check if MBType=OMX_VC_INTER4V_Q + MOVEQ Count,#4 + + TEQ MBType,#OMX_VC_INTER ;// Check if MBType=OMX_VC_INTER + TEQNE MBType,#OMX_VC_INTER_Q ;// Check if MBType=OMX_VC_INTER_Q + MOVEQ Count,#1 + + M_LDR fcodeForward,fcodeForwardonStack ;// Load fcodeForward from stack + + ;// Storing the values temporarily on stack + + M_STR ppBitStream,pppBitStream + M_STR pBitOffset,ppBitOffset + + + SUB temp,fcodeForward,#1 ;// temp=fcodeForward-1 + MOV one,#1 + M_STR pSrcMVLeftMB,ppSrcMVLeftMB + LSL scaleFactor,one,temp ;// scaleFactor=1<<(fcodeForward-1) + M_STR pSrcMVUpperMB,ppSrcMVUpperMB + LSL scaleFactor,scaleFactor,#5 + M_STR scaleFactor,pHigh ;// [pHigh]=32*scaleFactor + + ;// VLD Decoding + + +Loop + + LDR VlcMVD, =armVCM4P2_aVlcMVD ;// Load the optimized MVD VLC table + + ;// Horizontal Data and Residual calculation + + LDR temp,=0xFFF + M_BD_VLD index,T1,T2,VlcMVD,3,2 ;// variable lenght decoding using the macro + + TEQ index,temp + BEQ ExitError ;// Exit with an Error Message if the decoded symbol is an invalied symbol + + SUB mvHorData,index,#32 ;// mvHorData=index-32 + MOV mvHorResidual,#1 ;// mvHorResidual=1 + CMP fcodeForward,#1 + TEQNE mvHorData,#0 + MOVEQ diffMVdx,mvHorData ;// if scaleFactor=1(fcodeForward=1) or mvHorData=0 diffMVdx=mvHorData + BEQ VerticalData + + SUB temp,fcodeForward,#1 + M_BD_VREAD8 mvHorResidual,temp,T1,T2 ;// get mvHorResidual from bitstream if fcodeForward>1 and mvHorData!=0 + + CMP mvHorData,#0 + RSBLT mvHorData,mvHorData,#0 ;// mvHorData=abs(mvHorData) + SUB mvHorResidual,mvHorResidual,fcodeForward + SMLABB diffMVdx,mvHorData,fcodeForward,mvHorResidual ;// diffMVdx=abs(mvHorData)*fcodeForward+mvHorResidual-fcodeForward + ADD diffMVdx,diffMVdx,#1 + RSBLT diffMVdx,diffMVdx,#0 + + ;// Vertical Data and Residual calculation + +VerticalData + + M_STR diffMVdx,pdiffMVdx ;// Store the diffMVdx on stack + LDR VlcMVD, =armVCM4P2_aVlcMVD ;// Loading the address of optimized VLC tables + + LDR temp,=0xFFF + M_BD_VLD index,T1,T2,VlcMVD,3,2 ;// VLC decoding using the macro + + TEQ index,temp + BEQ ExitError ;// Exit with an Error Message if an Invalied Symbol occurs + + SUB mvVerData,index,#32 ;// mvVerData=index-32 + MOV mvVerResidual,#1 + CMP fcodeForward,#1 + TEQNE mvVerData,#0 + MOVEQ diffMVdy,mvVerData ;// diffMVdy = mvVerData if scaleFactor=1(fcodeForward=1) or mvVerData=0 + BEQ FindMVPred + + SUB temp,fcodeForward,#1 + M_BD_VREAD8 mvVerResidual,temp,T1,T2 ;// Get mvVerResidual from bit stream if fcodeForward>1 and mnVerData!=0 + + + CMP mvVerData,#0 + RSBLT mvVerData,mvVerData,#0 + SUB mvVerResidual,mvVerResidual,fcodeForward + SMLABB diffMVdy,mvVerData,fcodeForward,mvVerResidual ;// diffMVdy=abs(mvVerData)*fcodeForward+mvVerResidual-fcodeForward + ADD diffMVdy,diffMVdy,#1 + RSBLT diffMVdy,diffMVdy,#0 + + ;//Calling the Function omxVCM4P2_FindMVpred + +FindMVPred + + M_STR diffMVdy,pdiffMVdy + ADD temp,pDstMVCurMB,BlkCount,LSL #2 ;// temp=pDstMVCurMB[BlkCount] + M_STR temp,ppDstMVCurMB ;// store temp on stack for passing as an argument to FindMVPred + + MOV temp,#0 + M_STR temp,pDstMVPredME ;// Pass pDstMVPredME=NULL as an argument + M_STR BlkCount,pBlkCount ;// Passs BlkCount as Argument through stack + + MOV temp,pSrcMVLeftMB ;// temp (RN 1)=pSrcMVLeftMB + M_LDR pSrcMVUpperRightMB,pSrcMVUpperRightMBonStack + MOV pSrcMVLeftMB,pSrcMVUpperMB ;// pSrcMVLeftMB ( RN 2) = pSrcMVUpperMB + MOV ppBitStream,pDstMVCurMB ;// ppBitStream ( RN 0) = pDstMVCurMB + MOV pSrcMVUpperMB,pSrcMVUpperRightMB ;// pSrcMVUpperMB( RN 3) = pSrcMVUpperRightMB + BL omxVCM4P2_FindMVpred ;// Branch to subroutine omxVCM4P2_FindMVpred + + ;// Store Horizontal Motion Vector + + M_LDR BlkCount,pBlkCount ;// Load BlkCount from stack + M_LDR High,pHigh ;// High=32*scaleFactor + LSL temp1,BlkCount,#2 ;// temp=BlkCount*4 + M_LDR diffMVdx,pdiffMVdx ;// Laad diffMVdx + + LDRSH temp,[pDstMVCurMB,temp1] ;// temp=pDstMVCurMB[BlkCount] + + + RSB Low,High,#0 ;// Low = -32*scaleFactor + ADD diffMVdx,temp,diffMVdx ;// diffMVdx=pDstMVCurMB[BlkCount]+diffMVdx + ADD Range,High,High ;// Range=64*ScaleFactor + SUB High,High,#1 ;// High= 32*scaleFactor-1 + + CMP diffMVdx,Low ;// If diffMVdx<Low + ADDLT diffMVdx,diffMVdx,Range ;// diffMVdx+=Range + + CMP diffMVdx,High + SUBGT diffMVdx,diffMVdx,Range ;// If diffMVdx > High diffMVdx-=Range + STRH diffMVdx,[pDstMVCurMB,temp1] + + ;// Store Vertical + + ADD temp1,temp1,#2 ;// temp1=4*BlkCount+2 + M_LDR diffMVdx,pdiffMVdy ;// Laad diffMVdy + LDRSH temp,[pDstMVCurMB,temp1] ;// temp=pDstMVCurMB[BlkCount].diffMVdy + ADD BlkCount,BlkCount,#1 ;// BlkCount=BlkCount+1 + ADD diffMVdx,temp,diffMVdx + CMP diffMVdx,Low + ADDLT diffMVdx,diffMVdx,Range ;// If diffMVdy<Low diffMVdy+=Range + CMP diffMVdx,High + SUBGT diffMVdx,diffMVdx,Range ;// If diffMVdy > High diffMVdy-=Range + STRH diffMVdx,[pDstMVCurMB,temp1] + + CMP BlkCount,Count + M_LDR pSrcMVLeftMB,ppSrcMVLeftMB + M_LDR pSrcMVUpperMB,ppSrcMVUpperMB + + BLT Loop ;// If BlkCount<Count Continue the Loop + + + ;// If MBType=OMX_VC_INTER or MBtype=OMX_VC_INTER_Q copy pDstMVCurMB[0] to + ;// pDstMVCurMB[1], pDstMVCurMB[2], pDstMVCurMB[3] + + M_LDR MBType,MBTypeonStack + + TEQ MBType,#OMX_VC_INTER + TEQNE MBType,#OMX_VC_INTER_Q + LDREQ temp,[pDstMVCurMB] + M_LDR ppBitStream,pppBitStream + STREQ temp,[pDstMVCurMB,#4] + + STREQ temp,[pDstMVCurMB,#8] + STREQ temp,[pDstMVCurMB,#12] + + + M_LDR pBitOffset,ppBitOffset + ;//Ending the macro + M_BD_FINI ppBitStream,pBitOffset ;// Finishing the Macro + + + MOV Return,#OMX_Sts_NoErr + B ExitOK + +ExitError + + M_LDR ppBitStream,pppBitStream + M_LDR pBitOffset,ppBitOffset + ;//Ending the macro + M_BD_FINI ppBitStream,pBitOffset + + MOV Return,#OMX_Sts_Err + +ExitOK + + M_END + ENDIF + END + + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s new file mode 100755 index 0000000..7208c21 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s @@ -0,0 +1,132 @@ +;/** +; * +; * File Name: omxVCM4P2_DecodeVLCZigzag_Inter_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for zigzag scanning and VLC decoding +; * for inter block. +; * +; * +; * +; * Function: omxVCM4P2_DecodeVLCZigzag_Inter +; * +; * Description: +; * Performs VLC decoding and inverse zigzag scan for one inter coded block. +; * +; * Remarks: +; * +; * Parameters: +; * [in] ppBitStream pointer to the pointer to the current byte in +; * the bitstream buffer +; * [in] pBitOffset pointer to the bit position in the byte pointed +; * to by *ppBitStream. *pBitOffset is valid within [0-7]. +; * [in] shortVideoHeader binary flag indicating presence of short_video_header; +; * escape modes 0-3 are used if shortVideoHeader==0, +; * and escape mode 4 is used when shortVideoHeader==1. +; * [out] ppBitStream *ppBitStream is updated after the block is +; * decoded, so that it points to the current byte +; * in the bit stream buffer +; * [out] pBitOffset *pBitOffset is updated so that it points to the +; * current bit position in the byte pointed by +; * *ppBitStream +; * [out] pDst pointer to the coefficient buffer of current +; * block. Must be 16-byte aligned +; * +; * Return Value: +; * OMX_Sts_BadArgErr - bad arguments +; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or +; * -pDst is not 16-byte aligned, or +; * -*pBitOffset exceeds [0,7]. +; * OMX_Sts_Err - status error +; * -At least one mark bit is equal to zero +; * -Encountered an illegal stream code that cannot be found in the VLC table +; * -Encountered and illegal code in the VLC FLC table +; * -The number of coefficients is greater than 64 +; * +; */ + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + + + M_VARIANTS ARM1136JS + + + + + + IF ARM1136JS + + ;// Import various tables needed for the function + + + IMPORT armVCM4P2_InterVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0 + ;// Packed in Run:Level:Last format + IMPORT armVCM4P2_InterL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_InterL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_aClassicalZigzagScan ;// contains classical Zigzag table entries with double the original values + IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe + + + +;//Input Arguments + +ppBitStream RN 0 +pBitOffset RN 1 +pDst RN 2 +shortVideoHeader RN 3 + +;//Local Variables + +Return RN 0 + +pVlcTableL0L1 RN 4 +pLMAXTableL0L1 RN 4 +pRMAXTableL0L1 RN 4 +pZigzagTable RN 4 +Count RN 6 + + + + ;// Allocate stack memory to store the VLC,Zigzag,LMAX and RMAX tables + + + M_ALLOC4 ppVlcTableL0L1,4 + M_ALLOC4 ppLMAXTableL0L1,4 + M_ALLOC4 ppRMAXTableL0L1,4 + M_ALLOC4 ppZigzagTable,4 + + + M_START omxVCM4P2_DecodeVLCZigzag_Inter,r12 + + + + + LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load zigzag table + M_STR pZigzagTable,ppZigzagTable ;// Store zigzag table on stack to pass as argument to unsafe function + LDR pVlcTableL0L1, =armVCM4P2_InterVlcL0L1 ;// Load optimized VLC table with both L=0 and L=1 entries + M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store optimized VLC table address on stack + LDR pLMAXTableL0L1, =armVCM4P2_InterL0L1LMAX ;// Load Interleaved L=0 and L=1 LMAX Tables + M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX table address on stack + LDR pRMAXTableL0L1, =armVCM4P2_InterL0L1RMAX ;// Load Interleaved L=0 and L=1 RMAX Tables + MOV Count,#0 ;// set start=0 + M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// store RMAX table address on stack + + + BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// call Unsafe Function for VLC Zigzag Decoding + + + + M_END + ENDIF + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s new file mode 100755 index 0000000..9a37ec9 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s @@ -0,0 +1,136 @@ +;/** +; * +; * File Name: omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for zigzag scanning and VLC decoding +; * for inter block. +; * +; * +; * +; * Function: omxVCM4P2_DecodeVLCZigzag_Inter +; * +; * Description: +; * Performs VLC decoding and inverse zigzag scan for one intra coded block. +; * +; * Remarks: +; * +; * Parameters: +; * [in] ppBitStream pointer to the pointer to the current byte in +; * the bitstream buffer +; * [in] pBitOffset pointer to the bit position in the byte pointed +; * to by *ppBitStream. *pBitOffset is valid within [0-7]. +; * [in] shortVideoHeader binary flag indicating presence of short_video_header; +; * escape modes 0-3 are used if shortVideoHeader==0, +; * and escape mode 4 is used when shortVideoHeader==1. +; * [out] ppBitStream *ppBitStream is updated after the block is +; * decoded, so that it points to the current byte +; * in the bit stream buffer +; * [out] pBitOffset *pBitOffset is updated so that it points to the +; * current bit position in the byte pointed by +; * *ppBitStream +; * [out] pDst pointer to the coefficient buffer of current +; * block. Must be 16-byte aligned +; * +; * Return Value: +; * OMX_Sts_BadArgErr - bad arguments +; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or +; * -pDst is not 16-byte aligned, or +; * -*pBitOffset exceeds [0,7]. +; * OMX_Sts_Err - status error +; * -At least one mark bit is equal to zero +; * -Encountered an illegal stream code that cannot be found in the VLC table +; * -Encountered and illegal code in the VLC FLC table +; * -The number of coefficients is greater than 64 +; * +; */ + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + + + M_VARIANTS ARM1136JS + + + + + + IF ARM1136JS + + ;// Import various tables needed for the function + + + IMPORT armVCM4P2_IntraVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0 + ;// Packed in Run:Level:Last format + IMPORT armVCM4P2_IntraL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_IntraL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_aClassicalZigzagScan ;// contains classical Zigzag table entries with double the original values + IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe + +;//Input Arguments + +ppBitStream RN 0 +pBitOffset RN 1 +pDst RN 2 +PredDir RN 3 +shortVideoHeader RN 3 + +;//Local Variables + +Return RN 0 + +pVlcTableL0L1 RN 4 +pLMAXTableL0L1 RN 4 +pRMAXTableL0L1 RN 4 +pZigzagTable RN 4 +Count RN 6 + + + + ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses + + M_ALLOC4 ppVlcTableL0L1,4 + M_ALLOC4 ppLMAXTableL0L1,4 + M_ALLOC4 ppRMAXTableL0L1,4 + M_ALLOC4 ppZigzagTable,4 + + + M_START omxVCM4P2_DecodeVLCZigzag_IntraACVLC,r12 + + M_ARG shortVideoHeaderonStack,4 ;// pointer to Input Argument on stack + + LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load Address of the Zigzag table + ADD pZigzagTable, pZigzagTable, PredDir, LSL #6 ;// Loading Different type of zigzag tables based on PredDir + + M_STR pZigzagTable,ppZigzagTable ;// Store Zigzag table address on stack + LDR pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1 ;// Load optimized packed VLC Table with both L=0 and L=1 entries + M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store VLC Table address on stack + LDR pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX ;// Load LMAX Table + M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX Table address on Stack + LDR pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX ;// Load RMAX Table + MOV Count,#0 ;// Set Start=0 + + M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Store RMAX Table address on stack + + + + M_LDR shortVideoHeader,shortVideoHeaderonStack ;// get the Input Argument from stack + + BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// Call Unsafe Function + + + + + M_END + ENDIF + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s new file mode 100755 index 0000000..778aaf2 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s @@ -0,0 +1,224 @@ +;/** +; * +; * File Name: omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for zigzag scanning and VLC decoding +; * for inter block. +; * +; * +; * +; * Function: omxVCM4P2_DecodeVLCZigzag_Inter +; * +; * Description: +; * Performs VLC decoding and inverse zigzag scan for one intra coded block. +; * +; * Remarks: +; * +; * Parameters: +; * [in] ppBitStream pointer to the pointer to the current byte in +; * the bitstream buffer +; * [in] pBitOffset pointer to the bit position in the byte pointed +; * to by *ppBitStream. *pBitOffset is valid within [0-7]. +; * [in] shortVideoHeader binary flag indicating presence of short_video_header; +; * escape modes 0-3 are used if shortVideoHeader==0, +; * and escape mode 4 is used when shortVideoHeader==1. +; * [out] ppBitStream *ppBitStream is updated after the block is +; * decoded, so that it points to the current byte +; * in the bit stream buffer +; * [out] pBitOffset *pBitOffset is updated so that it points to the +; * current bit position in the byte pointed by +; * *ppBitStream +; * [out] pDst pointer to the coefficient buffer of current +; * block. Must be 16-byte aligned +; * +; * Return Value: +; * OMX_Sts_BadArgErr - bad arguments +; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or +; * -pDst is not 16-byte aligned, or +; * -*pBitOffset exceeds [0,7]. +; * OMX_Sts_Err - status error +; * -At least one mark bit is equal to zero +; * -Encountered an illegal stream code that cannot be found in the VLC table +; * -Encountered and illegal code in the VLC FLC table +; * -The number of coefficients is greater than 64 +; * +; */ + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armCOMM_BitDec_s.h + + + M_VARIANTS CortexA8 + + + + + + IF CortexA8 + + + ;// Import various tables needed for the function + + + IMPORT armVCM4P2_IntraVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0 + ;// Packed in Run:Level:Last format + IMPORT armVCM4P2_IntraL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_IntraL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1 + IMPORT armVCM4P2_aClassicalZigzagScan ;// contains CLassical, Horizontal, Vertical Zigzag table entries with double the original values + IMPORT armVCM4P2_aIntraDCLumaChromaIndex ;// Contains Optimized DCLuma and DCChroma Index table Entries + + + IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe + +;//Input Arguments + +ppBitStream RN 0 +pBitOffset RN 1 +pDst RN 2 +PredDir RN 3 +shortVideoHeader RN 3 +videoComp RN 5 +;//Local Variables + +Return RN 0 + +pDCLumaChromaIndex RN 4 +pDCChromaIndex RN 7 +pVlcTableL0L1 RN 4 +pLMAXTableL0L1 RN 4 +pRMAXTableL0L1 RN 4 +pZigzagTable RN 4 +Count RN 6 +DCValueSize RN 6 +powOfSize RN 7 +temp1 RN 5 + + +;// Scratch Registers + +RBitStream RN 8 +RBitBuffer RN 9 +RBitCount RN 10 + +T1 RN 11 +T2 RN 12 +DCVal RN 14 + + + ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses + + M_ALLOC4 ppVlcTableL0L1,4 + M_ALLOC4 ppLMAXTableL0L1,4 + M_ALLOC4 ppRMAXTableL0L1,4 + M_ALLOC4 ppZigzagTable,4 + M_ALLOC4 pDCCoeff,4 + + + + M_START omxVCM4P2_DecodeVLCZigzag_IntraDCVLC,r12 + + M_ARG shortVideoHeaderonStack,4 ;// Pointer to argument on stack + M_ARG videoComponstack,4 ;// Pointer to argument on stack + + + ;// Decode DC Coefficient + + + LDR pDCLumaChromaIndex, =armVCM4P2_aIntraDCLumaChromaIndex ;// Load Optimized VLC Table for Luminance and Chrominance + + ;// Initializing the Bitstream Macro + + M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount + M_LDR videoComp,videoComponstack + M_BD_INIT1 T1, T2, T2 + ADD pDCLumaChromaIndex,pDCLumaChromaIndex,videoComp, LSL #6 + M_BD_INIT2 T1, T2, T2 + + + M_BD_VLD DCValueSize,T1,T2,pDCLumaChromaIndex,4,2 ;// VLC Decode using optimized Luminance and Chrominance VLC Table + + + + +DecodeDC + + CMP DCValueSize,#12 + BGT ExitError + + CMP DCValueSize,#0 + MOVEQ DCVal,#0 ;// If DCValueSize is zero then DC coeff =0 + BEQ ACDecode ;// Branch to perform AC Coeff Decoding + + M_BD_VREAD16 DCVal,DCValueSize,T1,T2 ;// Get DC Value From Bit stream + + + MOV powOfSize,#1 + LSL powOfSize,DCValueSize ;// powOfSize=pow(2,DCValueSize) + CMP DCVal,powOfSize,LSR #1 ;// Compare DCVal with powOfSize/2 + ADDLT DCVal,DCVal,#1 + SUBLT DCVal,DCVal,powOfSize ;// If Lessthan powOfSize/2 DCVal=DCVal-powOfSize+1 + ;// Else DCVal= fetchbits from bit stream + +CheckDCValueSize + + CMP DCValueSize,#8 ;// If DCValueSize greater than 8 check marker bit + + BLE ACDecode + + M_BD_READ8 temp1,1,T1 + TEQ temp1,#0 ;// If Marker bit is zero Exit with an Error Message + BEQ ExitError + + + + ;// Decode AC Coefficient + +ACDecode + + M_STR DCVal,pDCCoeff ;// Store Decoded DC Coeff on Stack + M_BD_FINI ppBitStream,pBitOffset ;// Terminating the Bit stream Macro + + LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load Zigzag talbe address + ADD pZigzagTable, pZigzagTable, PredDir, LSL #6 ;// Modify the Zigzag table adress based on PredDir + + M_STR pZigzagTable,ppZigzagTable ;// Store zigzag table on stack + LDR pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1 ;// Load Optimized VLC Table With both Last=0 and Last=1 Entries + M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store Optimized VLC Table on stack + LDR pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX ;// Load LMAX Table + M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX table on stack + LDR pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX ;// Load RMAX Table + MOV Count,#1 ;// Set Start =1 + + M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Store RMAX Table on Stack + + + M_LDR shortVideoHeader,shortVideoHeaderonStack ;// Load the Input Argument From Stack + + BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// Call the Unsafe Function + + M_LDR DCVal,pDCCoeff ;// Get the Decoded DC Value From Stack + STRH DCVal,[pDst] ;// Store the DC Value + B ExitOK + + + +ExitError + + M_BD_FINI ppBitStream,pBitOffset ;// Terminating the Bit Stream Macro in case of an Error + MOV Return,#OMX_Sts_Err ;// Exit with an Error Message +ExitOK + + M_END + ENDIF + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s new file mode 100755 index 0000000..caf7121 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s @@ -0,0 +1,194 @@ +;// +;// +;// File Name: omxVCM4P2_FindMVpred_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + +;// Function: +;// omxVCM4P2_FindMVpred +;// + ;// Include headers + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + INCLUDE armVCCOMM_s.h + + ;// Define cpu variants + M_VARIANTS CortexA8 + + + IF CortexA8 + + M_TABLE armVCM4P2_pBlkIndexTable + DCD OMXVCBlk0, OMXVCBlk1 + DCD OMXVCBlk2, OMXVCBlk3 + +;//-------------------------------------------- +;// Declare input registers +;//-------------------------------------------- + +pSrcMVCurMB RN 0 +pSrcCandMV1 RN 1 +pSrcCandMV2 RN 2 +pSrcCandMV3 RN 3 +pDstMVPred RN 4 +pDstMVPredME RN 5 +iBlk RN 6 + +pTable RN 4 +CandMV RN 12 + +pCandMV1 RN 7 +pCandMV2 RN 8 +pCandMV3 RN 9 + +CandMV1dx RN 0 +CandMV1dy RN 1 +CandMV2dx RN 2 +CandMV2dy RN 3 +CandMV3dx RN 10 +CandMV3dy RN 11 + +temp RN 14 + +zero RN 14 +return RN 0 + +; ---------------------------------------------- +; Main routine +; ---------------------------------------------- + + M_ALLOC4 MV, 4 + + ;// Function header + M_START omxVCM4P2_FindMVpred, r11 + + ;// Define stack arguments + M_ARG ppDstMVPred, 4 + M_ARG ppDstMVPredME, 4 + M_ARG Blk, 4 + + M_ADR CandMV, MV + MOV zero, #0 + M_LDR iBlk, Blk + + ;// Set the default value for these + ;// to be used if pSrcCandMV[1|2|3] == NULL + MOV pCandMV1, CandMV + MOV pCandMV2, CandMV + MOV pCandMV3, CandMV + + STR zero, [CandMV] + + ;// Branch to the case based on blk number + M_SWITCH iBlk + M_CASE OMXVCBlk0 ;// iBlk=0 + M_CASE OMXVCBlk1 ;// iBlk=0 + M_CASE OMXVCBlk2 ;// iBlk=0 + M_CASE OMXVCBlk3 ;// iBlk=0 + M_ENDSWITCH + +OMXVCBlk0 + CMP pSrcCandMV1, #0 + ADDNE pCandMV1, pSrcCandMV1, #4 + + CMP pSrcCandMV2, #0 + ADDNE pCandMV2, pSrcCandMV2, #8 + + CMP pSrcCandMV3, #0 + ADDNE pCandMV3, pSrcCandMV3, #8 + CMPEQ pSrcCandMV1, #0 + + MOVEQ pCandMV3, pCandMV2 + MOVEQ pCandMV1, pCandMV2 + + CMP pSrcCandMV1, #0 + CMPEQ pSrcCandMV2, #0 + + MOVEQ pCandMV1, pCandMV3 + MOVEQ pCandMV2, pCandMV3 + + CMP pSrcCandMV2, #0 + CMPEQ pSrcCandMV3, #0 + + MOVEQ pCandMV2, pCandMV1 + MOVEQ pCandMV3, pCandMV1 + + B BlkEnd + +OMXVCBlk1 + MOV pCandMV1, pSrcMVCurMB + CMP pSrcCandMV3, #0 + ADDNE pCandMV3, pSrcCandMV3, #8 + + CMP pSrcCandMV2, #0 + ADDNE pCandMV2, pSrcCandMV2, #12 + + CMPEQ pSrcCandMV3, #0 + + MOVEQ pCandMV2, pCandMV1 + MOVEQ pCandMV3, pCandMV1 + + B BlkEnd + +OMXVCBlk2 + CMP pSrcCandMV1, #0 + MOV pCandMV2, pSrcMVCurMB + ADD pCandMV3, pSrcMVCurMB, #4 + ADDNE pCandMV1, pSrcCandMV1, #12 + B BlkEnd + +OMXVCBlk3 + ADD pCandMV1, pSrcMVCurMB, #8 + MOV pCandMV2, pSrcMVCurMB + ADD pCandMV3, pSrcMVCurMB, #4 + +BlkEnd + + ;// Using the transperancy info, zero + ;// out the candidate MV if neccesary + LDRSH CandMV1dx, [pCandMV1], #2 + LDRSH CandMV2dx, [pCandMV2], #2 + LDRSH CandMV3dx, [pCandMV3], #2 + + ;// Load argument from the stack + M_LDR pDstMVPredME, ppDstMVPredME + + LDRSH CandMV1dy, [pCandMV1] + LDRSH CandMV2dy, [pCandMV2] + LDRSH CandMV3dy, [pCandMV3] + + CMP pDstMVPredME, #0 + + ;// Store the candidate MV's into the pDstMVPredME, + ;// these can be used in the fast algorithm if implemented + + STRHNE CandMV1dx, [pDstMVPredME], #2 + STRHNE CandMV1dy, [pDstMVPredME], #2 + STRHNE CandMV2dx, [pDstMVPredME], #2 + STRHNE CandMV2dy, [pDstMVPredME], #2 + STRHNE CandMV3dx, [pDstMVPredME], #2 + STRHNE CandMV3dy, [pDstMVPredME] + + ; Find the median of the 3 candidate MV's + M_MEDIAN3 CandMV1dx, CandMV2dx, CandMV3dx, temp + + ;// Load argument from the stack + M_LDR pDstMVPred, ppDstMVPred + + M_MEDIAN3 CandMV1dy, CandMV2dy, CandMV3dy, temp + + STRH CandMV3dx, [pDstMVPred], #2 + STRH CandMV3dy, [pDstMVPred] + + MOV return, #OMX_Sts_NoErr + + M_END + ENDIF ;// ARM1136JS :LOR: CortexA8 + + END
\ No newline at end of file diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s new file mode 100755 index 0000000..b5e3d0d --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s @@ -0,0 +1,73 @@ +;// +;// +;// File Name: omxVCM4P2_IDCT8x8blk_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + +;// Function: +;// omxVCM4P2_IDCT8x8blk +;// + ;// Include headers + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + ;// Define cpu variants + M_VARIANTS CortexA8 + + INCLUDE armCOMM_IDCT_s.h + + IMPORT armCOMM_IDCTPreScale + ;// + ;// Function prototype + ;// + ;// OMXResult + ;// omxVCM4P2_IDCT8x8blk(const OMX_S16* pSrc, + ;// OMX_S16* pDst) + ;// + + IF CortexA8 + M_ALLOC4 ppDest, 4 + M_ALLOC4 pStride, 4 + M_ALLOC8 pBlk, 2*8*8 + ENDIF + + + IF CortexA8 + M_START omxVCM4P2_IDCT8x8blk, r11, d15 + ENDIF + + IF CortexA8 + +;// Declare input registers +pSrc RN 0 +pDst RN 1 + +;// Declare other intermediate registers +Result RN 0 + +;// Prototype for macro M_IDCT +;// pSrc RN 0 ;// source data buffer +;// Stride RN 1 ;// destination stride in bytes +;// pDest RN 2 ;// destination data buffer +;// pScale RN 3 ;// pointer to scaling table + +pSrc RN 0 +Stride RN 1 +pDest RN 2 +pScale RN 3 + + MOV pDest, pDst + LDR pScale, =armCOMM_IDCTPreScale + M_IDCT s9, s16, 16 + MOV Result, #OMX_Sts_NoErr + M_END + ENDIF + ;// ARM1136JS :LOR: CortexA8 + + END diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s new file mode 100755 index 0000000..dd00df5 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s @@ -0,0 +1,444 @@ +;// +;// +;// File Name: omxVCM4P2_MCReconBlock_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// +;// + +;// Include standard headers + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import symbols required from other files + + M_VARIANTS CortexA8 + +;// *************************************************************************** +;// ARM1136JS implementation +;// *************************************************************************** + +;// *************************************************************************** +;// CortexA8 implementation +;// *************************************************************************** + IF CortexA8 +;// *************************************************************************** +;// MACRO DEFINITIONS +;// *************************************************************************** + ;// Description: + ;// Does interpolation for the case of "IntegerPixel" predictType. Both + ;// rounding cases are handled. Just copies a block from pSrc to pDst + ;// + ;// Syntax: + ;// M_MCRECONBLOCK_IntegerPixel + ;// + ;// Inputs: None + ;// Outputs: None + + MACRO + M_MCRECONBLOCK_IntegerPixel +CaseIntegerPixel_Rnd0 +CaseIntegerPixel_Rnd1 + + VLD1 dRow0, [pSrc], srcStep + VLD1 dRow1, [pSrc], srcStep + VLD1 dRow2, [pSrc], srcStep + VLD1 dRow3, [pSrc], srcStep + VLD1 dRow4, [pSrc], srcStep + VLD1 dRow5, [pSrc], srcStep + VLD1 dRow6, [pSrc], srcStep + VLD1 dRow7, [pSrc], srcStep + + VST1 dRow0, [pDst@64], dstStep + VST1 dRow1, [pDst@64], dstStep + VST1 dRow2, [pDst@64], dstStep + VST1 dRow3, [pDst@64], dstStep + VST1 dRow4, [pDst@64], dstStep + VST1 dRow5, [pDst@64], dstStep + VST1 dRow6, [pDst@64], dstStep + VST1 dRow7, [pDst@64], dstStep + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + ;// Description: + ;// Does interpolation for the case of "HalfPixelX" predictType. The two + ;// rounding cases are handled by the parameter "$rndVal". Averages between + ;// a pixel and pixel right to it, rounding it based on $rndVal. The + ;// rounding is implemented by using opCode switching between "VRHADD" and + ;// "VHADD" instructions. + ;// + ;// Syntax: + ;// M_MCRECONBLOCK_HalfPixelX $rndVal + ;// + ;// Inputs: + ;// $rndVal: 0 for rounding and 1 for no rounding + ;// Outputs: None + + MACRO + M_MCRECONBLOCK_HalfPixelX $rndVal + + LCLS M_VHADDR + IF $rndVal = 0 +M_VHADDR SETS "VRHADD" + ELSE +M_VHADDR SETS "VHADD" + ENDIF + +CaseHalfPixelX_Rnd$rndVal + + VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep + VEXT dRow0Shft, dRow0, dRow0Shft, #1 + VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep + VEXT dRow1Shft, dRow1, dRow1Shft, #1 + VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep + VEXT dRow2Shft, dRow2, dRow2Shft, #1 + VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep + VEXT dRow3Shft, dRow3, dRow3Shft, #1 + VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep + VEXT dRow4Shft, dRow4, dRow4Shft, #1 + VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep + VEXT dRow5Shft, dRow5, dRow5Shft, #1 + VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep + VEXT dRow6Shft, dRow6, dRow6Shft, #1 + VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep + VEXT dRow7Shft, dRow7, dRow7Shft, #1 + $M_VHADDR dRow0, dRow0, dRow0Shft + $M_VHADDR dRow1, dRow1, dRow1Shft + VST1 dRow0, [pDst@64], dstStep + $M_VHADDR dRow2, dRow2, dRow2Shft + VST1 dRow1, [pDst@64], dstStep + $M_VHADDR dRow3, dRow3, dRow3Shft + VST1 dRow2, [pDst@64], dstStep + $M_VHADDR dRow4, dRow4, dRow4Shft + VST1 dRow3, [pDst@64], dstStep + $M_VHADDR dRow5, dRow5, dRow5Shft + VST1 dRow4, [pDst@64], dstStep + $M_VHADDR dRow6, dRow6, dRow6Shft + VST1 dRow5, [pDst@64], dstStep + $M_VHADDR dRow7, dRow7, dRow7Shft + VST1 dRow6, [pDst@64], dstStep + VST1 dRow7, [pDst@64], dstStep + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + ;// Description: + ;// Does interpolation for the case of "HalfPixelY" predictType. The two + ;// rounding cases are handled by the parameter "$rndVal". Averages between + ;// a pixel and pixel below it, rounding it based on $rndVal. The + ;// rounding is implemented by using opCode switching between "VRHADD" and + ;// "VHADD" instructions. + ;// + ;// Syntax: + ;// M_MCRECONBLOCK_HalfPixelY $rndVal + ;// + ;// Inputs: + ;// $rndVal: 0 for rounding and 1 for no rounding + ;// Outputs: None + + MACRO + M_MCRECONBLOCK_HalfPixelY $rndVal + + LCLS M_VHADDR + IF $rndVal = 0 +M_VHADDR SETS "VRHADD" + ELSE +M_VHADDR SETS "VHADD" + ENDIF + +CaseHalfPixelY_Rnd$rndVal + VLD1 dRow0, [pSrc], srcStep + VLD1 dRow1, [pSrc], srcStep + VLD1 dRow2, [pSrc], srcStep + VLD1 dRow3, [pSrc], srcStep + VLD1 dRow4, [pSrc], srcStep + VLD1 dRow5, [pSrc], srcStep + VLD1 dRow6, [pSrc], srcStep + VLD1 dRow7, [pSrc], srcStep + $M_VHADDR dRow0, dRow0, dRow1 + VLD1 dRow8, [pSrc], srcStep + $M_VHADDR dRow1, dRow1, dRow2 + VST1 dRow0, [pDst@64], dstStep + $M_VHADDR dRow2, dRow2, dRow3 + VST1 dRow1, [pDst@64], dstStep + $M_VHADDR dRow3, dRow3, dRow4 + VST1 dRow2, [pDst@64], dstStep + $M_VHADDR dRow4, dRow4, dRow5 + VST1 dRow3, [pDst@64], dstStep + $M_VHADDR dRow5, dRow5, dRow6 + VST1 dRow4, [pDst@64], dstStep + $M_VHADDR dRow6, dRow6, dRow7 + VST1 dRow5, [pDst@64], dstStep + $M_VHADDR dRow7, dRow7, dRow8 + VST1 dRow6, [pDst@64], dstStep + VST1 dRow7, [pDst@64], dstStep + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + ;// Description: + ;// Does interpolation for the case of "IntegerPixel" predictType. Both + ;// rounding cases are handled. + ;// Typical computation for a row goes like this + ;// 1. VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes + ;// 2. VEXT dRow0Shft, dRow0, dRow0Shft, #1 ;// Generate the shifted row + ;// 3. VADDL qSum0, dRow0, dRow0Shft ;// Generate the sum of row and shifted row + ;// 5. VADD qSum0, qSum0, qSum1 ;// Add to the sum of next row (odd row sum has rounding value added to it) + ;// 6. VSHRN dRow0, qSum0, #2 ;// Divide by 4 + ;// 7. VST1 dRow0, [pDst@64], dstStep ;// Store + ;// Odd rows undergo following computation after step 3 + ;// 4. VADD qSum1, qSum1, qRound + ;// This saves for adding rounding value to each final sum (overall saves 4 + ;// instructions). + ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes + ;// care of this and also minimizes stalls. Rounding value was modified in + ;// ARM register rndVal (originally used for rounding flag) before the switch. + ;// It is then populated into all lanes in this macro. No branching out to + ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these + ;// are the last of switch cases. + ;// + ;// Syntax: + ;// M_MCRECONBLOCK_HalfPixelXY + ;// + ;// Inputs: None + ;// Outputs: None + + MACRO + M_MCRECONBLOCK_HalfPixelXY + +CaseHalfPixelXY_Rnd0 +CaseHalfPixelXY_Rnd1 + VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep + VDUP qRound, rndVal + VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep + VEXT dRow0Shft, dRow0, dRow0Shft, #1 + VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep + VEXT dRow1Shft, dRow1, dRow1Shft, #1 + VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep + VEXT dRow2Shft, dRow2, dRow2Shft, #1 + VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep + VADDL qSum0, dRow0, dRow0Shft + VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep + VADDL qSum1, dRow1, dRow1Shft + VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep + VEXT dRow3Shft, dRow3, dRow3Shft, #1 + VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep + VEXT dRow4Shft, dRow4, dRow4Shft, #1 + VLD1 {dRow8, dRow8Shft}, [pSrc], srcStep + VADD qSum1, qSum1, qRound + VADDL qSum2, dRow2, dRow2Shft + VEXT dRow5Shft, dRow5, dRow5Shft, #1 + VADD qSum0, qSum0, qSum1 + VADDL qSum3, dRow3, dRow3Shft + VEXT dRow6Shft, dRow6, dRow6Shft, #1 + VADD qSum1, qSum1, qSum2 + VSHRN dRow0, qSum0, #2 + VADDL qSum4, dRow4, dRow4Shft + VSHRN dRow1, qSum1, #2 + VADD qSum3, qSum3, qRound + VADDL qSum5, dRow5, dRow5Shft + VST1 dRow0, [pDst@64], dstStep + VEXT dRow7Shft, dRow7, dRow7Shft, #1 + VST1 dRow1, [pDst@64], dstStep + VEXT dRow8Shft, dRow8, dRow8Shft, #1 + VADD qSum5, qSum5, qRound + VADD qSum2, qSum2, qSum3 + VADD qSum3, qSum3, qSum4 + VADD qSum4, qSum4, qSum5 + VSHRN dRow2, qSum2, #2 + VSHRN dRow3, qSum3, #2 + VSHRN dRow4, qSum4, #2 + VADDL qSum6, dRow6, dRow6Shft + VADDL qSum7, dRow7, dRow7Shft + VST1 dRow2, [pDst@64], dstStep + VADDL qSum8, dRow8, dRow8Shft + VADD qSum7, qSum7, qRound + VST1 dRow3, [pDst@64], dstStep + VST1 dRow4, [pDst@64], dstStep + VADD qSum5, qSum5, qSum6 + VADD qSum6, qSum6, qSum7 + VADD qSum7, qSum7, qSum8 + VSHRN dRow5, qSum5, #2 + VSHRN dRow6, qSum6, #2 + VSHRN dRow7, qSum7, #2 + VST1 dRow5, [pDst@64], dstStep + VST1 dRow6, [pDst@64], dstStep + VST1 dRow7, [pDst@64], dstStep + + MEND +;// *************************************************************************** + +;// Input/Output Registers +pSrc RN 0 +srcStep RN 1 +pSrcResidue RN 2 +pDst RN 3 +dstStep RN 4 +predictType RN 5 +rndVal RN 6 + +;// Local Scratch Registers +pDstCopy RN 0 +return RN 0 + +;// Neon Registers +dRow0 DN D0.U8 +dRow0Shft DN D1.U8 +dRow1 DN D2.U8 +dRow1Shft DN D3.U8 +dRow2 DN D4.U8 +dRow2Shft DN D5.U8 +dRow3 DN D6.U8 +dRow3Shft DN D7.U8 +dRow4 DN D8.U8 +dRow4Shft DN D9.U8 +dRow5 DN D10.U8 +dRow5Shft DN D11.U8 +dRow6 DN D12.U8 +dRow6Shft DN D13.U8 +dRow7 DN D14.U8 +dRow7Shft DN D15.U8 +dRow8 DN D16.U8 +dRow8Shft DN D17.U8 + + +qSum0 QN Q9.U16 +qSum1 QN Q10.U16 +qSum2 QN Q11.U16 +qSum3 QN Q12.U16 +qSum4 QN Q13.U16 +qSum5 QN Q14.U16 +qSum6 QN Q0.U16 +qSum7 QN Q1.U16 +qSum8 QN Q2.U16 + +qRound QN Q15.U16 + +dDst0 DN D0.U8 +dDst1 DN D1.U8 +dDst2 DN D2.U8 +dDst3 DN D3.U8 +dDst4 DN D4.U8 +dDst5 DN D5.U8 +dDst6 DN D6.U8 +dDst7 DN D7.U8 + +qRes0 QN Q4.S16 +qRes1 QN Q5.S16 +qRes2 QN Q6.S16 +qRes3 QN Q7.S16 +qRes4 QN Q8.S16 +qRes5 QN Q9.S16 +qRes6 QN Q10.S16 +qRes7 QN Q11.S16 + + ;// Function header + M_START omxVCM4P2_MCReconBlock, r6, d15 + ;// Define stack arguments + M_ARG Arg_dstStep, 4 + M_ARG Arg_predictType, 4 + M_ARG Arg_rndVal, 4 + ;// Load argument from the stack + M_LDR dstStep, Arg_dstStep + M_LDR predictType, Arg_predictType + M_LDR rndVal, Arg_rndVal + ADD predictType, rndVal, predictType, LSL #1 + RSB rndVal, rndVal, #2 ;// preparing rndVal for HalfPixelXY + + ;// The following is implementation of switching to different code segments + ;// based on different predictType and rndVal flags. The corresponding + ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following + ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel" + ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases; + ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle + ;// the two rounding cases in separate code bases. + ;// All these together implement the interpolation functionality + + M_SWITCH predictType + M_CASE CaseIntegerPixel_Rnd0 + M_CASE CaseIntegerPixel_Rnd1 + M_CASE CaseHalfPixelX_Rnd0 + M_CASE CaseHalfPixelX_Rnd1 + M_CASE CaseHalfPixelY_Rnd0 + M_CASE CaseHalfPixelY_Rnd1 + M_CASE CaseHalfPixelXY_Rnd0 + M_CASE CaseHalfPixelXY_Rnd1 + M_ENDSWITCH + + M_MCRECONBLOCK_IntegerPixel + M_MCRECONBLOCK_HalfPixelX 0 + M_MCRECONBLOCK_HalfPixelX 1 + M_MCRECONBLOCK_HalfPixelY 0 + M_MCRECONBLOCK_HalfPixelY 1 + M_MCRECONBLOCK_HalfPixelXY +SwitchPredictTypeEnd + + ;// After interpolation is done, residue needs to be added. This is done + ;// only in case "pSrcResidue" parameter to the function is not NULL. + ;// Following is a completely unrolled code to do so. Each row and + ;// corresponding residue is loaded and residue is added and value + ;// stored + + CMP pSrcResidue, #0 + SUBNE pDst, pDst, dstStep, LSL #3 ;// Restoring pDst + MOVNE pDstCopy, pDst + BEQ pSrcResidueConditionEnd +pSrcResidueNotNull + VLD1 dDst0, [pDst@64], dstStep + VLD1 qRes0, [pSrcResidue@128]! + VLD1 dDst1, [pDst@64], dstStep + VLD1 qRes1, [pSrcResidue@128]! + VLD1 dDst2, [pDst@64], dstStep + VLD1 qRes2, [pSrcResidue@128]! + VADDW qRes0, qRes0, dDst0 + VLD1 dDst3, [pDst@64], dstStep + VADDW qRes1, qRes1, dDst1 + VLD1 qRes3, [pSrcResidue@128]! + VADDW qRes2, qRes2, dDst2 + VLD1 dDst4, [pDst@64], dstStep + VQMOVUN dDst0, qRes0 + VLD1 qRes4, [pSrcResidue@128]! + VADDW qRes3, qRes3, dDst3 + VLD1 dDst5, [pDst@64], dstStep + VQMOVUN dDst1, qRes1 + VLD1 qRes5, [pSrcResidue@128]! + VADDW qRes4, qRes4, dDst4 + VLD1 dDst6, [pDst@64], dstStep + VQMOVUN dDst2, qRes2 + VLD1 qRes6, [pSrcResidue@128]! + VADDW qRes5, qRes5, dDst5 + VLD1 dDst7, [pDst@64], dstStep + VQMOVUN dDst3, qRes3 + VLD1 qRes7, [pSrcResidue@128]! + VADDW qRes6, qRes6, dDst6 + VST1 dDst0, [pDstCopy@64], dstStep + VQMOVUN dDst4, qRes4 + VST1 dDst1, [pDstCopy@64], dstStep + VADDW qRes7, qRes7, dDst7 + VST1 dDst2, [pDstCopy@64], dstStep + VQMOVUN dDst5, qRes5 + VST1 dDst3, [pDstCopy@64], dstStep + VQMOVUN dDst6, qRes6 + VST1 dDst4, [pDstCopy@64], dstStep + VQMOVUN dDst7, qRes7 + VST1 dDst5, [pDstCopy@64], dstStep + VST1 dDst6, [pDstCopy@64], dstStep + VST1 dDst7, [pDstCopy@64], dstStep + +pSrcResidueConditionEnd + MOV return, #OMX_Sts_NoErr + + M_END + ENDIF ;// CortexA8 + END +;// *************************************************************************** +;// omxVCM4P2_MCReconBlock ends +;// *************************************************************************** diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s new file mode 100755 index 0000000..a73f64a --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s @@ -0,0 +1,320 @@ +; ********** +; * +; * File Name: omxVCM4P2_PredictReconCoefIntra_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains module for DC/AC coefficient prediction +; * +; * +; * Function: omxVCM4P2_PredictReconCoefIntra +; * +; * Description: +; * Performs adaptive DC/AC coefficient prediction for an intra block. Prior +; * to the function call, prediction direction (predDir) should be selected +; * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2. +; * +; * Remarks: +; * +; * Parameters: +; * [in] pSrcDst pointer to the coefficient buffer which contains the +; * quantized coefficient residuals (PQF) of the current +; * block; must be aligned on a 4-byte boundary. The +; * output coefficients are saturated to the range +; * [-2048, 2047]. +; * [in] pPredBufRow pointer to the coefficient row buffer; must be aligned +; * on a 4-byte boundary. +; * [in] pPredBufCol pointer to the coefficient column buffer; must be +; * aligned on a 4-byte boundary. +; * [in] curQP quantization parameter of the current block. curQP may +; * equal to predQP especially when the current block and +; * the predictor block are in the same macroblock. +; * [in] predQP quantization parameter of the predictor block +; * [in] predDir indicates the prediction direction which takes one +; * of the following values: +; * OMX_VIDEO_HORIZONTAL predict horizontally +; * OMX_VIDEO_VERTICAL predict vertically +; * [in] ACPredFlag a flag indicating if AC prediction should be +; * performed. It is equal to ac_pred_flag in the bit +; * stream syntax of MPEG-4 +; * [in] videoComp video component type (luminance, chrominance or +; * alpha) of the current block +; * [out] pSrcDst pointer to the coefficient buffer which contains +; * the quantized coefficients (QF) of the current +; * block +; * [out] pPredBufRow pointer to the updated coefficient row buffer +; * [out] pPredBufCol pointer to the updated coefficient column buffer +; * Return Value: +; * OMX_Sts_NoErr - no error +; * OMX_Sts_BadArgErr - Bad arguments +; * - At least one of the pointers is NULL: pSrcDst, pPredBufRow, or pPredBufCol. +; * - At least one the following cases: curQP <= 0, predQP <= 0, curQP >31, +; * predQP > 31, preDir exceeds [1,2]. +; * - At least one of the pointers pSrcDst, pPredBufRow, or pPredBufCol is not +; * 4-byte aligned. +; * +; ********* + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + + + IMPORT armVCM4P2_Reciprocal_QP_S32 + IMPORT armVCM4P2_Reciprocal_QP_S16 + IMPORT armVCM4P2_DCScaler + + IF CortexA8 +;// Input Arguments + +pSrcDst RN 0 +pPredBufRow RN 1 +pPredBufCol RN 2 +curQP RN 3 +QP RN 3 +predQP RN 4 +predDir RN 5 +ACPredFlag RN 6 +videoComp RN 7 + +;// Local Variables + +shortVideoHeader RN 4 +dcScaler RN 4 +index RN 6 +predCoeffTable RN 7 +temp1 RN 6 +temp2 RN 9 +temp RN 14 +Const RN 8 +temppPredColBuf RN 8 +tempPred RN 9 + +absCoeffDC RN 8 +negdcScaler RN 10 +Rem RN 11 +temp3 RN 12 + +dcRowbufCoeff RN 10 +dcColBuffCoeff RN 11 +Return RN 0 + +;//NEON Registers + +qPredRowBuf QN Q0.S16 +dPredRowBuf0 DN D0.S16 +dPredRowBuf1 DN D1.S16 + + + + +qCoeffTab QN Q1.S32 + +qPredQP QN Q2.S16 +dPredQP0 DN D4.S16 +dPredQP1 DN D5.S16 + + +qtemp1 QN Q3.S32 +qtemp QN Q3.S16 + +dtemp0 DN D6.S16 +dtemp1 DN D7.S16 + +dtemp2 DN D8.S16 +dtemp3 DN D9.S16 + +dtemp4 DN D2.S16 +dtemp5 DN D3.S16 +dtemp6 DN D4.S16 +dtemp7 DN D5.S16 + +qtempPred1 QN Q5.S32 +qtempPred QN Q5.S16 + +dtempPred0 DN D10.S16 +dtempPred1 DN D11.S16 + + + + M_START omxVCM4P2_PredictReconCoefIntra,r11,d11 + + ;// Assigning pointers to Input arguments on Stack + + M_ARG predQPonStack,4 + M_ARG predDironStack,4 + M_ARG ACPredFlagonStack,4 + M_ARG videoComponStack,4 + + ;// DC Prediction + + M_LDR videoComp,videoComponStack ;// Load videoComp From Stack + + M_LDR predDir,predDironStack ;// Load Prediction direction + ;// DC Scaler calculation + LDR index, =armVCM4P2_DCScaler + ADD index,index,videoComp,LSL #5 + LDRB dcScaler,[index,QP] + + + LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S16 ;// Loading the table with entries 32767/(1 to 63) + CMP predDir,#2 ;// Check if the Prediction direction is vertical + + ;// Caulucate tempPred + + LDREQSH absCoeffDC,[pPredBufRow] ;// If vetical load the coeff from Row Prediction Buffer + LDRNESH absCoeffDC,[pPredBufCol] ;// If horizontal load the coeff from column Prediction Buffer + + RSB negdcScaler,dcScaler,#0 ;// negdcScaler=-dcScaler + MOV temp1,absCoeffDC ;// Load the Prediction coeff to temp for comparision + CMP temp1,#0 + RSBLT absCoeffDC,temp1,#0 ;// calculate absolute val of prediction coeff + + ADD temp,dcScaler,dcScaler + LDRH temp,[predCoeffTable,temp] ;// Load value from coeff table for performing division using multiplication + SMULBB tempPred,temp,absCoeffDC ;// tempped=pPredBufRow(Col)[0]*32767/dcScaler + ADD temp3,dcScaler,#1 + LSR tempPred,tempPred,#15 ;// tempped=pPredBufRow(Col)[0]/dcScaler + LSR temp3,temp3,#1 ;// temp3=round(dcScaler/2) + MLA Rem,negdcScaler,tempPred,absCoeffDC ;// Remainder Rem=abs(pPredBufRow(Col)[0])-tempPred*dcScaler + + LDRH dcRowbufCoeff,[pPredBufCol] + + CMP Rem,temp3 ;// compare Rem with (dcScaler/2) + ADDGE tempPred,#1 ;// tempPred=tempPred+1 if Rem>=(dcScaler/2) + CMP temp1,#0 + RSBLT tempPred,tempPred,#0 ;// tempPred=-tempPred if + + STRH dcRowbufCoeff,[pPredBufRow,#-16] + + + LDRH temp,[pSrcDst] ;// temp=pSrcDst[0] + ADD temp,temp,tempPred ;// temp=pSrcDst[0]+tempPred + SSAT16 temp,#12,temp ;// clip temp to [-2048,2047] + SMULBB dcColBuffCoeff,temp,dcScaler ;// temp1=clipped(pSrcDst[0])*dcScaler + M_LDR ACPredFlag,ACPredFlagonStack + STRH dcColBuffCoeff,[pPredBufCol] + + + ;// AC Prediction + + M_LDR predQP,predQPonStack + + CMP ACPredFlag,#1 ;// Check if the AC prediction flag is set or not + BNE Exit ;// If not set Exit + CMP predDir,#2 ;// Check the Prediction direction + LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S32 ;// Loading the table with entries 0x1ffff/(1 to 63) + MOV Const,#4 + MUL curQP,curQP,Const ;// curQP=4*curQP + VDUP dPredQP0,predQP + LDR temp2,[predCoeffTable,curQP] ;// temp=0x1ffff/curQP + VDUP qCoeffTab,temp2 + BNE Horizontal ;// If the Prediction direction is horizontal branch to Horizontal + + + + ;// Vertical + ;//Calculating tempPred + + VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufRow] ;// Loading pPredBufRow[i]:i=0 t0 7 + + VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i]: i=0 t0 3 + VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3 + + VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i] : i=4 t0 7 + + VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=0 t0 3 + VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits + + + VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7 + VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=4 t0 7 + VLD1 {dtemp0,dtemp1},[pSrcDst] ;//Loading pSrcDst[i] : i=0 to 7 + VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits + VMOV dtempPred0,dPredQP1 + + ;//updating source and row prediction buffer contents + VADD qtemp,qtemp,qtempPred ;//pSrcDst[i]=pSrcDst[i]+qtempPred[i]: i=0 to 7 + VQSHL qtemp,qtemp,#4 ;//Clip to [-2048,2047] + LDRH dcRowbufCoeff,[pPredBufRow] ;//Loading Dc Value of Row Prediction buffer + VSHR qtemp,qtemp,#4 + + VST1 {dtemp0,dtemp1},[pSrcDst] ;//storing back the updated values + VST1 {dtemp0,dtemp1},[pPredBufRow] ;//storing back the updated row prediction values + STRH dcRowbufCoeff,[pPredBufRow] ;// storing the updated DC Row Prediction coeff + + B Exit + +Horizontal + + ;// Calculating Temppred + + + + VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufCol] ;// Loading pPredBufCol[i]:i=0 t0 7 + VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i]: i=0 t0 3 + VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3 + + VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i] : i=4 t0 7 + + VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=0 t0 3 + VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits + + + VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7 + + MOV temppPredColBuf,pPredBufCol + VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=4 t0 7 + VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Loading coefficients Interleaving by 4 + VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits + VMOV dtempPred0,dPredQP1 + + ;// Updating source and column prediction buffer contents + ADD temp2,pSrcDst,#32 + VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2] ;// Loading next 16 coefficients Interleaving by 4 + VUZP dtemp0,dtemp4 ;// Interleaving by 8 + VADD dtemp0,dtemp0,dtempPred0 ;// Adding tempPred to coeffs + VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047] + VSHR dtemp0,dtemp0,#4 + VST1 {dtemp0},[pPredBufCol]! ;// Updating Pridiction column buffer + VZIP dtemp0,dtemp4 ;// deinterleaving + VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Updating source coeffs + VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]! + + MOV temp1,temp2 + VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp2]! ;// Loading coefficients Interleaving by 4 + + VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2] + VUZP dtemp0,dtemp4 ;// Interleaving by 8 + VADD dtemp0,dtemp0,dtempPred1 + VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047] + VSHR dtemp0,dtemp0,#4 + VST1 {dtemp0},[pPredBufCol]! + VZIP dtemp0,dtemp4 + VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp1] + STRH dcColBuffCoeff,[temppPredColBuf] + VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2] + +Exit + + STRH temp,[pSrcDst] + + + MOV Return,#OMX_Sts_NoErr + + M_END + ENDIF + + + END + + + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s new file mode 100755 index 0000000..bd0ad1f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s @@ -0,0 +1,162 @@ +;/** +; * +; * File Name: omxVCM4P2_QuantInvInter_I_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for inter reconstruction +; * +; * +; * +; * +; * +; * Function: omxVCM4P2_QuantInvInter_I +; * +; * Description: +; * Performs inverse quantization on intra/inter coded block. +; * This function supports bits_per_pixel = 8. Mismatch control +; * is performed for the first MPEG-4 mode inverse quantization method. +; * The output coefficients are clipped to the range: [-2048, 2047]. +; * Mismatch control is performed for the first inverse quantization method. +; * +; * Remarks: +; * +; * Parameters: +; * [in] pSrcDst pointer to the input (quantized) intra/inter block. Must be 16-byte aligned. +; * [in] QP quantization parameter (quantiser_scale) +; * [in] videoComp (Intra version only.) Video component type of the +; * current block. Takes one of the following flags: +; * OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE, +; * OMX_VC_ALPHA. +; * [in] shortVideoHeader a flag indicating presence of short_video_header; +; * shortVideoHeader==1 selects linear intra DC mode, +; * and shortVideoHeader==0 selects nonlinear intra DC mode. +; * [out] pSrcDst pointer to the output (dequantized) intra/inter block. Must be 16-byte aligned. +; * +; * Return Value: +; * OMX_Sts_NoErr - no error +; * OMX_Sts_BadArgErr - bad arguments +; * - If pSrcDst is NULL or is not 16-byte aligned. +; * or +; * - If QP <= 0. +; * or +; * - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA. +; * +; */ + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IF CortexA8 + + +;//Input Arguments +pSrcDst RN 0 +QP RN 1 + + +;//Local Variables +Count RN 3 +doubleQP RN 4 +Return RN 0 +;// Neon registers + + +dQP10 DN D0.S32[0] +qQP1 QN Q0.S32 + +dQP1 DN D0.S16 +dMinusQP1 DN D1.S16 + +dCoeff0 DN D2.S16 +dCoeff1 DN D3.S16 + +qResult0 QN Q3.S32 +dResult0 DN D7.S16 +qSign0 QN Q3.S32 +dSign0 DN D6.S16 + +qResult1 QN Q4.S32 +dResult1 DN D8.S16 +qSign1 QN Q4.S32 +dSign1 DN D8.S16 + +d2QP0 DN D10.S32[0] +q2QP0 QN Q5.S32 +d2QP DN D10.S16 + +dZero0 DN D11.S16 +dZero1 DN D12.S16 +dConst0 DN D13.S16 + + + M_START omxVCM4P2_QuantInvInter_I,r4,d13 + + + + ADD doubleQP,QP,QP ;// doubleQP= 2*QP + VMOV d2QP0,doubleQP + VDUP q2QP0,d2QP0 ;// Move doubleQP in to a scalar + TST QP,#1 + VLD1 {dCoeff0,dCoeff1},[pSrcDst] ;// Load first 8 values to Coeff0,Coeff1 + SUBEQ QP,QP,#1 + VMOV dQP10,QP ;// If QP is even then QP1=QP-1 else QP1=QP + MOV Count,#64 + VDUP qQP1,dQP10 ;// Duplicate tempResult with QP1 + VSHRN d2QP,q2QP0,#0 + VEOR dConst0,dConst0,dConst0 + VSHRN dQP1,qQP1,#0 ;// QP1 truncated to 16 bits + VSUB dMinusQP1,dConst0,dQP1 ;// dMinusQP1=-QP1 + +Loop + + ;//Performing Inverse Quantization + + VCLT dSign0,dCoeff0, #0 ;// Compare Coefficient 0 against 0 + VCLT dSign1,dCoeff1, #0 ;// Compare Coefficient 1 against 0 + VCEQ dZero0,dCoeff0,#0 ;// Compare Coefficient 0 against zero + VBSL dSign0,dMinusQP1,dQP1 ;// dSign0 = -QP1 if Coeff0< 0 else QP1 + VCEQ dZero1,dCoeff1,#0 ;// Compare Coefficient 1 against zero + VBSL dSign1,dMinusQP1,dQP1 ;// dSign1 = -QP1 if Coeff1< 0 else QP1 + VMOVL qSign0,dSign0 ;// Sign extend qSign0 to 32 bits + VMOVL qSign1,dSign1 + VMLAL qResult0,dCoeff0,d2QP ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0 + ;// qResult0[i]= qCoeff0[i] if Coeff >=0 + VMLAL qResult1,dCoeff1,d2QP ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0 + ;// qResult1[i]= qCoeff1[i] if Coeff >=0 + ;// Clip Result to [-2048,2047] + + VQSHL qResult0,qResult0,#20 ;// clip to [-2048,2047] + VQSHL qResult1,qResult1,#20 + + VSHR qResult0,qResult0,#4 + VSHR qResult1,qResult1,#4 + VSHRN dResult0,qResult0,#16 ;// Narrow the clipped Value to Halfword + VSHRN dResult1,qResult1,#16 + VBIT dResult0,dConst0,dZero0 + VBIT dResult1,dConst0,dZero1 + + VST1 {dResult0,dResult1},[pSrcDst]! ;// Store the result + SUBS Count,Count,#8 + VLD1 {dCoeff0,dCoeff1},[pSrcDst] + + + BGT Loop + + MOV Return,#OMX_Sts_NoErr + + + M_END + ENDIF + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s new file mode 100755 index 0000000..e00591f --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s @@ -0,0 +1,210 @@ +;/** +; * +; * File Name: omxVCM4P2_QuantInvIntra_I_s.s +; * OpenMAX DL: v1.0.2 +; * Revision: 12290 +; * Date: Wednesday, April 9, 2008 +; * +; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +; * +; * +; * +; * Description: +; * Contains modules for inter reconstruction +; * +; * +; * +; * +; * +; * +; * Function: omxVCM4P2_QuantInvIntra_I +; * +; * Description: +; * Performs inverse quantization on intra/inter coded block. +; * This function supports bits_per_pixel = 8. Mismatch control +; * is performed for the first MPEG-4 mode inverse quantization method. +; * The output coefficients are clipped to the range: [-2048, 2047]. +; * Mismatch control is performed for the first inverse quantization method. +; * +; * Remarks: +; * +; * Parameters: +; * [in] pSrcDst pointer to the input (quantized) intra/inter block. Must be 16-byte aligned. +; * [in] QP quantization parameter (quantiser_scale) +; * [in] videoComp (Intra version only.) Video component type of the +; * current block. Takes one of the following flags: +; * OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE, +; * OMX_VC_ALPHA. +; * [in] shortVideoHeader a flag indicating presence of short_video_header; +; * shortVideoHeader==1 selects linear intra DC mode, +; * and shortVideoHeader==0 selects nonlinear intra DC mode. +; * [out] pSrcDst pointer to the output (dequantized) intra/inter block. Must be 16-byte aligned. +; * +; * Return Value: +; * OMX_Sts_NoErr - no error +; * OMX_Sts_BadArgErr - bad arguments +; * - If pSrcDst is NULL or is not 16-byte aligned. +; * or +; * - If QP <= 0. +; * or +; * - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA. +; * + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + + IMPORT armVCM4P2_DCScaler + + IF CortexA8 + + +;//Input Arguments +pSrcDst RN 0 +QP RN 1 +videoComp RN 2 +shortVideoHeader RN 3 + + +;//Local Variables + +dcScaler RN 4 +temp RN 14 +index RN 5 + + +Count RN 5 +doubleQP RN 4 +Return RN 0 + + +;// Neon registers + + +dQP10 DN D0.S32[0] +qQP1 QN Q0.S32 + +dQP1 DN D0.S16 +dMinusQP1 DN D1.S16 + +dCoeff0 DN D2.S16 +dCoeff1 DN D3.S16 + +qResult0 QN Q3.S32 +dResult0 DN D7.S16 +qSign0 QN Q3.S32 +dSign0 DN D6.S16 + +qResult1 QN Q4.S32 +dResult1 DN D8.S16 +qSign1 QN Q4.S32 +dSign1 DN D8.S16 + +d2QP0 DN D10.S32[0] +q2QP0 QN Q5.S32 +d2QP DN D10.S16 + +dZero0 DN D11.S16 +dZero1 DN D4.S16 +dConst0 DN D5.S16 + + + + + + + M_START omxVCM4P2_QuantInvIntra_I,r5,d11 + + + ;// Perform Inverse Quantization for DC coefficient + + TEQ shortVideoHeader,#0 ;// Test if short Video Header flag =0 + MOVNE dcScaler,#8 ;// if shortVideoHeader is non zero dcScaler=8 + BNE calDCVal + + LDR index, =armVCM4P2_DCScaler + ADD index,index,videoComp,LSL #5 + LDRB dcScaler,[index,QP] + + ;//M_CalDCScalar shortVideoHeader,videoComp, QP + +calDCVal + + LDRH temp,[pSrcDst] + SMULBB temp,temp,dcScaler ;// dcCoeff = dcScaler * Quantized DC coefficient(from memory) + SSAT temp,#12,temp ;// Saturating to 12 bits + + + + ;// Perform Inverse Quantization for Ac Coefficients + + + + ADD doubleQP,QP,QP ;// doubleQP= 2*QP + VMOV d2QP0,doubleQP + VDUP q2QP0,d2QP0 ;// Move doubleQP in to a scalar + TST QP,#1 + VLD1 {dCoeff0,dCoeff1},[pSrcDst] ;// Load first 8 values to Coeff0,Coeff1 + SUBEQ QP,QP,#1 + VMOV dQP10,QP ;// If QP is even then QP1=QP-1 else QP1=QP + MOV Count,#64 + VDUP qQP1,dQP10 ;// Duplicate tempResult with QP1 + VSHRN d2QP,q2QP0,#0 + VEOR dConst0,dConst0,dConst0 + VSHRN dQP1,qQP1,#0 ;// QP1 truncated to 16 bits + VSUB dMinusQP1,dConst0,dQP1 ;// dMinusQP1=-QP1 + +Loop + + ;//Performing Inverse Quantization + + VCLT dSign0,dCoeff0, #0 ;// Compare Coefficient 0 against 0 + VCLT dSign1,dCoeff1, #0 ;// Compare Coefficient 1 against 0 + VCEQ dZero0,dCoeff0,#0 ;// Compare Coefficient 0 against zero + VBSL dSign0,dMinusQP1,dQP1 ;// dSign0 = -QP1 if Coeff0< 0 else QP1 + VCEQ dZero1,dCoeff1,#0 ;// Compare Coefficient 1 against zero + VBSL dSign1,dMinusQP1,dQP1 ;// dSign1 = -QP1 if Coeff1< 0 else QP1 + VMOVL qSign0,dSign0 ;// Sign extend qSign0 to 32 bits + VMOVL qSign1,dSign1 + VMLAL qResult0,dCoeff0,d2QP ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0 + ;// qResult0[i]= qCoeff0[i] if Coeff >=0 + VMLAL qResult1,dCoeff1,d2QP ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0 + ;// qResult1[i]= qCoeff1[i] if Coeff >=0 + ;// Clip Result to [-2048,2047] + + VQSHL qResult0,qResult0,#20 ;// clip to [-2048,2047] + VQSHL qResult1,qResult1,#20 + + VSHR qResult0,qResult0,#4 + VSHR qResult1,qResult1,#4 + VSHRN dResult0,qResult0,#16 ;// Narrow the clipped Value to Halfword + VSHRN dResult1,qResult1,#16 + VBIT dResult0,dConst0,dZero0 + VBIT dResult1,dConst0,dZero1 + + VST1 {dResult0,dResult1},[pSrcDst]! ;// Store the result + SUBS Count,Count,#8 + VLD1 {dCoeff0,dCoeff1},[pSrcDst] + + + BGT Loop + + SUB pSrcDst,pSrcDst,#128 + + ;// Store the Inverse quantized Dc coefficient + + STRH temp,[pSrcDst],#2 + + MOV Return,#OMX_Sts_NoErr + + + + M_END + ENDIF + + + END + diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c new file mode 100755 index 0000000..5d93681 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c @@ -0,0 +1,6 @@ +#include "omxtypes.h" +#include "armCOMM_Version.h" + +#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS +const char * const omxVC_VersionDescription = "ARM OpenMAX DL v" ARM_VERSION_STRING " Rel=" OMX_ARM_RELEASE_TAG " Arch=" OMX_ARM_BUILD_ARCHITECTURE " Tools=" OMX_ARM_BUILD_TOOLCHAIN ; +#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */ |