summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api
diff options
context:
space:
mode:
authorJames Dong <jdong@google.com>2011-05-31 18:53:46 -0700
committerJames Dong <jdong@google.com>2011-06-02 12:32:46 -0700
commit0c1bc742181ded4930842b46e9507372f0b1b963 (patch)
treec952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api
parent92a746c3b18d035189f596ce32847bf26247aaca (diff)
downloadframeworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder over PV software decoder is more than 30%. o In addition, it fixes some known PV software decoder issues like missing output frames o allow both pv and on2 software avc to be available for easy comparision o change output frames from 8 to 16 Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h785
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h670
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h212
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h40
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h1451
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h27
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h43
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h1157
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h274
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h252
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h77
11 files changed, 4988 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
new file mode 100755
index 0000000..64c1958
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
@@ -0,0 +1,785 @@
+/**
+ *
+ * File Name: armCOMM.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armCOMM.h
+ * Brief: Declares Common APIs/Data Types used across OpenMAX API's
+ *
+ */
+
+
+#ifndef _armCommon_H_
+#define _armCommon_H_
+
+#include "omxtypes.h"
+
+typedef struct
+{
+ OMX_F32 Re; /** Real part */
+ OMX_F32 Im; /** Imaginary part */
+
+} OMX_FC32; /** single precision floating point complex number */
+
+typedef struct
+{
+ OMX_F64 Re; /** Real part */
+ OMX_F64 Im; /** Imaginary part */
+
+} OMX_FC64; /** double precision floating point complex number */
+
+
+/* Used by both IP and IC domains for 8x8 JPEG blocks. */
+typedef OMX_S16 ARM_BLOCK8x8[64];
+
+
+#include "armOMX.h"
+
+#define armPI (OMX_F64)(3.1415926535897932384626433832795)
+
+/***********************************************************************/
+
+/* Compiler extensions */
+#ifdef ARM_DEBUG
+/* debug version */
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#define armError(str) {printf((str)); printf("\n"); exit(-1);}
+#define armWarn(str) {printf((str)); printf("\n");}
+#define armIgnore(a) ((void)a)
+#define armAssert(a) assert(a)
+#else
+/* release version */
+#define armError(str) ((void) (str))
+#define armWarn(str) ((void) (str))
+#define armIgnore(a) ((void) (a))
+#define armAssert(a) ((void) (a))
+#endif /* ARM_DEBUG */
+
+/* Arithmetic operations */
+
+#define armMin(a,b) ( (a) > (b) ? (b):(a) )
+#define armMax(a,b) ( (a) > (b) ? (a):(b) )
+#define armAbs(a) ( (a) < 0 ? -(a):(a) )
+
+/* Alignment operation */
+
+#define armAlignToBytes(Ptr,N) (Ptr + ( ((N-(int)Ptr)&(N-1)) / sizeof(*Ptr) ))
+#define armAlignTo2Bytes(Ptr) armAlignToBytes(Ptr,2)
+#define armAlignTo4Bytes(Ptr) armAlignToBytes(Ptr,4)
+#define armAlignTo8Bytes(Ptr) armAlignToBytes(Ptr,8)
+#define armAlignTo16Bytes(Ptr) armAlignToBytes(Ptr,16)
+
+/* Error and Alignment check */
+
+#define armRetArgErrIf(condition, code) if(condition) { return (code); }
+#define armRetDataErrIf(condition, code) if(condition) { return (code); }
+
+#ifndef ALIGNMENT_DOESNT_MATTER
+#define armIsByteAligned(Ptr,N) ((((int)(Ptr)) % N)==0)
+#define armNotByteAligned(Ptr,N) ((((int)(Ptr)) % N)!=0)
+#else
+#define armIsByteAligned(Ptr,N) (1)
+#define armNotByteAligned(Ptr,N) (0)
+#endif
+
+#define armIs2ByteAligned(Ptr) armIsByteAligned(Ptr,2)
+#define armIs4ByteAligned(Ptr) armIsByteAligned(Ptr,4)
+#define armIs8ByteAligned(Ptr) armIsByteAligned(Ptr,8)
+#define armIs16ByteAligned(Ptr) armIsByteAligned(Ptr,16)
+
+#define armNot2ByteAligned(Ptr) armNotByteAligned(Ptr,2)
+#define armNot4ByteAligned(Ptr) armNotByteAligned(Ptr,4)
+#define armNot8ByteAligned(Ptr) armNotByteAligned(Ptr,8)
+#define armNot16ByteAligned(Ptr) armNotByteAligned(Ptr,16)
+#define armNot32ByteAligned(Ptr) armNotByteAligned(Ptr,32)
+
+/**
+ * Function: armRoundFloatToS16_ref/armRoundFloatToS32_ref/armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value);
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToS16_ref/armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToU16_ref/armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16/OMX_U32 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value);
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value);
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] var Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT -- returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ */
+
+OMX_INT armSignCheck (OMX_S16 var);
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 -- returns clipped value
+ */
+
+OMX_S32 armClip (
+ OMX_INT min,
+ OMX_INT max,
+ OMX_S32 src
+ );
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 -- returns clipped value
+ */
+
+OMX_F32 armClip_F32 (
+ OMX_F32 min,
+ OMX_F32 max,
+ OMX_F32 src
+ );
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding
+ * integer value. Takes care of rounding while clipping the final
+ * value.
+ *
+ * Parameters:
+ * [in] v Number to be operated upon
+ * [in] shift Divides the input "v" by "2^shift"
+ * [in] satBits Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 -- returns "shifted" saturated value
+ */
+
+OMX_U32 armShiftSat_F32(
+ OMX_F32 v,
+ OMX_INT shift,
+ OMX_INT satBits
+ );
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * This function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(OMX_U8 *pBuf1, OMX_U8 *pBuf2, OMX_INT elemSize);
+
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry First entry
+ * [in] sEntry second entry
+ * [in] tEntry Third entry
+ *
+ * Return Value:
+ * OMX_S32 -- returns the median value
+ */
+
+OMX_S32 armMedianOf3 (
+ OMX_S32 fEntry,
+ OMX_S32 sEntry,
+ OMX_S32 tEntry
+ );
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value Positive value
+ *
+ * Return Value:
+ * OMX_U8 -- returns the size of the positive value
+ */
+
+OMX_U8 armLogSize (
+ OMX_U16 value
+ );
+
+/***********************************************************************/
+ /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S32 armSatAdd_S32(
+ OMX_S32 Value1,
+ OMX_S32 Value2
+ );
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S64 armSatAdd_S64(
+ OMX_S64 Value1,
+ OMX_S64 Value2
+ );
+
+/** Function :armSatSub_S32()
+ *
+ * Description :
+ * Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatSub_S32(
+ OMX_S32 Value1,
+ OMX_S32 Value2
+ );
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ * Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ * accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ * [in] Mac Accumulator
+ *
+ * Return:
+ * [out] Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(
+ OMX_S32 Mac,
+ OMX_S16 Value1,
+ OMX_S16 Value2
+ );
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ * Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ * mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem First 32 bit Operand
+ * [in] filTap Second 16 bit Operand
+ * [in] mac Result of MAC operation
+ *
+ * Return:
+ * [out] mac Result of operation
+ *
+ **/
+
+OMX_S32 armSatMac_S16S32_S32(
+ OMX_S32 mac,
+ OMX_S32 delayElem,
+ OMX_S16 filTap );
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ * Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ * output = Saturate_in_16Bits( ( RightShift( (Round(input) , scaleFactor ) )
+ *
+ * Parametrs:
+ * [in] input The input to be operated on
+ * [in] scaleFactor The shift number
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(
+ OMX_S32 input,
+ OMX_INT scaleFactor);
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatRoundLeftShift_S32(
+ OMX_S32 Value,
+ OMX_INT shift
+ );
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S64 armSatRoundLeftShift_S64(
+ OMX_S64 Value,
+ OMX_INT shift
+ );
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ * Returns the result of a S16 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(
+ OMX_S16 input1,
+ OMX_S32 input2);
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ * Returns the result of a S32 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatMulS32S32_S32(
+ OMX_S32 input1,
+ OMX_S32 input2);
+
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer.
+ * Half-integer values are rounded away from zero
+ * unless otherwise specified. For example 3//2 is rounded
+ * to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num Operand 1
+ * [in] Deno Operand 2
+ *
+ * Return:
+ * [out] Result of operation input1//input2
+ *
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno);
+
+
+/***********************************************************************/
+/*
+ * Debugging macros
+ *
+ */
+
+
+/*
+ * Definition of output stream - change to stderr if necessary
+ */
+#define DEBUG_STREAM stdout
+
+/*
+ * Debug printf macros, one for each argument count.
+ * Add more if needed.
+ */
+#ifdef DEBUG_ON
+#include <stdio.h>
+
+#define DEBUG_PRINTF_0(a) fprintf(DEBUG_STREAM, a)
+#define DEBUG_PRINTF_1(a, b) fprintf(DEBUG_STREAM, a, b)
+#define DEBUG_PRINTF_2(a, b, c) fprintf(DEBUG_STREAM, a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d) fprintf(DEBUG_STREAM, a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e) fprintf(DEBUG_STREAM, a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f) fprintf(DEBUG_STREAM, a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#else /* DEBUG_ON */
+#define DEBUG_PRINTF_0(a)
+#define DEBUG_PRINTF_1(a, b)
+#define DEBUG_PRINTF_2(a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#endif /* DEBUG_ON */
+
+
+/*
+ * Domain and sub domain definitions
+ *
+ * In order to turn on debug for an entire domain or sub-domain
+ * at compile time, one of the DEBUG_DOMAIN_* below may be defined,
+ * which will activate debug in all of the defines it contains.
+ */
+
+#ifdef DEBUG_DOMAIN_AC
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT_MPEG4
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT
+#define DEBUG_OMXACAAC_DECODEDATSTRELT
+#define DEBUG_OMXACAAC_DECODEFILLELT
+#define DEBUG_OMXACAAC_DECODEISSTEREO_S32
+#define DEBUG_OMXACAAC_DECODEMSPNS_S32
+#define DEBUG_OMXACAAC_DECODEMSSTEREO_S32_I
+#define DEBUG_OMXACAAC_DECODEPRGCFGELT
+#define DEBUG_OMXACAAC_DECODETNS_S32_I
+#define DEBUG_OMXACAAC_DEINTERLEAVESPECTRUM_S32
+#define DEBUG_OMXACAAC_ENCODETNS_S32_I
+#define DEBUG_OMXACAAC_LONGTERMPREDICT_S32
+#define DEBUG_OMXACAAC_LONGTERMRECONSTRUCT_S32
+#define DEBUG_OMXACAAC_MDCTFWD_S32
+#define DEBUG_OMXACAAC_MDCTINV_S32_S16
+#define DEBUG_OMXACAAC_NOISELESSDECODE
+#define DEBUG_OMXACAAC_QUANTINV_S32_I
+#define DEBUG_OMXACAAC_UNPACKADIFHEADER
+#define DEBUG_OMXACAAC_UNPACKADTSFRAMEHEADER
+#define DEBUG_OMXACMP3_HUFFMANDECODESFBMBP_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODESFB_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODE_S32
+#define DEBUG_OMXACMP3_MDCTINV_S32
+#define DEBUG_OMXACMP3_REQUANTIZESFB_S32_I
+#define DEBUG_OMXACMP3_REQUANTIZE_S32_I
+#define DEBUG_OMXACMP3_SYNTHPQMF_S32_S16
+#define DEBUG_OMXACMP3_UNPACKFRAMEHEADER
+#define DEBUG_OMXACMP3_UNPACKSCALEFACTORS_S8
+#define DEBUG_OMXACMP3_UNPACKSIDEINFO
+#endif /* DEBUG_DOMAIN_AC */
+
+
+#ifdef DEBUG_DOMAIN_VC
+#define DEBUG_OMXVCM4P10_AVERAGE_16X
+#define DEBUG_OMXVCM4P10_AVERAGE_4X
+#define DEBUG_OMXVCM4P10_AVERAGE_8X
+#define DEBUG_OMXVCM4P10_DEBLOCKCHROMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DEBLOCKLUMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DECODECHROMADCCOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DECODECOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DEQUANTTRANSFORMACFROMPAIR_U8_S16_C1_DLX
+#define DEBUG_OMXVCM4P10_EXPANDFRAME
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_PREDICTINTRACHROMA8X8_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_16X16_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_4X4_U8_C1R
+#define DEBUG_OMXVCM4P10_SADQUAR_16X
+#define DEBUG_OMXVCM4P10_SADQUAR_4X
+#define DEBUG_OMXVCM4P10_SADQUAR_8X
+#define DEBUG_OMXVCM4P10_SAD_16X
+#define DEBUG_OMXVCM4P10_SAD_4X
+#define DEBUG_OMXVCM4P10_SAD_8X
+#define DEBUG_OMXVCM4P10_SATD_4X4
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTCHROMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTLUMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_CHROMADC
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_LUMADC
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_8X8
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_8X8
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_SAD_U8_S16
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_U8_S16
+#define DEBUG_OMXVCM4P2_DCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_DECODEBLOCKCOEF_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEPADMV_PVOP
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEMV_U8_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_FINDMVPRED
+#define DEBUG_OMXVCM4P2_IDCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_LIMITMVTORECT
+#define DEBUG_OMXVCM4P2_MOTIONESTIMATIONMB
+#define DEBUG_OMXVCM4P2_PADMBGRAY_U8
+#define DEBUG_OMXVCM4P2_PADMBHORIZONTAL_U8
+#define DEBUG_OMXVCM4P2_PADMBVERTICAL_U8
+#define DEBUG_OMXVCM4P2_PADMV
+#define DEBUG_OMXVCM4P2_QUANTINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINTRA_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTRA_S16_I
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTER
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTRA
+#endif /* DEBUG_DOMAIN_VC */
+
+
+#ifdef DEBUG_DOMAIN_IC
+/* To be filled in */
+#endif /* DEBUG_DOMAIN_IC */
+
+
+#ifdef DEBUG_DOMAIN_SP
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S32
+#define DEBUG_OMXACSP_COPY_S16
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_DOTPROD_S16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S16S32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S32_SFS
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC16
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S16_S32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S32
+#define DEBUG_OMXACSP_FFTINIT_C_SC16
+#define DEBUG_OMXACSP_FFTINIT_C_SC32
+#define DEBUG_OMXACSP_FFTINIT_R_S16_S32
+#define DEBUG_OMXACSP_FFTINIT_R_S32
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32S16_SFS
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32_I
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIR_DIRECT_S16
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_DIRECT_S16
+#endif /* DEBUG_DOMAIN_SP */
+
+
+#ifdef DEBUG_DOMAIN_IP
+#define DEBUG_OMXIPBM_ADDC_U8_C1R_SFS
+#define DEBUG_OMXIPBM_COPY_U8_C1R
+#define DEBUG_OMXIPBM_COPY_U8_C3R
+#define DEBUG_OMXIPBM_MIRROR_U8_C1R
+#define DEBUG_OMXIPBM_MULC_U8_C1R_SFS
+#define DEBUG_OMXIPCS_COLORTWISTQ14_U8_C3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR420LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR422LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR444LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR420LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR422LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR444LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_YCBCR420RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422RSZCSCROTRGB_U8_C2R
+#define DEBUG_OMXIPCS_YCBCR422RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565_U8_U16_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB_U8_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_C2P3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR444TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR444TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB_U8_C3R
+#define DEBUG_OMXIPPP_GETCENTRALMOMENT_S64
+#define DEBUG_OMXIPPP_GETSPATIALMOMENT_S64
+#define DEBUG_OMXIPPP_MOMENTGETSTATESIZE_S64
+#define DEBUG_OMXIPPP_MOMENTINIT_S64
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C1R
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C3R
+#endif /* DEBUG_DOMAIN_IP */
+
+
+#endif /* _armCommon_H_ */
+
+/*End of File*/
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
new file mode 100755
index 0000000..c738f72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
@@ -0,0 +1,670 @@
+;//
+;//
+;// File Name: armCOMM_BitDec_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// OpenMAX optimized bitstream decode module
+;//
+;// You must include armCOMM_s.h before including this file
+;//
+;// This module provides macros to perform assembly optimized fixed and
+;// variable length decoding from a read-only bitstream. The variable
+;// length decode modules take as input a pointer to a table of 16-bit
+;// entries of the following format.
+;//
+;// VLD Table Entry format
+;//
+;// 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+;// +------------------------------------------------+
+;// | Len | Symbol | 1 |
+;// +------------------------------------------------+
+;// | Offset | 0 |
+;// +------------------------------------------------+
+;//
+;// If the table entry is a leaf entry then bit 0 set:
+;// Len = Number of bits overread (0 to 7)
+;// Symbol = Symbol payload (unsigned 12 bits)
+;//
+;// If the table entry is an internal node then bit 0 is clear:
+;// Offset = Number of (16-bit) half words from the table
+;// start to the next table node
+;//
+;// The table is accessed by successive lookup up on the
+;// next Step bits of the input bitstream until a leaf node
+;// is obtained. The Step sizes are supplied to the VLD macro.
+;//
+;// USAGE:
+;//
+;// To use any of the macros in this package, first call:
+;//
+;// M_BD_INIT ppBitStream, pBitOffset, pBitStream, RBitBuffer, RBitCount, Tmp
+;//
+;// This caches the current bitstream position and next available
+;// bits in registers pBitStream, RBitBuffer, RBitCount. These registers
+;// are reserved for use by the bitstream decode package until you
+;// call M_BD_FINI.
+;//
+;// Next call the following macro(s) as many times as you need:
+;//
+;// M_BD_LOOK8 - Look ahead constant 1<=N<=8 bits into the bitstream
+;// M_BD_LOOK16 - Look ahead constant 1<=N<=16 bits into the bitstream
+;// M_BD_READ8 - Read constant 1<=N<=8 bits from the bitstream
+;// M_BD_READ16 - Read constant 1<=N<=16 bits from the bitstream
+;// M_BD_VREAD8 - Read variable 1<=N<=8 bits from the bitstream
+;// M_BD_VREAD16 - Read variable 1<=N<=16 bits from the bitstream
+;// M_BD_VLD - Perform variable length decode using lookup table
+;//
+;// Finally call the macro:
+;//
+;// M_BD_FINI ppBitStream, pBitOffset
+;//
+;// This writes the bitstream state back to memory.
+;//
+;// The three bitstream cache register names are assigned to the following global
+;// variables:
+;//
+
+ GBLS pBitStream ;// Register name for pBitStream
+ GBLS BitBuffer ;// Register name for BitBuffer
+ GBLS BitCount ;// Register name for BitCount
+
+;//
+;// These register variables must have a certain defined state on entry to every bitstream
+;// macro (except M_BD_INIT) and on exit from every bitstream macro (except M_BD_FINI).
+;// The state may depend on implementation.
+;//
+;// For the default (ARM11) implementation the following hold:
+;// pBitStream - points to the first byte not held in the BitBuffer
+;// BitBuffer - is a cache of (4 bytes) 32 bits, bit 31 the first bit
+;// BitCount - is offset (from the top bit) to the next unused bitstream bit
+;// 0<=BitCount<=15 (so BitBuffer holds at least 17 unused bits)
+;//
+;//
+
+ ;// Bitstream Decode initialise
+ ;//
+ ;// Initialises the bitstream decode global registers from
+ ;// bitstream pointers. This macro is split into 3 parts to enable
+ ;// scheduling.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $ppBitStream - pointer to pointer to the next bitstream byte
+ ;// $pBitOffset - pointer to the number of bits used in the current byte (0..7)
+ ;// $RBitStream - register to use for pBitStream (can be $ppBitStream)
+ ;// $RBitBuffer - register to use for BitBuffer
+ ;// $RBitCount - register to use for BitCount (can be $pBitOffset)
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1,$T2,$T3 - registers that must be preserved between calls to
+ ;// M_BD_INIT1 and M_BD_INIT2
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_INIT0 $ppBitStream, $pBitOffset, $RBitStream, $RBitBuffer, $RBitCount
+
+pBitStream SETS "$RBitStream"
+BitBuffer SETS "$RBitBuffer"
+BitCount SETS "$RBitCount"
+
+ ;// load inputs
+ LDR $pBitStream, [$ppBitStream]
+ LDR $BitCount, [$pBitOffset]
+ MEND
+
+ MACRO
+ M_BD_INIT1 $T1, $T2, $T3
+ LDRB $T2, [$pBitStream, #2]
+ LDRB $T1, [$pBitStream, #1]
+ LDRB $BitBuffer, [$pBitStream], #3
+ ADD $BitCount, $BitCount, #8
+ MEND
+
+ MACRO
+ M_BD_INIT2 $T1, $T2, $T3
+ ORR $T2, $T2, $T1, LSL #8
+ ORR $BitBuffer, $T2, $BitBuffer, LSL #16
+ MEND
+
+ ;//
+ ;// Look ahead fixed 1<=N<=8 bits without consuming any bits
+ ;// The next bits will be placed at bit 31..24 of destination register
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to look
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_LOOK8 $Symbol, $N
+ ASSERT ($N>=1):LAND:($N<=8)
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MEND
+
+ ;//
+ ;// Look ahead fixed 1<=N<=16 bits without consuming any bits
+ ;// The next bits will be placed at bit 31..16 of destination register
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to look
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_LOOK16 $Symbol, $N, $T1
+ ASSERT ($N >= 1):LAND:($N <= 16)
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MEND
+
+ ;//
+ ;// Skips fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_SKIP8 $N, $T1
+ ASSERT ($N>=1):LAND:($N<=8)
+ SUBS $BitCount, $BitCount, #(8-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Read fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_READ8 $Symbol, $N, $T1
+ ASSERT ($N>=1):LAND:($N<=8)
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ SUBS $BitCount, $BitCount, #(8-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR #(32-$N)
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Read fixed 1<=N<=16 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_READ16 $Symbol, $N, $T1, $T2
+ ASSERT ($N>=1):LAND:($N<=16)
+ ASSERT $Symbol<>$T1
+ IF ($N<=8)
+ M_BD_READ8 $Symbol, $N, $T1
+ ELSE
+ ;// N>8 so we will be able to refill at least one byte
+ LDRB $T1, [$pBitStream], #1
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ ORR $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBS $BitCount, $BitCount, #(16-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ MOV $Symbol, $Symbol, LSR #(32-$N)
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ ENDIF
+ MEND
+
+ ;//
+ ;// Skip variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits. 1<=N<=8
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VSKIP8 $N, $T1
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Skip variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits. 1<=N<=16
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VSKIP16 $N, $T1, $T2
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Read variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read. 1<=N<=8
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VREAD8 $Symbol, $N, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ RSB $T2, $N, #32
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR $T2
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Read variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read. 1<=N<=16
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VREAD16 $Symbol, $N, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ RSB $T2, $N, #32
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR $T2
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Decode a code of the form 0000...001 where there
+ ;// are N zeros before the 1 and N<=15 (code length<=16)
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the number of zeros before the next 1
+ ;// >=16 is an illegal code
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_CLZ16 $Symbol, $T1, $T2
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ CLZ $Symbol, $Symbol
+ ADD $BitCount, $BitCount, $Symbol
+ SUBS $BitCount, $BitCount, #7 ;// length is Symbol+1
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Decode a code of the form 1111...110 where there
+ ;// are N ones before the 0 and N<=15 (code length<=16)
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the number of zeros before the next 1
+ ;// >=16 is an illegal code
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_CLO16 $Symbol, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MVN $Symbol, $Symbol
+ CLZ $Symbol, $Symbol
+ ADD $BitCount, $BitCount, $Symbol
+ SUBS $BitCount, $BitCount, #7 ;// length is Symbol+1
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Variable Length Decode module
+ ;//
+ ;// Decodes one VLD Symbol from a bitstream and refill the bitstream
+ ;// buffer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pVLDTable - pointer to VLD decode table of 16-bit entries.
+ ;// The format is described above at the start of
+ ;// this file.
+ ;// $S0 - The number of bits to look up for the first step
+ ;// 1<=$S0<=8
+ ;// $S1 - The number of bits to look up for each subsequent
+ ;// step 1<=$S1<=$S0.
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - decoded VLD symbol value
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VLD $Symbol, $T1, $T2, $pVLDTable, $S0, $S1
+ ASSERT (1<=$S0):LAND:($S0<=8)
+ ASSERT (1<=$S1):LAND:($S1<=$S0)
+
+ ;// Note 0<=BitCount<=15 on entry and exit
+
+ MOVS $T1, $BitBuffer, LSL $BitCount ;// left align next bits
+ MOVS $Symbol, #(2<<$S0)-2 ;// create mask
+ AND $Symbol, $Symbol, $T1, LSR #(31-$S0) ;// 2*(next $S0 bits)
+ SUBS $BitCount, $BitCount, #8 ;// CS if buffer can be filled
+01
+ LDRCSB $T1, [$pBitStream], #1 ;// load refill byte
+ LDRH $Symbol, [$pVLDTable, $Symbol] ;// load table entry
+ ADDCC $BitCount, $BitCount, #8 ;// refill not possible
+ ADD $BitCount, $BitCount, #$S0 ;// assume $S0 bits used
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8 ;// merge in refill byte
+ MOVS $T1, $Symbol, LSR #1 ;// CS=leaf entry
+ BCS %FT02
+
+ MOVS $T1, $BitBuffer, LSL $BitCount ;// left align next bit
+ IF (2*$S0-$S1<=8)
+ ;// Can combine refill check and -S0+S1 and keep $BitCount<=15
+ SUBS $BitCount, $BitCount, #8+($S0-$S1)
+ ELSE
+ ;// Separate refill check and -S0+S1 offset
+ SUBS $BitCount, $BitCount, #8
+ SUB $BitCount, $BitCount, #($S0-$S1)
+ ENDIF
+ ADD $Symbol, $Symbol, $T1, LSR #(31-$S1) ;// add 2*(next $S1 bits) to
+ BIC $Symbol, $Symbol, #1 ;// table offset
+ B %BT01 ;// load next table entry
+02
+ ;// BitCount range now depend on the route here
+ ;// if (first step) S0 <= BitCount <= 7+S0 <=15
+ ;// else if (2*S0-S1<=8) S0 <= BitCount <= 7+(2*S0-S1) <=15
+ ;// else S1 <= BitCount <= 7+S1 <=15
+
+ SUB $BitCount, $BitCount, $Symbol, LSR#13
+ BIC $Symbol, $T1, #0xF000
+ MEND
+
+
+ ;// Add an offset number of bits
+ ;//
+ ;// Outputs destination byte and bit index values which corresponds to an offset number of bits
+ ;// from the current location. This is used to compare bitstream positions using. M_BD_CMP.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $Offset - Offset to be added in bits.
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $ByteIndex - Destination pBitStream pointer after adding the Offset.
+ ;// This value will be 4 byte ahead and needs to subtract by 4 to get exact
+ ;// pointer (as in M_BD_FINI). But for using with M_BD_CMP subtract is not needed.
+ ;// $BitIndex - Destination BitCount after the addition of Offset number of bits
+ ;//
+ MACRO
+ M_BD_ADD $ByteIndex, $BitIndex, $Offset
+
+ ;// ($ByteIndex,$BitIndex) = Current position + $Offset bits
+ ADD $Offset, $Offset, $BitCount
+ AND $BitIndex, $Offset, #7
+ ADD $ByteIndex, $pBitStream, $Offset, ASR #3
+ MEND
+
+ ;// Move bitstream pointers to the location given
+ ;//
+ ;// Outputs destination byte and bit index values which corresponds to
+ ;// the current location given (calculated using M_BD_ADD).
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;// $ByteIndex - Destination pBitStream pointer after move.
+ ;// This value will be 4 byte ahead and needs to subtract by 4 to get exact
+ ;// pointer (as in M_BD_FINI).
+ ;// $BitIndex - Destination BitCount after the move
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $pBitStream \
+ ;// } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_MOV $ByteIndex, $BitIndex
+
+ ;// ($pBitStream, $Offset) = ($ByteIndex,$BitIndex)
+ MOV $BitCount, $BitIndex
+ MOV $pBitStream, $ByteIndex
+ MEND
+
+ ;// Bitstream Compare
+ ;//
+ ;// Compares bitstream position with that of a destination position. Destination position
+ ;// is held in two input registers which are calculated using M_BD_ADD macro
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $ByteIndex - Destination pBitStream pointer, (4 byte ahead as described in M_BD_ADD)
+ ;// $BitIndex - Destination BitCount
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// FLAGS - GE if destination is reached, LT = is destination is ahead
+ ;// $T1 - corrupted temp/scratch register
+ ;//
+ MACRO
+ M_BD_CMP $ByteIndex, $BitIndex, $T1
+
+ ;// Return flags set by (current positon)-($ByteIndex,$BitIndex)
+ ;// so GE means that we have reached the indicated position
+
+ ADD $T1, $pBitStream, $BitCount, LSR #3
+ CMP $T1, $ByteIndex
+ AND $T1, $BitCount, #7
+ CMPEQ $T1, $BitIndex
+ MEND
+
+
+ ;// Bitstream Decode finalise
+ ;//
+ ;// Writes back the bitstream state to the bitstream pointers
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $ppBitStream - pointer to pointer to the next bitstream byte
+ ;// $pBitOffset - pointer to the number of bits used in the current byte (0..7)
+ ;// $pBitStream \
+ ;// $BitBuffer } these register are corrupted
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_FINI $ppBitStream, $pBitOffset
+
+ ;// Advance pointer by the number of free bits in the buffer
+ ADD $pBitStream, $pBitStream, $BitCount, LSR#3
+ AND $BitCount, $BitCount, #7
+
+ ;// Now move back 32 bits to reach the first usued bit
+ SUB $pBitStream, $pBitStream, #4
+
+ ;// Store out bitstream state
+ STR $BitCount, [$pBitOffset]
+ STR $pBitStream, [$ppBitStream]
+ MEND
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
new file mode 100755
index 0000000..b699034
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
@@ -0,0 +1,212 @@
+/**
+ *
+ * File Name: armCOMM_Bitstream.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armCOMM_Bitstream.h
+ * Brief: Declares common API's/Data types used across the OpenMax Encoders/Decoders.
+ *
+ */
+
+#ifndef _armCodec_H_
+#define _armCodec_H_
+
+#include "omxtypes.h"
+
+typedef struct {
+ OMX_U8 codeLen;
+ OMX_U32 codeWord;
+} ARM_VLC32;
+
+/* The above should be renamed as "ARM_VLC32" */
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in] **ppBitStream
+ * [in] *pOffset
+ * [in] N=1...32
+ *
+ * Returns Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N=1..32
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ * Returns Value
+ */
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset);
+
+/**
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N);
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ *
+ * Parameters:
+ * [in] **ppBitStream
+ * [in] *pOffset
+ * [in] pCodeBook
+ *
+ * [out] **ppBitStream
+ * [out] *pOffset
+ *
+ * Returns : Code Book Index if successfull.
+ * : "ARM_NO_CODEBOOK_INDEX = 0xFFFF" if search fails.
+ **/
+
+#define ARM_NO_CODEBOOK_INDEX (OMX_U16)(0xFFFF)
+
+OMX_U16 armUnPackVLC32(
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ const ARM_VLC32 *pCodeBook
+);
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] codeWord Code word that need to be inserted in to the
+ * bitstream
+ * [in] codeLength Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackBits (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ OMX_U32 codeWord,
+ OMX_INT codeLength
+);
+
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pBitOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] code VLC code word that need to be inserted in to the
+ * bitstream
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackVLC32 (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ ARM_VLC32 code
+);
+
+#endif /*_armCodec_H_*/
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
new file mode 100755
index 0000000..e0cfdaa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
@@ -0,0 +1,40 @@
+/**
+ *
+ *
+ * File Name: armCOMM_IDCTTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File : armCOMM_IDCTTable.h
+ * Description : Contains declarations of tables for IDCT calculation.
+ *
+ */
+
+#ifndef _armCOMM_IDCTTable_H_
+#define _armCOMM_IDCTTable_H_
+
+#include "omxtypes.h"
+
+ /* Table of s(u)*A(u)*A(v)/16 at Q15
+ * s(u)=1.0 0 <= u <= 5
+ * s(6)=2.0
+ * s(7)=4.0
+ * A(0) = 2*sqrt(2)
+ * A(u) = 4*cos(u*pi/16) for (u!=0)
+ */
+extern const OMX_U16 armCOMM_IDCTPreScale [64];
+extern const OMX_U16 armCOMM_IDCTCoef [4];
+
+#endif /* _armCOMM_IDCTTable_H_ */
+
+
+/* End of File */
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
new file mode 100755
index 0000000..0baa087
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
@@ -0,0 +1,1451 @@
+;//
+;// This confidential and proprietary software may be used only as
+;// authorised by a licensing agreement from ARM Limited
+;// (C) COPYRIGHT 2004 ARM Limited
+;// ALL RIGHTS RESERVED
+;// The entire notice above must be reproduced on all authorised
+;// copies and copies may only be made to the extent permitted
+;// by a licensing agreement from ARM Limited.
+;//
+;// IDCT_s.s
+;//
+;// Inverse DCT module
+;//
+;//
+;// ALGORITHM DESCRIPTION
+;//
+;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
+;// column and then a 1D IDCT for each row.
+;//
+;// The 8-point 1D IDCT is defined by
+;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
+;//
+;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
+;// c(u,x) = cos( (2x+1)*u*pi/16 )
+;//
+;// We compute the 8-point 1D IDCT using the reverse of
+;// the Arai-Agui-Nakajima flow graph which we split into
+;// 5 stages named in reverse order to identify with the
+;// forward DCT. Direct inversion of the forward formulae
+;// in file FDCT_s.s gives:
+;//
+;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ]
+;// [ A(0) = 2*sqrt(2)
+;// A(u) = 4*cos(u*pi/16) for (u!=0) ]
+;//
+;// IStage 4: i0 = j0 i1 = j4
+;// i3 = (j2+j6)/2 i2 = (j2-j6)/2
+;// i7 = (j5+j3)/2 i4 = (j5-j3)/2
+;// i5 = (j1+j7)/2 i6 = (j1-j7)/2
+;//
+;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2
+;// h2 = (i2*sqrt2)-i3 h3 = i3
+;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6
+;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
+;// [ The above two lines rotate by -(pi/8) ]
+;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2
+;//
+;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2
+;// g1 = (h1+h2)/2 g2 = (h1-h2)/2
+;// g7 = h7 g6 = h6 - h7
+;// g5 = h5 - g6 g4 = h4 - g5
+;//
+;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2
+;// f1 = (g1+g6)/2 f6 = (g1-g6)/2
+;// f2 = (g2+g5)/2 f5 = (g2-g5)/2
+;// f3 = (g3+g4)/2 f4 = (g3-g4)/2
+;//
+;// Note that most coefficients are halved 3 times during the
+;// above calculation. We can rescale the algorithm dividing
+;// the input by 8 to remove the halvings.
+;//
+;// IStage 5: j(u) = T(u)*A(u)/8
+;//
+;// IStage 4: i0 = j0 i1 = j4
+;// i3 = j2 + j6 i2 = j2 - j6
+;// i7 = j5 + j3 i4 = j5 - j3
+;// i5 = j1 + j7 i6 = j1 - j7
+;//
+;// IStage 3: h0 = i0 + i1 h1 = i0 - i1
+;// h2 = (i2*sqrt2)-i3 h3 = i3
+;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
+;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
+;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7
+;//
+;// IStage 2: g0 = h0 + h3 g3 = h0 - h3
+;// g1 = h1 + h2 g2 = h1 - h2
+;// g7 = h7 g6 = h6 - h7
+;// g5 = h5 - g6 g4 = h4 - g5
+;//
+;// IStage 1: f0 = g0 + g7 f7 = g0 - g7
+;// f1 = g1 + g6 f6 = g1 - g6
+;// f2 = g2 + g5 f5 = g2 - g5
+;// f3 = g3 + g4 f4 = g3 - g4
+;//
+;// Note:
+;// 1. The scaling by A(u)/8 can often be combined with inverse
+;// quantization. The column and row scalings can be combined.
+;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
+;// to the above code but is otherwise identical.
+;// 3. The rotation by -pi/8 can be peformed using three multiplies
+;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
+;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
+;// 4. If |T(u)|<=1 then from the IDCT definition,
+;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
+;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
+;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
+;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
+;// = (approx)2.64
+;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
+;// The table below shows input patterns generating the maximum
+;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
+;// InputPattern Max |f(x)|
+;// PPPPPPPP |f0| = 2.64
+;// PPPMMMMM |f1| = 2.64
+;// PPMMMPPP |f2| = 2.64
+;// PPMMPPMM |f3| = 2.64
+;// PMMPPMMP |f4| = 2.64
+;// PMMPMMPM |f5| = 2.64
+;// PMPPMPMP |f6| = 2.64
+;// PMPMPMPM |f7| = 2.64
+;// Note that this input pattern is the transpose of the
+;// corresponding max input patter for the FDCT.
+
+;// Arguments
+
+pSrc RN 0 ;// source data buffer
+Stride RN 1 ;// destination stride in bytes
+pDest RN 2 ;// destination data buffer
+pScale RN 3 ;// pointer to scaling table
+
+
+ ;// DCT Inverse Macro
+ ;// The DCT code should be parametrized according
+ ;// to the following inputs:
+ ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255)
+ ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255)
+ ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273)
+ ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
+ ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
+ ;//
+ ;// Inputs:
+ ;// pSrc = r0 = Pointer to input data
+ ;// Range is -256 to +255 (9-bit)
+ ;// Stride = r1 = Stride between input lines
+ ;// pDest = r2 = Pointer to output data
+ ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
+
+
+
+ MACRO
+ M_IDCT $outsize, $inscale, $stride
+ LCLA SHIFT
+
+
+ IF ARM1136JS
+
+;// REGISTER ALLOCATION
+;// This is hard since we have 8 values, 9 free registers and each
+;// butterfly requires a temporary register. We also want to
+;// maintain register order so we can use LDM/STM. The table below
+;// summarises the register allocation that meets all these criteria.
+;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
+;//
+;// r1 a01 g0 h0
+;// r4 b01 f0 g1 h1 i0
+;// r5 a23 f1 g2 i1
+;// r6 b23 f2 g3 h2 i2
+;// r7 a45 f3 h3 i3
+;// r8 b45 f4 g4 h4 i4
+;// r9 a67 f5 g5 h5 i5
+;// r10 b67 f6 g6 h6 i6
+;// r11 f7 g7 h7 i7
+;//
+ra01 RN 1
+rb01 RN 4
+ra23 RN 5
+rb23 RN 6
+ra45 RN 7
+rb45 RN 8
+ra67 RN 9
+rb67 RN 10
+rtmp RN 11
+csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
+LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
+;// Transpose allocation
+xft RN ra01
+xf0 RN rb01
+xf1 RN ra23
+xf2 RN rb23
+xf3 RN ra45
+xf4 RN rb45
+xf5 RN ra67
+xf6 RN rb67
+xf7 RN rtmp
+;// IStage 1 allocation
+xg0 RN xft
+xg1 RN xf0
+xg2 RN xf1
+xg3 RN xf2
+xgt RN xf3
+xg4 RN xf4
+xg5 RN xf5
+xg6 RN xf6
+xg7 RN xf7
+;// IStage 2 allocation
+xh0 RN xg0
+xh1 RN xg1
+xht RN xg2
+xh2 RN xg3
+xh3 RN xgt
+xh4 RN xg4
+xh5 RN xg5
+xh6 RN xg6
+xh7 RN xg7
+;// IStage 3,4 allocation
+xit RN xh0
+xi0 RN xh1
+xi1 RN xht
+xi2 RN xh2
+xi3 RN xh3
+xi4 RN xh4
+xi5 RN xh5
+xi6 RN xh6
+xi7 RN xh7
+
+ M_STR pDest, ppDest
+ IF "$stride"="s"
+ M_STR Stride, pStride
+ ENDIF
+ M_ADR pDest, pBlk
+ LDR csPiBy8, =0x30fc7642
+ LDR LoopRR2, =0x00005a82
+
+v6_idct_col$_F
+ ;// Load even values
+ LDR xi4, [pSrc], #4 ;// j0
+ LDR xi5, [pSrc, #4*16-4] ;// j4
+ LDR xi6, [pSrc, #2*16-4] ;// j2
+ LDR xi7, [pSrc, #6*16-4] ;// j6
+
+ ;// Scale Even Values
+ IF "$inscale"="s16" ;// 16x16 mul
+SHIFT SETA 12
+ LDR xi0, [pScale], #4
+ LDR xi1, [pScale, #4*16-4]
+ LDR xi2, [pScale, #2*16-4]
+ MOV xit, #1<<(SHIFT-1)
+ SMLABB xi3, xi0, xi4, xit
+ SMLATT xi4, xi0, xi4, xit
+ SMLABB xi0, xi1, xi5, xit
+ SMLATT xi5, xi1, xi5, xit
+ MOV xi3, xi3, ASR #SHIFT
+ PKHBT xi4, xi3, xi4, LSL #(16-SHIFT)
+ LDR xi3, [pScale, #6*16-4]
+ SMLABB xi1, xi2, xi6, xit
+ SMLATT xi6, xi2, xi6, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi5, xi0, xi5, LSL #(16-SHIFT)
+ SMLABB xi2, xi3, xi7, xit
+ SMLATT xi7, xi3, xi7, xit
+ MOV xi1, xi1, ASR #SHIFT
+ PKHBT xi6, xi1, xi6, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi7, xi2, xi7, LSL #(16-SHIFT)
+ ENDIF
+ IF "$inscale"="s32" ;// 32x16 mul
+SHIFT SETA (12+8-16)
+ MOV xit, #1<<(SHIFT-1)
+ LDR xi0, [pScale], #8
+ LDR xi1, [pScale, #0*32+4-8]
+ LDR xi2, [pScale, #4*32-8]
+ LDR xi3, [pScale, #4*32+4-8]
+ SMLAWB xi0, xi0, xi4, xit
+ SMLAWT xi1, xi1, xi4, xit
+ SMLAWB xi2, xi2, xi5, xit
+ SMLAWT xi3, xi3, xi5, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi4, xi0, xi1, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi5, xi2, xi3, LSL #(16-SHIFT)
+ LDR xi0, [pScale, #2*32-8]
+ LDR xi1, [pScale, #2*32+4-8]
+ LDR xi2, [pScale, #6*32-8]
+ LDR xi3, [pScale, #6*32+4-8]
+ SMLAWB xi0, xi0, xi6, xit
+ SMLAWT xi1, xi1, xi6, xit
+ SMLAWB xi2, xi2, xi7, xit
+ SMLAWT xi3, xi3, xi7, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi6, xi0, xi1, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi7, xi2, xi3, LSL #(16-SHIFT)
+ ENDIF
+
+ ;// Load odd values
+ LDR xi0, [pSrc, #1*16-4] ;// j1
+ LDR xi1, [pSrc, #7*16-4] ;// j7
+ LDR xi2, [pSrc, #5*16-4] ;// j5
+ LDR xi3, [pSrc, #3*16-4] ;// j3
+
+ IF {TRUE}
+ ;// shortcut if odd values 0
+ TEQ xi0, #0
+ TEQEQ xi1, #0
+ TEQEQ xi2, #0
+ TEQEQ xi3, #0
+ BEQ v6OddZero$_F
+ ENDIF
+
+ ;// Store scaled even values
+ STMIA pDest, {xi4, xi5, xi6, xi7}
+
+ ;// Scale odd values
+ IF "$inscale"="s16"
+ ;// Perform AAN Scale
+ LDR xi4, [pScale, #1*16-4]
+ LDR xi5, [pScale, #7*16-4]
+ LDR xi6, [pScale, #5*16-4]
+ SMLABB xi7, xi0, xi4, xit
+ SMLATT xi0, xi0, xi4, xit
+ SMLABB xi4, xi1, xi5, xit
+ SMLATT xi1, xi1, xi5, xit
+ MOV xi7, xi7, ASR #SHIFT
+ PKHBT xi0, xi7, xi0, LSL #(16-SHIFT)
+ LDR xi7, [pScale, #3*16-4]
+ SMLABB xi5, xi2, xi6, xit
+ SMLATT xi2, xi2, xi6, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi1, xi4, xi1, LSL #(16-SHIFT)
+ SMLABB xi6, xi3, xi7, xit
+ SMLATT xi3, xi3, xi7, xit
+ MOV xi5, xi5, ASR #SHIFT
+ PKHBT xi2, xi5, xi2, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi3, xi6, xi3, LSL #(16-SHIFT)
+ ENDIF
+ IF "$inscale"="s32" ;// 32x16 mul
+ LDR xi4, [pScale, #1*32-8]
+ LDR xi5, [pScale, #1*32+4-8]
+ LDR xi6, [pScale, #7*32-8]
+ LDR xi7, [pScale, #7*32+4-8]
+ SMLAWB xi4, xi4, xi0, xit
+ SMLAWT xi5, xi5, xi0, xit
+ SMLAWB xi6, xi6, xi1, xit
+ SMLAWT xi7, xi7, xi1, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi0, xi4, xi5, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi1, xi6, xi7, LSL #(16-SHIFT)
+ LDR xi4, [pScale, #5*32-8]
+ LDR xi5, [pScale, #5*32+4-8]
+ LDR xi6, [pScale, #3*32-8]
+ LDR xi7, [pScale, #3*32+4-8]
+ SMLAWB xi4, xi4, xi2, xit
+ SMLAWT xi5, xi5, xi2, xit
+ SMLAWB xi6, xi6, xi3, xit
+ SMLAWT xi7, xi7, xi3, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi2, xi4, xi5, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi3, xi6, xi7, LSL #(16-SHIFT)
+ ENDIF
+
+ LDR xit, =0x00010001 ;// rounding constant
+ SADD16 xi5, xi0, xi1 ;// (j1+j7)/2
+ SHADD16 xi5, xi5, xit
+
+ SSUB16 xi6, xi0, xi1 ;// j1-j7
+ SADD16 xi7, xi2, xi3 ;// (j5+j3)/2
+ SHADD16 xi7, xi7, xit
+
+ SSUB16 xi4, xi2, xi3 ;// j5-j3
+
+ SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
+
+ PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
+ PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
+
+ SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
+ SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
+ SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
+ SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
+
+ SMULBB xi1, xi3, LoopRR2
+ SMULTB xi3, xi3, LoopRR2
+
+ PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
+ PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
+ SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
+
+ ;// xi0,xi1,xi2,xi3 now free
+ ;// IStage 4,3, rows 2to3 x1/2
+
+ MOV xi3, xi3, LSL #1
+ PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
+ LDRD xi0, [pDest, #8] ;// j2,j6 scaled
+
+ ;// IStage 2, rows4to7
+ SSUB16 xg6, xh6, xh7
+ SSUB16 xg5, xh5, xg6
+ SSUB16 xg4, xh4, xg5
+
+ SSUB16 xi2, xi0, xi1 ;// (j2-j6)
+
+ SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+ LDRD xi0, [pDest] ;// j0, j4 scaled
+ SSUB16 xh2, xh2, xi3
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+
+ SHADD16 xh0, xi0, xi1
+ SHSUB16 xh1, xi0, xi1
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ SADD16 xf3, xg3, xg4
+ SSUB16 xf4, xg3, xg4
+ SADD16 xf2, xg2, xg5
+ SSUB16 xf5, xg2, xg5
+ SADD16 xf1, xg1, xg6
+ SSUB16 xf6, xg1, xg6
+ SADD16 xf0, xg0, xg7
+ SSUB16 xf7, xg0, xg7
+
+ ;// Transpose, store and loop
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ STMIA pDest!, {ra01, ra23, ra45, ra67}
+ PKHTB rb67, xf7, xf6, ASR #16
+ STMIA pDest!, {rb01, rb23, rb45, rb67}
+ BCC v6_idct_col$_F
+
+ SUB pSrc, pDest, #(64*2)
+ M_LDR pDest, ppDest
+ IF "$stride"="s"
+ M_LDR pScale, pStride
+ ENDIF
+ B v6_idct_row$_F
+
+v6OddZero$_F
+ SSUB16 xi2, xi6, xi7 ;// (j2-j6)
+ SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+ SSUB16 xh2, xh2, xi3
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+
+ SHADD16 xh0, xi4, xi5
+ SHSUB16 xh1, xi4, xi5
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ MOV xf3, xg3
+ MOV xf4, xg3
+ MOV xf2, xg2
+ MOV xf5, xg2
+ MOV xf1, xg1
+ MOV xf6, xg1
+ MOV xf0, xg0
+ MOV xf7, xg0
+
+ ;// Transpose
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ PKHTB rb67, xf7, xf6, ASR #16
+
+ STMIA pDest!, {ra01, ra23, ra45, ra67}
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+ STMIA pDest!, {rb01, rb23, rb45, rb67}
+
+ BCC v6_idct_col$_F
+ SUB pSrc, pDest, #(64*2)
+ M_LDR pDest, ppDest
+ IF "$stride"="s"
+ M_LDR pScale, pStride
+ ENDIF
+
+
+v6_idct_row$_F
+ ;// IStage 4,3, rows4to7 x1/4
+ LDR xit, =0x00010001 ;// rounding constant
+ LDR xi0, [pSrc, #1*16] ;// j1
+ LDR xi1, [pSrc, #7*16] ;// 4*j7
+ LDR xi2, [pSrc, #5*16] ;// j5
+ LDR xi3, [pSrc, #3*16] ;// j3
+
+ SHADD16 xi1, xi1, xit ;// 2*j7
+ SHADD16 xi1, xi1, xit ;// j7
+
+ SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2
+ SSUB16 xi6, xi0, xi1 ;// j1-j7
+ SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2
+ SSUB16 xi4, xi2, xi3 ;// j5-j3
+
+ SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
+
+ PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
+ PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
+
+ SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
+ SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
+ SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
+ SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
+
+ SMULBB xi1, xi3, LoopRR2
+ SMULTB xi3, xi3, LoopRR2
+
+ PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
+ PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
+ SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
+
+ MOV xi3, xi3, LSL #1
+ PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
+
+ ;// xi0,xi1,xi2,xi3 now free
+ ;// IStage 4,3, rows 2to3 x1/2
+
+ LDR xi0, [pSrc, #2*16] ;// j2
+ LDR xi1, [pSrc, #6*16] ;// 2*j6
+
+ ;// IStage 2, rows4to7
+ SSUB16 xg6, xh6, xh7
+ SSUB16 xg5, xh5, xg6
+ SSUB16 xg4, xh4, xg5
+
+ SHADD16 xi1, xi1, xit ;// j6
+ SSUB16 xi2, xi0, xi1 ;// (j2-j6)
+ SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+ LDR xi1, [pSrc, #4*16] ;// j4
+ LDR xi0, [pSrc], #4 ;// j0
+
+ SSUB16 xh2, xh2, xi3
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+
+ ADD xi0, xi0, xit, LSL #2 ;// ensure correct round
+ SHADD16 xh0, xi0, xi1 ;// of DC result
+ SHSUB16 xh1, xi0, xi1
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ SHADD16 xf3, xg3, xg4
+ SHSUB16 xf4, xg3, xg4
+ SHADD16 xf2, xg2, xg5
+ SHSUB16 xf5, xg2, xg5
+ SHADD16 xf1, xg1, xg6
+ SHSUB16 xf6, xg1, xg6
+ SHADD16 xf0, xg0, xg7
+ SHSUB16 xf7, xg0, xg7
+
+ ;// Saturate
+ IF ("$outsize"="u8")
+ USAT16 xf0, #8, xf0
+ USAT16 xf1, #8, xf1
+ USAT16 xf2, #8, xf2
+ USAT16 xf3, #8, xf3
+ USAT16 xf4, #8, xf4
+ USAT16 xf5, #8, xf5
+ USAT16 xf6, #8, xf6
+ USAT16 xf7, #8, xf7
+ ENDIF
+ IF ("$outsize"="s9")
+ SSAT16 xf0, #9, xf0
+ SSAT16 xf1, #9, xf1
+ SSAT16 xf2, #9, xf2
+ SSAT16 xf3, #9, xf3
+ SSAT16 xf4, #9, xf4
+ SSAT16 xf5, #9, xf5
+ SSAT16 xf6, #9, xf6
+ SSAT16 xf7, #9, xf7
+ ENDIF
+
+ ;// Transpose to Row, Pack and store
+ IF ("$outsize"="u8")
+ ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
+ ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
+ ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
+ ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
+ PKHBT ra01, xf0, xf2, LSL #16
+ PKHTB rb01, xf2, xf0, ASR #16
+ PKHBT ra23, xf4, xf6, LSL #16
+ PKHTB rb23, xf6, xf4, ASR #16
+ STMIA pDest, {ra01, ra23}
+ IF "$stride"="s"
+ ADD pDest, pDest, pScale
+ STMIA pDest, {rb01, rb23}
+ ADD pDest, pDest, pScale
+ ELSE
+ ADD pDest, pDest, #($stride)
+ STMIA pDest, {rb01, rb23}
+ ADD pDest, pDest, #($stride)
+ ENDIF
+ ENDIF
+ IF ("$outsize"="s9"):LOR:("$outsize"="s16")
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ PKHTB rb67, xf7, xf6, ASR #16
+
+ STMIA pDest, {ra01, ra23, ra45, ra67}
+ IF "$stride"="s"
+ ADD pDest, pDest, pScale
+ STMIA pDest, {rb01, rb23, rb45, rb67}
+ ADD pDest, pDest, pScale
+ ELSE
+ ADD pDest, pDest, #($stride)
+ STMIA pDest, {rb01, rb23, rb45, rb67}
+ ADD pDest, pDest, #($stride)
+ ENDIF
+ ENDIF
+
+ BCC v6_idct_row$_F
+ ENDIF ;// ARM1136JS
+
+
+ IF CortexA8
+
+Src0 EQU 7
+Src1 EQU 8
+Src2 EQU 9
+Src3 EQU 10
+Src4 EQU 11
+Src5 EQU 12
+Src6 EQU 13
+Src7 EQU 14
+Tmp EQU 15
+
+qXj0 QN Src0.S16
+qXj1 QN Src1.S16
+qXj2 QN Src2.S16
+qXj3 QN Src3.S16
+qXj4 QN Src4.S16
+qXj5 QN Src5.S16
+qXj6 QN Src6.S16
+qXj7 QN Src7.S16
+qXjt QN Tmp.S16
+
+dXj0lo DN (Src0*2).S16
+dXj0hi DN (Src0*2+1).S16
+dXj1lo DN (Src1*2).S16
+dXj1hi DN (Src1*2+1).S16
+dXj2lo DN (Src2*2).S16
+dXj2hi DN (Src2*2+1).S16
+dXj3lo DN (Src3*2).S16
+dXj3hi DN (Src3*2+1).S16
+dXj4lo DN (Src4*2).S16
+dXj4hi DN (Src4*2+1).S16
+dXj5lo DN (Src5*2).S16
+dXj5hi DN (Src5*2+1).S16
+dXj6lo DN (Src6*2).S16
+dXj6hi DN (Src6*2+1).S16
+dXj7lo DN (Src7*2).S16
+dXj7hi DN (Src7*2+1).S16
+dXjtlo DN (Tmp*2).S16
+dXjthi DN (Tmp*2+1).S16
+
+qXi0 QN qXj0
+qXi1 QN qXj4
+qXi2 QN qXj2
+qXi3 QN qXj7
+qXi4 QN qXj5
+qXi5 QN qXjt
+qXi6 QN qXj1
+qXi7 QN qXj6
+qXit QN qXj3
+
+dXi0lo DN dXj0lo
+dXi0hi DN dXj0hi
+dXi1lo DN dXj4lo
+dXi1hi DN dXj4hi
+dXi2lo DN dXj2lo
+dXi2hi DN dXj2hi
+dXi3lo DN dXj7lo
+dXi3hi DN dXj7hi
+dXi4lo DN dXj5lo
+dXi4hi DN dXj5hi
+dXi5lo DN dXjtlo
+dXi5hi DN dXjthi
+dXi6lo DN dXj1lo
+dXi6hi DN dXj1hi
+dXi7lo DN dXj6lo
+dXi7hi DN dXj6hi
+dXitlo DN dXj3lo
+dXithi DN dXj3hi
+
+qXh0 QN qXit
+qXh1 QN qXi0
+qXh2 QN qXi2
+qXh3 QN qXi3
+qXh4 QN qXi7
+qXh5 QN qXi5
+qXh6 QN qXi4
+qXh7 QN qXi1
+qXht QN qXi6
+
+dXh0lo DN dXitlo
+dXh0hi DN dXithi
+dXh1lo DN dXi0lo
+dXh1hi DN dXi0hi
+dXh2lo DN dXi2lo
+dXh2hi DN dXi2hi
+dXh3lo DN dXi3lo
+dXh3hi DN dXi3hi
+dXh4lo DN dXi7lo
+dXh4hi DN dXi7hi
+dXh5lo DN dXi5lo
+dXh5hi DN dXi5hi
+dXh6lo DN dXi4lo
+dXh6hi DN dXi4hi
+dXh7lo DN dXi1lo
+dXh7hi DN dXi1hi
+dXhtlo DN dXi6lo
+dXhthi DN dXi6hi
+
+qXg0 QN qXh2
+qXg1 QN qXht
+qXg2 QN qXh1
+qXg3 QN qXh0
+qXg4 QN qXh4
+qXg5 QN qXh5
+qXg6 QN qXh6
+qXg7 QN qXh7
+qXgt QN qXh3
+
+qXf0 QN qXg6
+qXf1 QN qXg5
+qXf2 QN qXg4
+qXf3 QN qXgt
+qXf4 QN qXg3
+qXf5 QN qXg2
+qXf6 QN qXg1
+qXf7 QN qXg0
+qXft QN qXg7
+
+
+qXt0 QN 1.S32
+qXt1 QN 2.S32
+qT0lo QN 1.S32
+qT0hi QN 2.S32
+qT1lo QN 3.S32
+qT1hi QN 4.S32
+qScalelo QN 5.S32 ;// used to read post scale values
+qScalehi QN 6.S32
+qTemp0 QN 5.S32
+qTemp1 QN 6.S32
+
+
+Scale1 EQU 6
+Scale2 EQU 15
+qScale1 QN Scale1.S16
+qScale2 QN Scale2.S16
+dScale1lo DN (Scale1*2).S16
+dScale1hi DN (Scale1*2+1).S16
+dScale2lo DN (Scale2*2).S16
+dScale2hi DN (Scale2*2+1).S16
+
+dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
+InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15
+S DN dCoefs[1] ;// Sin(PI/8) in Q15
+C DN dCoefs[2] ;// Cos(PI/8) in Q15
+
+pTemp RN 12
+
+
+ IMPORT armCOMM_IDCTCoef
+
+ VLD1 {qXj0,qXj1}, [pSrc @64]!
+ VLD1 {qXj2,qXj3}, [pSrc @64]!
+ VLD1 {qXj4,qXj5}, [pSrc @64]!
+ VLD1 {qXj6,qXj7}, [pSrc @64]!
+
+ ;// Load PreScale and multiply with Src
+ ;// IStage 4
+
+ IF "$inscale"="s16" ;// 16X16 Mul
+ M_IDCT_PRESCALE16
+ ENDIF
+
+ IF "$inscale"="s32" ;// 32X32 ,ul
+ M_IDCT_PRESCALE32
+ ENDIF
+
+ ;// IStage 3
+ VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2)
+ VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2
+ VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2
+ VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4
+ VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2
+ VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2)
+ VSUB qXh2, qXi2, qXi3 ;// h2, h3
+
+ VMULL qXt0, dXi4lo, C ;// c*i4
+ VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6
+ VMULL qXt1, dXi4hi, C
+ VMLAL qXt1, dXi6hi, S
+ VSHRN dXh4lo, qXt0, #16 ;// h4
+ VSHRN dXh4hi, qXt1, #16
+
+ VMULL qXt0, dXi6lo, C ;// c*i6
+ VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6
+ VMULL qXt1, dXi6hi, C
+ VMLSL qXt1, dXi4hi, S
+ VSHRN dXh6lo, qXt0, #16 ;// h6
+ VSHRN dXh6hi, qXt1, #16
+
+ ;// IStage 2
+ VSUB qXg6, qXh6, qXh7
+ VSUB qXg5, qXh5, qXg6
+ VSUB qXg4, qXh4, qXg5
+ VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2
+ VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2
+ VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2
+ VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2
+
+ ;// IStage 1 all rows
+ VADD qXf3, qXg3, qXg4
+ VSUB qXf4, qXg3, qXg4
+ VADD qXf2, qXg2, qXg5
+ VSUB qXf5, qXg2, qXg5
+ VADD qXf1, qXg1, qXg6
+ VSUB qXf6, qXg1, qXg6
+ VADD qXf0, qXg0, qXg7
+ VSUB qXf7, qXg0, qXg7
+
+ ;// Transpose, store and loop
+XTR0 EQU Src5
+XTR1 EQU Tmp
+XTR2 EQU Src6
+XTR3 EQU Src7
+XTR4 EQU Src3
+XTR5 EQU Src0
+XTR6 EQU Src1
+XTR7 EQU Src2
+XTRt EQU Src4
+
+qA0 QN XTR0.S32 ;// for XTRpose
+qA1 QN XTR1.S32
+qA2 QN XTR2.S32
+qA3 QN XTR3.S32
+qA4 QN XTR4.S32
+qA5 QN XTR5.S32
+qA6 QN XTR6.S32
+qA7 QN XTR7.S32
+
+dB0 DN XTR0*2+1 ;// for using VSWP
+dB1 DN XTR1*2+1
+dB2 DN XTR2*2+1
+dB3 DN XTR3*2+1
+dB4 DN XTR4*2
+dB5 DN XTR5*2
+dB6 DN XTR6*2
+dB7 DN XTR7*2
+
+
+ VTRN qXf0, qXf1
+ VTRN qXf2, qXf3
+ VTRN qXf4, qXf5
+ VTRN qXf6, qXf7
+ VTRN qA0, qA2
+ VTRN qA1, qA3
+ VTRN qA4, qA6
+ VTRN qA5, qA7
+ VSWP dB0, dB4
+ VSWP dB1, dB5
+ VSWP dB2, dB6
+ VSWP dB3, dB7
+
+
+qYj0 QN qXf0
+qYj1 QN qXf1
+qYj2 QN qXf2
+qYj3 QN qXf3
+qYj4 QN qXf4
+qYj5 QN qXf5
+qYj6 QN qXf6
+qYj7 QN qXf7
+qYjt QN qXft
+
+dYj0lo DN (XTR0*2).S16
+dYj0hi DN (XTR0*2+1).S16
+dYj1lo DN (XTR1*2).S16
+dYj1hi DN (XTR1*2+1).S16
+dYj2lo DN (XTR2*2).S16
+dYj2hi DN (XTR2*2+1).S16
+dYj3lo DN (XTR3*2).S16
+dYj3hi DN (XTR3*2+1).S16
+dYj4lo DN (XTR4*2).S16
+dYj4hi DN (XTR4*2+1).S16
+dYj5lo DN (XTR5*2).S16
+dYj5hi DN (XTR5*2+1).S16
+dYj6lo DN (XTR6*2).S16
+dYj6hi DN (XTR6*2+1).S16
+dYj7lo DN (XTR7*2).S16
+dYj7hi DN (XTR7*2+1).S16
+dYjtlo DN (XTRt*2).S16
+dYjthi DN (XTRt*2+1).S16
+
+qYi0 QN qYj0
+qYi1 QN qYj4
+qYi2 QN qYj2
+qYi3 QN qYj7
+qYi4 QN qYj5
+qYi5 QN qYjt
+qYi6 QN qYj1
+qYi7 QN qYj6
+qYit QN qYj3
+
+dYi0lo DN dYj0lo
+dYi0hi DN dYj0hi
+dYi1lo DN dYj4lo
+dYi1hi DN dYj4hi
+dYi2lo DN dYj2lo
+dYi2hi DN dYj2hi
+dYi3lo DN dYj7lo
+dYi3hi DN dYj7hi
+dYi4lo DN dYj5lo
+dYi4hi DN dYj5hi
+dYi5lo DN dYjtlo
+dYi5hi DN dYjthi
+dYi6lo DN dYj1lo
+dYi6hi DN dYj1hi
+dYi7lo DN dYj6lo
+dYi7hi DN dYj6hi
+dYitlo DN dYj3lo
+dYithi DN dYj3hi
+
+qYh0 QN qYit
+qYh1 QN qYi0
+qYh2 QN qYi2
+qYh3 QN qYi3
+qYh4 QN qYi7
+qYh5 QN qYi5
+qYh6 QN qYi4
+qYh7 QN qYi1
+qYht QN qYi6
+
+dYh0lo DN dYitlo
+dYh0hi DN dYithi
+dYh1lo DN dYi0lo
+dYh1hi DN dYi0hi
+dYh2lo DN dYi2lo
+dYh2hi DN dYi2hi
+dYh3lo DN dYi3lo
+dYh3hi DN dYi3hi
+dYh4lo DN dYi7lo
+dYh4hi DN dYi7hi
+dYh5lo DN dYi5lo
+dYh5hi DN dYi5hi
+dYh6lo DN dYi4lo
+dYh6hi DN dYi4hi
+dYh7lo DN dYi1lo
+dYh7hi DN dYi1hi
+dYhtlo DN dYi6lo
+dYhthi DN dYi6hi
+
+qYg0 QN qYh2
+qYg1 QN qYht
+qYg2 QN qYh1
+qYg3 QN qYh0
+qYg4 QN qYh4
+qYg5 QN qYh5
+qYg6 QN qYh6
+qYg7 QN qYh7
+qYgt QN qYh3
+
+qYf0 QN qYg6
+qYf1 QN qYg5
+qYf2 QN qYg4
+qYf3 QN qYgt
+qYf4 QN qYg3
+qYf5 QN qYg2
+qYf6 QN qYg1
+qYf7 QN qYg0
+qYft QN qYg7
+
+ VRSHR qYj7, qYj7, #2
+ VRSHR qYj6, qYj6, #1
+
+ VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2
+ VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7
+ VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2
+ VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6
+ VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2
+ VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3
+
+ VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2)
+ ;// IStage 4,3 rows 0to1 x 1/2
+
+ MOV pTemp, #0x4 ;// ensure correct round
+ VDUP qScale1, pTemp ;// of DC result
+ VADD qYi0, qYi0, qScale1
+
+ VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2
+ VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2
+
+ VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4
+ VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2
+ VSUB qYh2, qYi2, qYi3 ;// h2, h3
+ VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2)
+
+ VMULL qXt0, dYi4lo, C ;// c*i4
+ VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6
+ VMULL qXt1, dYi4hi, C
+ VMLAL qXt1, dYi6hi, S
+ VSHRN dYh4lo, qXt0, #16 ;// h4
+ VSHRN dYh4hi, qXt1, #16
+
+ VMULL qXt0, dYi6lo, C ;// c*i6
+ VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6
+ VMULL qXt1, dYi6hi, C
+ VMLSL qXt1, dYi4hi, S
+ VSHRN dYh6lo, qXt0, #16 ;// h6
+ VSHRN dYh6hi, qXt1, #16
+
+ VSUB qYg6, qYh6, qYh7
+ VSUB qYg5, qYh5, qYg6
+ VSUB qYg4, qYh4, qYg5
+
+ ;// IStage 2 rows 0to3 x 1/2
+ VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2
+ VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2
+ VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2
+ VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2
+
+
+ ;// IStage 1 all rows
+ VHADD qYf3, qYg3, qYg4
+ VHSUB qYf4, qYg3, qYg4
+ VHADD qYf2, qYg2, qYg5
+ VHSUB qYf5, qYg2, qYg5
+ VHADD qYf1, qYg1, qYg6
+ VHSUB qYf6, qYg1, qYg6
+ VHADD qYf0, qYg0, qYg7
+ VHSUB qYf7, qYg0, qYg7
+
+YTR0 EQU Src0
+YTR1 EQU Src4
+YTR2 EQU Src1
+YTR3 EQU Src2
+YTR4 EQU Src7
+YTR5 EQU Src5
+YTR6 EQU Tmp
+YTR7 EQU Src6
+YTRt EQU Src3
+
+qC0 QN YTR0.S32 ;// for YTRpose
+qC1 QN YTR1.S32
+qC2 QN YTR2.S32
+qC3 QN YTR3.S32
+qC4 QN YTR4.S32
+qC5 QN YTR5.S32
+qC6 QN YTR6.S32
+qC7 QN YTR7.S32
+
+dD0 DN YTR0*2+1 ;// for using VSWP
+dD1 DN YTR1*2+1
+dD2 DN YTR2*2+1
+dD3 DN YTR3*2+1
+dD4 DN YTR4*2
+dD5 DN YTR5*2
+dD6 DN YTR6*2
+dD7 DN YTR7*2
+
+ VTRN qYf0, qYf1
+ VTRN qYf2, qYf3
+ VTRN qYf4, qYf5
+ VTRN qYf6, qYf7
+ VTRN qC0, qC2
+ VTRN qC1, qC3
+ VTRN qC4, qC6
+ VTRN qC5, qC7
+ VSWP dD0, dD4
+ VSWP dD1, dD5
+ VSWP dD2, dD6
+ VSWP dD3, dD7
+
+
+dYf0U8 DN YTR0*2.U8
+dYf1U8 DN YTR1*2.U8
+dYf2U8 DN YTR2*2.U8
+dYf3U8 DN YTR3*2.U8
+dYf4U8 DN YTR4*2.U8
+dYf5U8 DN YTR5*2.U8
+dYf6U8 DN YTR6*2.U8
+dYf7U8 DN YTR7*2.U8
+
+ ;//
+ ;// Do saturation if outsize is other than S16
+ ;//
+
+ IF ("$outsize"="u8")
+ ;// Output range [0-255]
+ VQMOVN dYf0U8, qYf0
+ VQMOVN dYf1U8, qYf1
+ VQMOVN dYf2U8, qYf2
+ VQMOVN dYf3U8, qYf3
+ VQMOVN dYf4U8, qYf4
+ VQMOVN dYf5U8, qYf5
+ VQMOVN dYf6U8, qYf6
+ VQMOVN dYf7U8, qYf7
+ ENDIF
+
+ IF ("$outsize"="s9")
+ ;// Output range [-256 to +255]
+ VQSHL qYf0, qYf0, #16-9
+ VQSHL qYf1, qYf1, #16-9
+ VQSHL qYf2, qYf2, #16-9
+ VQSHL qYf3, qYf3, #16-9
+ VQSHL qYf4, qYf4, #16-9
+ VQSHL qYf5, qYf5, #16-9
+ VQSHL qYf6, qYf6, #16-9
+ VQSHL qYf7, qYf7, #16-9
+
+ VSHR qYf0, qYf0, #16-9
+ VSHR qYf1, qYf1, #16-9
+ VSHR qYf2, qYf2, #16-9
+ VSHR qYf3, qYf3, #16-9
+ VSHR qYf4, qYf4, #16-9
+ VSHR qYf5, qYf5, #16-9
+ VSHR qYf6, qYf6, #16-9
+ VSHR qYf7, qYf7, #16-9
+ ENDIF
+
+ ;// Store output depending on the Stride size
+ IF "$stride"="s"
+ VST1 qYf0, [pDest @64], Stride
+ VST1 qYf1, [pDest @64], Stride
+ VST1 qYf2, [pDest @64], Stride
+ VST1 qYf3, [pDest @64], Stride
+ VST1 qYf4, [pDest @64], Stride
+ VST1 qYf5, [pDest @64], Stride
+ VST1 qYf6, [pDest @64], Stride
+ VST1 qYf7, [pDest @64]
+ ELSE
+ IF ("$outsize"="u8")
+ VST1 dYf0U8, [pDest @64], #8
+ VST1 dYf1U8, [pDest @64], #8
+ VST1 dYf2U8, [pDest @64], #8
+ VST1 dYf3U8, [pDest @64], #8
+ VST1 dYf4U8, [pDest @64], #8
+ VST1 dYf5U8, [pDest @64], #8
+ VST1 dYf6U8, [pDest @64], #8
+ VST1 dYf7U8, [pDest @64]
+ ELSE
+ ;// ("$outsize"="s9") or ("$outsize"="s16")
+ VST1 qYf0, [pDest @64], #16
+ VST1 qYf1, [pDest @64], #16
+ VST1 qYf2, [pDest @64], #16
+ VST1 qYf3, [pDest @64], #16
+ VST1 qYf4, [pDest @64], #16
+ VST1 qYf5, [pDest @64], #16
+ VST1 qYf6, [pDest @64], #16
+ VST1 qYf7, [pDest @64]
+ ENDIF
+
+ ENDIF
+
+
+
+ ENDIF ;// CortexA8
+
+
+
+ MEND
+
+ ;// Scale TWO input rows with TWO rows of 16 bit scale values
+ ;//
+ ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
+ ;// input (Eight input values) with one row of scale values. Also
+ ;// Loads next scale values from pScale, if $LastRow flag is not set.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $dAlo - Input D register with first four S16 values of row n
+ ;// $dAhi - Input D register with next four S16 values of row n
+ ;// $dBlo - Input D register with first four S16 values of row n+1
+ ;// $dBhi - Input D register with next four S16 values of row n+1
+ ;// pScale - Pointer to next row of scale values
+ ;// qT0lo - Temporary scratch register
+ ;// qT0hi - Temporary scratch register
+ ;// qT1lo - Temporary scratch register
+ ;// qT1hi - Temporary scratch register
+ ;// dScale1lo - Scale value of row n
+ ;// dScale1hi - Scale value of row n
+ ;// dScale2lo - Scale value of row n+1
+ ;// dScale2hi - Scale value of row n+1
+ ;//
+ ;// Input Flag
+ ;//
+ ;// $LastRow - Flag to indicate whether current row is last row
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $dAlo - Scaled output values (first four S16 of row n)
+ ;// $dAhi - Scaled output values (next four S16 of row n)
+ ;// $dBlo - Scaled output values (first four S16 of row n+1)
+ ;// $dBhi - Scaled output values (next four S16 of row n+1)
+ ;// qScale1 - Scale values for next row
+ ;// qScale2 - Scale values for next row+1
+ ;// pScale - Pointer to next row of scale values
+ ;//
+ MACRO
+ M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
+ VMULL qT0lo, $dAlo, dScale1lo
+ VMULL qT0hi, $dAhi, dScale1hi
+ VMULL qT1lo, $dBlo, dScale2lo
+ VMULL qT1hi, $dBhi, dScale2hi
+ IF "$LastRow"="0"
+ VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1
+ VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2
+ ENDIF
+ VQRSHRN $dAlo, qT0lo, #12
+ VQRSHRN $dAhi, qT0hi, #12
+ VQRSHRN $dBlo, qT1lo, #12
+ VQRSHRN $dBhi, qT1hi, #12
+ MEND
+
+ ;// Scale 8x8 block input values with 16 bit scale values
+ ;//
+ ;// This macro is used to pre-scale block of 8x8 input.
+ ;// This also do the Ist stage transformations of IDCT.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// dXjnlo - n th input D register with first four S16 values
+ ;// dXjnhi - n th input D register with next four S16 values
+ ;// qXjn - n th input Q register with eight S16 values
+ ;// pScale - Pointer to scale values
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// qXin - n th output Q register with eight S16 output values of 1st stage
+ ;//
+ MACRO
+ M_IDCT_PRESCALE16
+ VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0
+ VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0
+ M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1
+ M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
+ M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
+ M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
+ VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2
+ VSUB qXi6, qXj1, qXj7 ;// j1-j7
+ LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+ VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2
+ VSUB qXi2, qXj2, qXj6 ;// j2-j6
+ VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
+ VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2
+ VSUB qXi4, qXj5, qXj3 ;// j5-j3
+ MEND
+
+
+ ;// Scale 8x8 block input values with 32 bit scale values
+ ;//
+ ;// This macro is used to pre-scale block of 8x8 input.
+ ;// This also do the Ist stage transformations of IDCT.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// dXjnlo - n th input D register with first four S16 values
+ ;// dXjnhi - n th input D register with next four S16 values
+ ;// qXjn - n th input Q register with eight S16 values
+ ;// pScale - Pointer to 32bit scale values in Q23 format
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// dXinlo - n th output D register with first four S16 output values of 1st stage
+ ;// dXinhi - n th output D register with next four S16 output values of 1st stage
+ ;//
+ MACRO
+ M_IDCT_PRESCALE32
+qScale0lo QN 0.S32
+qScale0hi QN 1.S32
+qScale1lo QN 2.S32
+qScale1hi QN 3.S32
+qScale2lo QN qScale1lo
+qScale2hi QN qScale1hi
+qScale3lo QN qScale1lo
+qScale3hi QN qScale1hi
+qScale4lo QN qScale1lo
+qScale4hi QN qScale1hi
+qScale5lo QN qScale0lo
+qScale5hi QN qScale0hi
+qScale6lo QN qScale0lo
+qScale6hi QN qScale0hi
+qScale7lo QN qScale0lo
+qScale7hi QN qScale0hi
+
+qSrc0lo QN 4.S32
+qSrc0hi QN 5.S32
+qSrc1lo QN 6.S32
+qSrc1hi QN Src4.S32
+qSrc2lo QN qSrc0lo
+qSrc2hi QN qSrc0hi
+qSrc3lo QN qSrc0lo
+qSrc3hi QN qSrc0hi
+qSrc4lo QN qSrc0lo
+qSrc4hi QN qSrc0hi
+qSrc5lo QN qSrc1lo
+qSrc5hi QN qSrc1hi
+qSrc6lo QN qSrc1lo
+qSrc6hi QN qSrc1hi
+qSrc7lo QN qSrc0lo
+qSrc7hi QN qSrc0hi
+
+qRes17lo QN qScale0lo
+qRes17hi QN qScale0hi
+qRes26lo QN qScale0lo
+qRes26hi QN qScale0hi
+qRes53lo QN qScale0lo
+qRes53hi QN qScale0hi
+
+ ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7]
+
+ ;// Row 0
+ VLD1 {qScale0lo, qScale0hi}, [pScale]!
+ VSHLL qSrc0lo, dXj0lo, #(12-1)
+ VSHLL qSrc0hi, dXj0hi, #(12-1)
+ VLD1 {qScale1lo, qScale1hi}, [pScale]!
+ VQRDMULH qSrc0lo, qScale0lo, qSrc0lo
+ VQRDMULH qSrc0hi, qScale0hi, qSrc0hi
+ VLD1 {qScale7lo, qScale7hi}, [pTemp]!
+ VSHLL qSrc1lo, dXj1lo, #(12-1)
+ VSHLL qSrc1hi, dXj1hi, #(12-1)
+ VMOVN dXi0lo, qSrc0lo ;// Output i0
+ VMOVN dXi0hi, qSrc0hi
+ VSHLL qSrc7lo, dXj7lo, #(12-1)
+ VSHLL qSrc7hi, dXj7hi, #(12-1)
+ SUB pTemp, pTemp, #((16*2)+(4*8*1))
+ VQRDMULH qSrc1lo, qScale1lo, qSrc1lo
+ VQRDMULH qSrc1hi, qScale1hi, qSrc1hi
+ VQRDMULH qSrc7lo, qScale7lo, qSrc7lo
+ VQRDMULH qSrc7hi, qScale7hi, qSrc7hi
+ VLD1 {qScale2lo, qScale2hi}, [pScale]!
+
+ ;// Row 1 & 7
+ VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2
+ VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2
+ VMOVN dXi5lo, qRes17lo ;// Output i5
+ VMOVN dXi5hi, qRes17hi
+ VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7
+ VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7
+ VMOVN dXi6lo, qRes17lo ;// Output i6
+ VMOVN dXi6hi, qRes17hi
+ VSHLL qSrc2lo, dXj2lo, #(12-1)
+ VSHLL qSrc2hi, dXj2hi, #(12-1)
+ VLD1 {qScale6lo, qScale6hi}, [pTemp]!
+ VSHLL qSrc6lo, dXj6lo, #(12-1)
+ VSHLL qSrc6hi, dXj6hi, #(12-1)
+ SUB pTemp, pTemp, #((16*2)+(4*8*1))
+ VQRDMULH qSrc2lo, qScale2lo, qSrc2lo
+ VQRDMULH qSrc2hi, qScale2hi, qSrc2hi
+ VQRDMULH qSrc6lo, qScale6lo, qSrc6lo
+ VQRDMULH qSrc6hi, qScale6hi, qSrc6hi
+ VLD1 {qScale3lo, qScale3hi}, [pScale]!
+
+ ;// Row 2 & 6
+ VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2
+ VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2
+ VMOVN dXi3lo, qRes26lo ;// Output i3
+ VMOVN dXi3hi, qRes26hi
+ VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6
+ VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6
+ VMOVN dXi2lo, qRes26lo ;// Output i2
+ VMOVN dXi2hi, qRes26hi
+ VSHLL qSrc3lo, dXj3lo, #(12-1)
+ VSHLL qSrc3hi, dXj3hi, #(12-1)
+ VLD1 {qScale5lo, qScale5hi}, [pTemp]!
+ VSHLL qSrc5lo, dXj5lo, #(12-1)
+ VSHLL qSrc5hi, dXj5hi, #(12-1)
+ VQRDMULH qSrc3lo, qScale3lo, qSrc3lo
+ VQRDMULH qSrc3hi, qScale3hi, qSrc3hi
+ VQRDMULH qSrc5lo, qScale5lo, qSrc5lo
+ VQRDMULH qSrc5hi, qScale5hi, qSrc5hi
+
+ ;// Row 3 & 5
+ VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2
+ VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2
+ SUB pSrc, pSrc, #16*2*2
+ VMOVN dXi7lo, qRes53lo ;// Output i7
+ VMOVN dXi7hi, qRes53hi
+ VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3
+ VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3
+ VLD1 qXj4, [pSrc @64]
+ VMOVN dXi4lo, qRes53lo ;// Output i4
+ VMOVN dXi4hi, qRes53hi
+ VSHLL qSrc4lo, dXj4lo, #(12-1)
+ VSHLL qSrc4hi, dXj4hi, #(12-1)
+ VLD1 {qScale4lo, qScale4hi}, [pScale]
+ LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+ VQRDMULH qSrc4lo, qScale4lo, qSrc4lo
+ VQRDMULH qSrc4hi, qScale4hi, qSrc4hi
+ VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
+ ;// Row 4
+ VMOVN dXi1lo, qSrc4lo ;// Output i1
+ VMOVN dXi1hi, qSrc4hi
+
+ MEND
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
new file mode 100755
index 0000000..51118fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
@@ -0,0 +1,27 @@
+/**
+ *
+ * File Name: armCOMM_MaskTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Mask Table to mask the end of array
+ */
+
+
+
+#ifndef _ARMCOMM_MASKTABLE_H_
+#define _ARMCOMM_MASKTABLE_H_
+
+#define MaskTableSize 72
+
+/* Mask table */
+
+extern const OMX_U16 armCOMM_qMaskTable16[MaskTableSize];
+extern const OMX_U8 armCOMM_qMaskTable8[MaskTableSize];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
new file mode 100755
index 0000000..41b3e1e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
@@ -0,0 +1,43 @@
+/* Guard the header against multiple inclusion. */
+#ifndef __ARM_COMM_VERSION_H__
+#define __ARM_COMM_VERSION_H__
+
+
+/* The following line should be in omxtypes.h but hasn't been approved by OpenMAX yet */
+#define OMX_VERSION 102
+
+/* We need to define these macros in order to convert a #define number into a #define string. */
+#define ARM_QUOTE(a) #a
+#define ARM_INDIRECT(A) ARM_QUOTE(A)
+
+/* Convert the OMX_VERSION number into a string that can be used, for example, to print it out. */
+#define ARM_VERSION_STRING ARM_INDIRECT(OMX_VERSION)
+
+
+/* Define this in order to turn on ARM version/release/build strings in each domain */
+#define ARM_INCLUDE_VERSION_DESCRIPTIONS
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+ extern const char * const omxAC_VersionDescription;
+ extern const char * const omxIC_VersionDescription;
+ extern const char * const omxIP_VersionDescription;
+ extern const char * const omxSP_VersionDescription;
+ extern const char * const omxVC_VersionDescription;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */
+
+
+/* The following entries should be automatically updated by the release script */
+/* They are used in the ARM version strings defined for each domain. */
+
+/* The release tag associated with this release of the library. - used for source and object releases */
+#define OMX_ARM_RELEASE_TAG "r1p0-00bet0"
+
+/* The ARM architecture used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_ARCHITECTURE "ARM Architecture V7 with NEON"
+
+/* The ARM Toolchain used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_TOOLCHAIN "ARM RVCT 3.1"
+
+
+#endif /* __ARM_COMM_VERSION_H__ */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
new file mode 100755
index 0000000..0956bd1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
@@ -0,0 +1,1157 @@
+;//
+;//
+;// File Name: armCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// ARM optimized OpenMAX common header file
+;//
+
+;// Protect against multiple inclusion
+ IF :LNOT::DEF:ARMCOMM_S_H
+ GBLL ARMCOMM_S_H
+
+ REQUIRE8 ;// Requires 8-byte stack alignment
+ PRESERVE8 ;// Preserves 8-byte stack alignment
+
+ GBLL ARM_ERRORCHECK
+ARM_ERRORCHECK SETL {FALSE}
+
+;// Globals
+
+ GBLS _RRegList ;// R saved register list
+ GBLS _DRegList ;// D saved register list
+ GBLS _Variant ;// Selected processor variant
+ GBLS _CPU ;// CPU name
+ GBLS _Struct ;// Structure name
+
+ GBLL _InFunc ;// Inside function assembly flag
+ GBLL _SwLong ;// Long switch flag
+
+ GBLA _RBytes ;// Number of register bytes on stack
+ GBLA _SBytes ;// Number of scratch bytes on stack
+ GBLA _ABytes ;// Stack offset of next argument
+ GBLA _Workspace ;// Stack offset of scratch workspace
+ GBLA _F ;// Function number
+ GBLA _StOff ;// Struct offset
+ GBLA _SwNum ;// Switch number
+ GBLS _32 ;// Suffix for 32 byte alignmnet
+ GBLS _16 ;// Suffix for 16 byte alignmnet
+
+_InFunc SETL {FALSE}
+_SBytes SETA 0
+_F SETA 0
+_SwNum SETA 0
+_32 SETS "ALIGN32"
+_16 SETS "ALIGN16"
+
+;/////////////////////////////////////////////////////////
+;// Override the tools settings of the CPU if the #define
+;// USECPU is set, otherwise use the CPU defined by the
+;// assembler settings.
+;/////////////////////////////////////////////////////////
+
+ IF :DEF: OVERRIDECPU
+_CPU SETS OVERRIDECPU
+ ELSE
+_CPU SETS {CPU}
+ ENDIF
+
+
+
+;/////////////////////////////////////////////////////////
+;// Work out which code to build
+;/////////////////////////////////////////////////////////
+
+ IF :DEF:ARM1136JS:LOR::DEF:CortexA8:LOR::DEF:ARM_GENERIC
+ INFO 1,"Please switch to using M_VARIANTS"
+ ENDIF
+
+ ;// Define and reset all officially recongnised variants
+ MACRO
+ _M_DEF_VARIANTS
+ _M_DEF_VARIANT ARM926EJS
+ _M_DEF_VARIANT ARM1136JS
+ _M_DEF_VARIANT ARM1136JS_U
+ _M_DEF_VARIANT CortexA8
+ _M_DEF_VARIANT ARM7TDMI
+ MEND
+
+ MACRO
+ _M_DEF_VARIANT $var
+ GBLL $var
+ GBLL _ok$var
+$var SETL {FALSE}
+ MEND
+
+
+ ;// Variant declaration
+ ;//
+ ;// Define a list of code variants supported by this
+ ;// source file. This macro then chooses the most
+ ;// appropriate variant to build for the currently configured
+ ;// core.
+ ;//
+ MACRO
+ M_VARIANTS $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+ ;// Set to TRUE variants that are supported
+ _M_DEF_VARIANTS
+ _M_VARIANT $v0
+ _M_VARIANT $v1
+ _M_VARIANT $v2
+ _M_VARIANT $v3
+ _M_VARIANT $v4
+ _M_VARIANT $v5
+ _M_VARIANT $v6
+ _M_VARIANT $v7
+
+ ;// Look for first available variant to match a CPU
+ ;// _M_TRY cpu, variant fall back list
+_Variant SETS ""
+ _M_TRY ARM926EJ-S, ARM926EJS
+ _M_TRY ARM1176JZ-S, ARM1136JS
+ _M_TRY ARM1176JZF-S, ARM1136JS
+ _M_TRY ARM1156T2-S, ARM1136JS
+ _M_TRY ARM1156T2F-S, ARM1136JS
+ _M_TRY ARM1136J-S, ARM1136JS
+ _M_TRY ARM1136JF-S, ARM1136JS
+ _M_TRY MPCore, ARM1136JS
+ _M_TRY falcon-vfp, ARM1136JS
+ _M_TRY falcon-full-neon, CortexA8
+ _M_TRY Cortex-A8NoNeon, ARM1136JS
+ _M_TRY Cortex-A8, CortexA8, ARM1136JS
+ _M_TRY Cortex-R4, ARM1136JS
+ _M_TRY ARM7TDMI
+
+ ;// Select the correct variant
+ _M_DEF_VARIANTS
+ IF _Variant=""
+ INFO 1, "No match found for CPU '$_CPU'"
+ ELSE
+$_Variant SETL {TRUE}
+ ENDIF
+ MEND
+
+ ;// Register a variant as available
+ MACRO
+ _M_VARIANT $var
+ IF "$var"=""
+ MEXIT
+ ENDIF
+ IF :LNOT::DEF:_ok$var
+ INFO 1, "Unrecognized variant '$var'"
+ ENDIF
+$var SETL {TRUE}
+ MEND
+
+ ;// For a given CPU, see if any of the variants supporting
+ ;// this CPU are available. The first available variant is
+ ;// chosen
+ MACRO
+ _M_TRY $cpu, $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+ IF "$cpu"<>_CPU
+ MEXIT
+ ENDIF
+ _M_TRY1 $v0
+ _M_TRY1 $v1
+ _M_TRY1 $v2
+ _M_TRY1 $v3
+ _M_TRY1 $v4
+ _M_TRY1 $v5
+ _M_TRY1 $v6
+ _M_TRY1 $v7
+ ;// Check a match was found
+ IF _Variant=""
+ INFO 1, "No variant match found for CPU '$_CPU'"
+ ENDIF
+ MEND
+
+ MACRO
+ _M_TRY1 $var
+ IF "$var"=""
+ MEXIT
+ ENDIF
+ IF (_Variant=""):LAND:$var
+_Variant SETS "$var"
+ ENDIF
+ MEND
+
+;////////////////////////////////////////////////////////
+;// Structure definition
+;////////////////////////////////////////////////////////
+
+ ;// Declare a structure of given name
+ MACRO
+ M_STRUCT $sname
+_Struct SETS "$sname"
+_StOff SETA 0
+ MEND
+
+ ;// Declare a structure field
+ ;// The field is called $sname_$fname
+ ;// $size = the size of each entry, must be power of 2
+ ;// $number = (if provided) the number of entries for an array
+ MACRO
+ M_FIELD $fname, $size, $number
+ IF (_StOff:AND:($size-1))!=0
+_StOff SETA _StOff + ($size - (_StOff:AND:($size-1)))
+ ENDIF
+$_Struct._$fname EQU _StOff
+ IF "$number"<>""
+_StOff SETA _StOff + $size*$number
+ ELSE
+_StOff SETA _StOff + $size
+ ENDIF
+ MEND
+
+
+ MACRO
+ M_ENDSTRUCT
+sizeof_$_Struct EQU _StOff
+_Struct SETS ""
+ MEND
+
+;//////////////////////////////////////////////////////////
+;// Switch and table macros
+;//////////////////////////////////////////////////////////
+
+ ;// Start a relative switch table with register to switch on
+ ;//
+ ;// $v = the register to switch on
+ ;// $s = if specified must be "L" to indicate long
+ ;// this allows a greater range to the case code
+ MACRO
+ M_SWITCH $v, $s
+ ASSERT "$s"="":LOR:"$s"="L"
+_SwLong SETL {FALSE}
+ IF "$s"="L"
+_SwLong SETL {TRUE}
+ ENDIF
+_SwNum SETA _SwNum+1
+ IF {CONFIG}=16
+ ;// Thumb
+ IF _SwLong
+ TBH [pc, $v, LSL#1]
+ ELSE
+ TBB [pc, $v]
+ ENDIF
+_Switch$_SwNum
+ ELSE
+ ;// ARM
+ ADD pc, pc, $v, LSL #2
+ NOP
+ ENDIF
+ MEND
+
+ ;// Add a case to the switch statement
+ MACRO
+ M_CASE $label
+ IF {CONFIG}=16
+ ;// Thumb
+ IF _SwLong
+ DCW ($label - _Switch$_SwNum)/2
+ ELSE
+ DCB ($label - _Switch$_SwNum)/2
+ ENDIF
+ ELSE
+ ;// ARM
+ B $label
+ ENDIF
+ MEND
+
+ ;// End of switch statement
+ MACRO
+ M_ENDSWITCH
+ ALIGN 2
+ MEND
+
+
+;////////////////////////////////////////////////////////
+;// Data area allocation
+;////////////////////////////////////////////////////////
+
+ ;// Constant table allocator macro
+ ;//
+ ;// Creates a new section for each constant table
+ ;// $name is symbol through which the table can be accessed.
+ ;// $align is the optional alignment of the table, log2 of
+ ;// the byte alignment - $align=4 is 16 byte aligned
+ MACRO
+ M_TABLE $name, $align
+ ASSERT :LNOT:_InFunc
+ IF "$align"=""
+ AREA |.constdata|, READONLY, DATA
+ ELSE
+ ;// AREAs inherit the alignment of the first declaration.
+ ;// Therefore for each alignment size we must have an area
+ ;// of a different name.
+ AREA constdata_a$align, READONLY, DATA, ALIGN=$align
+
+ ;// We also force alignment incase we are tagging onto
+ ;// an already started area.
+ ALIGN (1<<$align)
+ ENDIF
+$name
+ MEND
+
+;/////////////////////////////////////////////////////
+;// Macros to allocate space on the stack
+;//
+;// These all assume that the stack is 8-byte aligned
+;// at entry to the function, which means that the
+;// 32-byte alignment macro needs to work in a
+;// bit more of a special way...
+;/////////////////////////////////////////////////////
+
+
+
+
+ ;// Allocate 1-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC1 $name, $size
+ ASSERT :LNOT:_InFunc
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 2-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC2 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:1)!=0
+_SBytes SETA _SBytes + (2 - (_SBytes:AND:1))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 4-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC4 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:3)!=0
+_SBytes SETA _SBytes + (4 - (_SBytes:AND:3))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC8 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size ($size+16) bytes.
+ ;// The extra 16 bytes are later used to align the pointer to 16 bytes
+
+ MACRO
+ M_ALLOC16 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F$_16 EQU (_SBytes + 8)
+_SBytes SETA _SBytes + ($size) + 8
+ MEND
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size ($size+32) bytes.
+ ;// The extra 32 bytes are later used to align the pointer to 32 bytes
+
+ MACRO
+ M_ALLOC32 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F$_32 EQU (_SBytes + 24)
+_SBytes SETA _SBytes + ($size) + 24
+ MEND
+
+
+
+
+ ;// Argument Declaration Macro
+ ;//
+ ;// Allocate an argument name $name
+ ;// size $size bytes
+ MACRO
+ M_ARG $name, $size
+ ASSERT _InFunc
+$name$_F EQU _ABytes
+_ABytes SETA _ABytes + ($size)
+ MEND
+
+;///////////////////////////////////////////////
+;// Macros to access stacked variables
+;///////////////////////////////////////////////
+
+ ;// Macro to perform a data processing operation
+ ;// with a constant second operand
+ MACRO
+ _M_OPC $op,$rd,$rn,$const
+ LCLA _sh
+ LCLA _cst
+_sh SETA 0
+_cst SETA $const
+ IF _cst=0
+ $op $rd, $rn, #_cst
+ MEXIT
+ ENDIF
+ WHILE (_cst:AND:3)=0
+_cst SETA _cst>>2
+_sh SETA _sh+2
+ WEND
+ $op $rd, $rn, #(_cst:AND:0x000000FF)<<_sh
+ IF _cst>=256
+ $op $rd, $rd, #(_cst:AND:0xFFFFFF00)<<_sh
+ ENDIF
+ MEND
+
+ ;// Macro to perform a data access operation
+ ;// Such as LDR or STR
+ ;// The addressing mode is modified such that
+ ;// 1. If no address is given then the name is taken
+ ;// as a stack offset
+ ;// 2. If the addressing mode is not available for the
+ ;// state being assembled for (eg Thumb) then a suitable
+ ;// addressing mode is substituted.
+ ;//
+ ;// On Entry:
+ ;// $i = Instruction to perform (eg "LDRB")
+ ;// $a = Required byte alignment
+ ;// $r = Register(s) to transfer (eg "r1")
+ ;// $a0,$a1,$a2. Addressing mode and condition. One of:
+ ;// label {,cc}
+ ;// [base] {,,,cc}
+ ;// [base, offset]{!} {,,cc}
+ ;// [base, offset, shift]{!} {,cc}
+ ;// [base], offset {,,cc}
+ ;// [base], offset, shift {,cc}
+ MACRO
+ _M_DATA $i,$a,$r,$a0,$a1,$a2,$a3
+ IF "$a0":LEFT:1="["
+ IF "$a1"=""
+ $i$a3 $r, $a0
+ ELSE
+ IF "$a0":RIGHT:1="]"
+ IF "$a2"=""
+ _M_POSTIND $i$a3, "$r", $a0, $a1
+ ELSE
+ _M_POSTIND $i$a3, "$r", $a0, "$a1,$a2"
+ ENDIF
+ ELSE
+ IF "$a2"=""
+ _M_PREIND $i$a3, "$r", $a0, $a1
+ ELSE
+ _M_PREIND $i$a3, "$r", $a0, "$a1,$a2"
+ ENDIF
+ ENDIF
+ ENDIF
+ ELSE
+ LCLA _Offset
+_Offset SETA _Workspace + $a0$_F
+ ASSERT (_Offset:AND:($a-1))=0
+ $i$a1 $r, [sp, #_Offset]
+ ENDIF
+ MEND
+
+ ;// Handle post indexed load/stores
+ ;// op reg, [base], offset
+ MACRO
+ _M_POSTIND $i,$r,$a0,$a1
+ LCLS _base
+ LCLS _offset
+ IF {CONFIG}=16 ;// Thumb
+_base SETS ("$a0":LEFT:(:LEN:"$a0"-1)):RIGHT:(:LEN:"$a0"-2) ;// remove []
+_offset SETS "$a1"
+ IF _offset:LEFT:1="+"
+_offset SETS _offset:RIGHT:(:LEN:_offset-1)
+ ENDIF
+ $i $r, $a0
+ IF _offset:LEFT:1="-"
+_offset SETS _offset:RIGHT:(:LEN:_offset-1)
+ SUB $_base, $_base, $_offset
+ ELSE
+ ADD $_base, $_base, $_offset
+ ENDIF
+ ELSE ;// ARM
+ $i $r, $a0, $a1
+ ENDIF
+ MEND
+
+ ;// Handle pre indexed load/store
+ ;// op reg, [base, offset]{!}
+ MACRO
+ _M_PREIND $i,$r,$a0,$a1
+ LCLS _base
+ LCLS _offset
+ IF ({CONFIG}=16):LAND:(("$a1":RIGHT:2)="]!")
+_base SETS "$a0":RIGHT:(:LEN:("$a0")-1)
+_offset SETS "$a1":LEFT:(:LEN:("$a1")-2)
+ $i $r, [$_base, $_offset]
+ ADD $_base, $_base, $_offset
+ ELSE
+ $i $r, $a0, $a1
+ ENDIF
+ MEND
+
+ ;// Load unsigned byte from stack
+ MACRO
+ M_LDRB $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load signed byte from stack
+ MACRO
+ M_LDRSB $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRSB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store byte to stack
+ MACRO
+ M_STRB $r,$a0,$a1,$a2,$a3
+ _M_DATA "STRB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load unsigned half word from stack
+ MACRO
+ M_LDRH $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load signed half word from stack
+ MACRO
+ M_LDRSH $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRSH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store half word to stack
+ MACRO
+ M_STRH $r,$a0,$a1,$a2,$a3
+ _M_DATA "STRH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load word from stack
+ MACRO
+ M_LDR $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDR",4,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store word to stack
+ MACRO
+ M_STR $r,$a0,$a1,$a2,$a3
+ _M_DATA "STR",4,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load double word from stack
+ MACRO
+ M_LDRD $r0,$r1,$a0,$a1,$a2,$a3
+ _M_DATA "LDRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store double word to stack
+ MACRO
+ M_STRD $r0,$r1,$a0,$a1,$a2,$a3
+ _M_DATA "STRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Get absolute address of stack allocated location
+ MACRO
+ M_ADR $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F)
+ MEND
+
+ ;// Get absolute address of stack allocated location and align the address to 16 bytes
+ MACRO
+ M_ADR16 $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F$_16)
+
+ ;// Now align $a to 16 bytes
+ BIC$cc $a,$a,#0x0F
+ MEND
+
+ ;// Get absolute address of stack allocated location and align the address to 32 bytes
+ MACRO
+ M_ADR32 $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F$_32)
+
+ ;// Now align $a to 32 bytes
+ BIC$cc $a,$a,#0x1F
+ MEND
+
+;//////////////////////////////////////////////////////////
+;// Function header and footer macros
+;//////////////////////////////////////////////////////////
+
+ ;// Function Header Macro
+ ;// Generates the function prologue
+ ;// Note that functions should all be "stack-moves-once"
+ ;// The FNSTART and FNEND macros should be the only places
+ ;// where the stack moves.
+ ;//
+ ;// $name = function name
+ ;// $rreg = "" don't stack any registers
+ ;// "lr" stack "lr" only
+ ;// "rN" stack registers "r4-rN,lr"
+ ;// $dreg = "" don't stack any D registers
+ ;// "dN" stack registers "d8-dN"
+ ;//
+ ;// Note: ARM Archicture procedure call standard AAPCS
+ ;// states that r4-r11, sp, d8-d15 must be preserved by
+ ;// a compliant function.
+ MACRO
+ M_START $name, $rreg, $dreg
+ ASSERT :LNOT:_InFunc
+ ASSERT "$name"!=""
+_InFunc SETL {TRUE}
+_RBytes SETA 0
+_Workspace SETA 0
+
+ ;// Create an area for the function
+ AREA |.text|, CODE
+ EXPORT $name
+$name FUNCTION
+
+ ;// Save R registers
+ _M_GETRREGLIST $rreg
+ IF _RRegList<>""
+ STMFD sp!, {$_RRegList, lr}
+ ENDIF
+
+ ;// Save D registers
+ _M_GETDREGLIST $dreg
+ IF _DRegList<>""
+ VSTMFD sp!, {$_DRegList}
+ ENDIF
+
+
+ ;// Ensure size claimed on stack is 8-byte aligned
+ IF ((_SBytes:AND:7)!=0)
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+
+ IF (_SBytes!=0)
+ _M_OPC SUB, sp, sp, _SBytes
+ ENDIF
+
+
+_ABytes SETA _SBytes + _RBytes - _Workspace
+
+
+ ;// Print function name if debug enabled
+ M_PRINTF "$name\n",
+ MEND
+
+ ;// Work out a list of R saved registers
+ MACRO
+ _M_GETRREGLIST $rreg
+ IF "$rreg"=""
+_RRegList SETS ""
+ MEXIT
+ ENDIF
+ IF "$rreg"="lr":LOR:"$rreg"="r4"
+_RRegList SETS "r4"
+_RBytes SETA _RBytes+8
+ MEXIT
+ ENDIF
+ IF "$rreg"="r5":LOR:"$rreg"="r6"
+_RRegList SETS "r4-r6"
+_RBytes SETA _RBytes+16
+ MEXIT
+ ENDIF
+ IF "$rreg"="r7":LOR:"$rreg"="r8"
+_RRegList SETS "r4-r8"
+_RBytes SETA _RBytes+24
+ MEXIT
+ ENDIF
+ IF "$rreg"="r9":LOR:"$rreg"="r10"
+_RRegList SETS "r4-r10"
+_RBytes SETA _RBytes+32
+ MEXIT
+ ENDIF
+ IF "$rreg"="r11":LOR:"$rreg"="r12"
+_RRegList SETS "r4-r12"
+_RBytes SETA _RBytes+40
+ MEXIT
+ ENDIF
+ INFO 1, "Unrecognized saved r register limit '$rreg'"
+ MEND
+
+ ;// Work out a list of D saved registers
+ MACRO
+ _M_GETDREGLIST $dreg
+ IF "$dreg"=""
+_DRegList SETS ""
+ MEXIT
+ ENDIF
+ IF "$dreg"="d8"
+_DRegList SETS "d8"
+_RBytes SETA _RBytes+8
+ MEXIT
+ ENDIF
+ IF "$dreg"="d9"
+_DRegList SETS "d8-d9"
+_RBytes SETA _RBytes+16
+ MEXIT
+ ENDIF
+ IF "$dreg"="d10"
+_DRegList SETS "d8-d10"
+_RBytes SETA _RBytes+24
+ MEXIT
+ ENDIF
+ IF "$dreg"="d11"
+_DRegList SETS "d8-d11"
+_RBytes SETA _RBytes+32
+ MEXIT
+ ENDIF
+ IF "$dreg"="d12"
+_DRegList SETS "d8-d12"
+_RBytes SETA _RBytes+40
+ MEXIT
+ ENDIF
+ IF "$dreg"="d13"
+_DRegList SETS "d8-d13"
+_RBytes SETA _RBytes+48
+ MEXIT
+ ENDIF
+ IF "$dreg"="d14"
+_DRegList SETS "d8-d14"
+_RBytes SETA _RBytes+56
+ MEXIT
+ ENDIF
+ IF "$dreg"="d15"
+_DRegList SETS "d8-d15"
+_RBytes SETA _RBytes+64
+ MEXIT
+ ENDIF
+ INFO 1, "Unrecognized saved d register limit '$dreg'"
+ MEND
+
+ ;// Produce function return instructions
+ MACRO
+ _M_RET $cc
+ IF _DRegList<>""
+ VPOP$cc {$_DRegList}
+ ENDIF
+ IF _RRegList=""
+ BX$cc lr
+ ELSE
+ LDM$cc.FD sp!, {$_RRegList, pc}
+ ENDIF
+ MEND
+
+ ;// Early Function Exit Macro
+ ;// $cc = condition to exit with
+ ;// (Example: M_EXIT EQ)
+ MACRO
+ M_EXIT $cc
+ ASSERT _InFunc
+ IF _SBytes!=0
+ ;// Restore stack frame and exit
+ B$cc _End$_F
+ ELSE
+ ;// Can return directly
+ _M_RET $cc
+ ENDIF
+ MEND
+
+ ;// Function Footer Macro
+ ;// Generates the function epilogue
+ MACRO
+ M_END
+ ASSERT _InFunc
+_InFunc SETL {FALSE}
+_End$_F
+
+ ;// Restore the stack pointer to its original value on function entry
+ IF _SBytes!=0
+ _M_OPC ADD, sp, sp, _SBytes
+ ENDIF
+ _M_RET
+ ENDFUNC
+
+ ;// Reset the global stack tracking variables back to their
+ ;// initial values, and increment the function count
+_SBytes SETA 0
+_F SETA _F+1
+ MEND
+
+
+;//==========================================================================
+;// Debug Macros
+;//==========================================================================
+
+ GBLL DEBUG_ON
+DEBUG_ON SETL {FALSE}
+ GBLL DEBUG_STALLS_ON
+DEBUG_STALLS_ON SETL {FALSE}
+
+ ;//==========================================================================
+ ;// Debug call to printf
+ ;// M_PRINTF $format, $val0, $val1, $val2
+ ;//
+ ;// Examples:
+ ;// M_PRINTF "x=%08x\n", r0
+ ;//
+ ;// This macro preserves the value of all registers including the
+ ;// flags.
+ ;//==========================================================================
+
+ MACRO
+ M_PRINTF $format, $val0, $val1, $val2
+ IF DEBUG_ON
+
+ IMPORT printf
+ LCLA nArgs
+nArgs SETA 0
+
+ ;// save registers so we don't corrupt them
+ STMFD sp!, {r0-r12, lr}
+
+ ;// Drop stack to give us some workspace
+ SUB sp, sp, #16
+
+ ;// Save registers we need to print to the stack
+ IF "$val2" <> ""
+ ASSERT "$val1" <> ""
+ STR $val2, [sp, #8]
+nArgs SETA nArgs+1
+ ENDIF
+ IF "$val1" <> ""
+ ASSERT "$val0" <> ""
+ STR $val1, [sp, #4]
+nArgs SETA nArgs+1
+ ENDIF
+ IF "$val0"<>""
+ STR $val0, [sp]
+nArgs SETA nArgs+1
+ ENDIF
+
+ ;// Now we are safe to corrupt registers
+ ADR r0, %FT00
+ IF nArgs=1
+ LDR r1, [sp]
+ ENDIF
+ IF nArgs=2
+ LDMIA sp, {r1,r2}
+ ENDIF
+ IF nArgs=3
+ LDMIA sp, {r1,r2,r3}
+ ENDIF
+
+ ;// print the values
+ MRS r4, cpsr ;// preserve flags
+ BL printf
+ MSR cpsr_f, r4 ;// restore flags
+ B %FT01
+00 ;// string to print
+ DCB "$format", 0
+ ALIGN
+01 ;// Finished
+ ADD sp, sp, #16
+ ;// Restore registers
+ LDMFD sp!, {r0-r12,lr}
+
+ ENDIF ;// DEBUG_ON
+ MEND
+
+
+ ;// Stall Simulation Macro
+ ;// Inserts a given number of NOPs for the currently
+ ;// defined platform
+ MACRO
+ M_STALL $plat1stall, $plat2stall, $plat3stall, $plat4stall, $plat5stall, $plat6stall
+ IF DEBUG_STALLS_ON
+ _M_STALL_SUB $plat1stall
+ _M_STALL_SUB $plat2stall
+ _M_STALL_SUB $plat3stall
+ _M_STALL_SUB $plat4stall
+ _M_STALL_SUB $plat5stall
+ _M_STALL_SUB $plat6stall
+ ENDIF
+ MEND
+
+ MACRO
+ _M_STALL_SUB $platstall
+ IF "$platstall"!=""
+ LCLA _pllen
+ LCLS _pl
+ LCLL _pllog
+_pllen SETA :LEN:"$platstall"
+_pl SETS "$platstall":LEFT:(_pllen - 2)
+ IF :DEF:$_pl
+ IF $_pl
+ LCLS _st
+ LCLA _stnum
+_st SETS "$platstall":RIGHT:1
+_stnum SETA $_st
+ WHILE _stnum>0
+ MOV sp, sp
+_stnum SETA _stnum - 1
+ WEND
+ ENDIF
+ ENDIF
+ ENDIF
+ MEND
+
+
+
+;//==========================================================================
+;// Endian Invarience Macros
+;//
+;// The idea behind these macros is that if an array is
+;// loaded as words then the SMUL00 macro will multiply
+;// array elements 0 regardless of the endianess of the
+;// system. For little endian SMUL00=SMULBB, for big
+;// endian SMUL00=SMULTT and similarly for other packed operations.
+;//
+;//==========================================================================
+
+ MACRO
+ LIBI4 $comli, $combi, $a, $b, $c, $d, $cc
+ IF {ENDIAN}="big"
+ $combi.$cc $a, $b, $c, $d
+ ELSE
+ $comli.$cc $a, $b, $c, $d
+ ENDIF
+ MEND
+
+ MACRO
+ LIBI3 $comli, $combi, $a, $b, $c, $cc
+ IF {ENDIAN}="big"
+ $combi.$cc $a, $b, $c
+ ELSE
+ $comli.$cc $a, $b, $c
+ ENDIF
+ MEND
+
+ ;// SMLAxy macros
+
+ MACRO
+ SMLA00 $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA01 $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA0B $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA0T $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA10 $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA11 $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA1B $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA1T $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAB0 $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAB1 $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAT0 $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAT1 $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ ;// SMULxy macros
+
+ MACRO
+ SMUL00 $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL01 $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL0B $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL0T $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL10 $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL11 $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL1B $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL1T $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULB0 $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULB1 $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULT0 $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULT1 $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ ;// SMLAWx, SMULWx macros
+
+ MACRO
+ SMLAW0 $a, $b, $c, $d, $cc
+ LIBI4 SMLAWB, SMLAWT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAW1 $a, $b, $c, $d, $cc
+ LIBI4 SMLAWT, SMLAWB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMULW0 $a, $b, $c, $cc
+ LIBI3 SMULWB, SMULWT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULW1 $a, $b, $c, $cc
+ LIBI3 SMULWT, SMULWB, $a, $b, $c, $cc
+ MEND
+
+ ;// SMLALxy macros
+
+
+ MACRO
+ SMLAL00 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL01 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL0B $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL0T $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL10 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL11 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL1B $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL1T $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALB0 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALB1 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALT0 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALT1 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ ENDIF ;// ARMCOMM_S_H
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
new file mode 100755
index 0000000..7a68d14
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
@@ -0,0 +1,274 @@
+/*
+ *
+ * File Name: armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix.
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ * a function with a different version (the original version would still be
+ * in the library just with a different name - so you could debug the new
+ * version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that
+ * you can include two versions of the library and choose between functions
+ * at runtime.
+ *
+ * e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ *
+ */
+
+
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX BARE_SUFFIX
+#define OMXACMP3_SUFFIX BARE_SUFFIX
+#define OMXICJP_SUFFIX BARE_SUFFIX
+#define OMXIPBM_SUFFIX BARE_SUFFIX
+#define OMXIPCS_SUFFIX BARE_SUFFIX
+#define OMXIPPP_SUFFIX BARE_SUFFIX
+#define OMXSP_SUFFIX BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32 OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32 OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32 OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16 OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32 OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32 OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32 OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32 OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16 OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8 OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3 OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16 OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16 OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16 OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16 OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16 OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16 OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1 OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8 OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8 OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1 OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8 OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8 OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64 OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64 OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16 OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32 OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16 OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16 OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32 OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32 OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32 OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16 OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32 OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32 OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32 OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32 OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16 OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16 OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16 OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16 OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16 OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16 OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16 OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16 OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4 OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8 OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4 OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16 OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8 OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16 OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8 OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+
+#endif /* _armOMX_h_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
new file mode 100755
index 0000000..8b295a6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
@@ -0,0 +1,252 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright © 2005-2008 The Khronos Group Inc. All Rights Reserved.
+ *
+ * These materials are protected by copyright laws and contain material
+ * proprietary to the Khronos Group, Inc. You may use these materials
+ * for implementing Khronos specifications, without altering or removing
+ * any trademark, copyright or other notice from the specification.
+ *
+ * Khronos Group makes no, and expressly disclaims any, representations
+ * or warranties, express or implied, regarding these materials, including,
+ * without limitation, any implied warranties of merchantability or fitness
+ * for a particular purpose or non-infringement of any intellectual property.
+ * Khronos Group makes no, and expressly disclaims any, warranties, express
+ * or implied, regarding the correctness, accuracy, completeness, timeliness,
+ * and reliability of these materials.
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters,
+ * Contributors or Members or their respective partners, officers, directors,
+ * employees, agents or representatives be liable for any damages, whether
+ * direct, indirect, special or consequential damages for lost revenues,
+ * lost profits, or otherwise, arising from or in connection with these
+ * materials.
+ *
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc.
+ *
+ */
+
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h>
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+
+ /* Mandatory return codes - use cases are explicitly described for each function */
+ OMX_Sts_NoErr = 0, /* No error, the function completed successfully */
+ OMX_Sts_Err = -2, /* Unknown/unspecified error */
+ OMX_Sts_InvalidBitstreamValErr = -182, /* Invalid value detected during bitstream processing */
+ OMX_Sts_MemAllocErr = -9, /* Not enough memory allocated for the operation */
+ OMX_StsACAAC_GainCtrErr = -159, /* AAC: Unsupported gain control data detected */
+ OMX_StsACAAC_PrgNumErr = -167, /* AAC: Invalid number of elements for one program */
+ OMX_StsACAAC_CoefValErr = -163, /* AAC: Invalid quantized coefficient value */
+ OMX_StsACAAC_MaxSfbErr = -162, /* AAC: Invalid maxSfb value in relation to numSwb */
+ OMX_StsACAAC_PlsDataErr = -160, /* AAC: pulse escape sequence data error */
+
+ /* Optional return codes - use cases are explicitly described for each function*/
+ OMX_Sts_BadArgErr = -5, /* Bad Arguments */
+
+ OMX_StsACAAC_TnsNumFiltErr = -157, /* AAC: Invalid number of TNS filters */
+ OMX_StsACAAC_TnsLenErr = -156, /* AAC: Invalid TNS region length */
+ OMX_StsACAAC_TnsOrderErr = -155, /* AAC: Invalid order of TNS filter */
+ OMX_StsACAAC_TnsCoefResErr = -154, /* AAC: Invalid bit-resolution for TNS filter coefficients */
+ OMX_StsACAAC_TnsCoefErr = -153, /* AAC: Invalid TNS filter coefficients */
+ OMX_StsACAAC_TnsDirectErr = -152, /* AAC: Invalid TNS filter direction */
+
+ OMX_StsICJP_JPEGMarkerErr = -183, /* JPEG marker encountered within an entropy-coded block; */
+ /* Huffman decoding operation terminated early. */
+ OMX_StsICJP_JPEGMarker = -181, /* JPEG marker encountered; Huffman decoding */
+ /* operation terminated early. */
+ OMX_StsIPPP_ContextMatchErr = -17, /* Context parameter doesn't match to the operation */
+
+ OMX_StsSP_EvenMedianMaskSizeErr = -180, /* Even size of the Median Filter mask was replaced by the odd one */
+
+ OMX_Sts_MaximumEnumeration = INT_MAX /*Placeholder, forces enum of size OMX_INT*/
+
+ } OMXResult; /** Return value or error value returned from a function. Identical to OMX_INT */
+
+
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff
+typedef unsigned short int OMX_U8;
+#else
+#error OMX_U8 undefined
+#endif
+
+
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f
+typedef signed short int OMX_S8;
+#else
+#error OMX_S8 undefined
+#endif
+
+
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16;
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff
+typedef signed int OMX_S16;
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32;
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32;
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+ typedef __int64 OMX_S64; /** Signed 64-bit integer */
+ typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+ #define OMX_MIN_S64 (0x8000000000000000i64)
+ #define OMX_MIN_U64 (0x0000000000000000i64)
+ #define OMX_MAX_S64 (0x7FFFFFFFFFFFFFFFi64)
+ #define OMX_MAX_U64 (0xFFFFFFFFFFFFFFFFi64)
+#else
+ typedef long long OMX_S64; /** Signed 64-bit integer */
+ typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+ #define OMX_MIN_S64 (0x8000000000000000LL)
+ #define OMX_MIN_U64 (0x0000000000000000LL)
+ #define OMX_MAX_S64 (0x7FFFFFFFFFFFFFFFLL)
+ #define OMX_MAX_U64 (0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+ OMX_S8 Re; /** Real part */
+ OMX_S8 Im; /** Imaginary part */
+
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+ OMX_S16 Re; /** Real part */
+ OMX_S16 Im; /** Imaginary part */
+
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+ OMX_S32 Re; /** Real part */
+ OMX_S32 Im; /** Imaginary part */
+
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+ OMX_S64 Re; /** Real part */
+ OMX_S64 Im; /** Imaginary part */
+
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8 (-128)
+#define OMX_MIN_U8 0
+#define OMX_MIN_S16 (-32768)
+#define OMX_MIN_U16 0
+#define OMX_MIN_S32 (-2147483647-1)
+#define OMX_MIN_U32 0
+
+#define OMX_MAX_S8 (127)
+#define OMX_MAX_U8 (255)
+#define OMX_MAX_S16 (32767)
+#define OMX_MAX_U16 (0xFFFF)
+#define OMX_MAX_S32 (2147483647)
+#define OMX_MAX_U32 (0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle,
+ * where x,y defines the coordinates of the top left corner
+ * of the rectangle, with dimensions width in the x-direction
+ * and height in the y-direction */
+typedef struct {
+ OMX_INT x; /** x-coordinate of top left corner of rectangle */
+ OMX_INT y; /** y-coordinate of top left corner of rectangle */
+ OMX_INT width; /** Width in the x-direction. */
+ OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y; /** y-coordinate */
+
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct
+{
+ OMX_INT width; /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
new file mode 100755
index 0000000..48703d1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
@@ -0,0 +1,77 @@
+;//
+;//
+;// File Name: omxtypes_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Mandatory return codes - use cases are explicitly described for each function
+OMX_Sts_NoErr EQU 0 ;// No error the function completed successfully
+OMX_Sts_Err EQU -2 ;// Unknown/unspecified error
+OMX_Sts_InvalidBitstreamValErr EQU -182 ;// Invalid value detected during bitstream processing
+OMX_Sts_MemAllocErr EQU -9 ;// Not enough memory allocated for the operation
+OMX_StsACAAC_GainCtrErr EQU -159 ;// AAC: Unsupported gain control data detected
+OMX_StsACAAC_PrgNumErr EQU -167 ;// AAC: Invalid number of elements for one program
+OMX_StsACAAC_CoefValErr EQU -163 ;// AAC: Invalid quantized coefficient value
+OMX_StsACAAC_MaxSfbErr EQU -162 ;// AAC: Invalid maxSfb value in relation to numSwb
+OMX_StsACAAC_PlsDataErr EQU -160 ;// AAC: pulse escape sequence data error
+
+;// Optional return codes - use cases are explicitly described for each function
+OMX_Sts_BadArgErr EQU -5 ;// Bad Arguments
+
+OMX_StsACAAC_TnsNumFiltErr EQU -157 ;// AAC: Invalid number of TNS filters
+OMX_StsACAAC_TnsLenErr EQU -156 ;// AAC: Invalid TNS region length
+OMX_StsACAAC_TnsOrderErr EQU -155 ;// AAC: Invalid order of TNS filter
+OMX_StsACAAC_TnsCoefResErr EQU -154 ;// AAC: Invalid bit-resolution for TNS filter coefficients
+OMX_StsACAAC_TnsCoefErr EQU -153 ;// AAC: Invalid TNS filter coefficients
+OMX_StsACAAC_TnsDirectErr EQU -152 ;// AAC: Invalid TNS filter direction
+
+OMX_StsICJP_JPEGMarkerErr EQU -183 ;// JPEG marker encountered within an entropy-coded block;
+ ;// Huffman decoding operation terminated early.
+OMX_StsICJP_JPEGMarker EQU -181 ;// JPEG marker encountered; Huffman decoding
+ ;// operation terminated early.
+OMX_StsIPPP_ContextMatchErr EQU -17 ;// Context parameter doesn't match to the operation
+
+OMX_StsSP_EvenMedianMaskSizeErr EQU -180 ;// Even size of the Median Filter mask was replaced by the odd one
+
+OMX_Sts_MaximumEnumeration EQU 0x7FFFFFFF
+
+
+
+OMX_MIN_S8 EQU (-128)
+OMX_MIN_U8 EQU 0
+OMX_MIN_S16 EQU (-32768)
+OMX_MIN_U16 EQU 0
+
+
+OMX_MIN_S32 EQU (-2147483647-1)
+OMX_MIN_U32 EQU 0
+
+OMX_MAX_S8 EQU (127)
+OMX_MAX_U8 EQU (255)
+OMX_MAX_S16 EQU (32767)
+OMX_MAX_U16 EQU (0xFFFF)
+OMX_MAX_S32 EQU (2147483647)
+OMX_MAX_U32 EQU (0xFFFFFFFF)
+
+OMX_VC_UPPER EQU 0x1 ;// Used by the PredictIntra functions
+OMX_VC_LEFT EQU 0x2 ;// Used by the PredictIntra functions
+OMX_VC_UPPER_RIGHT EQU 0x40 ;// Used by the PredictIntra functions
+
+NULL EQU 0
+
+;// Structures
+
+ INCLUDE armCOMM_s.h
+
+ M_STRUCT OMXPoint
+ M_FIELD x, 4
+ M_FIELD y, 4
+ M_ENDSTRUCT
+
+ END