Initial-checkin for ON2 Software AVC/H264 decoder

o when neon is present, the performance gain of On2 AVC software decoder over PV software decoder is more than 30%. o In addition, it fixes some known PV software decoder issues like missing output frames o allow both pv and on2 software avc to be available for easy comparision o change output frames from 8 to 16 Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
author: James Dong <jdong@google.com> 2011-05-31 18:53:46 -0700
committer: James Dong <jdong@google.com> 2011-06-02 12:32:46 -0700
commit: 0c1bc742181ded4930842b46e9507372f0b1b963 (patch)
tree: c952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api
parent: 92a746c3b18d035189f596ce32847bf26247aaca (diff)
download: frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2
11 files changed, 4988 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
new file mode 100755
index 0000000..64c1958
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
@@ -0,0 +1,785 @@
+/**
+ * 
+ * File Name:  armCOMM.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *   
+ * File: armCOMM.h
+ * Brief: Declares Common APIs/Data Types used across OpenMAX API's
+ *
+ */
+ 
+  
+#ifndef _armCommon_H_
+#define _armCommon_H_
+
+#include "omxtypes.h"
+
+typedef struct
+{
+  OMX_F32 Re; /** Real part */
+  OMX_F32 Im; /** Imaginary part */	
+        
+} OMX_FC32; /** single precision floating point complex number */
+
+typedef struct
+{
+  OMX_F64 Re; /** Real part */
+  OMX_F64 Im; /** Imaginary part */	
+        
+} OMX_FC64; /** double precision floating point complex number */
+
+
+/* Used by both IP and IC domains for 8x8 JPEG blocks. */
+typedef OMX_S16 ARM_BLOCK8x8[64];
+
+
+#include "armOMX.h"
+
+#define  armPI (OMX_F64)(3.1415926535897932384626433832795)
+
+/***********************************************************************/
+
+/* Compiler extensions */
+#ifdef ARM_DEBUG
+/* debug version */
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#define armError(str) {printf((str)); printf("\n"); exit(-1);}
+#define armWarn(str) {printf((str)); printf("\n");}
+#define armIgnore(a) ((void)a)
+#define armAssert(a) assert(a)
+#else 
+/* release version */
+#define armError(str) ((void) (str))
+#define armWarn(str)  ((void) (str))
+#define armIgnore(a)  ((void) (a))
+#define armAssert(a)  ((void) (a))
+#endif /* ARM_DEBUG */
+
+/* Arithmetic operations */
+
+#define armMin(a,b)             ( (a) > (b) ?  (b):(a) )
+#define armMax(a,b)             ( (a) > (b) ?  (a):(b) )
+#define armAbs(a)               ( (a) <  0  ? -(a):(a) )
+
+/* Alignment operation */
+
+#define armAlignToBytes(Ptr,N)      (Ptr + ( ((N-(int)Ptr)&(N-1)) / sizeof(*Ptr) ))
+#define armAlignTo2Bytes(Ptr)       armAlignToBytes(Ptr,2)
+#define armAlignTo4Bytes(Ptr)       armAlignToBytes(Ptr,4)
+#define armAlignTo8Bytes(Ptr)       armAlignToBytes(Ptr,8)
+#define armAlignTo16Bytes(Ptr)      armAlignToBytes(Ptr,16)
+
+/* Error and Alignment check */
+
+#define armRetArgErrIf(condition, code)  if(condition) { return (code); }
+#define armRetDataErrIf(condition, code) if(condition) { return (code); }
+
+#ifndef ALIGNMENT_DOESNT_MATTER
+#define armIsByteAligned(Ptr,N)     ((((int)(Ptr)) % N)==0)
+#define armNotByteAligned(Ptr,N)    ((((int)(Ptr)) % N)!=0)
+#else
+#define armIsByteAligned(Ptr,N)     (1)
+#define armNotByteAligned(Ptr,N)    (0)
+#endif
+
+#define armIs2ByteAligned(Ptr)      armIsByteAligned(Ptr,2)
+#define armIs4ByteAligned(Ptr)      armIsByteAligned(Ptr,4)
+#define armIs8ByteAligned(Ptr)      armIsByteAligned(Ptr,8)
+#define armIs16ByteAligned(Ptr)     armIsByteAligned(Ptr,16)
+
+#define armNot2ByteAligned(Ptr)     armNotByteAligned(Ptr,2)
+#define armNot4ByteAligned(Ptr)     armNotByteAligned(Ptr,4)
+#define armNot8ByteAligned(Ptr)     armNotByteAligned(Ptr,8)
+#define armNot16ByteAligned(Ptr)    armNotByteAligned(Ptr,16)
+#define armNot32ByteAligned(Ptr)    armNotByteAligned(Ptr,32)
+
+/**
+ * Function: armRoundFloatToS16_ref/armRoundFloatToS32_ref/armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value);
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToS16_ref/armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToU16_ref/armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16/OMX_U32 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value);
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value);
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	    var     Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT --   returns 1 if it is Positive
+ *              returns 0 if it is 0
+ *              returns -1 if it is Negative 
+ */ 
+ 
+OMX_INT armSignCheck (OMX_S16 var);
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 --   returns clipped value
+ */ 
+ 
+OMX_S32 armClip (
+        OMX_INT min,
+        OMX_INT max, 
+        OMX_S32 src
+        );
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 --   returns clipped value
+ */ 
+ 
+OMX_F32 armClip_F32 (
+        OMX_F32 min,
+        OMX_F32 max, 
+        OMX_F32 src
+        );
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and 
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding 
+ * integer value. Takes care of rounding while clipping the final 
+ * value.
+ *
+ * Parameters:
+ * [in] v          Number to be operated upon
+ * [in] shift      Divides the input "v" by "2^shift"
+ * [in] satBits    Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 --   returns "shifted" saturated value
+ */ 
+ 
+OMX_U32 armShiftSat_F32(
+        OMX_F32 v, 
+        OMX_INT shift, 
+        OMX_INT satBits
+        );
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * This function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(OMX_U8 *pBuf1, OMX_U8 *pBuf2, OMX_INT elemSize);
+
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry     First entry
+ * [in] sEntry     second entry
+ * [in] tEntry     Third entry
+ *
+ * Return Value:
+ * OMX_S32 --   returns the median value
+ */ 
+ 
+OMX_S32 armMedianOf3 (
+    OMX_S32 fEntry,
+    OMX_S32 sEntry, 
+    OMX_S32 tEntry 
+    );
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value    Positive value
+ *
+ * Return Value:
+ * OMX_U8 --   returns the size of the positive value
+ */ 
+ 
+OMX_U8 armLogSize (
+    OMX_U16 value 
+    );    
+
+/***********************************************************************/
+                /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+
+OMX_S32 armSatAdd_S32(
+                OMX_S32 Value1,
+                OMX_S32 Value2
+                );
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+
+OMX_S64 armSatAdd_S64(
+                OMX_S64 Value1,
+                OMX_S64 Value2
+                );
+
+/** Function :armSatSub_S32()
+ * 
+ * Description :
+ *     Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ **/
+
+OMX_S32 armSatSub_S32(
+                    OMX_S32 Value1,
+                    OMX_S32 Value2
+                    );
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ *     Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ *     accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ * [in] Mac          Accumulator
+ *
+ * Return:
+ * [out]             Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(
+                    OMX_S32 Mac,
+                    OMX_S16 Value1,
+                    OMX_S16 Value2
+                    );
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ *   Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ *   mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem    First 32 bit Operand
+ * [in] filTap       Second 16 bit Operand
+ * [in] mac          Result of MAC operation
+ *
+ * Return:
+ * [out]  mac        Result of operation
+ *    
+ **/
+ 
+OMX_S32 armSatMac_S16S32_S32(
+                        OMX_S32 mac, 
+                        OMX_S32 delayElem, 
+                        OMX_S16 filTap );
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ *   Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ *   output = Saturate_in_16Bits( ( RightShift( (Round(input) , scaleFactor ) )
+ *
+ * Parametrs:
+ * [in] input       The input to be operated on
+ * [in] scaleFactor The shift number
+ *
+ * Return:
+ * [out]            Result of operation
+ *    
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(
+                        OMX_S32 input, 
+                        OMX_INT scaleFactor);
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+ 
+OMX_S32 armSatRoundLeftShift_S32(
+                        OMX_S32 Value,
+                        OMX_INT shift
+                        );
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+ 
+OMX_S64 armSatRoundLeftShift_S64(
+                        OMX_S64 Value,
+                        OMX_INT shift
+                        );
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S16 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(
+                    OMX_S16 input1,
+                    OMX_S32 input2);
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S32 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+OMX_S32 armSatMulS32S32_S32(
+                    OMX_S32 input1,
+                    OMX_S32 input2);
+
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer. 
+ *               Half-integer values are rounded away from zero
+ *               unless otherwise specified. For example 3//2 is rounded 
+ *               to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num        Operand 1
+ * [in] Deno       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation input1//input2
+ *    
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno);
+
+
+/***********************************************************************/
+/*
+ * Debugging macros
+ *
+ */
+
+
+/*
+ * Definition of output stream - change to stderr if necessary
+ */
+#define DEBUG_STREAM stdout
+
+/*
+ * Debug printf macros, one for each argument count.
+ * Add more if needed.
+ */
+#ifdef DEBUG_ON
+#include <stdio.h>
+
+#define DEBUG_PRINTF_0(a)                                               fprintf(DEBUG_STREAM, a)
+#define DEBUG_PRINTF_1(a, b)                                            fprintf(DEBUG_STREAM, a, b)
+#define DEBUG_PRINTF_2(a, b, c)                                         fprintf(DEBUG_STREAM, a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d)                                      fprintf(DEBUG_STREAM, a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e)                                   fprintf(DEBUG_STREAM, a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)                                fprintf(DEBUG_STREAM, a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)                             fprintf(DEBUG_STREAM, a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)                          fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)                       fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)                    fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)                fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)             fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)          fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)       fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)    fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#else /* DEBUG_ON */
+#define DEBUG_PRINTF_0(a)                                  
+#define DEBUG_PRINTF_1(a, b)                               
+#define DEBUG_PRINTF_2(a, b, c)                            
+#define DEBUG_PRINTF_3(a, b, c, d)                         
+#define DEBUG_PRINTF_4(a, b, c, d, e)                      
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)                   
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)                
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)             
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)          
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)       
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)    
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)             
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)          
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)   
+#endif /* DEBUG_ON */
+
+
+/*
+ * Domain and sub domain definitions
+ *
+ * In order to turn on debug for an entire domain or sub-domain
+ * at compile time, one of the DEBUG_DOMAIN_* below may be defined,
+ * which will activate debug in all of the defines it contains.
+ */
+
+#ifdef DEBUG_DOMAIN_AC
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT_MPEG4
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT
+#define DEBUG_OMXACAAC_DECODEDATSTRELT
+#define DEBUG_OMXACAAC_DECODEFILLELT
+#define DEBUG_OMXACAAC_DECODEISSTEREO_S32
+#define DEBUG_OMXACAAC_DECODEMSPNS_S32
+#define DEBUG_OMXACAAC_DECODEMSSTEREO_S32_I
+#define DEBUG_OMXACAAC_DECODEPRGCFGELT
+#define DEBUG_OMXACAAC_DECODETNS_S32_I
+#define DEBUG_OMXACAAC_DEINTERLEAVESPECTRUM_S32
+#define DEBUG_OMXACAAC_ENCODETNS_S32_I
+#define DEBUG_OMXACAAC_LONGTERMPREDICT_S32
+#define DEBUG_OMXACAAC_LONGTERMRECONSTRUCT_S32
+#define DEBUG_OMXACAAC_MDCTFWD_S32
+#define DEBUG_OMXACAAC_MDCTINV_S32_S16
+#define DEBUG_OMXACAAC_NOISELESSDECODE
+#define DEBUG_OMXACAAC_QUANTINV_S32_I
+#define DEBUG_OMXACAAC_UNPACKADIFHEADER
+#define DEBUG_OMXACAAC_UNPACKADTSFRAMEHEADER
+#define DEBUG_OMXACMP3_HUFFMANDECODESFBMBP_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODESFB_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODE_S32
+#define DEBUG_OMXACMP3_MDCTINV_S32
+#define DEBUG_OMXACMP3_REQUANTIZESFB_S32_I
+#define DEBUG_OMXACMP3_REQUANTIZE_S32_I
+#define DEBUG_OMXACMP3_SYNTHPQMF_S32_S16
+#define DEBUG_OMXACMP3_UNPACKFRAMEHEADER
+#define DEBUG_OMXACMP3_UNPACKSCALEFACTORS_S8
+#define DEBUG_OMXACMP3_UNPACKSIDEINFO
+#endif /* DEBUG_DOMAIN_AC */
+
+
+#ifdef DEBUG_DOMAIN_VC
+#define DEBUG_OMXVCM4P10_AVERAGE_16X
+#define DEBUG_OMXVCM4P10_AVERAGE_4X
+#define DEBUG_OMXVCM4P10_AVERAGE_8X
+#define DEBUG_OMXVCM4P10_DEBLOCKCHROMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DEBLOCKLUMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DECODECHROMADCCOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DECODECOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DEQUANTTRANSFORMACFROMPAIR_U8_S16_C1_DLX
+#define DEBUG_OMXVCM4P10_EXPANDFRAME
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_PREDICTINTRACHROMA8X8_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_16X16_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_4X4_U8_C1R
+#define DEBUG_OMXVCM4P10_SADQUAR_16X
+#define DEBUG_OMXVCM4P10_SADQUAR_4X
+#define DEBUG_OMXVCM4P10_SADQUAR_8X
+#define DEBUG_OMXVCM4P10_SAD_16X
+#define DEBUG_OMXVCM4P10_SAD_4X
+#define DEBUG_OMXVCM4P10_SAD_8X
+#define DEBUG_OMXVCM4P10_SATD_4X4
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTCHROMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTLUMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_CHROMADC
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_LUMADC
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_8X8
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_8X8
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_SAD_U8_S16
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_U8_S16
+#define DEBUG_OMXVCM4P2_DCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_DECODEBLOCKCOEF_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEPADMV_PVOP
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEMV_U8_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_FINDMVPRED
+#define DEBUG_OMXVCM4P2_IDCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_LIMITMVTORECT
+#define DEBUG_OMXVCM4P2_MOTIONESTIMATIONMB
+#define DEBUG_OMXVCM4P2_PADMBGRAY_U8
+#define DEBUG_OMXVCM4P2_PADMBHORIZONTAL_U8
+#define DEBUG_OMXVCM4P2_PADMBVERTICAL_U8
+#define DEBUG_OMXVCM4P2_PADMV
+#define DEBUG_OMXVCM4P2_QUANTINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINTRA_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTRA_S16_I
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTER
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTRA
+#endif /* DEBUG_DOMAIN_VC */
+
+
+#ifdef DEBUG_DOMAIN_IC
+/* To be filled in */
+#endif /* DEBUG_DOMAIN_IC */
+
+
+#ifdef DEBUG_DOMAIN_SP
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S32
+#define DEBUG_OMXACSP_COPY_S16
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_DOTPROD_S16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S16S32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S32_SFS
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC16
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S16_S32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S32
+#define DEBUG_OMXACSP_FFTINIT_C_SC16
+#define DEBUG_OMXACSP_FFTINIT_C_SC32
+#define DEBUG_OMXACSP_FFTINIT_R_S16_S32
+#define DEBUG_OMXACSP_FFTINIT_R_S32
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32S16_SFS
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32_I
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIR_DIRECT_S16
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_DIRECT_S16
+#endif /* DEBUG_DOMAIN_SP */
+
+
+#ifdef DEBUG_DOMAIN_IP
+#define DEBUG_OMXIPBM_ADDC_U8_C1R_SFS
+#define DEBUG_OMXIPBM_COPY_U8_C1R
+#define DEBUG_OMXIPBM_COPY_U8_C3R
+#define DEBUG_OMXIPBM_MIRROR_U8_C1R
+#define DEBUG_OMXIPBM_MULC_U8_C1R_SFS
+#define DEBUG_OMXIPCS_COLORTWISTQ14_U8_C3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR420LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR422LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR444LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR420LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR422LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR444LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_YCBCR420RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422RSZCSCROTRGB_U8_C2R
+#define DEBUG_OMXIPCS_YCBCR422RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565_U8_U16_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB_U8_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_C2P3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR444TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR444TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB_U8_C3R
+#define DEBUG_OMXIPPP_GETCENTRALMOMENT_S64
+#define DEBUG_OMXIPPP_GETSPATIALMOMENT_S64
+#define DEBUG_OMXIPPP_MOMENTGETSTATESIZE_S64
+#define DEBUG_OMXIPPP_MOMENTINIT_S64
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C1R
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C3R
+#endif /* DEBUG_DOMAIN_IP */
+
+
+#endif /* _armCommon_H_ */
+
+/*End of File*/
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
new file mode 100755
index 0000000..c738f72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
@@ -0,0 +1,670 @@
+;//
+;// 
+;// File Name:  armCOMM_BitDec_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;// 
+;// OpenMAX optimized bitstream decode module
+;//
+;// You must include armCOMM_s.h before including this file
+;//
+;// This module provides macros to perform assembly optimized fixed and
+;// variable length decoding from a read-only bitstream. The variable
+;// length decode modules take as input a pointer to a table of 16-bit
+;// entries of the following format.
+;//
+;// VLD Table Entry format
+;//
+;//        15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+;//       +------------------------------------------------+
+;//       |  Len   |               Symbol              | 1 |
+;//       +------------------------------------------------+
+;//       |                Offset                      | 0 |
+;//       +------------------------------------------------+
+;//
+;// If the table entry is a leaf entry then bit 0 set:
+;//    Len    = Number of bits overread (0 to 7)
+;//    Symbol = Symbol payload (unsigned 12 bits)
+;//
+;// If the table entry is an internal node then bit 0 is clear:
+;//    Offset = Number of (16-bit) half words from the table
+;//             start to the next table node
+;//
+;// The table is accessed by successive lookup up on the
+;// next Step bits of the input bitstream until a leaf node
+;// is obtained. The Step sizes are supplied to the VLD macro.
+;//
+;// USAGE:
+;//
+;// To use any of the macros in this package, first call:
+;//
+;//    M_BD_INIT ppBitStream, pBitOffset, pBitStream, RBitBuffer, RBitCount, Tmp
+;//
+;// This caches the current bitstream position and next available
+;// bits in registers pBitStream, RBitBuffer, RBitCount. These registers
+;// are reserved for use by the bitstream decode package until you
+;// call M_BD_FINI.
+;//
+;// Next call the following macro(s) as many times as you need:
+;//
+;//    M_BD_LOOK8       - Look ahead constant 1<=N<=8  bits into the bitstream
+;//    M_BD_LOOK16      - Look ahead constant 1<=N<=16 bits into the bitstream
+;//    M_BD_READ8       - Read constant 1<=N<=8  bits from the bitstream
+;//    M_BD_READ16      - Read constant 1<=N<=16 bits from the bitstream
+;//    M_BD_VREAD8      - Read variable 1<=N<=8  bits from the bitstream
+;//    M_BD_VREAD16     - Read variable 1<=N<=16 bits from the bitstream
+;//    M_BD_VLD         - Perform variable length decode using lookup table
+;//
+;// Finally call the macro:
+;//
+;//    M_BD_FINI ppBitStream, pBitOffset
+;//
+;// This writes the bitstream state back to memory.
+;//
+;// The three bitstream cache register names are assigned to the following global
+;// variables:
+;//
+
+        GBLS    pBitStream  ;// Register name for pBitStream
+        GBLS    BitBuffer   ;// Register name for BitBuffer
+        GBLS    BitCount    ;// Register name for BitCount
+   
+;//        
+;// These register variables must have a certain defined state on entry to every bitstream
+;// macro (except M_BD_INIT) and on exit from every bitstream macro (except M_BD_FINI).
+;// The state may depend on implementation.
+;//
+;// For the default (ARM11) implementation the following hold:
+;//    pBitStream - points to the first byte not held in the BitBuffer
+;//    BitBuffer  - is a cache of (4 bytes) 32 bits, bit 31 the first bit
+;//    BitCount   - is offset (from the top bit) to the next unused bitstream bit
+;//    0<=BitCount<=15 (so BitBuffer holds at least 17 unused bits)
+;//
+;//
+
+        ;// Bitstream Decode initialise
+        ;//
+        ;// Initialises the bitstream decode global registers from
+        ;// bitstream pointers. This macro is split into 3 parts to enable
+        ;// scheduling.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $ppBitStream    - pointer to pointer to the next bitstream byte
+        ;// $pBitOffset     - pointer to the number of bits used in the current byte (0..7)
+        ;// $RBitStream     - register to use for pBitStream (can be $ppBitStream)
+        ;// $RBitBuffer     - register to use for BitBuffer
+        ;// $RBitCount      - register to use for BitCount   (can be $pBitOffset)
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $T1,$T2,$T3     - registers that must be preserved between calls to
+        ;//                   M_BD_INIT1 and M_BD_INIT2
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_INIT0  $ppBitStream, $pBitOffset, $RBitStream, $RBitBuffer, $RBitCount
+
+pBitStream  SETS "$RBitStream"
+BitBuffer   SETS "$RBitBuffer"
+BitCount    SETS "$RBitCount"        
+        
+        ;// load inputs
+        LDR     $pBitStream, [$ppBitStream]
+        LDR     $BitCount, [$pBitOffset]
+        MEND
+        
+        MACRO
+        M_BD_INIT1  $T1, $T2, $T3
+        LDRB    $T2, [$pBitStream, #2]
+        LDRB    $T1, [$pBitStream, #1]
+        LDRB    $BitBuffer,  [$pBitStream], #3
+        ADD     $BitCount, $BitCount, #8
+        MEND
+        
+        MACRO
+        M_BD_INIT2  $T1, $T2, $T3
+        ORR     $T2, $T2, $T1, LSL #8
+        ORR     $BitBuffer, $T2, $BitBuffer, LSL #16
+        MEND    
+        
+        ;//
+        ;// Look ahead fixed 1<=N<=8 bits without consuming any bits
+        ;// The next bits will be placed at bit 31..24 of destination register
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to look
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_LOOK8  $Symbol, $N
+        ASSERT  ($N>=1):LAND:($N<=8)
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MEND
+        
+        ;//
+        ;// Look ahead fixed 1<=N<=16 bits without consuming any bits
+        ;// The next bits will be placed at bit 31..16 of destination register
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to look
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_LOOK16  $Symbol, $N, $T1
+        ASSERT  ($N >= 1):LAND:($N <= 16)
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MEND
+        
+        ;//
+        ;// Skips fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_SKIP8 $N, $T1
+        ASSERT  ($N>=1):LAND:($N<=8)        
+        SUBS    $BitCount, $BitCount, #(8-$N)
+        LDRCSB  $T1, [$pBitStream], #1   
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+        
+        
+        ;//
+        ;// Read fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_READ8 $Symbol, $N, $T1
+        ASSERT  ($N>=1):LAND:($N<=8)                
+        MOVS    $Symbol, $BitBuffer, LSL $BitCount        
+        SUBS    $BitCount, $BitCount, #(8-$N)
+        LDRCSB  $T1, [$pBitStream], #1   
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR #(32-$N)
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+        ;//
+        ;// Read fixed 1<=N<=16 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_READ16 $Symbol, $N, $T1, $T2
+        ASSERT  ($N>=1):LAND:($N<=16)
+        ASSERT  $Symbol<>$T1
+        IF ($N<=8)
+            M_BD_READ8  $Symbol, $N, $T1
+        ELSE        
+            ;// N>8 so we will be able to refill at least one byte            
+            LDRB    $T1, [$pBitStream], #1            
+            MOVS    $Symbol, $BitBuffer, LSL $BitCount
+            ORR     $BitBuffer, $T1, $BitBuffer, LSL #8                       
+            SUBS    $BitCount, $BitCount, #(16-$N)
+            LDRCSB  $T1, [$pBitStream], #1            
+            MOV     $Symbol, $Symbol, LSR #(32-$N)
+            ADDCC   $BitCount, $BitCount, #8
+            ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        ENDIF
+        MEND
+        
+        ;//
+        ;// Skip variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits. 1<=N<=8
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VSKIP8 $N, $T1
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND        
+        
+        ;//
+        ;// Skip variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits. 1<=N<=16
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VSKIP16 $N, $T1, $T2
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND        
+
+        ;//
+        ;// Read variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read. 1<=N<=8
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VREAD8 $Symbol, $N, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount        
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        RSB     $T2, $N, #32        
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR $T2
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+
+        ;//
+        ;// Read variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read. 1<=N<=16
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VREAD16 $Symbol, $N, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount        
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        RSB     $T2, $N, #32        
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR $T2
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+
+        ;//
+        ;// Decode a code of the form 0000...001 where there
+        ;// are N zeros before the 1 and N<=15 (code length<=16)
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the number of zeros before the next 1
+        ;//                   >=16 is an illegal code
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//        
+        MACRO
+        M_BD_CLZ16 $Symbol, $T1, $T2
+        MOVS    $Symbol, $BitBuffer, LSL $BitCount
+        CLZ     $Symbol, $Symbol                
+        ADD     $BitCount, $BitCount, $Symbol
+        SUBS    $BitCount, $BitCount, #7        ;// length is Symbol+1
+        LDRCSB  $T1, [$pBitStream], #1
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND  
+
+        ;//
+        ;// Decode a code of the form 1111...110 where there
+        ;// are N ones before the 0 and N<=15 (code length<=16)
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the number of zeros before the next 1
+        ;//                   >=16 is an illegal code
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//        
+        MACRO
+        M_BD_CLO16 $Symbol, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MVN     $Symbol, $Symbol
+        CLZ     $Symbol, $Symbol                
+        ADD     $BitCount, $BitCount, $Symbol
+        SUBS    $BitCount, $BitCount, #7        ;// length is Symbol+1
+        LDRCSB  $T1, [$pBitStream], #1
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND  
+
+
+        ;//
+        ;// Variable Length Decode module
+        ;//
+        ;// Decodes one VLD Symbol from a bitstream and refill the bitstream
+        ;// buffer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pVLDTable      - pointer to VLD decode table of 16-bit entries.
+        ;//                   The format is described above at the start of
+        ;//                   this file.
+        ;// $S0             - The number of bits to look up for the first step
+        ;//                   1<=$S0<=8
+        ;// $S1             - The number of bits to look up for each subsequent
+        ;//                   step 1<=$S1<=$S0.
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - decoded VLD symbol value
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VLD $Symbol, $T1, $T2, $pVLDTable, $S0, $S1
+        ASSERT (1<=$S0):LAND:($S0<=8)
+        ASSERT (1<=$S1):LAND:($S1<=$S0)
+        
+        ;// Note 0<=BitCount<=15 on entry and exit
+        
+        MOVS    $T1, $BitBuffer, LSL $BitCount       ;// left align next bits
+        MOVS    $Symbol, #(2<<$S0)-2                 ;// create mask
+        AND     $Symbol, $Symbol, $T1, LSR #(31-$S0) ;// 2*(next $S0 bits)
+        SUBS    $BitCount, $BitCount, #8             ;// CS if buffer can be filled
+01
+        LDRCSB  $T1, [$pBitStream], #1               ;// load refill byte
+        LDRH    $Symbol, [$pVLDTable, $Symbol]       ;// load table entry
+        ADDCC   $BitCount, $BitCount, #8             ;// refill not possible
+        ADD     $BitCount, $BitCount, #$S0           ;// assume $S0 bits used
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8  ;// merge in refill byte
+        MOVS    $T1, $Symbol, LSR #1                 ;// CS=leaf entry
+        BCS     %FT02
+        
+        MOVS    $T1, $BitBuffer, LSL $BitCount       ;// left align next bit
+        IF (2*$S0-$S1<=8)
+            ;// Can combine refill check and -S0+S1 and keep $BitCount<=15
+            SUBS    $BitCount, $BitCount, #8+($S0-$S1)
+        ELSE
+            ;// Separate refill check and -S0+S1 offset
+            SUBS  $BitCount, $BitCount, #8
+            SUB   $BitCount, $BitCount, #($S0-$S1)
+        ENDIF
+        ADD     $Symbol, $Symbol, $T1, LSR #(31-$S1) ;// add 2*(next $S1 bits) to
+        BIC     $Symbol, $Symbol, #1                 ;//   table offset
+        B       %BT01                                ;// load next table entry
+02
+        ;// BitCount range now depend on the route here
+        ;// if (first step)       S0 <= BitCount <= 7+S0        <=15
+        ;// else if (2*S0-S1<=8)  S0 <= BitCount <= 7+(2*S0-S1) <=15
+        ;// else                  S1 <= BitCount <= 7+S1        <=15
+        
+        SUB     $BitCount, $BitCount, $Symbol, LSR#13
+        BIC     $Symbol, $T1, #0xF000
+        MEND
+        
+
+        ;// Add an offset number of bits
+        ;//
+        ;// Outputs destination byte and bit index values which corresponds to an offset number of bits 
+        ;// from the current location. This is used to compare bitstream positions using. M_BD_CMP.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $Offset         - Offset to be added in bits.
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $ByteIndex      - Destination pBitStream pointer after adding the Offset. 
+        ;//                   This value will be 4 byte ahead and needs to subtract by 4 to get exact 
+        ;//                   pointer (as in M_BD_FINI). But for using with M_BD_CMP subtract is not needed.
+        ;// $BitIndex       - Destination BitCount after the addition of Offset number of bits
+        ;//
+        MACRO
+        M_BD_ADD  $ByteIndex, $BitIndex, $Offset
+
+        ;// ($ByteIndex,$BitIndex) = Current position + $Offset bits
+        ADD     $Offset, $Offset, $BitCount
+        AND     $BitIndex, $Offset, #7
+        ADD     $ByteIndex, $pBitStream, $Offset, ASR #3        
+        MEND
+
+        ;// Move bitstream pointers to the location given
+        ;//
+        ;// Outputs destination byte and bit index values which corresponds to  
+        ;// the current location given (calculated using M_BD_ADD). 
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// $ByteIndex      - Destination pBitStream pointer after move. 
+        ;//                   This value will be 4 byte ahead and needs to subtract by 4 to get exact 
+        ;//                   pointer (as in M_BD_FINI).
+        ;// $BitIndex       - Destination BitCount after the move
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;//                  } See description above.  
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_MOV  $ByteIndex, $BitIndex
+
+        ;// ($pBitStream, $Offset) = ($ByteIndex,$BitIndex)
+        MOV     $BitCount, $BitIndex
+        MOV     $pBitStream, $ByteIndex
+        MEND
+
+        ;// Bitstream Compare
+        ;//
+        ;// Compares bitstream position with that of a destination position. Destination position 
+        ;// is held in two input registers which are calculated using M_BD_ADD macro
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $ByteIndex      - Destination pBitStream pointer, (4 byte ahead as described in M_BD_ADD)
+        ;// $BitIndex       - Destination BitCount
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// FLAGS           - GE if destination is reached, LT = is destination is ahead
+        ;// $T1             - corrupted temp/scratch register
+        ;//
+        MACRO
+        M_BD_CMP  $ByteIndex, $BitIndex, $T1
+        
+        ;// Return flags set by (current positon)-($ByteIndex,$BitIndex)
+        ;// so GE means that we have reached the indicated position
+
+        ADD         $T1, $pBitStream, $BitCount, LSR #3
+        CMP         $T1, $ByteIndex
+        AND         $T1, $BitCount, #7
+        CMPEQ       $T1, $BitIndex        
+        MEND
+
+        
+        ;// Bitstream Decode finalise
+        ;//
+        ;// Writes back the bitstream state to the bitstream pointers
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $ppBitStream    - pointer to pointer to the next bitstream byte
+        ;// $pBitOffset     - pointer to the number of bits used in the current byte (0..7)
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } these register are corrupted
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_FINI  $ppBitStream, $pBitOffset
+        
+        ;// Advance pointer by the number of free bits in the buffer
+        ADD     $pBitStream, $pBitStream, $BitCount, LSR#3
+        AND     $BitCount, $BitCount, #7
+        
+        ;// Now move back 32 bits to reach the first usued bit
+        SUB     $pBitStream, $pBitStream, #4
+        
+        ;// Store out bitstream state
+        STR     $BitCount, [$pBitOffset]
+        STR     $pBitStream, [$ppBitStream]
+        MEND
+        
+        END
+        
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
new file mode 100755
index 0000000..b699034
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
@@ -0,0 +1,212 @@
+/**
+ * 
+ * File Name:  armCOMM_Bitstream.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File: armCOMM_Bitstream.h
+ * Brief: Declares common API's/Data types used across the OpenMax Encoders/Decoders.
+ *
+ */
+
+#ifndef _armCodec_H_
+#define _armCodec_H_
+
+#include "omxtypes.h"
+
+typedef struct {
+    OMX_U8   codeLen;
+    OMX_U32	 codeWord;
+} ARM_VLC32;
+
+/* The above should be renamed as "ARM_VLC32" */
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in]     **ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1...32
+ *
+ * Returns  Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *    
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1..32
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ * Returns  Value
+ */
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+ 
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset);
+
+/** 
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N);
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ * 
+ * Parameters:
+ * [in]     **ppBitStream
+ * [in]     *pOffset
+ * [in]     pCodeBook
+ * 
+ * [out]    **ppBitStream
+ * [out]    *pOffset
+ *
+ * Returns : Code Book Index if successfull. 
+ *         : "ARM_NO_CODEBOOK_INDEX = 0xFFFF" if search fails.
+ **/
+
+#define ARM_NO_CODEBOOK_INDEX (OMX_U16)(0xFFFF)
+
+OMX_U16 armUnPackVLC32(
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pOffset,
+    const ARM_VLC32 *pCodeBook
+);
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in]	pOffset	        pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in]	codeWord		Code word that need to be inserted in to the
+ *                          bitstream
+ * [in]	codeLength		Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream	*ppBitStream is updated after the block is encoded,
+ *	                        so that it points to the current byte in the bit
+ *							stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the
+ *							current bit position in the byte pointed by
+ *							*ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackBits (
+    OMX_U8  **ppBitStream, 
+    OMX_INT *pOffset,
+    OMX_U32 codeWord, 
+    OMX_INT codeLength 
+);
+ 
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in]	pBitOffset	    pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in]	 code     		VLC code word that need to be inserted in to the
+ *                      bitstream
+ *
+ * [out] ppBitStream	*ppBitStream is updated after the block is encoded,
+ *	                    so that it points to the current byte in the bit
+ *						stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the
+ *						current bit position in the byte pointed by
+ *						*ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackVLC32 (
+    OMX_U8 **ppBitStream, 
+    OMX_INT *pBitOffset,
+    ARM_VLC32 code 
+);
+
+#endif      /*_armCodec_H_*/
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
new file mode 100755
index 0000000..e0cfdaa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
@@ -0,0 +1,40 @@
+/**
+ *
+ * 
+ * File Name:  armCOMM_IDCTTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File         : armCOMM_IDCTTable.h
+ * Description  : Contains declarations of tables for IDCT calculation.
+ *
+ */
+  
+#ifndef _armCOMM_IDCTTable_H_
+#define _armCOMM_IDCTTable_H_
+
+#include "omxtypes.h"
+
+     /*  Table of s(u)*A(u)*A(v)/16 at Q15
+      *  s(u)=1.0 0 <= u <= 5
+      *  s(6)=2.0
+      *  s(7)=4.0
+      *  A(0) = 2*sqrt(2)
+      *  A(u) = 4*cos(u*pi/16)  for (u!=0)
+	  */
+extern const OMX_U16 armCOMM_IDCTPreScale [64];
+extern const OMX_U16 armCOMM_IDCTCoef [4];
+
+#endif /* _armCOMM_IDCTTable_H_ */
+
+
+/* End of File */
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
new file mode 100755
index 0000000..0baa087
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
@@ -0,0 +1,1451 @@
+;//
+;// This confidential and proprietary software may be used only as
+;// authorised by a licensing agreement from ARM Limited
+;//   (C) COPYRIGHT 2004 ARM Limited
+;//       ALL RIGHTS RESERVED
+;// The entire notice above must be reproduced on all authorised
+;// copies and copies may only be made to the extent permitted
+;// by a licensing agreement from ARM Limited.
+;//
+;// IDCT_s.s
+;//
+;// Inverse DCT module
+;//
+;// 
+;// ALGORITHM DESCRIPTION
+;//
+;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
+;// column and then a 1D IDCT for each row.
+;//
+;// The 8-point 1D IDCT is defined by
+;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
+;//
+;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
+;//   c(u,x) = cos( (2x+1)*u*pi/16 )
+;//
+;// We compute the 8-point 1D IDCT using the reverse of
+;// the Arai-Agui-Nakajima flow graph which we split into
+;// 5 stages named in reverse order to identify with the
+;// forward DCT. Direct inversion of the forward formulae
+;// in file FDCT_s.s gives:
+;//
+;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
+;//             [ A(0) = 2*sqrt(2)
+;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
+;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
+;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
+;//
+;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
+;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
+;//             [ The above two lines rotate by -(pi/8) ]
+;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2 
+;//             
+;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
+;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
+;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
+;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
+;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
+;//
+;// Note that most coefficients are halved 3 times during the
+;// above calculation. We can rescale the algorithm dividing
+;// the input by 8 to remove the halvings.
+;//
+;// IStage 5:   j(u) = T(u)*A(u)/8
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = j2 + j6        i2 = j2 - j6
+;//             i7 = j5 + j3        i4 = j5 - j3
+;//             i5 = j1 + j7        i6 = j1 - j7
+;//
+;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
+;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
+;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7 
+;//             
+;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
+;//             g1 = h1 + h2        g2 = h1 - h2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
+;//             f1 = g1 + g6        f6 = g1 - g6
+;//             f2 = g2 + g5        f5 = g2 - g5
+;//             f3 = g3 + g4        f4 = g3 - g4
+;//
+;// Note:
+;// 1. The scaling by A(u)/8 can often be combined with inverse
+;//    quantization. The column and row scalings can be combined.
+;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
+;//    to the above code but is otherwise identical.
+;// 3. The rotation by -pi/8 can be peformed using three multiplies
+;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
+;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
+;// 4. If |T(u)|<=1 then from the IDCT definition,
+;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
+;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
+;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
+;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
+;//            = (approx)2.64
+;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
+;//    The table below shows input patterns generating the maximum
+;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
+;//    InputPattern      Max |f(x)|
+;//      PPPPPPPP        |f0| =  2.64
+;//      PPPMMMMM        |f1| =  2.64
+;//      PPMMMPPP        |f2| =  2.64
+;//      PPMMPPMM        |f3| =  2.64
+;//      PMMPPMMP        |f4| =  2.64
+;//      PMMPMMPM        |f5| =  2.64
+;//      PMPPMPMP        |f6| =  2.64
+;//      PMPMPMPM        |f7| =  2.64
+;//   Note that this input pattern is the transpose of the
+;//   corresponding max input patter for the FDCT.
+
+;// Arguments
+
+pSrc    RN 0    ;// source data buffer
+Stride  RN 1    ;// destination stride in bytes
+pDest   RN 2    ;// destination data buffer
+pScale  RN 3    ;// pointer to scaling table
+
+
+        ;// DCT Inverse Macro
+        ;// The DCT code should be parametrized according
+        ;// to the following inputs:
+        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
+        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
+        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
+        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
+        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
+        ;//
+        ;// Inputs:
+        ;// pSrc   = r0 = Pointer to input data
+        ;//               Range is -256 to +255 (9-bit)
+        ;// Stride = r1 = Stride between input lines
+        ;// pDest  = r2 = Pointer to output data
+        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
+        
+        
+        
+        MACRO
+        M_IDCT  $outsize, $inscale, $stride
+        LCLA    SHIFT
+        
+        
+        IF ARM1136JS
+        
+;// REGISTER ALLOCATION
+;// This is hard since we have 8 values, 9 free registers and each
+;// butterfly requires a temporary register. We also want to 
+;// maintain register order so we can use LDM/STM. The table below
+;// summarises the register allocation that meets all these criteria.
+;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
+;//
+;// r1  a01     g0  h0
+;// r4  b01 f0  g1  h1  i0
+;// r5  a23 f1  g2      i1
+;// r6  b23 f2  g3  h2  i2
+;// r7  a45 f3      h3  i3
+;// r8  b45 f4  g4  h4  i4
+;// r9  a67 f5  g5  h5  i5
+;// r10 b67 f6  g6  h6  i6
+;// r11     f7  g7  h7  i7
+;//
+ra01    RN 1
+rb01    RN 4
+ra23    RN 5
+rb23    RN 6
+ra45    RN 7
+rb45    RN 8
+ra67    RN 9
+rb67    RN 10
+rtmp    RN 11
+csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
+LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
+;// Transpose allocation
+xft     RN ra01
+xf0     RN rb01
+xf1     RN ra23
+xf2     RN rb23
+xf3     RN ra45
+xf4     RN rb45
+xf5     RN ra67
+xf6     RN rb67
+xf7     RN rtmp
+;// IStage 1 allocation
+xg0     RN xft
+xg1     RN xf0
+xg2     RN xf1
+xg3     RN xf2
+xgt     RN xf3
+xg4     RN xf4
+xg5     RN xf5
+xg6     RN xf6
+xg7     RN xf7
+;// IStage 2 allocation
+xh0     RN xg0
+xh1     RN xg1
+xht     RN xg2
+xh2     RN xg3
+xh3     RN xgt
+xh4     RN xg4
+xh5     RN xg5
+xh6     RN xg6
+xh7     RN xg7
+;// IStage 3,4 allocation
+xit     RN xh0
+xi0     RN xh1
+xi1     RN xht
+xi2     RN xh2
+xi3     RN xh3
+xi4     RN xh4
+xi5     RN xh5
+xi6     RN xh6
+xi7     RN xh7
+        
+        M_STR   pDest,  ppDest
+        IF "$stride"="s"
+            M_STR   Stride, pStride
+        ENDIF
+        M_ADR   pDest,  pBlk
+        LDR     csPiBy8, =0x30fc7642
+        LDR     LoopRR2, =0x00005a82
+  
+v6_idct_col$_F
+        ;// Load even values
+        LDR     xi4, [pSrc], #4  ;// j0
+        LDR     xi5, [pSrc, #4*16-4]  ;// j4
+        LDR     xi6, [pSrc, #2*16-4]  ;// j2
+        LDR     xi7, [pSrc, #6*16-4]  ;// j6
+        
+        ;// Scale Even Values
+        IF "$inscale"="s16" ;// 16x16 mul
+SHIFT       SETA    12
+            LDR     xi0, [pScale], #4
+            LDR     xi1, [pScale, #4*16-4]        
+            LDR     xi2, [pScale, #2*16-4]
+            MOV     xit, #1<<(SHIFT-1)
+            SMLABB  xi3, xi0, xi4, xit
+            SMLATT  xi4, xi0, xi4, xit
+            SMLABB  xi0, xi1, xi5, xit
+            SMLATT  xi5, xi1, xi5, xit
+            MOV     xi3, xi3, ASR #SHIFT
+            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
+            LDR     xi3, [pScale, #6*16-4]
+            SMLABB  xi1, xi2, xi6, xit
+            SMLATT  xi6, xi2, xi6, xit
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
+            SMLABB  xi2, xi3, xi7, xit
+            SMLATT  xi7, xi3, xi7, xit
+            MOV     xi1, xi1, ASR #SHIFT
+            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT
+            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+SHIFT       SETA    (12+8-16)
+            MOV     xit, #1<<(SHIFT-1)
+            LDR     xi0, [pScale], #8
+            LDR     xi1, [pScale, #0*32+4-8]
+            LDR     xi2, [pScale, #4*32-8]
+            LDR     xi3, [pScale, #4*32+4-8]            
+            SMLAWB  xi0, xi0, xi4, xit
+            SMLAWT  xi1, xi1, xi4, xit
+            SMLAWB  xi2, xi2, xi5, xit
+            SMLAWT  xi3, xi3, xi5, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
+            LDR     xi0, [pScale, #2*32-8]
+            LDR     xi1, [pScale, #2*32+4-8]
+            LDR     xi2, [pScale, #6*32-8]
+            LDR     xi3, [pScale, #6*32+4-8]            
+            SMLAWB  xi0, xi0, xi6, xit
+            SMLAWT  xi1, xi1, xi6, xit
+            SMLAWB  xi2, xi2, xi7, xit
+            SMLAWT  xi3, xi3, xi7, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
+        ENDIF
+                
+        ;// Load odd values
+        LDR     xi0, [pSrc, #1*16-4]      ;// j1
+        LDR     xi1, [pSrc, #7*16-4]      ;// j7
+        LDR     xi2, [pSrc, #5*16-4]      ;// j5
+        LDR     xi3, [pSrc, #3*16-4]      ;// j3
+        
+        IF  {TRUE}
+            ;// shortcut if odd values 0
+            TEQ     xi0, #0
+            TEQEQ   xi1, #0
+            TEQEQ   xi2, #0
+            TEQEQ   xi3, #0
+            BEQ     v6OddZero$_F
+        ENDIF
+        
+        ;// Store scaled even values
+        STMIA   pDest, {xi4, xi5, xi6, xi7}
+        
+        ;// Scale odd values
+        IF "$inscale"="s16"
+            ;// Perform AAN Scale
+            LDR     xi4, [pScale, #1*16-4]
+            LDR     xi5, [pScale, #7*16-4]        
+            LDR     xi6, [pScale, #5*16-4]
+            SMLABB  xi7, xi0, xi4, xit
+            SMLATT  xi0, xi0, xi4, xit
+            SMLABB  xi4, xi1, xi5, xit
+            SMLATT  xi1, xi1, xi5, xit
+            MOV     xi7, xi7, ASR #SHIFT
+            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
+            LDR     xi7, [pScale, #3*16-4]
+            SMLABB  xi5, xi2, xi6, xit
+            SMLATT  xi2, xi2, xi6, xit
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
+            SMLABB  xi6, xi3, xi7, xit
+            SMLATT  xi3, xi3, xi7, xit
+            MOV     xi5, xi5, ASR #SHIFT
+            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT
+            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+            LDR     xi4, [pScale, #1*32-8]
+            LDR     xi5, [pScale, #1*32+4-8]
+            LDR     xi6, [pScale, #7*32-8]
+            LDR     xi7, [pScale, #7*32+4-8]            
+            SMLAWB  xi4, xi4, xi0, xit
+            SMLAWT  xi5, xi5, xi0, xit
+            SMLAWB  xi6, xi6, xi1, xit
+            SMLAWT  xi7, xi7, xi1, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
+            LDR     xi4, [pScale, #5*32-8]
+            LDR     xi5, [pScale, #5*32+4-8]
+            LDR     xi6, [pScale, #3*32-8]
+            LDR     xi7, [pScale, #3*32+4-8]            
+            SMLAWB  xi4, xi4, xi2, xit
+            SMLAWT  xi5, xi5, xi2, xit
+            SMLAWB  xi6, xi6, xi3, xit
+            SMLAWT  xi7, xi7, xi3, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
+        ENDIF
+        
+        LDR     xit, =0x00010001        ;// rounding constant
+        SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SHADD16 xi5, xi5, xit
+        
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SHADD16 xi7, xi7, xit
+        
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+                
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
+                
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6        
+        SSUB16  xg4, xh4, xg5
+                
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
+        
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDRD    xi0, [pDest]            ;// j0, j4 scaled
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        SHADD16 xh0, xi0, xi1
+        SHSUB16 xh1, xi0, xi1                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SADD16  xf3, xg3, xg4
+        SSUB16  xf4, xg3, xg4
+        SADD16  xf2, xg2, xg5
+        SSUB16  xf5, xg2, xg5
+        SADD16  xf1, xg1, xg6
+        SSUB16  xf6, xg1, xg6
+        SADD16  xf0, xg0, xg7
+        SSUB16  xf7, xg0, xg7
+        
+        ;// Transpose, store and loop
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        PKHTB   rb67, xf7, xf6, ASR #16
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}                              
+        BCC     v6_idct_col$_F
+        
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+        B       v6_idct_row$_F
+        
+v6OddZero$_F
+        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
+        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        SSUB16  xh2, xh2, xi3
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        SHADD16 xh0, xi4, xi5
+        SHSUB16 xh1, xi4, xi5                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+               
+        ;// IStage 1 all rows
+        MOV  xf3, xg3
+        MOV  xf4, xg3
+        MOV  xf2, xg2
+        MOV  xf5, xg2
+        MOV  xf1, xg1
+        MOV  xf6, xg1
+        MOV  xf0, xg0
+        MOV  xf7, xg0
+        
+        ;// Transpose
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        PKHTB   rb67, xf7, xf6, ASR #16
+                
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}      
+        
+        BCC     v6_idct_col$_F
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+               
+        
+v6_idct_row$_F
+        ;// IStage 4,3, rows4to7 x1/4
+        LDR     xit, =0x00010001        ;// rounding constant
+        LDR     xi0, [pSrc, #1*16]      ;// j1
+        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
+        LDR     xi2, [pSrc, #5*16]      ;// j5
+        LDR     xi3, [pSrc, #3*16]      ;// j3
+        
+        SHADD16 xi1, xi1, xit           ;// 2*j7
+        SHADD16 xi1, xi1, xit           ;// j7                
+        
+        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+               
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        LDR     xi0, [pSrc, #2*16]      ;// j2
+        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
+        
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6
+        SSUB16  xg4, xh4, xg5
+        
+        SHADD16 xi1, xi1, xit           ;// j6
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)        
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDR     xi1, [pSrc, #4*16]      ;// j4
+        LDR     xi0, [pSrc], #4         ;// j0
+
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
+        SHADD16 xh0, xi0, xi1           ;// of DC result
+        SHSUB16 xh1, xi0, xi1
+                
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SHADD16 xf3, xg3, xg4
+        SHSUB16 xf4, xg3, xg4
+        SHADD16 xf2, xg2, xg5
+        SHSUB16 xf5, xg2, xg5
+        SHADD16 xf1, xg1, xg6
+        SHSUB16 xf6, xg1, xg6
+        SHADD16 xf0, xg0, xg7
+        SHSUB16 xf7, xg0, xg7
+        
+        ;// Saturate
+        IF ("$outsize"="u8")
+            USAT16  xf0, #8, xf0
+            USAT16  xf1, #8, xf1
+            USAT16  xf2, #8, xf2
+            USAT16  xf3, #8, xf3
+            USAT16  xf4, #8, xf4
+            USAT16  xf5, #8, xf5
+            USAT16  xf6, #8, xf6
+            USAT16  xf7, #8, xf7        
+        ENDIF
+        IF ("$outsize"="s9")
+            SSAT16  xf0, #9, xf0
+            SSAT16  xf1, #9, xf1
+            SSAT16  xf2, #9, xf2
+            SSAT16  xf3, #9, xf3
+            SSAT16  xf4, #9, xf4
+            SSAT16  xf5, #9, xf5
+            SSAT16  xf6, #9, xf6
+            SSAT16  xf7, #9, xf7        
+        ENDIF
+        
+        ;// Transpose to Row, Pack and store
+        IF ("$outsize"="u8")
+            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
+            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
+            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
+            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
+            PKHBT   ra01, xf0, xf2, LSL #16
+            PKHTB   rb01, xf2, xf0, ASR #16
+            PKHBT   ra23, xf4, xf6, LSL #16
+            PKHTB   rb23, xf6, xf4, ASR #16
+            STMIA   pDest, {ra01, ra23}
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        IF ("$outsize"="s9"):LOR:("$outsize"="s16")        
+            PKHBT   ra01, xf0, xf1, LSL #16
+            PKHTB   rb01, xf1, xf0, ASR #16
+        
+            PKHBT   ra23, xf2, xf3, LSL #16
+            PKHTB   rb23, xf3, xf2, ASR #16
+            
+            PKHBT   ra45, xf4, xf5, LSL #16
+            PKHTB   rb45, xf5, xf4, ASR #16
+            
+            PKHBT   ra67, xf6, xf7, LSL #16
+            PKHTB   rb67, xf7, xf6, ASR #16
+            
+            STMIA   pDest, {ra01, ra23, ra45, ra67}      
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        
+        BCC     v6_idct_row$_F
+        ENDIF ;// ARM1136JS
+
+
+        IF CortexA8
+        
+Src0            EQU  7              
+Src1            EQU  8              
+Src2            EQU  9              
+Src3            EQU  10              
+Src4            EQU  11              
+Src5            EQU  12              
+Src6            EQU  13
+Src7            EQU  14
+Tmp             EQU  15
+
+qXj0            QN Src0.S16 
+qXj1            QN Src1.S16
+qXj2            QN Src2.S16
+qXj3            QN Src3.S16
+qXj4            QN Src4.S16
+qXj5            QN Src5.S16
+qXj6            QN Src6.S16
+qXj7            QN Src7.S16
+qXjt            QN Tmp.S16
+
+dXj0lo          DN (Src0*2).S16
+dXj0hi          DN (Src0*2+1).S16
+dXj1lo          DN (Src1*2).S16
+dXj1hi          DN (Src1*2+1).S16
+dXj2lo          DN (Src2*2).S16
+dXj2hi          DN (Src2*2+1).S16
+dXj3lo          DN (Src3*2).S16
+dXj3hi          DN (Src3*2+1).S16
+dXj4lo          DN (Src4*2).S16
+dXj4hi          DN (Src4*2+1).S16
+dXj5lo          DN (Src5*2).S16
+dXj5hi          DN (Src5*2+1).S16
+dXj6lo          DN (Src6*2).S16
+dXj6hi          DN (Src6*2+1).S16
+dXj7lo          DN (Src7*2).S16
+dXj7hi          DN (Src7*2+1).S16
+dXjtlo          DN (Tmp*2).S16
+dXjthi          DN (Tmp*2+1).S16
+
+qXi0            QN qXj0
+qXi1            QN qXj4
+qXi2            QN qXj2
+qXi3            QN qXj7
+qXi4            QN qXj5
+qXi5            QN qXjt
+qXi6            QN qXj1
+qXi7            QN qXj6
+qXit            QN qXj3
+
+dXi0lo          DN dXj0lo
+dXi0hi          DN dXj0hi
+dXi1lo          DN dXj4lo
+dXi1hi          DN dXj4hi
+dXi2lo          DN dXj2lo
+dXi2hi          DN dXj2hi
+dXi3lo          DN dXj7lo
+dXi3hi          DN dXj7hi
+dXi4lo          DN dXj5lo
+dXi4hi          DN dXj5hi
+dXi5lo          DN dXjtlo
+dXi5hi          DN dXjthi
+dXi6lo          DN dXj1lo
+dXi6hi          DN dXj1hi
+dXi7lo          DN dXj6lo
+dXi7hi          DN dXj6hi
+dXitlo          DN dXj3lo
+dXithi          DN dXj3hi
+
+qXh0            QN qXit
+qXh1            QN qXi0
+qXh2            QN qXi2
+qXh3            QN qXi3
+qXh4            QN qXi7
+qXh5            QN qXi5
+qXh6            QN qXi4
+qXh7            QN qXi1
+qXht            QN qXi6
+
+dXh0lo          DN dXitlo
+dXh0hi          DN dXithi
+dXh1lo          DN dXi0lo
+dXh1hi          DN dXi0hi
+dXh2lo          DN dXi2lo
+dXh2hi          DN dXi2hi
+dXh3lo          DN dXi3lo
+dXh3hi          DN dXi3hi
+dXh4lo          DN dXi7lo
+dXh4hi          DN dXi7hi
+dXh5lo          DN dXi5lo
+dXh5hi          DN dXi5hi
+dXh6lo          DN dXi4lo
+dXh6hi          DN dXi4hi
+dXh7lo          DN dXi1lo
+dXh7hi          DN dXi1hi
+dXhtlo          DN dXi6lo
+dXhthi          DN dXi6hi
+
+qXg0            QN qXh2
+qXg1            QN qXht
+qXg2            QN qXh1
+qXg3            QN qXh0
+qXg4            QN qXh4
+qXg5            QN qXh5
+qXg6            QN qXh6
+qXg7            QN qXh7
+qXgt            QN qXh3
+
+qXf0            QN qXg6
+qXf1            QN qXg5
+qXf2            QN qXg4
+qXf3            QN qXgt
+qXf4            QN qXg3
+qXf5            QN qXg2
+qXf6            QN qXg1
+qXf7            QN qXg0
+qXft            QN qXg7
+
+
+qXt0            QN 1.S32
+qXt1            QN 2.S32
+qT0lo           QN 1.S32         
+qT0hi           QN 2.S32         
+qT1lo           QN 3.S32         
+qT1hi           QN 4.S32         
+qScalelo        QN 5.S32        ;// used to read post scale values
+qScalehi        QN 6.S32
+qTemp0          QN 5.S32         
+qTemp1          QN 6.S32    
+
+
+Scale1          EQU 6
+Scale2          EQU 15
+qScale1         QN Scale1.S16     
+qScale2         QN Scale2.S16     
+dScale1lo       DN (Scale1*2).S16     
+dScale1hi       DN (Scale1*2+1).S16
+dScale2lo       DN (Scale2*2).S16     
+dScale2hi       DN (Scale2*2+1).S16
+
+dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
+InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
+S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
+C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
+
+pTemp           RN 12
+
+                
+        IMPORT  armCOMM_IDCTCoef
+                    
+        VLD1        {qXj0,qXj1}, [pSrc @64]!
+        VLD1        {qXj2,qXj3}, [pSrc @64]!
+        VLD1        {qXj4,qXj5}, [pSrc @64]!
+        VLD1        {qXj6,qXj7}, [pSrc @64]!
+        
+        ;// Load PreScale and multiply with Src
+        ;// IStage 4
+        
+        IF "$inscale"="s16"                         ;// 16X16 Mul
+            M_IDCT_PRESCALE16
+        ENDIF
+        
+        IF "$inscale"="s32"                         ;// 32X32 ,ul
+            M_IDCT_PRESCALE32
+        ENDIF
+
+        ;// IStage 3
+        VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
+        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
+        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
+        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
+        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
+        VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
+        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
+
+        VMULL       qXt0, dXi4lo, C                 ;// c*i4
+        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
+        VMULL       qXt1, dXi4hi, C
+        VMLAL       qXt1, dXi6hi, S
+        VSHRN       dXh4lo, qXt0, #16               ;// h4
+        VSHRN       dXh4hi, qXt1, #16
+        
+        VMULL       qXt0, dXi6lo, C                 ;// c*i6
+        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
+        VMULL       qXt1, dXi6hi, C
+        VMLSL       qXt1, dXi4hi, S
+        VSHRN       dXh6lo, qXt0, #16               ;// h6
+        VSHRN       dXh6hi, qXt1, #16
+        
+        ;// IStage 2
+        VSUB        qXg6, qXh6, qXh7
+        VSUB        qXg5, qXh5, qXg6
+        VSUB        qXg4, qXh4, qXg5
+        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
+        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
+        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
+        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
+
+        ;// IStage 1 all rows
+        VADD        qXf3, qXg3, qXg4        
+        VSUB        qXf4, qXg3, qXg4        
+        VADD        qXf2, qXg2, qXg5        
+        VSUB        qXf5, qXg2, qXg5        
+        VADD        qXf1, qXg1, qXg6
+        VSUB        qXf6, qXg1, qXg6        
+        VADD        qXf0, qXg0, qXg7
+        VSUB        qXf7, qXg0, qXg7      
+
+        ;// Transpose, store and loop
+XTR0            EQU Src5
+XTR1            EQU Tmp
+XTR2            EQU Src6
+XTR3            EQU Src7
+XTR4            EQU Src3
+XTR5            EQU Src0
+XTR6            EQU Src1
+XTR7            EQU Src2
+XTRt            EQU Src4
+                
+qA0             QN  XTR0.S32  ;// for XTRpose
+qA1             QN  XTR1.S32
+qA2             QN  XTR2.S32
+qA3             QN  XTR3.S32
+qA4             QN  XTR4.S32
+qA5             QN  XTR5.S32
+qA6             QN  XTR6.S32
+qA7             QN  XTR7.S32
+
+dB0             DN  XTR0*2+1      ;// for using VSWP
+dB1             DN  XTR1*2+1
+dB2             DN  XTR2*2+1
+dB3             DN  XTR3*2+1
+dB4             DN  XTR4*2
+dB5             DN  XTR5*2
+dB6             DN  XTR6*2
+dB7             DN  XTR7*2
+
+          
+        VTRN        qXf0, qXf1
+        VTRN        qXf2, qXf3
+        VTRN        qXf4, qXf5
+        VTRN        qXf6, qXf7
+        VTRN        qA0, qA2
+        VTRN        qA1, qA3
+        VTRN        qA4, qA6
+        VTRN        qA5, qA7        
+        VSWP        dB0, dB4
+        VSWP        dB1, dB5
+        VSWP        dB2, dB6
+        VSWP        dB3, dB7
+        
+
+qYj0            QN qXf0
+qYj1            QN qXf1
+qYj2            QN qXf2
+qYj3            QN qXf3
+qYj4            QN qXf4
+qYj5            QN qXf5
+qYj6            QN qXf6
+qYj7            QN qXf7
+qYjt            QN qXft
+
+dYj0lo          DN (XTR0*2).S16
+dYj0hi          DN (XTR0*2+1).S16
+dYj1lo          DN (XTR1*2).S16
+dYj1hi          DN (XTR1*2+1).S16
+dYj2lo          DN (XTR2*2).S16
+dYj2hi          DN (XTR2*2+1).S16
+dYj3lo          DN (XTR3*2).S16
+dYj3hi          DN (XTR3*2+1).S16
+dYj4lo          DN (XTR4*2).S16
+dYj4hi          DN (XTR4*2+1).S16
+dYj5lo          DN (XTR5*2).S16
+dYj5hi          DN (XTR5*2+1).S16
+dYj6lo          DN (XTR6*2).S16
+dYj6hi          DN (XTR6*2+1).S16
+dYj7lo          DN (XTR7*2).S16
+dYj7hi          DN (XTR7*2+1).S16
+dYjtlo          DN (XTRt*2).S16
+dYjthi          DN (XTRt*2+1).S16
+
+qYi0            QN qYj0
+qYi1            QN qYj4
+qYi2            QN qYj2
+qYi3            QN qYj7
+qYi4            QN qYj5
+qYi5            QN qYjt
+qYi6            QN qYj1
+qYi7            QN qYj6
+qYit            QN qYj3
+
+dYi0lo          DN dYj0lo
+dYi0hi          DN dYj0hi
+dYi1lo          DN dYj4lo
+dYi1hi          DN dYj4hi
+dYi2lo          DN dYj2lo
+dYi2hi          DN dYj2hi
+dYi3lo          DN dYj7lo
+dYi3hi          DN dYj7hi
+dYi4lo          DN dYj5lo
+dYi4hi          DN dYj5hi
+dYi5lo          DN dYjtlo
+dYi5hi          DN dYjthi
+dYi6lo          DN dYj1lo
+dYi6hi          DN dYj1hi
+dYi7lo          DN dYj6lo
+dYi7hi          DN dYj6hi
+dYitlo          DN dYj3lo
+dYithi          DN dYj3hi
+
+qYh0            QN qYit
+qYh1            QN qYi0
+qYh2            QN qYi2
+qYh3            QN qYi3
+qYh4            QN qYi7
+qYh5            QN qYi5
+qYh6            QN qYi4
+qYh7            QN qYi1
+qYht            QN qYi6
+
+dYh0lo          DN dYitlo
+dYh0hi          DN dYithi
+dYh1lo          DN dYi0lo
+dYh1hi          DN dYi0hi
+dYh2lo          DN dYi2lo
+dYh2hi          DN dYi2hi
+dYh3lo          DN dYi3lo
+dYh3hi          DN dYi3hi
+dYh4lo          DN dYi7lo
+dYh4hi          DN dYi7hi
+dYh5lo          DN dYi5lo
+dYh5hi          DN dYi5hi
+dYh6lo          DN dYi4lo
+dYh6hi          DN dYi4hi
+dYh7lo          DN dYi1lo
+dYh7hi          DN dYi1hi
+dYhtlo          DN dYi6lo
+dYhthi          DN dYi6hi
+
+qYg0            QN qYh2
+qYg1            QN qYht
+qYg2            QN qYh1
+qYg3            QN qYh0
+qYg4            QN qYh4
+qYg5            QN qYh5
+qYg6            QN qYh6
+qYg7            QN qYh7
+qYgt            QN qYh3
+
+qYf0            QN qYg6
+qYf1            QN qYg5
+qYf2            QN qYg4
+qYf3            QN qYgt
+qYf4            QN qYg3
+qYf5            QN qYg2
+qYf6            QN qYg1
+qYf7            QN qYg0
+qYft            QN qYg7
+
+        VRSHR       qYj7, qYj7, #2
+        VRSHR       qYj6, qYj6, #1
+        
+        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
+        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
+        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
+        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
+        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
+        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
+
+        VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        MOV         pTemp, #0x4             ;// ensure correct round
+        VDUP        qScale1, pTemp           ;// of DC result
+        VADD        qYi0, qYi0, qScale1
+        
+        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
+        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
+
+        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
+        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
+        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
+        VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
+
+        VMULL       qXt0, dYi4lo, C         ;// c*i4
+        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
+        VMULL       qXt1, dYi4hi, C
+        VMLAL       qXt1, dYi6hi, S
+        VSHRN       dYh4lo, qXt0, #16       ;// h4
+        VSHRN       dYh4hi, qXt1, #16
+        
+        VMULL       qXt0, dYi6lo, C         ;// c*i6
+        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
+        VMULL       qXt1, dYi6hi, C
+        VMLSL       qXt1, dYi4hi, S
+        VSHRN       dYh6lo, qXt0, #16       ;// h6
+        VSHRN       dYh6hi, qXt1, #16
+        
+        VSUB        qYg6, qYh6, qYh7
+        VSUB        qYg5, qYh5, qYg6
+        VSUB        qYg4, qYh4, qYg5
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
+        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
+        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
+        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
+        
+
+        ;// IStage 1 all rows
+        VHADD        qYf3, qYg3, qYg4        
+        VHSUB        qYf4, qYg3, qYg4        
+        VHADD        qYf2, qYg2, qYg5        
+        VHSUB        qYf5, qYg2, qYg5        
+        VHADD        qYf1, qYg1, qYg6
+        VHSUB        qYf6, qYg1, qYg6        
+        VHADD        qYf0, qYg0, qYg7
+        VHSUB        qYf7, qYg0, qYg7      
+
+YTR0            EQU Src0
+YTR1            EQU Src4
+YTR2            EQU Src1
+YTR3            EQU Src2
+YTR4            EQU Src7
+YTR5            EQU Src5
+YTR6            EQU Tmp
+YTR7            EQU Src6
+YTRt            EQU Src3
+
+qC0             QN  YTR0.S32                ;// for YTRpose
+qC1             QN  YTR1.S32
+qC2             QN  YTR2.S32
+qC3             QN  YTR3.S32
+qC4             QN  YTR4.S32
+qC5             QN  YTR5.S32
+qC6             QN  YTR6.S32
+qC7             QN  YTR7.S32
+
+dD0             DN  YTR0*2+1                ;// for using VSWP
+dD1             DN  YTR1*2+1
+dD2             DN  YTR2*2+1
+dD3             DN  YTR3*2+1
+dD4             DN  YTR4*2
+dD5             DN  YTR5*2
+dD6             DN  YTR6*2
+dD7             DN  YTR7*2
+          
+        VTRN        qYf0, qYf1
+        VTRN        qYf2, qYf3
+        VTRN        qYf4, qYf5
+        VTRN        qYf6, qYf7
+        VTRN        qC0, qC2
+        VTRN        qC1, qC3
+        VTRN        qC4, qC6
+        VTRN        qC5, qC7        
+        VSWP        dD0, dD4
+        VSWP        dD1, dD5
+        VSWP        dD2, dD6
+        VSWP        dD3, dD7
+
+        
+dYf0U8          DN YTR0*2.U8
+dYf1U8          DN YTR1*2.U8
+dYf2U8          DN YTR2*2.U8
+dYf3U8          DN YTR3*2.U8
+dYf4U8          DN YTR4*2.U8
+dYf5U8          DN YTR5*2.U8
+dYf6U8          DN YTR6*2.U8
+dYf7U8          DN YTR7*2.U8
+        
+        ;//
+        ;// Do saturation if outsize is other than S16
+        ;//
+        
+        IF ("$outsize"="u8")
+            ;// Output range [0-255]
+            VQMOVN            dYf0U8, qYf0
+            VQMOVN            dYf1U8, qYf1
+            VQMOVN            dYf2U8, qYf2
+            VQMOVN            dYf3U8, qYf3
+            VQMOVN            dYf4U8, qYf4
+            VQMOVN            dYf5U8, qYf5
+            VQMOVN            dYf6U8, qYf6
+            VQMOVN            dYf7U8, qYf7
+        ENDIF
+        
+        IF ("$outsize"="s9")
+            ;// Output range [-256 to +255]
+            VQSHL            qYf0, qYf0, #16-9
+            VQSHL            qYf1, qYf1, #16-9
+            VQSHL            qYf2, qYf2, #16-9
+            VQSHL            qYf3, qYf3, #16-9
+            VQSHL            qYf4, qYf4, #16-9
+            VQSHL            qYf5, qYf5, #16-9
+            VQSHL            qYf6, qYf6, #16-9
+            VQSHL            qYf7, qYf7, #16-9
+            
+            VSHR             qYf0, qYf0, #16-9
+            VSHR             qYf1, qYf1, #16-9
+            VSHR             qYf2, qYf2, #16-9
+            VSHR             qYf3, qYf3, #16-9
+            VSHR             qYf4, qYf4, #16-9
+            VSHR             qYf5, qYf5, #16-9
+            VSHR             qYf6, qYf6, #16-9
+            VSHR             qYf7, qYf7, #16-9
+        ENDIF
+
+        ;// Store output depending on the Stride size
+        IF "$stride"="s"
+            VST1        qYf0, [pDest @64], Stride
+            VST1        qYf1, [pDest @64], Stride
+            VST1        qYf2, [pDest @64], Stride
+            VST1        qYf3, [pDest @64], Stride
+            VST1        qYf4, [pDest @64], Stride
+            VST1        qYf5, [pDest @64], Stride
+            VST1        qYf6, [pDest @64], Stride
+            VST1        qYf7, [pDest @64]            
+        ELSE
+            IF ("$outsize"="u8")
+                VST1        dYf0U8, [pDest @64], #8
+                VST1        dYf1U8, [pDest @64], #8
+                VST1        dYf2U8, [pDest @64], #8
+                VST1        dYf3U8, [pDest @64], #8
+                VST1        dYf4U8, [pDest @64], #8
+                VST1        dYf5U8, [pDest @64], #8
+                VST1        dYf6U8, [pDest @64], #8
+                VST1        dYf7U8, [pDest @64]
+            ELSE
+                ;// ("$outsize"="s9") or ("$outsize"="s16")
+                VST1        qYf0, [pDest @64], #16
+                VST1        qYf1, [pDest @64], #16
+                VST1        qYf2, [pDest @64], #16
+                VST1        qYf3, [pDest @64], #16
+                VST1        qYf4, [pDest @64], #16
+                VST1        qYf5, [pDest @64], #16
+                VST1        qYf6, [pDest @64], #16
+                VST1        qYf7, [pDest @64]
+            ENDIF
+        
+        ENDIF
+
+
+
+        ENDIF ;// CortexA8
+
+
+
+        MEND        
+
+        ;// Scale TWO input rows with TWO rows of 16 bit scale values
+        ;//
+        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
+        ;// input (Eight input values) with one row of scale values. Also 
+        ;// Loads next scale values from pScale, if $LastRow flag is not set.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $dAlo           - Input D register with first four S16 values of row n
+        ;// $dAhi           - Input D register with next four S16 values of row n
+        ;// $dBlo           - Input D register with first four S16 values of row n+1
+        ;// $dBhi           - Input D register with next four S16 values of row n+1
+        ;// pScale          - Pointer to next row of scale values
+        ;// qT0lo           - Temporary scratch register
+        ;// qT0hi           - Temporary scratch register
+        ;// qT1lo           - Temporary scratch register
+        ;// qT1hi           - Temporary scratch register
+        ;// dScale1lo       - Scale value of row n
+        ;// dScale1hi       - Scale value of row n
+        ;// dScale2lo       - Scale value of row n+1
+        ;// dScale2hi       - Scale value of row n+1
+        ;//
+        ;// Input Flag
+        ;//
+        ;// $LastRow        - Flag to indicate whether current row is last row
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $dAlo           - Scaled output values (first four S16 of row n)
+        ;// $dAhi           - Scaled output values (next four S16 of row n)
+        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
+        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
+        ;// qScale1         - Scale values for next row
+        ;// qScale2         - Scale values for next row+1
+        ;// pScale          - Pointer to next row of scale values
+        ;//
+        MACRO
+        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
+        VMULL       qT0lo, $dAlo, dScale1lo
+        VMULL       qT0hi, $dAhi, dScale1hi
+        VMULL       qT1lo, $dBlo, dScale2lo
+        VMULL       qT1hi, $dBhi, dScale2hi
+        IF "$LastRow"="0"
+            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
+            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
+        ENDIF
+        VQRSHRN       $dAlo, qT0lo, #12        
+        VQRSHRN       $dAhi, qT0hi, #12        
+        VQRSHRN       $dBlo, qT1lo, #12        
+        VQRSHRN       $dBhi, qT1hi, #12        
+        MEND
+
+        ;// Scale 8x8 block input values with 16 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to scale values
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE16
+        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
+        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
+        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
+        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0        
+        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0        
+        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1        
+        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
+        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
+        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
+        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
+        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
+        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
+        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
+        MEND    
+        
+        
+        ;// Scale 8x8 block input values with 32 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to 32bit scale values in Q23 format
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
+        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE32
+qScale0lo       QN 0.S32
+qScale0hi       QN 1.S32
+qScale1lo       QN 2.S32
+qScale1hi       QN 3.S32
+qScale2lo       QN qScale1lo
+qScale2hi       QN qScale1hi
+qScale3lo       QN qScale1lo
+qScale3hi       QN qScale1hi
+qScale4lo       QN qScale1lo
+qScale4hi       QN qScale1hi
+qScale5lo       QN qScale0lo
+qScale5hi       QN qScale0hi
+qScale6lo       QN qScale0lo
+qScale6hi       QN qScale0hi
+qScale7lo       QN qScale0lo
+qScale7hi       QN qScale0hi
+
+qSrc0lo         QN 4.S32
+qSrc0hi         QN 5.S32
+qSrc1lo         QN 6.S32
+qSrc1hi         QN Src4.S32
+qSrc2lo         QN qSrc0lo
+qSrc2hi         QN qSrc0hi
+qSrc3lo         QN qSrc0lo
+qSrc3hi         QN qSrc0hi
+qSrc4lo         QN qSrc0lo
+qSrc4hi         QN qSrc0hi
+qSrc5lo         QN qSrc1lo
+qSrc5hi         QN qSrc1hi
+qSrc6lo         QN qSrc1lo
+qSrc6hi         QN qSrc1hi
+qSrc7lo         QN qSrc0lo
+qSrc7hi         QN qSrc0hi
+
+qRes17lo        QN qScale0lo
+qRes17hi        QN qScale0hi
+qRes26lo        QN qScale0lo
+qRes26hi        QN qScale0hi
+qRes53lo        QN qScale0lo
+qRes53hi        QN qScale0hi
+
+            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
+            
+            ;// Row 0
+            VLD1        {qScale0lo, qScale0hi}, [pScale]!
+            VSHLL       qSrc0lo, dXj0lo, #(12-1)
+            VSHLL       qSrc0hi, dXj0hi, #(12-1)            
+            VLD1        {qScale1lo, qScale1hi}, [pScale]!
+            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
+            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
+            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
+            VSHLL       qSrc1lo, dXj1lo, #(12-1)
+            VSHLL       qSrc1hi, dXj1hi, #(12-1)            
+            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
+            VMOVN       dXi0hi, qSrc0hi
+            VSHLL       qSrc7lo, dXj7lo, #(12-1)
+            VSHLL       qSrc7hi, dXj7hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
+            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
+            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
+            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
+            VLD1        {qScale2lo, qScale2hi}, [pScale]!
+
+            ;// Row 1 & 7
+            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
+            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
+            VMOVN       dXi5lo, qRes17lo                ;// Output i5
+            VMOVN       dXi5hi, qRes17hi              
+            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
+            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
+            VMOVN       dXi6lo, qRes17lo                ;// Output i6
+            VMOVN       dXi6hi, qRes17hi      
+            VSHLL       qSrc2lo, dXj2lo, #(12-1)
+            VSHLL       qSrc2hi, dXj2hi, #(12-1)
+            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
+            VSHLL       qSrc6lo, dXj6lo, #(12-1)
+            VSHLL       qSrc6hi, dXj6hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
+            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
+            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
+            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
+            VLD1        {qScale3lo, qScale3hi}, [pScale]!
+
+            ;// Row 2 & 6
+            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
+            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
+            VMOVN       dXi3lo, qRes26lo                ;// Output i3
+            VMOVN       dXi3hi, qRes26hi              
+            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
+            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
+            VMOVN       dXi2lo, qRes26lo                ;// Output i2
+            VMOVN       dXi2hi, qRes26hi      
+            VSHLL       qSrc3lo, dXj3lo, #(12-1)
+            VSHLL       qSrc3hi, dXj3hi, #(12-1)
+            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
+            VSHLL       qSrc5lo, dXj5lo, #(12-1)
+            VSHLL       qSrc5hi, dXj5hi, #(12-1)
+            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
+            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
+            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
+            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
+            
+            ;// Row 3 & 5
+            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
+            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
+            SUB         pSrc, pSrc, #16*2*2
+            VMOVN       dXi7lo, qRes53lo                ;// Output i7
+            VMOVN       dXi7hi, qRes53hi              
+            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
+            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
+            VLD1        qXj4, [pSrc @64]
+            VMOVN       dXi4lo, qRes53lo                ;// Output i4
+            VMOVN       dXi4hi, qRes53hi                              
+            VSHLL       qSrc4lo, dXj4lo, #(12-1)
+            VSHLL       qSrc4hi, dXj4hi, #(12-1)
+            VLD1        {qScale4lo, qScale4hi}, [pScale]            
+            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
+            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
+            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
+            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
+            ;// Row 4
+            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
+            VMOVN       dXi1hi, qSrc4hi              
+        
+        MEND
+                                                
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
new file mode 100755
index 0000000..51118fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
@@ -0,0 +1,27 @@
+/**
+ * 
+ * File Name:  armCOMM_MaskTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Mask Table to mask the end of array
+ */
+ 
+
+
+#ifndef _ARMCOMM_MASKTABLE_H_
+#define _ARMCOMM_MASKTABLE_H_
+
+#define MaskTableSize 72
+  
+/* Mask table */
+
+extern const OMX_U16 armCOMM_qMaskTable16[MaskTableSize];
+extern const OMX_U8 armCOMM_qMaskTable8[MaskTableSize];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
new file mode 100755
index 0000000..41b3e1e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
@@ -0,0 +1,43 @@
+/* Guard the header against multiple inclusion. */
+#ifndef __ARM_COMM_VERSION_H__
+#define __ARM_COMM_VERSION_H__
+
+
+/* The following line should be in omxtypes.h but hasn't been approved by OpenMAX yet */
+#define OMX_VERSION 102
+
+/* We need to define these macros in order to convert a #define number into a #define string. */
+#define ARM_QUOTE(a) #a
+#define ARM_INDIRECT(A) ARM_QUOTE(A)
+
+/* Convert the OMX_VERSION number into a string that can be used, for example, to print it out. */
+#define ARM_VERSION_STRING ARM_INDIRECT(OMX_VERSION)
+
+
+/* Define this in order to turn on ARM version/release/build strings in each domain */
+#define ARM_INCLUDE_VERSION_DESCRIPTIONS
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+  extern const char * const omxAC_VersionDescription;
+  extern const char * const omxIC_VersionDescription;
+  extern const char * const omxIP_VersionDescription;
+  extern const char * const omxSP_VersionDescription;
+  extern const char * const omxVC_VersionDescription;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */
+
+
+/* The following entries should be automatically updated by the release script */
+/* They are used in the ARM version strings defined for each domain.             */
+
+/* The release tag associated with this release of the library. - used for source and object releases */
+#define OMX_ARM_RELEASE_TAG  "r1p0-00bet0"
+
+/* The ARM architecture used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_ARCHITECTURE "ARM Architecture V7 with NEON"
+
+/* The ARM Toolchain used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_TOOLCHAIN    "ARM RVCT 3.1"
+
+
+#endif /* __ARM_COMM_VERSION_H__ */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
new file mode 100755
index 0000000..0956bd1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
@@ -0,0 +1,1157 @@
+;//
+;// 
+;// File Name:  armCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// ARM optimized OpenMAX common header file
+;//
+
+;// Protect against multiple inclusion
+ IF :LNOT::DEF:ARMCOMM_S_H
+ GBLL ARMCOMM_S_H
+
+        REQUIRE8            ;// Requires 8-byte stack alignment
+        PRESERVE8           ;// Preserves 8-byte stack alignment
+        
+        GBLL    ARM_ERRORCHECK
+ARM_ERRORCHECK  SETL {FALSE}
+
+;// Globals
+
+        GBLS    _RRegList   ;// R saved register list
+        GBLS    _DRegList   ;// D saved register list
+        GBLS    _Variant    ;// Selected processor variant
+        GBLS    _CPU        ;// CPU name
+        GBLS    _Struct     ;// Structure name
+        
+        GBLL    _InFunc     ;// Inside function assembly flag
+        GBLL    _SwLong     ;// Long switch flag
+        
+        GBLA    _RBytes     ;// Number of register bytes on stack
+        GBLA    _SBytes     ;// Number of scratch bytes on stack 
+        GBLA    _ABytes     ;// Stack offset of next argument
+        GBLA    _Workspace  ;// Stack offset of scratch workspace
+        GBLA    _F          ;// Function number
+        GBLA    _StOff      ;// Struct offset
+        GBLA    _SwNum      ;// Switch number
+        GBLS    _32         ;// Suffix for 32 byte alignmnet
+        GBLS    _16         ;// Suffix for 16 byte alignmnet
+        
+_InFunc         SETL    {FALSE}
+_SBytes         SETA    0
+_F              SETA    0
+_SwNum          SETA    0
+_32             SETS    "ALIGN32"
+_16             SETS    "ALIGN16"
+
+;/////////////////////////////////////////////////////////
+;// Override the tools settings of the CPU if the #define
+;// USECPU is set, otherwise use the CPU defined by the
+;// assembler settings.
+;/////////////////////////////////////////////////////////
+
+       IF :DEF: OVERRIDECPU
+_CPU       SETS  OVERRIDECPU
+       ELSE
+_CPU       SETS    {CPU}       
+       ENDIF
+
+
+
+;/////////////////////////////////////////////////////////
+;// Work out which code to build
+;/////////////////////////////////////////////////////////
+
+        IF :DEF:ARM1136JS:LOR::DEF:CortexA8:LOR::DEF:ARM_GENERIC
+            INFO 1,"Please switch to using M_VARIANTS"
+        ENDIF
+
+        ;// Define and reset all officially recongnised variants
+        MACRO
+        _M_DEF_VARIANTS
+        _M_DEF_VARIANT ARM926EJS
+        _M_DEF_VARIANT ARM1136JS
+        _M_DEF_VARIANT ARM1136JS_U
+        _M_DEF_VARIANT CortexA8
+        _M_DEF_VARIANT ARM7TDMI
+        MEND
+        
+        MACRO
+        _M_DEF_VARIANT $var
+        GBLL $var
+        GBLL _ok$var
+$var    SETL {FALSE}
+        MEND        
+        
+
+        ;// Variant declaration
+        ;//
+        ;// Define a list of code variants supported by this
+        ;// source file. This macro then chooses the most
+        ;// appropriate variant to build for the currently configured
+        ;// core.
+        ;//        
+        MACRO
+        M_VARIANTS $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7        
+        ;// Set to TRUE variants that are supported
+        _M_DEF_VARIANTS
+        _M_VARIANT $v0
+        _M_VARIANT $v1
+        _M_VARIANT $v2
+        _M_VARIANT $v3
+        _M_VARIANT $v4
+        _M_VARIANT $v5
+        _M_VARIANT $v6
+        _M_VARIANT $v7
+        
+        ;// Look for first available variant to match a CPU
+        ;// _M_TRY cpu, variant fall back list
+_Variant SETS ""                
+        _M_TRY ARM926EJ-S,   ARM926EJS
+        _M_TRY ARM1176JZ-S,  ARM1136JS
+        _M_TRY ARM1176JZF-S, ARM1136JS
+        _M_TRY ARM1156T2-S,  ARM1136JS
+        _M_TRY ARM1156T2F-S, ARM1136JS
+        _M_TRY ARM1136J-S,   ARM1136JS
+        _M_TRY ARM1136JF-S,  ARM1136JS
+        _M_TRY MPCore,       ARM1136JS
+        _M_TRY falcon-vfp, ARM1136JS
+        _M_TRY falcon-full-neon, CortexA8
+        _M_TRY Cortex-A8NoNeon, ARM1136JS
+        _M_TRY Cortex-A8,    CortexA8, ARM1136JS
+        _M_TRY Cortex-R4,    ARM1136JS
+        _M_TRY ARM7TDMI
+        
+        ;// Select the correct variant
+        _M_DEF_VARIANTS
+        IF _Variant=""
+            INFO 1, "No match found for CPU '$_CPU'"
+        ELSE
+$_Variant   SETL {TRUE}
+        ENDIF
+        MEND
+        
+        ;// Register a variant as available
+        MACRO
+        _M_VARIANT $var
+        IF "$var"=""
+            MEXIT
+        ENDIF
+        IF :LNOT::DEF:_ok$var
+            INFO 1, "Unrecognized variant '$var'"
+        ENDIF
+$var    SETL {TRUE}
+        MEND
+        
+        ;// For a given CPU, see if any of the variants supporting
+        ;// this CPU are available. The first available variant is
+        ;// chosen
+        MACRO
+        _M_TRY $cpu, $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+        IF "$cpu"<>_CPU
+            MEXIT
+        ENDIF
+        _M_TRY1 $v0
+        _M_TRY1 $v1
+        _M_TRY1 $v2
+        _M_TRY1 $v3
+        _M_TRY1 $v4
+        _M_TRY1 $v5
+        _M_TRY1 $v6
+        _M_TRY1 $v7
+        ;// Check a match was found
+        IF _Variant=""
+            INFO 1, "No variant match found for CPU '$_CPU'"
+        ENDIF
+        MEND
+        
+        MACRO
+        _M_TRY1 $var
+        IF "$var"=""
+            MEXIT
+        ENDIF
+        IF (_Variant=""):LAND:$var
+_Variant SETS "$var"
+        ENDIF
+        MEND
+        
+;////////////////////////////////////////////////////////
+;// Structure definition
+;////////////////////////////////////////////////////////
+
+        ;// Declare a structure of given name
+        MACRO
+        M_STRUCT $sname
+_Struct SETS "$sname"
+_StOff  SETA 0
+        MEND
+        
+        ;// Declare a structure field
+        ;// The field is called $sname_$fname
+        ;// $size   = the size of each entry, must be power of 2 
+        ;// $number = (if provided) the number of entries for an array
+        MACRO
+        M_FIELD $fname, $size, $number
+        IF (_StOff:AND:($size-1))!=0
+_StOff      SETA _StOff + ($size - (_StOff:AND:($size-1)))
+        ENDIF
+$_Struct._$fname EQU _StOff
+        IF "$number"<>""
+_StOff      SETA _StOff + $size*$number
+        ELSE
+_StOff      SETA _StOff + $size
+        ENDIF
+        MEND
+        
+        
+        MACRO
+        M_ENDSTRUCT
+sizeof_$_Struct EQU _StOff
+_Struct SETS ""
+        MEND
+
+;//////////////////////////////////////////////////////////
+;// Switch and table macros
+;//////////////////////////////////////////////////////////
+
+        ;// Start a relative switch table with register to switch on
+        ;//
+        ;// $v = the register to switch on
+        ;// $s = if specified must be "L" to indicate long
+        ;//      this allows a greater range to the case code
+        MACRO
+        M_SWITCH $v, $s
+        ASSERT "$s"="":LOR:"$s"="L"
+_SwLong SETL {FALSE}
+        IF "$s"="L"
+_SwLong     SETL {TRUE}
+        ENDIF
+_SwNum  SETA _SwNum+1        
+        IF {CONFIG}=16
+            ;// Thumb
+            IF _SwLong
+                TBH [pc, $v, LSL#1]
+            ELSE
+                TBB [pc, $v]
+            ENDIF
+_Switch$_SwNum
+        ELSE
+            ;// ARM
+            ADD pc, pc, $v, LSL #2
+            NOP
+        ENDIF
+        MEND
+        
+        ;// Add a case to the switch statement
+        MACRO
+        M_CASE  $label
+        IF {CONFIG}=16
+            ;// Thumb
+            IF _SwLong
+                DCW ($label - _Switch$_SwNum)/2
+            ELSE
+                DCB ($label - _Switch$_SwNum)/2
+            ENDIF
+        ELSE
+            ;// ARM
+            B   $label
+        ENDIF
+        MEND
+        
+        ;// End of switch statement
+        MACRO
+        M_ENDSWITCH
+        ALIGN 2
+        MEND       
+
+
+;////////////////////////////////////////////////////////
+;// Data area allocation
+;////////////////////////////////////////////////////////
+
+        ;// Constant table allocator macro
+        ;//
+        ;// Creates a new section for each constant table
+        ;// $name is symbol through which the table can be accessed.
+        ;// $align is the optional alignment of the table, log2 of 
+        ;//  the byte alignment - $align=4 is 16 byte aligned
+        MACRO
+        M_TABLE  $name, $align
+        ASSERT :LNOT:_InFunc
+        IF "$align"=""
+            AREA |.constdata|, READONLY, DATA
+        ELSE
+            ;// AREAs inherit the alignment of the first declaration.
+            ;// Therefore for each alignment size we must have an area
+            ;// of a different name.
+            AREA constdata_a$align, READONLY, DATA, ALIGN=$align
+            
+            ;// We also force alignment incase we are tagging onto
+            ;// an already started area.
+            ALIGN (1<<$align)
+        ENDIF
+$name
+        MEND
+        
+;/////////////////////////////////////////////////////
+;// Macros to allocate space on the stack
+;//
+;// These all assume that the stack is 8-byte aligned
+;// at entry to the function, which means that the 
+;// 32-byte alignment macro needs to work in a
+;// bit more of a special way...
+;/////////////////////////////////////////////////////
+
+        
+
+
+        ;// Allocate 1-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC1  $name, $size
+        ASSERT :LNOT:_InFunc
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 2-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC2  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:1)!=0
+_SBytes     SETA _SBytes + (2 - (_SBytes:AND:1))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 4-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC4  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:3)!=0
+_SBytes     SETA _SBytes + (4 - (_SBytes:AND:3))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC8  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND        
+
+        
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size ($size+16) bytes.
+        ;// The extra 16 bytes are later used to align the pointer to 16 bytes
+        
+        MACRO
+        M_ALLOC16  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F$_16   EQU (_SBytes + 8)
+_SBytes SETA _SBytes + ($size) + 8
+        MEND        
+        
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size ($size+32) bytes.
+        ;// The extra 32 bytes are later used to align the pointer to 32 bytes
+        
+        MACRO
+        M_ALLOC32  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F$_32   EQU (_SBytes + 24)
+_SBytes SETA _SBytes + ($size) + 24
+        MEND        
+        
+        
+        
+        
+        ;// Argument Declaration Macro
+        ;//
+        ;// Allocate an argument name $name
+        ;// size $size bytes
+        MACRO
+        M_ARG     $name, $size
+        ASSERT _InFunc
+$name$_F    EQU _ABytes
+_ABytes SETA _ABytes + ($size)
+        MEND        
+        
+;///////////////////////////////////////////////
+;// Macros to access stacked variables
+;///////////////////////////////////////////////
+
+        ;// Macro to perform a data processing operation
+        ;// with a constant second operand
+        MACRO
+        _M_OPC $op,$rd,$rn,$const
+        LCLA    _sh
+        LCLA    _cst
+_sh     SETA    0
+_cst    SETA    $const
+        IF _cst=0
+        $op $rd, $rn, #_cst
+            MEXIT
+        ENDIF
+        WHILE (_cst:AND:3)=0
+_cst        SETA _cst>>2
+_sh         SETA _sh+2
+        WEND
+        $op $rd, $rn, #(_cst:AND:0x000000FF)<<_sh
+        IF _cst>=256
+            $op $rd, $rd, #(_cst:AND:0xFFFFFF00)<<_sh
+        ENDIF
+        MEND
+
+        ;// Macro to perform a data access operation
+        ;// Such as LDR or STR
+        ;// The addressing mode is modified such that
+        ;// 1. If no address is given then the name is taken
+        ;//    as a stack offset
+        ;// 2. If the addressing mode is not available for the
+        ;//    state being assembled for (eg Thumb) then a suitable
+        ;//    addressing mode is substituted.
+        ;//
+        ;// On Entry:
+        ;// $i = Instruction to perform (eg "LDRB")
+        ;// $a = Required byte alignment
+        ;// $r = Register(s) to transfer (eg "r1")
+        ;// $a0,$a1,$a2. Addressing mode and condition. One of:
+        ;//     label {,cc}
+        ;//     [base]                    {,,,cc}
+        ;//     [base, offset]{!}         {,,cc}
+        ;//     [base, offset, shift]{!}  {,cc}
+        ;//     [base], offset            {,,cc}
+        ;//     [base], offset, shift     {,cc}
+        MACRO
+        _M_DATA $i,$a,$r,$a0,$a1,$a2,$a3
+        IF "$a0":LEFT:1="["
+            IF "$a1"=""
+                $i$a3   $r, $a0
+            ELSE
+                IF "$a0":RIGHT:1="]"
+                    IF "$a2"=""
+                        _M_POSTIND $i$a3, "$r", $a0, $a1
+                    ELSE
+                        _M_POSTIND $i$a3, "$r", $a0, "$a1,$a2"
+                    ENDIF
+                ELSE
+                    IF "$a2"=""
+                        _M_PREIND  $i$a3, "$r", $a0, $a1
+                    ELSE
+                        _M_PREIND  $i$a3, "$r", $a0, "$a1,$a2"
+                    ENDIF
+                ENDIF
+            ENDIF
+        ELSE
+            LCLA    _Offset
+_Offset     SETA    _Workspace + $a0$_F
+            ASSERT  (_Offset:AND:($a-1))=0
+            $i$a1   $r, [sp, #_Offset]
+        ENDIF
+        MEND
+        
+        ;// Handle post indexed load/stores
+        ;// op  reg, [base], offset
+        MACRO
+        _M_POSTIND $i,$r,$a0,$a1
+        LCLS _base
+        LCLS _offset
+        IF {CONFIG}=16 ;// Thumb
+_base       SETS ("$a0":LEFT:(:LEN:"$a0"-1)):RIGHT:(:LEN:"$a0"-2)   ;// remove []
+_offset     SETS "$a1"
+            IF _offset:LEFT:1="+"
+_offset         SETS _offset:RIGHT:(:LEN:_offset-1)
+            ENDIF
+            $i  $r, $a0
+            IF _offset:LEFT:1="-"
+_offset         SETS _offset:RIGHT:(:LEN:_offset-1)
+                SUB $_base, $_base, $_offset
+            ELSE                
+                ADD $_base, $_base, $_offset
+            ENDIF
+        ELSE ;// ARM
+            $i  $r, $a0, $a1
+        ENDIF
+        MEND
+        
+        ;// Handle pre indexed load/store
+        ;// op  reg, [base, offset]{!}
+        MACRO
+        _M_PREIND $i,$r,$a0,$a1
+        LCLS _base
+        LCLS _offset
+        IF ({CONFIG}=16):LAND:(("$a1":RIGHT:2)="]!")
+_base       SETS "$a0":RIGHT:(:LEN:("$a0")-1)
+_offset     SETS "$a1":LEFT:(:LEN:("$a1")-2)
+            $i $r, [$_base, $_offset]
+            ADD $_base, $_base, $_offset
+        ELSE
+            $i  $r, $a0, $a1
+        ENDIF
+        MEND
+
+        ;// Load unsigned byte from stack
+        MACRO
+        M_LDRB  $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load signed byte from stack
+        MACRO
+        M_LDRSB $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRSB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store byte to stack
+        MACRO
+        M_STRB  $r,$a0,$a1,$a2,$a3
+        _M_DATA "STRB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load unsigned half word from stack
+        MACRO
+        M_LDRH  $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load signed half word from stack
+        MACRO
+        M_LDRSH $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRSH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store half word to stack
+        MACRO
+        M_STRH  $r,$a0,$a1,$a2,$a3
+        _M_DATA "STRH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+
+        ;// Load word from stack
+        MACRO
+        M_LDR   $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDR",4,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store word to stack
+        MACRO
+        M_STR   $r,$a0,$a1,$a2,$a3
+        _M_DATA "STR",4,$r,$a0,$a1,$a2,$a3
+        MEND
+
+        ;// Load double word from stack
+        MACRO
+        M_LDRD  $r0,$r1,$a0,$a1,$a2,$a3
+        _M_DATA "LDRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+        MEND
+                
+        ;// Store double word to stack
+        MACRO
+        M_STRD  $r0,$r1,$a0,$a1,$a2,$a3
+        _M_DATA "STRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Get absolute address of stack allocated location
+        MACRO
+        M_ADR   $a, $b, $cc
+        _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F)
+        MEND
+        
+        ;// Get absolute address of stack allocated location and align the address to 16 bytes
+        MACRO
+        M_ADR16 $a, $b, $cc
+            _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F$_16)
+        
+            ;// Now align $a to 16 bytes
+            BIC$cc  $a,$a,#0x0F
+        MEND
+        
+        ;// Get absolute address of stack allocated location and align the address to 32 bytes
+        MACRO
+        M_ADR32 $a, $b, $cc
+            _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F$_32)
+        
+            ;// Now align $a to 32 bytes
+            BIC$cc  $a,$a,#0x1F
+        MEND
+
+;//////////////////////////////////////////////////////////
+;// Function header and footer macros
+;//////////////////////////////////////////////////////////      
+        
+        ;// Function Header Macro    
+        ;// Generates the function prologue
+        ;// Note that functions should all be "stack-moves-once"
+        ;// The FNSTART and FNEND macros should be the only places
+        ;// where the stack moves.
+        ;//    
+        ;// $name  = function name
+        ;// $rreg  = ""   don't stack any registers
+        ;//          "lr" stack "lr" only
+        ;//          "rN" stack registers "r4-rN,lr"
+        ;// $dreg  = ""   don't stack any D registers
+        ;//          "dN" stack registers "d8-dN"
+        ;//
+        ;// Note: ARM Archicture procedure call standard AAPCS
+        ;// states that r4-r11, sp, d8-d15 must be preserved by
+        ;// a compliant function.
+        MACRO
+        M_START $name, $rreg, $dreg
+        ASSERT :LNOT:_InFunc
+        ASSERT "$name"!=""
+_InFunc SETL {TRUE}
+_RBytes SETA 0
+_Workspace SETA 0
+
+        ;// Create an area for the function        
+        AREA    |.text|, CODE
+        EXPORT  $name
+$name   FUNCTION
+        
+        ;// Save R registers
+        _M_GETRREGLIST $rreg
+        IF _RRegList<>""
+            STMFD   sp!, {$_RRegList, lr}
+        ENDIF
+                
+        ;// Save D registers
+        _M_GETDREGLIST  $dreg        
+        IF _DRegList<>""
+            VSTMFD  sp!, {$_DRegList}
+        ENDIF            
+            
+                    
+        ;// Ensure size claimed on stack is 8-byte aligned
+        IF ((_SBytes:AND:7)!=0)
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+        
+        IF (_SBytes!=0)
+            _M_OPC SUB, sp, sp, _SBytes
+        ENDIF
+        
+        
+_ABytes SETA _SBytes + _RBytes - _Workspace
+
+                        
+        ;// Print function name if debug enabled
+        M_PRINTF "$name\n",
+        MEND
+        
+        ;// Work out a list of R saved registers
+        MACRO
+        _M_GETRREGLIST $rreg
+        IF "$rreg"=""
+_RRegList   SETS ""
+            MEXIT
+        ENDIF        
+        IF "$rreg"="lr":LOR:"$rreg"="r4"
+_RRegList   SETS "r4"
+_RBytes     SETA _RBytes+8
+            MEXIT
+        ENDIF
+        IF "$rreg"="r5":LOR:"$rreg"="r6"
+_RRegList   SETS "r4-r6"
+_RBytes     SETA _RBytes+16
+            MEXIT
+        ENDIF
+        IF "$rreg"="r7":LOR:"$rreg"="r8"
+_RRegList   SETS "r4-r8"
+_RBytes     SETA _RBytes+24
+            MEXIT
+        ENDIF
+        IF "$rreg"="r9":LOR:"$rreg"="r10"
+_RRegList   SETS "r4-r10"
+_RBytes     SETA _RBytes+32
+            MEXIT
+        ENDIF
+        IF "$rreg"="r11":LOR:"$rreg"="r12"
+_RRegList   SETS "r4-r12"
+_RBytes     SETA _RBytes+40
+            MEXIT
+        ENDIF
+        INFO 1, "Unrecognized saved r register limit '$rreg'"
+        MEND        
+        
+        ;// Work out a list of D saved registers
+        MACRO
+        _M_GETDREGLIST $dreg
+        IF "$dreg"=""
+_DRegList   SETS ""
+            MEXIT
+        ENDIF        
+        IF "$dreg"="d8"
+_DRegList   SETS "d8"
+_RBytes     SETA _RBytes+8
+            MEXIT
+        ENDIF
+        IF "$dreg"="d9"
+_DRegList   SETS "d8-d9"
+_RBytes     SETA _RBytes+16
+            MEXIT
+        ENDIF
+        IF "$dreg"="d10"
+_DRegList   SETS "d8-d10"
+_RBytes     SETA _RBytes+24
+            MEXIT
+        ENDIF
+        IF "$dreg"="d11"
+_DRegList   SETS "d8-d11"
+_RBytes     SETA _RBytes+32
+            MEXIT
+        ENDIF
+        IF "$dreg"="d12"
+_DRegList   SETS "d8-d12"
+_RBytes     SETA _RBytes+40
+            MEXIT
+        ENDIF
+        IF "$dreg"="d13"
+_DRegList   SETS "d8-d13"
+_RBytes     SETA _RBytes+48
+            MEXIT
+        ENDIF
+        IF "$dreg"="d14"
+_DRegList   SETS "d8-d14"
+_RBytes     SETA _RBytes+56
+            MEXIT
+        ENDIF
+        IF "$dreg"="d15"
+_DRegList   SETS "d8-d15"
+_RBytes     SETA _RBytes+64
+            MEXIT
+        ENDIF
+        INFO 1, "Unrecognized saved d register limit '$dreg'"
+        MEND
+        
+        ;// Produce function return instructions
+        MACRO
+        _M_RET $cc
+        IF _DRegList<>""
+            VPOP$cc {$_DRegList}
+        ENDIF
+        IF _RRegList=""
+            BX$cc lr
+        ELSE
+            LDM$cc.FD sp!, {$_RRegList, pc}
+        ENDIF
+        MEND        
+        
+        ;// Early Function Exit Macro
+        ;// $cc = condition to exit with
+        ;// (Example: M_EXIT EQ)
+        MACRO
+        M_EXIT  $cc
+        ASSERT  _InFunc
+        IF  _SBytes!=0
+            ;// Restore stack frame and exit
+            B$cc  _End$_F
+        ELSE
+            ;// Can return directly
+            _M_RET $cc
+        ENDIF        
+        MEND        
+
+        ;// Function Footer Macro        
+        ;// Generates the function epilogue
+        MACRO
+        M_END
+        ASSERT _InFunc
+_InFunc SETL {FALSE}
+_End$_F
+
+        ;// Restore the stack pointer to its original value on function entry
+        IF _SBytes!=0
+            _M_OPC ADD, sp, sp, _SBytes
+        ENDIF
+        _M_RET
+        ENDFUNC
+
+        ;// Reset the global stack tracking variables back to their 
+        ;// initial values, and increment the function count
+_SBytes        SETA 0
+_F             SETA _F+1
+        MEND
+
+                
+;//==========================================================================
+;// Debug Macros
+;//==========================================================================
+
+        GBLL    DEBUG_ON
+DEBUG_ON SETL   {FALSE}
+        GBLL    DEBUG_STALLS_ON
+DEBUG_STALLS_ON SETL {FALSE}
+        
+        ;//==========================================================================
+        ;// Debug call to printf
+        ;//  M_PRINTF $format, $val0, $val1, $val2
+        ;//
+        ;// Examples:
+        ;//  M_PRINTF "x=%08x\n", r0
+        ;//
+        ;// This macro preserves the value of all registers including the
+        ;// flags.
+        ;//==========================================================================
+
+        MACRO
+        M_PRINTF  $format, $val0, $val1, $val2
+        IF DEBUG_ON
+        
+        IMPORT  printf
+        LCLA    nArgs
+nArgs	SETA    0
+        
+        ;// save registers so we don't corrupt them
+        STMFD   sp!, {r0-r12, lr}
+        
+        ;// Drop stack to give us some workspace
+        SUB     sp, sp, #16
+        
+        ;// Save registers we need to print to the stack
+        IF "$val2" <> ""
+            ASSERT "$val1" <> ""
+            STR    $val2, [sp, #8]
+nArgs       SETA   nArgs+1
+        ENDIF
+        IF "$val1" <> ""
+            ASSERT "$val0" <> ""
+            STR    $val1, [sp, #4]
+nArgs	    SETA   nArgs+1
+        ENDIF
+        IF "$val0"<>""
+            STR    $val0, [sp]
+nArgs	    SETA   nArgs+1
+        ENDIF
+        
+        ;// Now we are safe to corrupt registers
+        ADR     r0, %FT00
+        IF nArgs=1
+          LDR   r1, [sp]
+        ENDIF
+        IF nArgs=2
+          LDMIA sp, {r1,r2}
+        ENDIF
+        IF nArgs=3
+          LDMIA sp, {r1,r2,r3}
+        ENDIF
+        
+        ;// print the values
+        MRS     r4, cpsr        ;// preserve flags
+        BL      printf
+        MSR     cpsr_f, r4      ;// restore flags
+        B       %FT01
+00      ;// string to print
+        DCB     "$format", 0
+        ALIGN
+01      ;// Finished
+        ADD     sp, sp, #16
+        ;// Restore registers
+        LDMFD	sp!, {r0-r12,lr}
+
+        ENDIF   ;// DEBUG_ON
+        MEND
+
+
+        ;// Stall Simulation Macro
+        ;// Inserts a given number of NOPs for the currently
+        ;//  defined platform
+        MACRO
+        M_STALL $plat1stall, $plat2stall, $plat3stall, $plat4stall, $plat5stall, $plat6stall
+        IF DEBUG_STALLS_ON
+            _M_STALL_SUB $plat1stall    
+            _M_STALL_SUB $plat2stall    
+            _M_STALL_SUB $plat3stall    
+            _M_STALL_SUB $plat4stall    
+            _M_STALL_SUB $plat5stall    
+            _M_STALL_SUB $plat6stall    
+        ENDIF
+        MEND
+        
+        MACRO
+        _M_STALL_SUB $platstall
+        IF "$platstall"!=""
+            LCLA _pllen
+            LCLS _pl
+            LCLL _pllog
+_pllen      SETA :LEN:"$platstall"
+_pl         SETS "$platstall":LEFT:(_pllen - 2)
+            IF :DEF:$_pl
+                IF $_pl
+                    LCLS _st
+                    LCLA _stnum
+_st                 SETS "$platstall":RIGHT:1        
+_stnum              SETA $_st
+                    WHILE _stnum>0
+			MOV sp, sp
+_stnum                  SETA _stnum - 1
+                    WEND
+                ENDIF
+            ENDIF
+        ENDIF
+        MEND
+        
+        
+        
+;//==========================================================================
+;// Endian Invarience Macros
+;// 
+;// The idea behind these macros is that if an array is
+;// loaded as words then the SMUL00 macro will multiply
+;// array elements 0 regardless of the endianess of the
+;// system. For little endian SMUL00=SMULBB, for big
+;// endian SMUL00=SMULTT and similarly for other packed operations.
+;//
+;//==========================================================================
+
+        MACRO
+        LIBI4   $comli, $combi, $a, $b, $c, $d, $cc
+        IF {ENDIAN}="big"
+        $combi.$cc $a, $b, $c, $d
+        ELSE
+        $comli.$cc $a, $b, $c, $d
+        ENDIF
+        MEND
+        
+        MACRO
+        LIBI3   $comli, $combi, $a, $b, $c, $cc
+        IF {ENDIAN}="big"
+        $combi.$cc $a, $b, $c
+        ELSE
+        $comli.$cc $a, $b, $c
+        ENDIF
+        MEND
+        
+        ;// SMLAxy macros
+        
+        MACRO
+        SMLA00  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA01  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA0B  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA0T  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA10  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA11  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA1B  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA1T  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAB0  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAB1  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAT0  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAT1  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        ;// SMULxy macros
+        
+        MACRO
+        SMUL00  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL01  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL0B  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL0T  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL10  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL11  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL1B  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL1T  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULB0  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULB1  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULT0  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULT1  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        ;// SMLAWx, SMULWx macros
+        
+        MACRO
+        SMLAW0  $a, $b, $c, $d, $cc
+        LIBI4 SMLAWB, SMLAWT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAW1  $a, $b, $c, $d, $cc
+        LIBI4 SMLAWT, SMLAWB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMULW0  $a, $b, $c, $cc
+        LIBI3 SMULWB, SMULWT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULW1  $a, $b, $c, $cc
+        LIBI3 SMULWT, SMULWB, $a, $b, $c, $cc
+        MEND
+
+        ;// SMLALxy macros
+
+
+        MACRO
+        SMLAL00  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL01  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL0B  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL0T  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL10  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+
+        MACRO
+        SMLAL11  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL1B  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL1T  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALB0  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALB1  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALT0  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALT1  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+  ENDIF ;// ARMCOMM_S_H
+            
+  END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
new file mode 100755
index 0000000..7a68d14
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
@@ -0,0 +1,274 @@
+/* 
+ * 
+ * File Name:  armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix. 
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ *   a function with a different version (the original version would still be
+ *   in the library just with a different name - so you could debug the new
+ *   version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that 
+ *   you can include two versions of the library and choose between functions
+ *   at runtime.
+ *
+ *     e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ * 
+ */
+
+  
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX 
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX    BARE_SUFFIX   
+#define OMXACMP3_SUFFIX    BARE_SUFFIX
+#define OMXICJP_SUFFIX     BARE_SUFFIX
+#define OMXIPBM_SUFFIX     BARE_SUFFIX
+#define OMXIPCS_SUFFIX     BARE_SUFFIX
+#define OMXIPPP_SUFFIX     BARE_SUFFIX
+#define OMXSP_SUFFIX       BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX   BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX  BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX   BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt                        OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt                          OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt                            OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32                       OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I                        OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I                     OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt                          OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I                          OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32                 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I                          OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32                      OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I                OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32                              OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16                          OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode                          OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I                           OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader                         OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader                    OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32                        OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32                     OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32                  OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32                              OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I                         OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I                      OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16                        OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader                        OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8                    OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo                           OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3                          OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16                                OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I                              OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16                                OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I                              OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16                           OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I                         OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit                      OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16                           OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I                         OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit                      OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1            OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1         OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs                           OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R                               OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R                               OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R                             OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs                           OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R                      OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R          OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R            OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R                OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R        OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R           OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R               OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R                   OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I                      OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I                      OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R                          OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R                       OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64                      OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64                      OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize                        OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit                                OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R                            OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R                            OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16                                OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32                                OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16                                    OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16                                 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs                             OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs                    OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs                       OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16                        OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32                        OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32                      OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32                         OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16                              OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32                              OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32                            OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32                               OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs                       OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs                    OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32                            OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I                          OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16                              OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I                            OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs                         OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs                          OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16                           OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I                         OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs                      OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs                       OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16                        OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I                      OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16                              OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I                            OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16                     OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I                   OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16                           OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I                         OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x                             OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x                              OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock                OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD            OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16                               OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8                                 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I                           OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect                           OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x                                 OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x                                  OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x                             OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half                        OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer                     OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter                     OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I                        OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I                          OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC        OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC                OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo                             OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma                      OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma                OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma                OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma                        OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC           OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC             OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd             OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize                           OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit                                 OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB                     OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16                     OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4                       OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8                  OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x                                 OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x                            OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x                             OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x                             OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4                               OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual             OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair       OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair         OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC                OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC                  OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16                   OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8                     OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16                OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8                  OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk                               OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP                        OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV                                OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred                              OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk                              OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock                            OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize                            OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit                                  OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB                      OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra                   OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I                            OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I                            OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I                         OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I                         OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter                 OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra                 OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+
+#endif /* _armOMX_h_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
new file mode 100755
index 0000000..8b295a6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
@@ -0,0 +1,252 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright � 2005-2008 The Khronos Group Inc. All Rights Reserved. 
+ *
+ * These materials are protected by copyright laws and contain material 
+ * proprietary to the Khronos Group, Inc.  You may use these materials 
+ * for implementing Khronos specifications, without altering or removing 
+ * any trademark, copyright or other notice from the specification.
+ * 
+ * Khronos Group makes no, and expressly disclaims any, representations 
+ * or warranties, express or implied, regarding these materials, including, 
+ * without limitation, any implied warranties of merchantability or fitness 
+ * for a particular purpose or non-infringement of any intellectual property. 
+ * Khronos Group makes no, and expressly disclaims any, warranties, express 
+ * or implied, regarding the correctness, accuracy, completeness, timeliness, 
+ * and reliability of these materials. 
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters, 
+ * Contributors or Members or their respective partners, officers, directors, 
+ * employees, agents or representatives be liable for any damages, whether 
+ * direct, indirect, special or consequential damages for lost revenues, 
+ * lost profits, or otherwise, arising from or in connection with these 
+ * materials.
+ * 
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc. 
+ *
+ */
+  
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h> 
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+    
+    /* Mandatory return codes - use cases are explicitly described for each function */
+    OMX_Sts_NoErr                    =  0,    /* No error, the function completed successfully */
+    OMX_Sts_Err                      = -2,    /* Unknown/unspecified error */    
+    OMX_Sts_InvalidBitstreamValErr   = -182,  /* Invalid value detected during bitstream processing */    
+    OMX_Sts_MemAllocErr              = -9,    /* Not enough memory allocated for the operation */
+    OMX_StsACAAC_GainCtrErr    	     = -159,  /* AAC: Unsupported gain control data detected */
+    OMX_StsACAAC_PrgNumErr           = -167,  /* AAC: Invalid number of elements for one program   */
+    OMX_StsACAAC_CoefValErr          = -163,  /* AAC: Invalid quantized coefficient value          */     
+    OMX_StsACAAC_MaxSfbErr           = -162,  /* AAC: Invalid maxSfb value in relation to numSwb */    
+	OMX_StsACAAC_PlsDataErr		     = -160,  /* AAC: pulse escape sequence data error */
+
+    /* Optional return codes - use cases are explicitly described for each function*/
+    OMX_Sts_BadArgErr                = -5,    /* Bad Arguments */
+
+    OMX_StsACAAC_TnsNumFiltErr       = -157,  /* AAC: Invalid number of TNS filters  */
+    OMX_StsACAAC_TnsLenErr           = -156,  /* AAC: Invalid TNS region length  */   
+    OMX_StsACAAC_TnsOrderErr         = -155,  /* AAC: Invalid order of TNS filter  */                  
+    OMX_StsACAAC_TnsCoefResErr       = -154,  /* AAC: Invalid bit-resolution for TNS filter coefficients  */
+    OMX_StsACAAC_TnsCoefErr          = -153,  /* AAC: Invalid TNS filter coefficients  */                  
+    OMX_StsACAAC_TnsDirectErr        = -152,  /* AAC: Invalid TNS filter direction  */  
+
+    OMX_StsICJP_JPEGMarkerErr        = -183,  /* JPEG marker encountered within an entropy-coded block; */
+                                              /* Huffman decoding operation terminated early.           */
+    OMX_StsICJP_JPEGMarker           = -181,  /* JPEG marker encountered; Huffman decoding */
+                                              /* operation terminated early.                         */
+    OMX_StsIPPP_ContextMatchErr      = -17,   /* Context parameter doesn't match to the operation */
+
+    OMX_StsSP_EvenMedianMaskSizeErr  = -180,  /* Even size of the Median Filter mask was replaced by the odd one */
+
+    OMX_Sts_MaximumEnumeration       = INT_MAX  /*Placeholder, forces enum of size OMX_INT*/
+    
+ } OMXResult;          /** Return value or error value returned from a function. Identical to OMX_INT */
+
+ 
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff 
+typedef unsigned short int OMX_U8; 
+#else
+#error OMX_U8 undefined
+#endif 
+
+ 
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f 
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f 
+typedef signed short int OMX_S8; 
+#else
+#error OMX_S8 undefined
+#endif
+ 
+ 
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16; 
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff 
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff 
+typedef signed int OMX_S16; 
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32; 
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32; 
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+    typedef __int64 OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000i64)
+    #define OMX_MIN_U64			(0x0000000000000000i64)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFi64)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFi64)
+#else
+    typedef long long OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000LL)
+    #define OMX_MIN_U64			(0x0000000000000000LL)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFLL)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+  OMX_S8 Re; /** Real part */
+  OMX_S8 Im; /** Imaginary part */	
+	
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+  OMX_S16 Re; /** Real part */
+  OMX_S16 Im; /** Imaginary part */	
+	
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+  OMX_S32 Re; /** Real part */
+  OMX_S32 Im; /** Imaginary part */	
+	
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+  OMX_S64 Re; /** Real part */
+  OMX_S64 Im; /** Imaginary part */	
+	
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8  	   	(-128)
+#define OMX_MIN_U8  		0
+#define OMX_MIN_S16		 	(-32768)
+#define OMX_MIN_U16			0
+#define OMX_MIN_S32			(-2147483647-1)
+#define OMX_MIN_U32			0
+
+#define OMX_MAX_S8			(127)
+#define OMX_MAX_U8			(255)
+#define OMX_MAX_S16			(32767)
+#define OMX_MAX_U16			(0xFFFF)
+#define OMX_MAX_S32			(2147483647)
+#define OMX_MAX_U32			(0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle, 
+  * where x,y defines the coordinates of the top left corner
+  * of the rectangle, with dimensions width in the x-direction 
+  * and height in the y-direction */
+typedef struct {
+	OMX_INT x;      /** x-coordinate of top left corner of rectangle */
+	OMX_INT y;      /** y-coordinate of top left corner of rectangle */
+	OMX_INT width;  /** Width in the x-direction. */
+	OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct 
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y;	/** y-coordinate */
+	
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct 
+{
+ OMX_INT width;  /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+	
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
new file mode 100755
index 0000000..48703d1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
@@ -0,0 +1,77 @@
+;//
+;// 
+;// File Name:  omxtypes_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Mandatory return codes - use cases are explicitly described for each function 
+OMX_Sts_NoErr                    EQU  0    ;// No error the function completed successfully 
+OMX_Sts_Err                      EQU -2    ;// Unknown/unspecified error     
+OMX_Sts_InvalidBitstreamValErr   EQU -182  ;// Invalid value detected during bitstream processing     
+OMX_Sts_MemAllocErr              EQU -9    ;// Not enough memory allocated for the operation 
+OMX_StsACAAC_GainCtrErr    	     EQU -159  ;// AAC: Unsupported gain control data detected 
+OMX_StsACAAC_PrgNumErr           EQU -167  ;// AAC: Invalid number of elements for one program   
+OMX_StsACAAC_CoefValErr          EQU -163  ;// AAC: Invalid quantized coefficient value               
+OMX_StsACAAC_MaxSfbErr           EQU -162  ;// AAC: Invalid maxSfb value in relation to numSwb     
+OMX_StsACAAC_PlsDataErr		     EQU -160  ;// AAC: pulse escape sequence data error 
+
+;// Optional return codes - use cases are explicitly described for each function
+OMX_Sts_BadArgErr                EQU -5    ;// Bad Arguments 
+
+OMX_StsACAAC_TnsNumFiltErr       EQU -157  ;// AAC: Invalid number of TNS filters  
+OMX_StsACAAC_TnsLenErr           EQU -156  ;// AAC: Invalid TNS region length     
+OMX_StsACAAC_TnsOrderErr         EQU -155  ;// AAC: Invalid order of TNS filter                    
+OMX_StsACAAC_TnsCoefResErr       EQU -154  ;// AAC: Invalid bit-resolution for TNS filter coefficients  
+OMX_StsACAAC_TnsCoefErr          EQU -153  ;// AAC: Invalid TNS filter coefficients                    
+OMX_StsACAAC_TnsDirectErr        EQU -152  ;// AAC: Invalid TNS filter direction    
+
+OMX_StsICJP_JPEGMarkerErr        EQU -183  ;// JPEG marker encountered within an entropy-coded block; 
+                                            ;// Huffman decoding operation terminated early.           
+OMX_StsICJP_JPEGMarker           EQU -181  ;// JPEG marker encountered; Huffman decoding 
+                                            ;// operation terminated early.                         
+OMX_StsIPPP_ContextMatchErr      EQU -17   ;// Context parameter doesn't match to the operation 
+
+OMX_StsSP_EvenMedianMaskSizeErr  EQU -180  ;// Even size of the Median Filter mask was replaced by the odd one 
+
+OMX_Sts_MaximumEnumeration       EQU 0x7FFFFFFF
+
+
+
+OMX_MIN_S8      EQU 	   	(-128)
+OMX_MIN_U8  	EQU     	0
+OMX_MIN_S16		EQU      	(-32768)
+OMX_MIN_U16		EQU	        0
+
+
+OMX_MIN_S32		EQU	(-2147483647-1)
+OMX_MIN_U32		EQU	0
+
+OMX_MAX_S8		EQU	(127)
+OMX_MAX_U8		EQU	(255)
+OMX_MAX_S16		EQU	(32767)
+OMX_MAX_U16		EQU	(0xFFFF)
+OMX_MAX_S32		EQU	(2147483647)
+OMX_MAX_U32		EQU	(0xFFFFFFFF)
+
+OMX_VC_UPPER    EQU 0x1                 ;// Used by the PredictIntra functions   
+OMX_VC_LEFT     EQU 0x2                 ;// Used by the PredictIntra functions 
+OMX_VC_UPPER_RIGHT    EQU 0x40          ;// Used by the PredictIntra functions   
+
+NULL    EQU 0
+
+;// Structures
+
+    INCLUDE     armCOMM_s.h
+
+    M_STRUCT    OMXPoint
+    M_FIELD     x, 4
+    M_FIELD     y, 4
+    M_ENDSTRUCT
+
+        END
author	James Dong <jdong@google.com>	2011-05-31 18:53:46 -0700
committer	James Dong <jdong@google.com>	2011-06-02 12:32:46 -0700
commit	0c1bc742181ded4930842b46e9507372f0b1b963 (patch)
tree	c952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api
parent	92a746c3b18d035189f596ce32847bf26247aaca (diff)
download	frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2