109 files changed, 28123 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT
new file mode 100755
index 0000000..cc2d70a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT
@@ -0,0 +1,63 @@
+The contents of this transaction was created by Hedley Francis
+of ARM on 19-Feb-2008.
+
+It contains the ARM data versions listed below.
+
+This data, unless otherwise stated, is ARM Proprietary and access to it
+is subject to the agreements indicated below.
+
+If you experience problems with this data, please contact ARM support
+quoting transaction reference <97414>.
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+- OX002-SW-98010-r0p0-00bet1
+  Video codecs - optimised code
+  V7 code release for Hantro (Ver 1.0.2)
+  internal access
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+This transaction contains deliverables which are designated as being of
+beta release status (BET).
+
+Beta release status has a particular meaning to ARM of which the recipient
+must be aware. Beta is a pre-release status indicating that the deliverable
+so described is believed to robustly demonstrate specified behaviour, to be
+consistent across its included aspects and be ready for general deployment.
+But Beta also indicates that pre-release reliability trials are ongoing and
+that it is possible residual defects or errors in operation, consistency
+and documentation may still be encountered. The recipient should consider
+this position when using this Beta material supplied. ARM will normally
+attempt to provide fixes or a work-around for defects identified by the
+recipient, but the provision or timeliness of this support cannot be
+guaranteed. ARM shall not be responsible for direct or consequential
+damages as a result of encountering one or more of these residual defects.
+By accepting a Beta release, the recipient agrees to these constraints and
+to providing reasonable information to ARM to enable the replication of the
+defects identified by the recipient. The specific Beta version supplied
+will not be supported after release of a later or higher status version.
+It should be noted that Support for the Beta release of the deliverable
+will only be provided by ARM to a recipient who has a current support and
+maintenance contract for the deliverable.
+
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+In addition to the data versions listed above, this transaction contains
+two additional files at the top level.
+
+The first is this file, ARM_DELIVERY_97414.TXT, which is the delivery
+note.
+
+The second is ARM_MANIFEST_97414.TXT which contains a manifest of all the
+files included in this transaction, together with their checksums.
+
+The checksums provided are calculated using the RSA Data Security, Inc.
+MD5 Message-Digest Algorithm.
+
+The checksums can be used to verify the integrity of this data using the
+"md5sum" tool (which is part of the GNU "textutils" package) by running:
+
+  % md5sum --check ARM_MANIFEST_97414.TXT
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT
new file mode 100755
index 0000000..8310f67
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT
@@ -0,0 +1,91 @@
+				  OX002-SW-98010-r0p0-00bet1/
+				  OX002-SW-98010-r0p0-00bet1/api/
+e049791cfab6060a08cbac7b3ad767d6  OX002-SW-98010-r0p0-00bet1/api/armCOMM_s.h
+ed798face25497b2703ede736d6d52b6  OX002-SW-98010-r0p0-00bet1/api/omxtypes_s.h
+4eebd63af087376811d6749f0646b864  OX002-SW-98010-r0p0-00bet1/api/armCOMM_BitDec_s.h
+43cf46c2cf2fe1f93c615b57bcbe4809  OX002-SW-98010-r0p0-00bet1/api/armCOMM.h
+8f248ceaac8f602e277a521b679dcbbe  OX002-SW-98010-r0p0-00bet1/api/armCOMM_IDCTTable.h
+8ac5fa80ea98e391f5730a375280b5bd  OX002-SW-98010-r0p0-00bet1/api/armCOMM_Version.h
+3a2f420ddf6a1b950470bd0f5ebd5c62  OX002-SW-98010-r0p0-00bet1/api/armCOMM_IDCT_s.h
+511c0bb534fe223599e2c84eff24c9ed  OX002-SW-98010-r0p0-00bet1/api/armCOMM_MaskTable.h
+8971932d56eed6b1ad1ba507f0bff5f0  OX002-SW-98010-r0p0-00bet1/api/armCOMM_Bitstream.h
+f87fedd9ca432fefa757008176864ef8  OX002-SW-98010-r0p0-00bet1/api/armOMX.h
+8e49899a428822c36ef9dd94e0e05f18  OX002-SW-98010-r0p0-00bet1/api/omxtypes.h
+323008b72e9f04099a8cb42e99a1face  OX002-SW-98010-r0p0-00bet1/build_vc.pl
+e72d96c0a415459748df9807f3dae72f  OX002-SW-98010-r0p0-00bet1/filelist_vc.txt
+				  OX002-SW-98010-r0p0-00bet1/src/
+5eeae659a29477f5c52296d24afffd3c  OX002-SW-98010-r0p0-00bet1/src/armCOMM_IDCTTable.c
+d64cdcf38f7749dc7f77465e5b7d356d  OX002-SW-98010-r0p0-00bet1/src/armCOMM_MaskTable.c
+				  OX002-SW-98010-r0p0-00bet1/vc/
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p10/
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/
+e7e0c320978564a7c9b2c723749a98d6  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_CAVLCTables.c
+4adcd0df081990bdfc4729041a2a9152  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
+852e0404142965dc1f3aa7f00ee5127b  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
+7054151c5bfea6b5e74feee86b2d7b01  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+5f7213a4f37627b3c58f6294ba477e30  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DequantTables_s.s
+32ff4b8be62e2f0f3e764b83c1e5e2fd  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
+d066e3c81d82616f37ec1810ea49e7b7  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
+fe629a3e9d55395a6098bdf2431b5f02  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
+5b13fb954b7679de20076bb6a7f4ee1d  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
+01ba60eff66ea49a4f833ce6279f8e2f  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
+fa1072cf1d17e9666c9f1e215fa302b1  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+db387b9e66d32787f47ef9cf0347da2a  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+ea537e4e2ad03a1940981055fa3ace01  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+29a4283885b9473a3550a81eff2559d2  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+2ddcaf60a8ea1e6e6b77737f768bfb9d  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_QuantTables_s.s
+c3002aad5600f872b70a5d7fe3915846  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+a2900f2c47f1c61d20bd6c1eda33d6d4  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+c921df73397a32c947dc996ba6858553  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
+3769e14f2fc3f514d025fe6ab73ff67a  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+c029d1cebea0a09e1d235a37e2155002  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+076a033f8161750a685756f9f51f04c9  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+c5b5d22842822e6e5e31094882cbeb46  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
+f6bdf6d914a4a1479f524951a3409846  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+ebeb0713a9b2ea25986360ef262138c4  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+78ed9ea200faa7be665445a713859af1  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+c2d995f787b6f44ef10c751c12d1935f  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+40bed679a9f6e0d3efe216b7d4a9cf45  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
+4a52b3e9e268b8a8f07829bf500d03af  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
+11249f8a98c5d4b84cb5575b0e37ca9c  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
+2513b60559ba71ae495c6053fb779fa9  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
+2fb1ee17c36e3c1469c170f6dac11bf1  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
+cc4a6f32db0b72a91d3f278f6855df69  OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p10/api/
+6e530ddaa7c2b57ffe88162c020cb662  OX002-SW-98010-r0p0-00bet1/vc/m4p10/api/armVCM4P10_CAVLCTables.h
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p2/
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/
+bec6de348b113438498867b869001622  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Clip8_s.s
+dba9824e959b21d401cac925e68a11a6  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+dfa7e5b58027be3542dda0593b77b2d3  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
+4fba4c431a783a78a2eb6497a94ac967  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
+39991961179ca03b6381b6e653b1f14b  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
+1b0b2990c2669dfb87cf6b810611c01b  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
+1c9b87abf3283e957816b3937c680701  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
+4fe1afca659a9055fc1172e58f78a506  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
+2ea067f0436f91ba1351edaf411cb4ea  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
+6ce363aadc9d65c308b40cca8902e4f6  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
+bf212f786772aed2bc705d22ff4e74f5  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
+293a48a648a3085456e6665bb7366fad  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
+2bb47ed9c9e25c5709c6d9b4ad39a38a  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
+437dfa204508850d61d4b87091446e9f  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+bc9778898dd41101dc0fb0139eaf83cc  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+fc191eeae43f8ce735dbd311cc7bcb8d  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
+a0d85f4f517c945a4c9317ac021f2d08  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+386020dee8b725c7fe2526f1fc211d7d  OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
+				  OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/
+4624e7c838e10a249abcc3d3f4f40748  OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
+65e1057d04e2cb844559dc9f6e09795a  OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
+				  OX002-SW-98010-r0p0-00bet1/vc/src/
+e627b3346b0dc9aff14446005ce0fa43  OX002-SW-98010-r0p0-00bet1/vc/src/armVC_Version.c
+				  OX002-SW-98010-r0p0-00bet1/vc/api/
+7ca94b1c33ac0211e17d38baadd7d1dd  OX002-SW-98010-r0p0-00bet1/vc/api/armVC.h
+12cf7596edbbf6048b626d15e8d0ed48  OX002-SW-98010-r0p0-00bet1/vc/api/omxVC.h
+11726e286a81257cb45f5547fb4d374c  OX002-SW-98010-r0p0-00bet1/vc/api/omxVC_s.h
+a5b2af605c319cd2491319e430741377  OX002-SW-98010-r0p0-00bet1/vc/api/armVCCOMM_s.h
+				  OX002-SW-98010-r0p0-00bet1/vc/comm/
+				  OX002-SW-98010-r0p0-00bet1/vc/comm/src/
+1f81187b48487a8ea6dbc327648e3e4f  OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_Copy16x16_s.s
+936d3f2038a6f8613ec25e50cc601fe8  OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_Copy8x8_s.s
+8f6708a249130962e0bc5c044ac6dd93  OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
+aab7713414428e95de0ba799a2679b36  ARM_DELIVERY_97414.TXT
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
new file mode 100755
index 0000000..64c1958
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
@@ -0,0 +1,785 @@
+/**
+ * 
+ * File Name:  armCOMM.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *   
+ * File: armCOMM.h
+ * Brief: Declares Common APIs/Data Types used across OpenMAX API's
+ *
+ */
+ 
+  
+#ifndef _armCommon_H_
+#define _armCommon_H_
+
+#include "omxtypes.h"
+
+typedef struct
+{
+  OMX_F32 Re; /** Real part */
+  OMX_F32 Im; /** Imaginary part */	
+        
+} OMX_FC32; /** single precision floating point complex number */
+
+typedef struct
+{
+  OMX_F64 Re; /** Real part */
+  OMX_F64 Im; /** Imaginary part */	
+        
+} OMX_FC64; /** double precision floating point complex number */
+
+
+/* Used by both IP and IC domains for 8x8 JPEG blocks. */
+typedef OMX_S16 ARM_BLOCK8x8[64];
+
+
+#include "armOMX.h"
+
+#define  armPI (OMX_F64)(3.1415926535897932384626433832795)
+
+/***********************************************************************/
+
+/* Compiler extensions */
+#ifdef ARM_DEBUG
+/* debug version */
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#define armError(str) {printf((str)); printf("\n"); exit(-1);}
+#define armWarn(str) {printf((str)); printf("\n");}
+#define armIgnore(a) ((void)a)
+#define armAssert(a) assert(a)
+#else 
+/* release version */
+#define armError(str) ((void) (str))
+#define armWarn(str)  ((void) (str))
+#define armIgnore(a)  ((void) (a))
+#define armAssert(a)  ((void) (a))
+#endif /* ARM_DEBUG */
+
+/* Arithmetic operations */
+
+#define armMin(a,b)             ( (a) > (b) ?  (b):(a) )
+#define armMax(a,b)             ( (a) > (b) ?  (a):(b) )
+#define armAbs(a)               ( (a) <  0  ? -(a):(a) )
+
+/* Alignment operation */
+
+#define armAlignToBytes(Ptr,N)      (Ptr + ( ((N-(int)Ptr)&(N-1)) / sizeof(*Ptr) ))
+#define armAlignTo2Bytes(Ptr)       armAlignToBytes(Ptr,2)
+#define armAlignTo4Bytes(Ptr)       armAlignToBytes(Ptr,4)
+#define armAlignTo8Bytes(Ptr)       armAlignToBytes(Ptr,8)
+#define armAlignTo16Bytes(Ptr)      armAlignToBytes(Ptr,16)
+
+/* Error and Alignment check */
+
+#define armRetArgErrIf(condition, code)  if(condition) { return (code); }
+#define armRetDataErrIf(condition, code) if(condition) { return (code); }
+
+#ifndef ALIGNMENT_DOESNT_MATTER
+#define armIsByteAligned(Ptr,N)     ((((int)(Ptr)) % N)==0)
+#define armNotByteAligned(Ptr,N)    ((((int)(Ptr)) % N)!=0)
+#else
+#define armIsByteAligned(Ptr,N)     (1)
+#define armNotByteAligned(Ptr,N)    (0)
+#endif
+
+#define armIs2ByteAligned(Ptr)      armIsByteAligned(Ptr,2)
+#define armIs4ByteAligned(Ptr)      armIsByteAligned(Ptr,4)
+#define armIs8ByteAligned(Ptr)      armIsByteAligned(Ptr,8)
+#define armIs16ByteAligned(Ptr)     armIsByteAligned(Ptr,16)
+
+#define armNot2ByteAligned(Ptr)     armNotByteAligned(Ptr,2)
+#define armNot4ByteAligned(Ptr)     armNotByteAligned(Ptr,4)
+#define armNot8ByteAligned(Ptr)     armNotByteAligned(Ptr,8)
+#define armNot16ByteAligned(Ptr)    armNotByteAligned(Ptr,16)
+#define armNot32ByteAligned(Ptr)    armNotByteAligned(Ptr,32)
+
+/**
+ * Function: armRoundFloatToS16_ref/armRoundFloatToS32_ref/armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value);
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToS16_ref/armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToU16_ref/armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16/OMX_U32 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value);
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value);
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	    var     Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT --   returns 1 if it is Positive
+ *              returns 0 if it is 0
+ *              returns -1 if it is Negative 
+ */ 
+ 
+OMX_INT armSignCheck (OMX_S16 var);
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 --   returns clipped value
+ */ 
+ 
+OMX_S32 armClip (
+        OMX_INT min,
+        OMX_INT max, 
+        OMX_S32 src
+        );
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 --   returns clipped value
+ */ 
+ 
+OMX_F32 armClip_F32 (
+        OMX_F32 min,
+        OMX_F32 max, 
+        OMX_F32 src
+        );
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and 
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding 
+ * integer value. Takes care of rounding while clipping the final 
+ * value.
+ *
+ * Parameters:
+ * [in] v          Number to be operated upon
+ * [in] shift      Divides the input "v" by "2^shift"
+ * [in] satBits    Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 --   returns "shifted" saturated value
+ */ 
+ 
+OMX_U32 armShiftSat_F32(
+        OMX_F32 v, 
+        OMX_INT shift, 
+        OMX_INT satBits
+        );
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * This function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(OMX_U8 *pBuf1, OMX_U8 *pBuf2, OMX_INT elemSize);
+
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry     First entry
+ * [in] sEntry     second entry
+ * [in] tEntry     Third entry
+ *
+ * Return Value:
+ * OMX_S32 --   returns the median value
+ */ 
+ 
+OMX_S32 armMedianOf3 (
+    OMX_S32 fEntry,
+    OMX_S32 sEntry, 
+    OMX_S32 tEntry 
+    );
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value    Positive value
+ *
+ * Return Value:
+ * OMX_U8 --   returns the size of the positive value
+ */ 
+ 
+OMX_U8 armLogSize (
+    OMX_U16 value 
+    );    
+
+/***********************************************************************/
+                /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+
+OMX_S32 armSatAdd_S32(
+                OMX_S32 Value1,
+                OMX_S32 Value2
+                );
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+
+OMX_S64 armSatAdd_S64(
+                OMX_S64 Value1,
+                OMX_S64 Value2
+                );
+
+/** Function :armSatSub_S32()
+ * 
+ * Description :
+ *     Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ **/
+
+OMX_S32 armSatSub_S32(
+                    OMX_S32 Value1,
+                    OMX_S32 Value2
+                    );
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ *     Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ *     accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ * [in] Mac          Accumulator
+ *
+ * Return:
+ * [out]             Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(
+                    OMX_S32 Mac,
+                    OMX_S16 Value1,
+                    OMX_S16 Value2
+                    );
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ *   Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ *   mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem    First 32 bit Operand
+ * [in] filTap       Second 16 bit Operand
+ * [in] mac          Result of MAC operation
+ *
+ * Return:
+ * [out]  mac        Result of operation
+ *    
+ **/
+ 
+OMX_S32 armSatMac_S16S32_S32(
+                        OMX_S32 mac, 
+                        OMX_S32 delayElem, 
+                        OMX_S16 filTap );
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ *   Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ *   output = Saturate_in_16Bits( ( RightShift( (Round(input) , scaleFactor ) )
+ *
+ * Parametrs:
+ * [in] input       The input to be operated on
+ * [in] scaleFactor The shift number
+ *
+ * Return:
+ * [out]            Result of operation
+ *    
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(
+                        OMX_S32 input, 
+                        OMX_INT scaleFactor);
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+ 
+OMX_S32 armSatRoundLeftShift_S32(
+                        OMX_S32 Value,
+                        OMX_INT shift
+                        );
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+ 
+OMX_S64 armSatRoundLeftShift_S64(
+                        OMX_S64 Value,
+                        OMX_INT shift
+                        );
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S16 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(
+                    OMX_S16 input1,
+                    OMX_S32 input2);
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S32 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+OMX_S32 armSatMulS32S32_S32(
+                    OMX_S32 input1,
+                    OMX_S32 input2);
+
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer. 
+ *               Half-integer values are rounded away from zero
+ *               unless otherwise specified. For example 3//2 is rounded 
+ *               to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num        Operand 1
+ * [in] Deno       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation input1//input2
+ *    
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno);
+
+
+/***********************************************************************/
+/*
+ * Debugging macros
+ *
+ */
+
+
+/*
+ * Definition of output stream - change to stderr if necessary
+ */
+#define DEBUG_STREAM stdout
+
+/*
+ * Debug printf macros, one for each argument count.
+ * Add more if needed.
+ */
+#ifdef DEBUG_ON
+#include <stdio.h>
+
+#define DEBUG_PRINTF_0(a)                                               fprintf(DEBUG_STREAM, a)
+#define DEBUG_PRINTF_1(a, b)                                            fprintf(DEBUG_STREAM, a, b)
+#define DEBUG_PRINTF_2(a, b, c)                                         fprintf(DEBUG_STREAM, a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d)                                      fprintf(DEBUG_STREAM, a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e)                                   fprintf(DEBUG_STREAM, a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)                                fprintf(DEBUG_STREAM, a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)                             fprintf(DEBUG_STREAM, a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)                          fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)                       fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)                    fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)                fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)             fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)          fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)       fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)    fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#else /* DEBUG_ON */
+#define DEBUG_PRINTF_0(a)                                  
+#define DEBUG_PRINTF_1(a, b)                               
+#define DEBUG_PRINTF_2(a, b, c)                            
+#define DEBUG_PRINTF_3(a, b, c, d)                         
+#define DEBUG_PRINTF_4(a, b, c, d, e)                      
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)                   
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)                
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)             
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)          
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)       
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)    
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)             
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)          
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)      
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)   
+#endif /* DEBUG_ON */
+
+
+/*
+ * Domain and sub domain definitions
+ *
+ * In order to turn on debug for an entire domain or sub-domain
+ * at compile time, one of the DEBUG_DOMAIN_* below may be defined,
+ * which will activate debug in all of the defines it contains.
+ */
+
+#ifdef DEBUG_DOMAIN_AC
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT_MPEG4
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT
+#define DEBUG_OMXACAAC_DECODEDATSTRELT
+#define DEBUG_OMXACAAC_DECODEFILLELT
+#define DEBUG_OMXACAAC_DECODEISSTEREO_S32
+#define DEBUG_OMXACAAC_DECODEMSPNS_S32
+#define DEBUG_OMXACAAC_DECODEMSSTEREO_S32_I
+#define DEBUG_OMXACAAC_DECODEPRGCFGELT
+#define DEBUG_OMXACAAC_DECODETNS_S32_I
+#define DEBUG_OMXACAAC_DEINTERLEAVESPECTRUM_S32
+#define DEBUG_OMXACAAC_ENCODETNS_S32_I
+#define DEBUG_OMXACAAC_LONGTERMPREDICT_S32
+#define DEBUG_OMXACAAC_LONGTERMRECONSTRUCT_S32
+#define DEBUG_OMXACAAC_MDCTFWD_S32
+#define DEBUG_OMXACAAC_MDCTINV_S32_S16
+#define DEBUG_OMXACAAC_NOISELESSDECODE
+#define DEBUG_OMXACAAC_QUANTINV_S32_I
+#define DEBUG_OMXACAAC_UNPACKADIFHEADER
+#define DEBUG_OMXACAAC_UNPACKADTSFRAMEHEADER
+#define DEBUG_OMXACMP3_HUFFMANDECODESFBMBP_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODESFB_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODE_S32
+#define DEBUG_OMXACMP3_MDCTINV_S32
+#define DEBUG_OMXACMP3_REQUANTIZESFB_S32_I
+#define DEBUG_OMXACMP3_REQUANTIZE_S32_I
+#define DEBUG_OMXACMP3_SYNTHPQMF_S32_S16
+#define DEBUG_OMXACMP3_UNPACKFRAMEHEADER
+#define DEBUG_OMXACMP3_UNPACKSCALEFACTORS_S8
+#define DEBUG_OMXACMP3_UNPACKSIDEINFO
+#endif /* DEBUG_DOMAIN_AC */
+
+
+#ifdef DEBUG_DOMAIN_VC
+#define DEBUG_OMXVCM4P10_AVERAGE_16X
+#define DEBUG_OMXVCM4P10_AVERAGE_4X
+#define DEBUG_OMXVCM4P10_AVERAGE_8X
+#define DEBUG_OMXVCM4P10_DEBLOCKCHROMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DEBLOCKLUMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DECODECHROMADCCOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DECODECOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DEQUANTTRANSFORMACFROMPAIR_U8_S16_C1_DLX
+#define DEBUG_OMXVCM4P10_EXPANDFRAME
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_PREDICTINTRACHROMA8X8_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_16X16_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_4X4_U8_C1R
+#define DEBUG_OMXVCM4P10_SADQUAR_16X
+#define DEBUG_OMXVCM4P10_SADQUAR_4X
+#define DEBUG_OMXVCM4P10_SADQUAR_8X
+#define DEBUG_OMXVCM4P10_SAD_16X
+#define DEBUG_OMXVCM4P10_SAD_4X
+#define DEBUG_OMXVCM4P10_SAD_8X
+#define DEBUG_OMXVCM4P10_SATD_4X4
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTCHROMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTLUMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_CHROMADC
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_LUMADC
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_8X8
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_8X8
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_SAD_U8_S16
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_U8_S16
+#define DEBUG_OMXVCM4P2_DCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_DECODEBLOCKCOEF_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEPADMV_PVOP
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEMV_U8_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_FINDMVPRED
+#define DEBUG_OMXVCM4P2_IDCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_LIMITMVTORECT
+#define DEBUG_OMXVCM4P2_MOTIONESTIMATIONMB
+#define DEBUG_OMXVCM4P2_PADMBGRAY_U8
+#define DEBUG_OMXVCM4P2_PADMBHORIZONTAL_U8
+#define DEBUG_OMXVCM4P2_PADMBVERTICAL_U8
+#define DEBUG_OMXVCM4P2_PADMV
+#define DEBUG_OMXVCM4P2_QUANTINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINTRA_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTRA_S16_I
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTER
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTRA
+#endif /* DEBUG_DOMAIN_VC */
+
+
+#ifdef DEBUG_DOMAIN_IC
+/* To be filled in */
+#endif /* DEBUG_DOMAIN_IC */
+
+
+#ifdef DEBUG_DOMAIN_SP
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S32
+#define DEBUG_OMXACSP_COPY_S16
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_DOTPROD_S16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S16S32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S32_SFS
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC16
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S16_S32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S32
+#define DEBUG_OMXACSP_FFTINIT_C_SC16
+#define DEBUG_OMXACSP_FFTINIT_C_SC32
+#define DEBUG_OMXACSP_FFTINIT_R_S16_S32
+#define DEBUG_OMXACSP_FFTINIT_R_S32
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32S16_SFS
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32_I
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIR_DIRECT_S16
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_DIRECT_S16
+#endif /* DEBUG_DOMAIN_SP */
+
+
+#ifdef DEBUG_DOMAIN_IP
+#define DEBUG_OMXIPBM_ADDC_U8_C1R_SFS
+#define DEBUG_OMXIPBM_COPY_U8_C1R
+#define DEBUG_OMXIPBM_COPY_U8_C3R
+#define DEBUG_OMXIPBM_MIRROR_U8_C1R
+#define DEBUG_OMXIPBM_MULC_U8_C1R_SFS
+#define DEBUG_OMXIPCS_COLORTWISTQ14_U8_C3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR420LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR422LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR444LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR420LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR422LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR444LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_YCBCR420RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422RSZCSCROTRGB_U8_C2R
+#define DEBUG_OMXIPCS_YCBCR422RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565_U8_U16_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB_U8_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_C2P3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR444TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR444TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB_U8_C3R
+#define DEBUG_OMXIPPP_GETCENTRALMOMENT_S64
+#define DEBUG_OMXIPPP_GETSPATIALMOMENT_S64
+#define DEBUG_OMXIPPP_MOMENTGETSTATESIZE_S64
+#define DEBUG_OMXIPPP_MOMENTINIT_S64
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C1R
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C3R
+#endif /* DEBUG_DOMAIN_IP */
+
+
+#endif /* _armCommon_H_ */
+
+/*End of File*/
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
new file mode 100755
index 0000000..c738f72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
@@ -0,0 +1,670 @@
+;//
+;// 
+;// File Name:  armCOMM_BitDec_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;// 
+;// OpenMAX optimized bitstream decode module
+;//
+;// You must include armCOMM_s.h before including this file
+;//
+;// This module provides macros to perform assembly optimized fixed and
+;// variable length decoding from a read-only bitstream. The variable
+;// length decode modules take as input a pointer to a table of 16-bit
+;// entries of the following format.
+;//
+;// VLD Table Entry format
+;//
+;//        15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+;//       +------------------------------------------------+
+;//       |  Len   |               Symbol              | 1 |
+;//       +------------------------------------------------+
+;//       |                Offset                      | 0 |
+;//       +------------------------------------------------+
+;//
+;// If the table entry is a leaf entry then bit 0 set:
+;//    Len    = Number of bits overread (0 to 7)
+;//    Symbol = Symbol payload (unsigned 12 bits)
+;//
+;// If the table entry is an internal node then bit 0 is clear:
+;//    Offset = Number of (16-bit) half words from the table
+;//             start to the next table node
+;//
+;// The table is accessed by successive lookup up on the
+;// next Step bits of the input bitstream until a leaf node
+;// is obtained. The Step sizes are supplied to the VLD macro.
+;//
+;// USAGE:
+;//
+;// To use any of the macros in this package, first call:
+;//
+;//    M_BD_INIT ppBitStream, pBitOffset, pBitStream, RBitBuffer, RBitCount, Tmp
+;//
+;// This caches the current bitstream position and next available
+;// bits in registers pBitStream, RBitBuffer, RBitCount. These registers
+;// are reserved for use by the bitstream decode package until you
+;// call M_BD_FINI.
+;//
+;// Next call the following macro(s) as many times as you need:
+;//
+;//    M_BD_LOOK8       - Look ahead constant 1<=N<=8  bits into the bitstream
+;//    M_BD_LOOK16      - Look ahead constant 1<=N<=16 bits into the bitstream
+;//    M_BD_READ8       - Read constant 1<=N<=8  bits from the bitstream
+;//    M_BD_READ16      - Read constant 1<=N<=16 bits from the bitstream
+;//    M_BD_VREAD8      - Read variable 1<=N<=8  bits from the bitstream
+;//    M_BD_VREAD16     - Read variable 1<=N<=16 bits from the bitstream
+;//    M_BD_VLD         - Perform variable length decode using lookup table
+;//
+;// Finally call the macro:
+;//
+;//    M_BD_FINI ppBitStream, pBitOffset
+;//
+;// This writes the bitstream state back to memory.
+;//
+;// The three bitstream cache register names are assigned to the following global
+;// variables:
+;//
+
+        GBLS    pBitStream  ;// Register name for pBitStream
+        GBLS    BitBuffer   ;// Register name for BitBuffer
+        GBLS    BitCount    ;// Register name for BitCount
+   
+;//        
+;// These register variables must have a certain defined state on entry to every bitstream
+;// macro (except M_BD_INIT) and on exit from every bitstream macro (except M_BD_FINI).
+;// The state may depend on implementation.
+;//
+;// For the default (ARM11) implementation the following hold:
+;//    pBitStream - points to the first byte not held in the BitBuffer
+;//    BitBuffer  - is a cache of (4 bytes) 32 bits, bit 31 the first bit
+;//    BitCount   - is offset (from the top bit) to the next unused bitstream bit
+;//    0<=BitCount<=15 (so BitBuffer holds at least 17 unused bits)
+;//
+;//
+
+        ;// Bitstream Decode initialise
+        ;//
+        ;// Initialises the bitstream decode global registers from
+        ;// bitstream pointers. This macro is split into 3 parts to enable
+        ;// scheduling.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $ppBitStream    - pointer to pointer to the next bitstream byte
+        ;// $pBitOffset     - pointer to the number of bits used in the current byte (0..7)
+        ;// $RBitStream     - register to use for pBitStream (can be $ppBitStream)
+        ;// $RBitBuffer     - register to use for BitBuffer
+        ;// $RBitCount      - register to use for BitCount   (can be $pBitOffset)
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $T1,$T2,$T3     - registers that must be preserved between calls to
+        ;//                   M_BD_INIT1 and M_BD_INIT2
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_INIT0  $ppBitStream, $pBitOffset, $RBitStream, $RBitBuffer, $RBitCount
+
+pBitStream  SETS "$RBitStream"
+BitBuffer   SETS "$RBitBuffer"
+BitCount    SETS "$RBitCount"        
+        
+        ;// load inputs
+        LDR     $pBitStream, [$ppBitStream]
+        LDR     $BitCount, [$pBitOffset]
+        MEND
+        
+        MACRO
+        M_BD_INIT1  $T1, $T2, $T3
+        LDRB    $T2, [$pBitStream, #2]
+        LDRB    $T1, [$pBitStream, #1]
+        LDRB    $BitBuffer,  [$pBitStream], #3
+        ADD     $BitCount, $BitCount, #8
+        MEND
+        
+        MACRO
+        M_BD_INIT2  $T1, $T2, $T3
+        ORR     $T2, $T2, $T1, LSL #8
+        ORR     $BitBuffer, $T2, $BitBuffer, LSL #16
+        MEND    
+        
+        ;//
+        ;// Look ahead fixed 1<=N<=8 bits without consuming any bits
+        ;// The next bits will be placed at bit 31..24 of destination register
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to look
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_LOOK8  $Symbol, $N
+        ASSERT  ($N>=1):LAND:($N<=8)
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MEND
+        
+        ;//
+        ;// Look ahead fixed 1<=N<=16 bits without consuming any bits
+        ;// The next bits will be placed at bit 31..16 of destination register
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to look
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_LOOK16  $Symbol, $N, $T1
+        ASSERT  ($N >= 1):LAND:($N <= 16)
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MEND
+        
+        ;//
+        ;// Skips fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_SKIP8 $N, $T1
+        ASSERT  ($N>=1):LAND:($N<=8)        
+        SUBS    $BitCount, $BitCount, #(8-$N)
+        LDRCSB  $T1, [$pBitStream], #1   
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+        
+        
+        ;//
+        ;// Read fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_READ8 $Symbol, $N, $T1
+        ASSERT  ($N>=1):LAND:($N<=8)                
+        MOVS    $Symbol, $BitBuffer, LSL $BitCount        
+        SUBS    $BitCount, $BitCount, #(8-$N)
+        LDRCSB  $T1, [$pBitStream], #1   
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR #(32-$N)
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+        ;//
+        ;// Read fixed 1<=N<=16 bits from the bitstream, advancing the bitstream pointer
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_READ16 $Symbol, $N, $T1, $T2
+        ASSERT  ($N>=1):LAND:($N<=16)
+        ASSERT  $Symbol<>$T1
+        IF ($N<=8)
+            M_BD_READ8  $Symbol, $N, $T1
+        ELSE        
+            ;// N>8 so we will be able to refill at least one byte            
+            LDRB    $T1, [$pBitStream], #1            
+            MOVS    $Symbol, $BitBuffer, LSL $BitCount
+            ORR     $BitBuffer, $T1, $BitBuffer, LSL #8                       
+            SUBS    $BitCount, $BitCount, #(16-$N)
+            LDRCSB  $T1, [$pBitStream], #1            
+            MOV     $Symbol, $Symbol, LSR #(32-$N)
+            ADDCC   $BitCount, $BitCount, #8
+            ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        ENDIF
+        MEND
+        
+        ;//
+        ;// Skip variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits. 1<=N<=8
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VSKIP8 $N, $T1
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND        
+        
+        ;//
+        ;// Skip variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits. 1<=N<=16
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VSKIP16 $N, $T1, $T2
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND        
+
+        ;//
+        ;// Read variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read. 1<=N<=8
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VREAD8 $Symbol, $N, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount        
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        RSB     $T2, $N, #32        
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR $T2
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+
+        ;//
+        ;// Read variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $N              - number of bits to read. 1<=N<=16
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the next N bits of the bitstream
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VREAD16 $Symbol, $N, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount        
+        ADD     $BitCount, $BitCount, $N
+        SUBS    $BitCount, $BitCount, #8
+        LDRCSB  $T1, [$pBitStream], #1        
+        RSB     $T2, $N, #32        
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        MOV     $Symbol, $Symbol, LSR $T2
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND
+
+
+        ;//
+        ;// Decode a code of the form 0000...001 where there
+        ;// are N zeros before the 1 and N<=15 (code length<=16)
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the number of zeros before the next 1
+        ;//                   >=16 is an illegal code
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//        
+        MACRO
+        M_BD_CLZ16 $Symbol, $T1, $T2
+        MOVS    $Symbol, $BitBuffer, LSL $BitCount
+        CLZ     $Symbol, $Symbol                
+        ADD     $BitCount, $BitCount, $Symbol
+        SUBS    $BitCount, $BitCount, #7        ;// length is Symbol+1
+        LDRCSB  $T1, [$pBitStream], #1
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND  
+
+        ;//
+        ;// Decode a code of the form 1111...110 where there
+        ;// are N ones before the 0 and N<=15 (code length<=16)
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - the number of zeros before the next 1
+        ;//                   >=16 is an illegal code
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//        
+        MACRO
+        M_BD_CLO16 $Symbol, $T1, $T2
+        MOV     $Symbol, $BitBuffer, LSL $BitCount
+        MVN     $Symbol, $Symbol
+        CLZ     $Symbol, $Symbol                
+        ADD     $BitCount, $BitCount, $Symbol
+        SUBS    $BitCount, $BitCount, #7        ;// length is Symbol+1
+        LDRCSB  $T1, [$pBitStream], #1
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        SUBCSS  $BitCount, $BitCount, #8        
+        LDRCSB  $T1, [$pBitStream], #1
+        ADDCC   $BitCount, $BitCount, #8
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8
+        MEND  
+
+
+        ;//
+        ;// Variable Length Decode module
+        ;//
+        ;// Decodes one VLD Symbol from a bitstream and refill the bitstream
+        ;// buffer.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pVLDTable      - pointer to VLD decode table of 16-bit entries.
+        ;//                   The format is described above at the start of
+        ;//                   this file.
+        ;// $S0             - The number of bits to look up for the first step
+        ;//                   1<=$S0<=8
+        ;// $S1             - The number of bits to look up for each subsequent
+        ;//                   step 1<=$S1<=$S0.
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// 
+        ;// Output Registers:
+        ;//
+        ;// $Symbol         - decoded VLD symbol value
+        ;// $T1             - corrupted temp/scratch register
+        ;// $T2             - corrupted temp/scratch register
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_VLD $Symbol, $T1, $T2, $pVLDTable, $S0, $S1
+        ASSERT (1<=$S0):LAND:($S0<=8)
+        ASSERT (1<=$S1):LAND:($S1<=$S0)
+        
+        ;// Note 0<=BitCount<=15 on entry and exit
+        
+        MOVS    $T1, $BitBuffer, LSL $BitCount       ;// left align next bits
+        MOVS    $Symbol, #(2<<$S0)-2                 ;// create mask
+        AND     $Symbol, $Symbol, $T1, LSR #(31-$S0) ;// 2*(next $S0 bits)
+        SUBS    $BitCount, $BitCount, #8             ;// CS if buffer can be filled
+01
+        LDRCSB  $T1, [$pBitStream], #1               ;// load refill byte
+        LDRH    $Symbol, [$pVLDTable, $Symbol]       ;// load table entry
+        ADDCC   $BitCount, $BitCount, #8             ;// refill not possible
+        ADD     $BitCount, $BitCount, #$S0           ;// assume $S0 bits used
+        ORRCS   $BitBuffer, $T1, $BitBuffer, LSL #8  ;// merge in refill byte
+        MOVS    $T1, $Symbol, LSR #1                 ;// CS=leaf entry
+        BCS     %FT02
+        
+        MOVS    $T1, $BitBuffer, LSL $BitCount       ;// left align next bit
+        IF (2*$S0-$S1<=8)
+            ;// Can combine refill check and -S0+S1 and keep $BitCount<=15
+            SUBS    $BitCount, $BitCount, #8+($S0-$S1)
+        ELSE
+            ;// Separate refill check and -S0+S1 offset
+            SUBS  $BitCount, $BitCount, #8
+            SUB   $BitCount, $BitCount, #($S0-$S1)
+        ENDIF
+        ADD     $Symbol, $Symbol, $T1, LSR #(31-$S1) ;// add 2*(next $S1 bits) to
+        BIC     $Symbol, $Symbol, #1                 ;//   table offset
+        B       %BT01                                ;// load next table entry
+02
+        ;// BitCount range now depend on the route here
+        ;// if (first step)       S0 <= BitCount <= 7+S0        <=15
+        ;// else if (2*S0-S1<=8)  S0 <= BitCount <= 7+(2*S0-S1) <=15
+        ;// else                  S1 <= BitCount <= 7+S1        <=15
+        
+        SUB     $BitCount, $BitCount, $Symbol, LSR#13
+        BIC     $Symbol, $T1, #0xF000
+        MEND
+        
+
+        ;// Add an offset number of bits
+        ;//
+        ;// Outputs destination byte and bit index values which corresponds to an offset number of bits 
+        ;// from the current location. This is used to compare bitstream positions using. M_BD_CMP.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $Offset         - Offset to be added in bits.
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $ByteIndex      - Destination pBitStream pointer after adding the Offset. 
+        ;//                   This value will be 4 byte ahead and needs to subtract by 4 to get exact 
+        ;//                   pointer (as in M_BD_FINI). But for using with M_BD_CMP subtract is not needed.
+        ;// $BitIndex       - Destination BitCount after the addition of Offset number of bits
+        ;//
+        MACRO
+        M_BD_ADD  $ByteIndex, $BitIndex, $Offset
+
+        ;// ($ByteIndex,$BitIndex) = Current position + $Offset bits
+        ADD     $Offset, $Offset, $BitCount
+        AND     $BitIndex, $Offset, #7
+        ADD     $ByteIndex, $pBitStream, $Offset, ASR #3        
+        MEND
+
+        ;// Move bitstream pointers to the location given
+        ;//
+        ;// Outputs destination byte and bit index values which corresponds to  
+        ;// the current location given (calculated using M_BD_ADD). 
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;// $ByteIndex      - Destination pBitStream pointer after move. 
+        ;//                   This value will be 4 byte ahead and needs to subtract by 4 to get exact 
+        ;//                   pointer (as in M_BD_FINI).
+        ;// $BitIndex       - Destination BitCount after the move
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;//                  } See description above.  
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_MOV  $ByteIndex, $BitIndex
+
+        ;// ($pBitStream, $Offset) = ($ByteIndex,$BitIndex)
+        MOV     $BitCount, $BitIndex
+        MOV     $pBitStream, $ByteIndex
+        MEND
+
+        ;// Bitstream Compare
+        ;//
+        ;// Compares bitstream position with that of a destination position. Destination position 
+        ;// is held in two input registers which are calculated using M_BD_ADD macro
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $ByteIndex      - Destination pBitStream pointer, (4 byte ahead as described in M_BD_ADD)
+        ;// $BitIndex       - Destination BitCount
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// FLAGS           - GE if destination is reached, LT = is destination is ahead
+        ;// $T1             - corrupted temp/scratch register
+        ;//
+        MACRO
+        M_BD_CMP  $ByteIndex, $BitIndex, $T1
+        
+        ;// Return flags set by (current positon)-($ByteIndex,$BitIndex)
+        ;// so GE means that we have reached the indicated position
+
+        ADD         $T1, $pBitStream, $BitCount, LSR #3
+        CMP         $T1, $ByteIndex
+        AND         $T1, $BitCount, #7
+        CMPEQ       $T1, $BitIndex        
+        MEND
+
+        
+        ;// Bitstream Decode finalise
+        ;//
+        ;// Writes back the bitstream state to the bitstream pointers
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } See description above.
+        ;// $BitCount       / 
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $ppBitStream    - pointer to pointer to the next bitstream byte
+        ;// $pBitOffset     - pointer to the number of bits used in the current byte (0..7)
+        ;// $pBitStream     \ 
+        ;// $BitBuffer       } these register are corrupted
+        ;// $BitCount       / 
+        ;//
+        MACRO
+        M_BD_FINI  $ppBitStream, $pBitOffset
+        
+        ;// Advance pointer by the number of free bits in the buffer
+        ADD     $pBitStream, $pBitStream, $BitCount, LSR#3
+        AND     $BitCount, $BitCount, #7
+        
+        ;// Now move back 32 bits to reach the first usued bit
+        SUB     $pBitStream, $pBitStream, #4
+        
+        ;// Store out bitstream state
+        STR     $BitCount, [$pBitOffset]
+        STR     $pBitStream, [$ppBitStream]
+        MEND
+        
+        END
+        
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
new file mode 100755
index 0000000..b699034
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
@@ -0,0 +1,212 @@
+/**
+ * 
+ * File Name:  armCOMM_Bitstream.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File: armCOMM_Bitstream.h
+ * Brief: Declares common API's/Data types used across the OpenMax Encoders/Decoders.
+ *
+ */
+
+#ifndef _armCodec_H_
+#define _armCodec_H_
+
+#include "omxtypes.h"
+
+typedef struct {
+    OMX_U8   codeLen;
+    OMX_U32	 codeWord;
+} ARM_VLC32;
+
+/* The above should be renamed as "ARM_VLC32" */
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in]     **ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1...32
+ *
+ * Returns  Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *    
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1..32
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ * Returns  Value
+ */
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+ 
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset);
+
+/** 
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N);
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ * 
+ * Parameters:
+ * [in]     **ppBitStream
+ * [in]     *pOffset
+ * [in]     pCodeBook
+ * 
+ * [out]    **ppBitStream
+ * [out]    *pOffset
+ *
+ * Returns : Code Book Index if successfull. 
+ *         : "ARM_NO_CODEBOOK_INDEX = 0xFFFF" if search fails.
+ **/
+
+#define ARM_NO_CODEBOOK_INDEX (OMX_U16)(0xFFFF)
+
+OMX_U16 armUnPackVLC32(
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pOffset,
+    const ARM_VLC32 *pCodeBook
+);
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in]	pOffset	        pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in]	codeWord		Code word that need to be inserted in to the
+ *                          bitstream
+ * [in]	codeLength		Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream	*ppBitStream is updated after the block is encoded,
+ *	                        so that it points to the current byte in the bit
+ *							stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the
+ *							current bit position in the byte pointed by
+ *							*ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackBits (
+    OMX_U8  **ppBitStream, 
+    OMX_INT *pOffset,
+    OMX_U32 codeWord, 
+    OMX_INT codeLength 
+);
+ 
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in]	pBitOffset	    pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in]	 code     		VLC code word that need to be inserted in to the
+ *                      bitstream
+ *
+ * [out] ppBitStream	*ppBitStream is updated after the block is encoded,
+ *	                    so that it points to the current byte in the bit
+ *						stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the
+ *						current bit position in the byte pointed by
+ *						*ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackVLC32 (
+    OMX_U8 **ppBitStream, 
+    OMX_INT *pBitOffset,
+    ARM_VLC32 code 
+);
+
+#endif      /*_armCodec_H_*/
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
new file mode 100755
index 0000000..e0cfdaa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
@@ -0,0 +1,40 @@
+/**
+ *
+ * 
+ * File Name:  armCOMM_IDCTTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File         : armCOMM_IDCTTable.h
+ * Description  : Contains declarations of tables for IDCT calculation.
+ *
+ */
+  
+#ifndef _armCOMM_IDCTTable_H_
+#define _armCOMM_IDCTTable_H_
+
+#include "omxtypes.h"
+
+     /*  Table of s(u)*A(u)*A(v)/16 at Q15
+      *  s(u)=1.0 0 <= u <= 5
+      *  s(6)=2.0
+      *  s(7)=4.0
+      *  A(0) = 2*sqrt(2)
+      *  A(u) = 4*cos(u*pi/16)  for (u!=0)
+	  */
+extern const OMX_U16 armCOMM_IDCTPreScale [64];
+extern const OMX_U16 armCOMM_IDCTCoef [4];
+
+#endif /* _armCOMM_IDCTTable_H_ */
+
+
+/* End of File */
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
new file mode 100755
index 0000000..0baa087
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
@@ -0,0 +1,1451 @@
+;//
+;// This confidential and proprietary software may be used only as
+;// authorised by a licensing agreement from ARM Limited
+;//   (C) COPYRIGHT 2004 ARM Limited
+;//       ALL RIGHTS RESERVED
+;// The entire notice above must be reproduced on all authorised
+;// copies and copies may only be made to the extent permitted
+;// by a licensing agreement from ARM Limited.
+;//
+;// IDCT_s.s
+;//
+;// Inverse DCT module
+;//
+;// 
+;// ALGORITHM DESCRIPTION
+;//
+;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
+;// column and then a 1D IDCT for each row.
+;//
+;// The 8-point 1D IDCT is defined by
+;//   f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
+;//
+;//   C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
+;//   c(u,x) = cos( (2x+1)*u*pi/16 )
+;//
+;// We compute the 8-point 1D IDCT using the reverse of
+;// the Arai-Agui-Nakajima flow graph which we split into
+;// 5 stages named in reverse order to identify with the
+;// forward DCT. Direct inversion of the forward formulae
+;// in file FDCT_s.s gives:
+;//
+;// IStage 5:   j(u) = T(u)*A(u)  [ A(u)=4*C(u)*c(u,0) ]
+;//             [ A(0) = 2*sqrt(2)
+;//               A(u) = 4*cos(u*pi/16)  for (u!=0) ]
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = (j2+j6)/2      i2 = (j2-j6)/2
+;//             i7 = (j5+j3)/2      i4 = (j5-j3)/2
+;//             i5 = (j1+j7)/2      i6 = (j1-j7)/2
+;//
+;// IStage 3:   h0 = (i0+i1)/2      h1 = (i0-i1)/2
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 =  cos(pi/8)*i4 + sin(pi/8)*i6
+;//             h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
+;//             [ The above two lines rotate by -(pi/8) ]
+;//             h5 = (i5-i7)/sqrt2  h7 = (i5+i7)/2 
+;//             
+;// IStage 2:   g0 = (h0+h3)/2      g3 = (h0-h3)/2
+;//             g1 = (h1+h2)/2      g2 = (h1-h2)/2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = (g0+g7)/2      f7 = (g0-g7)/2
+;//             f1 = (g1+g6)/2      f6 = (g1-g6)/2
+;//             f2 = (g2+g5)/2      f5 = (g2-g5)/2
+;//             f3 = (g3+g4)/2      f4 = (g3-g4)/2
+;//
+;// Note that most coefficients are halved 3 times during the
+;// above calculation. We can rescale the algorithm dividing
+;// the input by 8 to remove the halvings.
+;//
+;// IStage 5:   j(u) = T(u)*A(u)/8
+;//
+;// IStage 4:   i0 = j0             i1 = j4
+;//             i3 = j2 + j6        i2 = j2 - j6
+;//             i7 = j5 + j3        i4 = j5 - j3
+;//             i5 = j1 + j7        i6 = j1 - j7
+;//
+;// IStage 3:   h0 = i0 + i1        h1 = i0 - i1
+;//             h2 = (i2*sqrt2)-i3  h3 = i3
+;//             h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
+;//             h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
+;//             h5 = (i5-i7)*sqrt2  h7 = i5 + i7 
+;//             
+;// IStage 2:   g0 = h0 + h3        g3 = h0 - h3
+;//             g1 = h1 + h2        g2 = h1 - h2
+;//             g7 = h7             g6 = h6 - h7
+;//             g5 = h5 - g6        g4 = h4 - g5
+;//
+;// IStage 1:   f0 = g0 + g7        f7 = g0 - g7
+;//             f1 = g1 + g6        f6 = g1 - g6
+;//             f2 = g2 + g5        f5 = g2 - g5
+;//             f3 = g3 + g4        f4 = g3 - g4
+;//
+;// Note:
+;// 1. The scaling by A(u)/8 can often be combined with inverse
+;//    quantization. The column and row scalings can be combined.
+;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
+;//    to the above code but is otherwise identical.
+;// 3. The rotation by -pi/8 can be peformed using three multiplies
+;//    Eg  c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
+;//       -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
+;// 4. If |T(u)|<=1 then from the IDCT definition,
+;//    |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
+;//            = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
+;//            = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
+;//            = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
+;//            = (approx)2.64
+;//    So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
+;//    The table below shows input patterns generating the maximum
+;//    value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
+;//    InputPattern      Max |f(x)|
+;//      PPPPPPPP        |f0| =  2.64
+;//      PPPMMMMM        |f1| =  2.64
+;//      PPMMMPPP        |f2| =  2.64
+;//      PPMMPPMM        |f3| =  2.64
+;//      PMMPPMMP        |f4| =  2.64
+;//      PMMPMMPM        |f5| =  2.64
+;//      PMPPMPMP        |f6| =  2.64
+;//      PMPMPMPM        |f7| =  2.64
+;//   Note that this input pattern is the transpose of the
+;//   corresponding max input patter for the FDCT.
+
+;// Arguments
+
+pSrc    RN 0    ;// source data buffer
+Stride  RN 1    ;// destination stride in bytes
+pDest   RN 2    ;// destination data buffer
+pScale  RN 3    ;// pointer to scaling table
+
+
+        ;// DCT Inverse Macro
+        ;// The DCT code should be parametrized according
+        ;// to the following inputs:
+        ;// $outsize = "u8"  :  8-bit unsigned data saturated (0 to +255)
+        ;//            "s9"  : 16-bit signed data saturated to 9-bit (-256 to +255)
+        ;//            "s16" : 16-bit signed data not saturated (max size ~+/-14273)
+        ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
+        ;//            "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
+        ;//
+        ;// Inputs:
+        ;// pSrc   = r0 = Pointer to input data
+        ;//               Range is -256 to +255 (9-bit)
+        ;// Stride = r1 = Stride between input lines
+        ;// pDest  = r2 = Pointer to output data
+        ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
+        
+        
+        
+        MACRO
+        M_IDCT  $outsize, $inscale, $stride
+        LCLA    SHIFT
+        
+        
+        IF ARM1136JS
+        
+;// REGISTER ALLOCATION
+;// This is hard since we have 8 values, 9 free registers and each
+;// butterfly requires a temporary register. We also want to 
+;// maintain register order so we can use LDM/STM. The table below
+;// summarises the register allocation that meets all these criteria.
+;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
+;//
+;// r1  a01     g0  h0
+;// r4  b01 f0  g1  h1  i0
+;// r5  a23 f1  g2      i1
+;// r6  b23 f2  g3  h2  i2
+;// r7  a45 f3      h3  i3
+;// r8  b45 f4  g4  h4  i4
+;// r9  a67 f5  g5  h5  i5
+;// r10 b67 f6  g6  h6  i6
+;// r11     f7  g7  h7  i7
+;//
+ra01    RN 1
+rb01    RN 4
+ra23    RN 5
+rb23    RN 6
+ra45    RN 7
+rb45    RN 8
+ra67    RN 9
+rb67    RN 10
+rtmp    RN 11
+csPiBy8 RN 12   ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
+LoopRR2 RN 14   ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
+;// Transpose allocation
+xft     RN ra01
+xf0     RN rb01
+xf1     RN ra23
+xf2     RN rb23
+xf3     RN ra45
+xf4     RN rb45
+xf5     RN ra67
+xf6     RN rb67
+xf7     RN rtmp
+;// IStage 1 allocation
+xg0     RN xft
+xg1     RN xf0
+xg2     RN xf1
+xg3     RN xf2
+xgt     RN xf3
+xg4     RN xf4
+xg5     RN xf5
+xg6     RN xf6
+xg7     RN xf7
+;// IStage 2 allocation
+xh0     RN xg0
+xh1     RN xg1
+xht     RN xg2
+xh2     RN xg3
+xh3     RN xgt
+xh4     RN xg4
+xh5     RN xg5
+xh6     RN xg6
+xh7     RN xg7
+;// IStage 3,4 allocation
+xit     RN xh0
+xi0     RN xh1
+xi1     RN xht
+xi2     RN xh2
+xi3     RN xh3
+xi4     RN xh4
+xi5     RN xh5
+xi6     RN xh6
+xi7     RN xh7
+        
+        M_STR   pDest,  ppDest
+        IF "$stride"="s"
+            M_STR   Stride, pStride
+        ENDIF
+        M_ADR   pDest,  pBlk
+        LDR     csPiBy8, =0x30fc7642
+        LDR     LoopRR2, =0x00005a82
+  
+v6_idct_col$_F
+        ;// Load even values
+        LDR     xi4, [pSrc], #4  ;// j0
+        LDR     xi5, [pSrc, #4*16-4]  ;// j4
+        LDR     xi6, [pSrc, #2*16-4]  ;// j2
+        LDR     xi7, [pSrc, #6*16-4]  ;// j6
+        
+        ;// Scale Even Values
+        IF "$inscale"="s16" ;// 16x16 mul
+SHIFT       SETA    12
+            LDR     xi0, [pScale], #4
+            LDR     xi1, [pScale, #4*16-4]        
+            LDR     xi2, [pScale, #2*16-4]
+            MOV     xit, #1<<(SHIFT-1)
+            SMLABB  xi3, xi0, xi4, xit
+            SMLATT  xi4, xi0, xi4, xit
+            SMLABB  xi0, xi1, xi5, xit
+            SMLATT  xi5, xi1, xi5, xit
+            MOV     xi3, xi3, ASR #SHIFT
+            PKHBT   xi4, xi3, xi4, LSL #(16-SHIFT)
+            LDR     xi3, [pScale, #6*16-4]
+            SMLABB  xi1, xi2, xi6, xit
+            SMLATT  xi6, xi2, xi6, xit
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi5, xi0, xi5, LSL #(16-SHIFT)
+            SMLABB  xi2, xi3, xi7, xit
+            SMLATT  xi7, xi3, xi7, xit
+            MOV     xi1, xi1, ASR #SHIFT
+            PKHBT   xi6, xi1, xi6, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT
+            PKHBT   xi7, xi2, xi7, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+SHIFT       SETA    (12+8-16)
+            MOV     xit, #1<<(SHIFT-1)
+            LDR     xi0, [pScale], #8
+            LDR     xi1, [pScale, #0*32+4-8]
+            LDR     xi2, [pScale, #4*32-8]
+            LDR     xi3, [pScale, #4*32+4-8]            
+            SMLAWB  xi0, xi0, xi4, xit
+            SMLAWT  xi1, xi1, xi4, xit
+            SMLAWB  xi2, xi2, xi5, xit
+            SMLAWT  xi3, xi3, xi5, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi4, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi5, xi2, xi3, LSL #(16-SHIFT)
+            LDR     xi0, [pScale, #2*32-8]
+            LDR     xi1, [pScale, #2*32+4-8]
+            LDR     xi2, [pScale, #6*32-8]
+            LDR     xi3, [pScale, #6*32+4-8]            
+            SMLAWB  xi0, xi0, xi6, xit
+            SMLAWT  xi1, xi1, xi6, xit
+            SMLAWB  xi2, xi2, xi7, xit
+            SMLAWT  xi3, xi3, xi7, xit            
+            MOV     xi0, xi0, ASR #SHIFT
+            PKHBT   xi6, xi0, xi1, LSL #(16-SHIFT)
+            MOV     xi2, xi2, ASR #SHIFT            
+            PKHBT   xi7, xi2, xi3, LSL #(16-SHIFT)
+        ENDIF
+                
+        ;// Load odd values
+        LDR     xi0, [pSrc, #1*16-4]      ;// j1
+        LDR     xi1, [pSrc, #7*16-4]      ;// j7
+        LDR     xi2, [pSrc, #5*16-4]      ;// j5
+        LDR     xi3, [pSrc, #3*16-4]      ;// j3
+        
+        IF  {TRUE}
+            ;// shortcut if odd values 0
+            TEQ     xi0, #0
+            TEQEQ   xi1, #0
+            TEQEQ   xi2, #0
+            TEQEQ   xi3, #0
+            BEQ     v6OddZero$_F
+        ENDIF
+        
+        ;// Store scaled even values
+        STMIA   pDest, {xi4, xi5, xi6, xi7}
+        
+        ;// Scale odd values
+        IF "$inscale"="s16"
+            ;// Perform AAN Scale
+            LDR     xi4, [pScale, #1*16-4]
+            LDR     xi5, [pScale, #7*16-4]        
+            LDR     xi6, [pScale, #5*16-4]
+            SMLABB  xi7, xi0, xi4, xit
+            SMLATT  xi0, xi0, xi4, xit
+            SMLABB  xi4, xi1, xi5, xit
+            SMLATT  xi1, xi1, xi5, xit
+            MOV     xi7, xi7, ASR #SHIFT
+            PKHBT   xi0, xi7, xi0, LSL #(16-SHIFT)
+            LDR     xi7, [pScale, #3*16-4]
+            SMLABB  xi5, xi2, xi6, xit
+            SMLATT  xi2, xi2, xi6, xit
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi1, xi4, xi1, LSL #(16-SHIFT)
+            SMLABB  xi6, xi3, xi7, xit
+            SMLATT  xi3, xi3, xi7, xit
+            MOV     xi5, xi5, ASR #SHIFT
+            PKHBT   xi2, xi5, xi2, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT
+            PKHBT   xi3, xi6, xi3, LSL #(16-SHIFT)
+        ENDIF
+        IF "$inscale"="s32" ;// 32x16 mul
+            LDR     xi4, [pScale, #1*32-8]
+            LDR     xi5, [pScale, #1*32+4-8]
+            LDR     xi6, [pScale, #7*32-8]
+            LDR     xi7, [pScale, #7*32+4-8]            
+            SMLAWB  xi4, xi4, xi0, xit
+            SMLAWT  xi5, xi5, xi0, xit
+            SMLAWB  xi6, xi6, xi1, xit
+            SMLAWT  xi7, xi7, xi1, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi0, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi1, xi6, xi7, LSL #(16-SHIFT)
+            LDR     xi4, [pScale, #5*32-8]
+            LDR     xi5, [pScale, #5*32+4-8]
+            LDR     xi6, [pScale, #3*32-8]
+            LDR     xi7, [pScale, #3*32+4-8]            
+            SMLAWB  xi4, xi4, xi2, xit
+            SMLAWT  xi5, xi5, xi2, xit
+            SMLAWB  xi6, xi6, xi3, xit
+            SMLAWT  xi7, xi7, xi3, xit            
+            MOV     xi4, xi4, ASR #SHIFT
+            PKHBT   xi2, xi4, xi5, LSL #(16-SHIFT)
+            MOV     xi6, xi6, ASR #SHIFT            
+            PKHBT   xi3, xi6, xi7, LSL #(16-SHIFT)
+        ENDIF
+        
+        LDR     xit, =0x00010001        ;// rounding constant
+        SADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SHADD16 xi5, xi5, xit
+        
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SHADD16 xi7, xi7, xit
+        
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+                
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+        LDRD    xi0, [pDest, #8]        ;// j2,j6 scaled
+                
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6        
+        SSUB16  xg4, xh4, xg5
+                
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)
+        
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDRD    xi0, [pDest]            ;// j0, j4 scaled
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        SHADD16 xh0, xi0, xi1
+        SHSUB16 xh1, xi0, xi1                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SADD16  xf3, xg3, xg4
+        SSUB16  xf4, xg3, xg4
+        SADD16  xf2, xg2, xg5
+        SSUB16  xf5, xg2, xg5
+        SADD16  xf1, xg1, xg6
+        SSUB16  xf6, xg1, xg6
+        SADD16  xf0, xg0, xg7
+        SSUB16  xf7, xg0, xg7
+        
+        ;// Transpose, store and loop
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        PKHTB   rb67, xf7, xf6, ASR #16
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}                              
+        BCC     v6_idct_col$_F
+        
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+        B       v6_idct_row$_F
+        
+v6OddZero$_F
+        SSUB16  xi2, xi6, xi7           ;// (j2-j6)
+        SHADD16 xi3, xi6, xi7           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        SSUB16  xh2, xh2, xi3
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        SHADD16 xh0, xi4, xi5
+        SHSUB16 xh1, xi4, xi5                
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+               
+        ;// IStage 1 all rows
+        MOV  xf3, xg3
+        MOV  xf4, xg3
+        MOV  xf2, xg2
+        MOV  xf5, xg2
+        MOV  xf1, xg1
+        MOV  xf6, xg1
+        MOV  xf0, xg0
+        MOV  xf7, xg0
+        
+        ;// Transpose
+        PKHBT   ra01, xf0, xf1, LSL #16
+        PKHTB   rb01, xf1, xf0, ASR #16
+        
+        PKHBT   ra23, xf2, xf3, LSL #16
+        PKHTB   rb23, xf3, xf2, ASR #16
+        
+        PKHBT   ra45, xf4, xf5, LSL #16
+        PKHTB   rb45, xf5, xf4, ASR #16
+        
+        PKHBT   ra67, xf6, xf7, LSL #16
+        PKHTB   rb67, xf7, xf6, ASR #16
+                
+        STMIA   pDest!, {ra01, ra23, ra45, ra67}      
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        STMIA   pDest!, {rb01, rb23, rb45, rb67}      
+        
+        BCC     v6_idct_col$_F
+        SUB     pSrc, pDest, #(64*2)
+        M_LDR   pDest, ppDest
+        IF "$stride"="s"
+            M_LDR   pScale, pStride 
+        ENDIF
+               
+        
+v6_idct_row$_F
+        ;// IStage 4,3, rows4to7 x1/4
+        LDR     xit, =0x00010001        ;// rounding constant
+        LDR     xi0, [pSrc, #1*16]      ;// j1
+        LDR     xi1, [pSrc, #7*16]      ;// 4*j7
+        LDR     xi2, [pSrc, #5*16]      ;// j5
+        LDR     xi3, [pSrc, #3*16]      ;// j3
+        
+        SHADD16 xi1, xi1, xit           ;// 2*j7
+        SHADD16 xi1, xi1, xit           ;// j7                
+        
+        SHADD16 xi5, xi0, xi1           ;// (j1+j7)/2
+        SSUB16  xi6, xi0, xi1           ;// j1-j7
+        SHADD16 xi7, xi2, xi3           ;// (j5+j3)/2
+        SSUB16  xi4, xi2, xi3           ;// j5-j3
+        
+        SSUB16  xi3, xi5, xi7           ;// (i5-i7)/2
+        
+        PKHBT   xi0, xi6, xi4, LSL#16   ;// [i4,i6] row a
+        PKHTB   xi1, xi4, xi6, ASR#16   ;// [i4,i6] row b
+        
+        SMUADX  xi2, xi0, csPiBy8       ;// rowa by [c,s]
+        SMUADX  xi4, xi1, csPiBy8       ;// rowb by [c,s]
+        SMUSD   xi0, xi0, csPiBy8       ;// rowa by [-s,c]   
+        SMUSD   xi6, xi1, csPiBy8       ;// rowb by [-s,c]
+                
+        SMULBB  xi1, xi3, LoopRR2
+        SMULTB  xi3, xi3, LoopRR2
+                
+        PKHTB   xh4, xi4, xi2, ASR#16   ;// h4/4
+        PKHTB   xh6, xi6, xi0, ASR#16   ;// h6/4
+        SHADD16 xh7, xi5, xi7           ;// (i5+i7)/4
+        
+        MOV     xi3, xi3, LSL #1
+        PKHTB   xh5, xi3, xi1, ASR#15   ;// h5/4
+               
+        ;// xi0,xi1,xi2,xi3 now free
+        ;// IStage 4,3, rows 2to3 x1/2
+        
+        LDR     xi0, [pSrc, #2*16]      ;// j2
+        LDR     xi1, [pSrc, #6*16]      ;// 2*j6
+        
+        ;// IStage 2, rows4to7
+        SSUB16  xg6, xh6, xh7
+        SSUB16  xg5, xh5, xg6
+        SSUB16  xg4, xh4, xg5
+        
+        SHADD16 xi1, xi1, xit           ;// j6
+        SSUB16  xi2, xi0, xi1           ;// (j2-j6)        
+        SHADD16 xi3, xi0, xi1           ;// (j2+j6)/2
+        
+        SMULBB  xi0, xi2, LoopRR2
+        SMULTB  xi2, xi2, LoopRR2
+        
+        MOV     xi2, xi2, LSL #1
+        
+        PKHTB   xh2, xi2, xi0, ASR#15   ;// i2*sqrt(2)/4
+        
+        ;// xi0, xi1 now free
+        ;// IStage 4,3 rows 0to1 x 1/2
+        LDR     xi1, [pSrc, #4*16]      ;// j4
+        LDR     xi0, [pSrc], #4         ;// j0
+
+        SSUB16  xh2, xh2, xi3
+        ADDS    LoopRR2, LoopRR2, #2<<29    ;// done two rows
+        
+        ADD     xi0, xi0, xit, LSL #2   ;// ensure correct round
+        SHADD16 xh0, xi0, xi1           ;// of DC result
+        SHSUB16 xh1, xi0, xi1
+                
+        ;// IStage 2 rows 0to3 x 1/2
+        SHSUB16 xg2, xh1, xh2
+        SHADD16 xg1, xh1, xh2
+        SHSUB16 xg3, xh0, xh3
+        SHADD16 xg0, xh0, xh3
+        
+        ;// IStage 1 all rows
+        SHADD16 xf3, xg3, xg4
+        SHSUB16 xf4, xg3, xg4
+        SHADD16 xf2, xg2, xg5
+        SHSUB16 xf5, xg2, xg5
+        SHADD16 xf1, xg1, xg6
+        SHSUB16 xf6, xg1, xg6
+        SHADD16 xf0, xg0, xg7
+        SHSUB16 xf7, xg0, xg7
+        
+        ;// Saturate
+        IF ("$outsize"="u8")
+            USAT16  xf0, #8, xf0
+            USAT16  xf1, #8, xf1
+            USAT16  xf2, #8, xf2
+            USAT16  xf3, #8, xf3
+            USAT16  xf4, #8, xf4
+            USAT16  xf5, #8, xf5
+            USAT16  xf6, #8, xf6
+            USAT16  xf7, #8, xf7        
+        ENDIF
+        IF ("$outsize"="s9")
+            SSAT16  xf0, #9, xf0
+            SSAT16  xf1, #9, xf1
+            SSAT16  xf2, #9, xf2
+            SSAT16  xf3, #9, xf3
+            SSAT16  xf4, #9, xf4
+            SSAT16  xf5, #9, xf5
+            SSAT16  xf6, #9, xf6
+            SSAT16  xf7, #9, xf7        
+        ENDIF
+        
+        ;// Transpose to Row, Pack and store
+        IF ("$outsize"="u8")
+            ORR     xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
+            ORR     xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
+            ORR     xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
+            ORR     xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
+            PKHBT   ra01, xf0, xf2, LSL #16
+            PKHTB   rb01, xf2, xf0, ASR #16
+            PKHBT   ra23, xf4, xf6, LSL #16
+            PKHTB   rb23, xf6, xf4, ASR #16
+            STMIA   pDest, {ra01, ra23}
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23}
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        IF ("$outsize"="s9"):LOR:("$outsize"="s16")        
+            PKHBT   ra01, xf0, xf1, LSL #16
+            PKHTB   rb01, xf1, xf0, ASR #16
+        
+            PKHBT   ra23, xf2, xf3, LSL #16
+            PKHTB   rb23, xf3, xf2, ASR #16
+            
+            PKHBT   ra45, xf4, xf5, LSL #16
+            PKHTB   rb45, xf5, xf4, ASR #16
+            
+            PKHBT   ra67, xf6, xf7, LSL #16
+            PKHTB   rb67, xf7, xf6, ASR #16
+            
+            STMIA   pDest, {ra01, ra23, ra45, ra67}      
+            IF "$stride"="s"
+                ADD     pDest, pDest, pScale
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, pScale
+            ELSE                
+                ADD     pDest, pDest, #($stride)
+                STMIA   pDest, {rb01, rb23, rb45, rb67}      
+                ADD     pDest, pDest, #($stride)
+            ENDIF
+        ENDIF
+        
+        BCC     v6_idct_row$_F
+        ENDIF ;// ARM1136JS
+
+
+        IF CortexA8
+        
+Src0            EQU  7              
+Src1            EQU  8              
+Src2            EQU  9              
+Src3            EQU  10              
+Src4            EQU  11              
+Src5            EQU  12              
+Src6            EQU  13
+Src7            EQU  14
+Tmp             EQU  15
+
+qXj0            QN Src0.S16 
+qXj1            QN Src1.S16
+qXj2            QN Src2.S16
+qXj3            QN Src3.S16
+qXj4            QN Src4.S16
+qXj5            QN Src5.S16
+qXj6            QN Src6.S16
+qXj7            QN Src7.S16
+qXjt            QN Tmp.S16
+
+dXj0lo          DN (Src0*2).S16
+dXj0hi          DN (Src0*2+1).S16
+dXj1lo          DN (Src1*2).S16
+dXj1hi          DN (Src1*2+1).S16
+dXj2lo          DN (Src2*2).S16
+dXj2hi          DN (Src2*2+1).S16
+dXj3lo          DN (Src3*2).S16
+dXj3hi          DN (Src3*2+1).S16
+dXj4lo          DN (Src4*2).S16
+dXj4hi          DN (Src4*2+1).S16
+dXj5lo          DN (Src5*2).S16
+dXj5hi          DN (Src5*2+1).S16
+dXj6lo          DN (Src6*2).S16
+dXj6hi          DN (Src6*2+1).S16
+dXj7lo          DN (Src7*2).S16
+dXj7hi          DN (Src7*2+1).S16
+dXjtlo          DN (Tmp*2).S16
+dXjthi          DN (Tmp*2+1).S16
+
+qXi0            QN qXj0
+qXi1            QN qXj4
+qXi2            QN qXj2
+qXi3            QN qXj7
+qXi4            QN qXj5
+qXi5            QN qXjt
+qXi6            QN qXj1
+qXi7            QN qXj6
+qXit            QN qXj3
+
+dXi0lo          DN dXj0lo
+dXi0hi          DN dXj0hi
+dXi1lo          DN dXj4lo
+dXi1hi          DN dXj4hi
+dXi2lo          DN dXj2lo
+dXi2hi          DN dXj2hi
+dXi3lo          DN dXj7lo
+dXi3hi          DN dXj7hi
+dXi4lo          DN dXj5lo
+dXi4hi          DN dXj5hi
+dXi5lo          DN dXjtlo
+dXi5hi          DN dXjthi
+dXi6lo          DN dXj1lo
+dXi6hi          DN dXj1hi
+dXi7lo          DN dXj6lo
+dXi7hi          DN dXj6hi
+dXitlo          DN dXj3lo
+dXithi          DN dXj3hi
+
+qXh0            QN qXit
+qXh1            QN qXi0
+qXh2            QN qXi2
+qXh3            QN qXi3
+qXh4            QN qXi7
+qXh5            QN qXi5
+qXh6            QN qXi4
+qXh7            QN qXi1
+qXht            QN qXi6
+
+dXh0lo          DN dXitlo
+dXh0hi          DN dXithi
+dXh1lo          DN dXi0lo
+dXh1hi          DN dXi0hi
+dXh2lo          DN dXi2lo
+dXh2hi          DN dXi2hi
+dXh3lo          DN dXi3lo
+dXh3hi          DN dXi3hi
+dXh4lo          DN dXi7lo
+dXh4hi          DN dXi7hi
+dXh5lo          DN dXi5lo
+dXh5hi          DN dXi5hi
+dXh6lo          DN dXi4lo
+dXh6hi          DN dXi4hi
+dXh7lo          DN dXi1lo
+dXh7hi          DN dXi1hi
+dXhtlo          DN dXi6lo
+dXhthi          DN dXi6hi
+
+qXg0            QN qXh2
+qXg1            QN qXht
+qXg2            QN qXh1
+qXg3            QN qXh0
+qXg4            QN qXh4
+qXg5            QN qXh5
+qXg6            QN qXh6
+qXg7            QN qXh7
+qXgt            QN qXh3
+
+qXf0            QN qXg6
+qXf1            QN qXg5
+qXf2            QN qXg4
+qXf3            QN qXgt
+qXf4            QN qXg3
+qXf5            QN qXg2
+qXf6            QN qXg1
+qXf7            QN qXg0
+qXft            QN qXg7
+
+
+qXt0            QN 1.S32
+qXt1            QN 2.S32
+qT0lo           QN 1.S32         
+qT0hi           QN 2.S32         
+qT1lo           QN 3.S32         
+qT1hi           QN 4.S32         
+qScalelo        QN 5.S32        ;// used to read post scale values
+qScalehi        QN 6.S32
+qTemp0          QN 5.S32         
+qTemp1          QN 6.S32    
+
+
+Scale1          EQU 6
+Scale2          EQU 15
+qScale1         QN Scale1.S16     
+qScale2         QN Scale2.S16     
+dScale1lo       DN (Scale1*2).S16     
+dScale1hi       DN (Scale1*2+1).S16
+dScale2lo       DN (Scale2*2).S16     
+dScale2hi       DN (Scale2*2+1).S16
+
+dCoefs          DN 0.S16        ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
+InvSqrt2        DN dCoefs[0]    ;// 1/sqrt(2) in Q15
+S               DN dCoefs[1]    ;// Sin(PI/8) in Q15
+C               DN dCoefs[2]    ;// Cos(PI/8) in Q15
+
+pTemp           RN 12
+
+                
+        IMPORT  armCOMM_IDCTCoef
+                    
+        VLD1        {qXj0,qXj1}, [pSrc @64]!
+        VLD1        {qXj2,qXj3}, [pSrc @64]!
+        VLD1        {qXj4,qXj5}, [pSrc @64]!
+        VLD1        {qXj6,qXj7}, [pSrc @64]!
+        
+        ;// Load PreScale and multiply with Src
+        ;// IStage 4
+        
+        IF "$inscale"="s16"                         ;// 16X16 Mul
+            M_IDCT_PRESCALE16
+        ENDIF
+        
+        IF "$inscale"="s32"                         ;// 32X32 ,ul
+            M_IDCT_PRESCALE32
+        ENDIF
+
+        ;// IStage 3
+        VQDMULH     qXi2, qXi2, InvSqrt2            ;// i2/sqrt(2)
+        VHADD       qXh0, qXi0, qXi1                ;// (i0+i1)/2
+        VHSUB       qXh1, qXi0, qXi1                ;// (i0-i1)/2
+        VHADD       qXh7, qXi5, qXi7                ;// (i5+i7)/4
+        VSUB        qXh5, qXi5, qXi7                ;// (i5-i7)/2
+        VQDMULH     qXh5, qXh5, InvSqrt2            ;// h5/sqrt(2)
+        VSUB        qXh2, qXi2, qXi3                ;// h2, h3
+
+        VMULL       qXt0, dXi4lo, C                 ;// c*i4
+        VMLAL       qXt0, dXi6lo, S                 ;// c*i4+s*i6
+        VMULL       qXt1, dXi4hi, C
+        VMLAL       qXt1, dXi6hi, S
+        VSHRN       dXh4lo, qXt0, #16               ;// h4
+        VSHRN       dXh4hi, qXt1, #16
+        
+        VMULL       qXt0, dXi6lo, C                 ;// c*i6
+        VMLSL       qXt0, dXi4lo, S                 ;// -s*i4 + c*h6
+        VMULL       qXt1, dXi6hi, C
+        VMLSL       qXt1, dXi4hi, S
+        VSHRN       dXh6lo, qXt0, #16               ;// h6
+        VSHRN       dXh6hi, qXt1, #16
+        
+        ;// IStage 2
+        VSUB        qXg6, qXh6, qXh7
+        VSUB        qXg5, qXh5, qXg6
+        VSUB        qXg4, qXh4, qXg5
+        VHADD       qXg1, qXh1, qXh2        ;// (h1+h2)/2
+        VHSUB       qXg2, qXh1, qXh2        ;// (h1-h2)/2
+        VHADD       qXg0, qXh0, qXh3        ;// (h0+h3)/2
+        VHSUB       qXg3, qXh0, qXh3        ;// (h0-h3)/2
+
+        ;// IStage 1 all rows
+        VADD        qXf3, qXg3, qXg4        
+        VSUB        qXf4, qXg3, qXg4        
+        VADD        qXf2, qXg2, qXg5        
+        VSUB        qXf5, qXg2, qXg5        
+        VADD        qXf1, qXg1, qXg6
+        VSUB        qXf6, qXg1, qXg6        
+        VADD        qXf0, qXg0, qXg7
+        VSUB        qXf7, qXg0, qXg7      
+
+        ;// Transpose, store and loop
+XTR0            EQU Src5
+XTR1            EQU Tmp
+XTR2            EQU Src6
+XTR3            EQU Src7
+XTR4            EQU Src3
+XTR5            EQU Src0
+XTR6            EQU Src1
+XTR7            EQU Src2
+XTRt            EQU Src4
+                
+qA0             QN  XTR0.S32  ;// for XTRpose
+qA1             QN  XTR1.S32
+qA2             QN  XTR2.S32
+qA3             QN  XTR3.S32
+qA4             QN  XTR4.S32
+qA5             QN  XTR5.S32
+qA6             QN  XTR6.S32
+qA7             QN  XTR7.S32
+
+dB0             DN  XTR0*2+1      ;// for using VSWP
+dB1             DN  XTR1*2+1
+dB2             DN  XTR2*2+1
+dB3             DN  XTR3*2+1
+dB4             DN  XTR4*2
+dB5             DN  XTR5*2
+dB6             DN  XTR6*2
+dB7             DN  XTR7*2
+
+          
+        VTRN        qXf0, qXf1
+        VTRN        qXf2, qXf3
+        VTRN        qXf4, qXf5
+        VTRN        qXf6, qXf7
+        VTRN        qA0, qA2
+        VTRN        qA1, qA3
+        VTRN        qA4, qA6
+        VTRN        qA5, qA7        
+        VSWP        dB0, dB4
+        VSWP        dB1, dB5
+        VSWP        dB2, dB6
+        VSWP        dB3, dB7
+        
+
+qYj0            QN qXf0
+qYj1            QN qXf1
+qYj2            QN qXf2
+qYj3            QN qXf3
+qYj4            QN qXf4
+qYj5            QN qXf5
+qYj6            QN qXf6
+qYj7            QN qXf7
+qYjt            QN qXft
+
+dYj0lo          DN (XTR0*2).S16
+dYj0hi          DN (XTR0*2+1).S16
+dYj1lo          DN (XTR1*2).S16
+dYj1hi          DN (XTR1*2+1).S16
+dYj2lo          DN (XTR2*2).S16
+dYj2hi          DN (XTR2*2+1).S16
+dYj3lo          DN (XTR3*2).S16
+dYj3hi          DN (XTR3*2+1).S16
+dYj4lo          DN (XTR4*2).S16
+dYj4hi          DN (XTR4*2+1).S16
+dYj5lo          DN (XTR5*2).S16
+dYj5hi          DN (XTR5*2+1).S16
+dYj6lo          DN (XTR6*2).S16
+dYj6hi          DN (XTR6*2+1).S16
+dYj7lo          DN (XTR7*2).S16
+dYj7hi          DN (XTR7*2+1).S16
+dYjtlo          DN (XTRt*2).S16
+dYjthi          DN (XTRt*2+1).S16
+
+qYi0            QN qYj0
+qYi1            QN qYj4
+qYi2            QN qYj2
+qYi3            QN qYj7
+qYi4            QN qYj5
+qYi5            QN qYjt
+qYi6            QN qYj1
+qYi7            QN qYj6
+qYit            QN qYj3
+
+dYi0lo          DN dYj0lo
+dYi0hi          DN dYj0hi
+dYi1lo          DN dYj4lo
+dYi1hi          DN dYj4hi
+dYi2lo          DN dYj2lo
+dYi2hi          DN dYj2hi
+dYi3lo          DN dYj7lo
+dYi3hi          DN dYj7hi
+dYi4lo          DN dYj5lo
+dYi4hi          DN dYj5hi
+dYi5lo          DN dYjtlo
+dYi5hi          DN dYjthi
+dYi6lo          DN dYj1lo
+dYi6hi          DN dYj1hi
+dYi7lo          DN dYj6lo
+dYi7hi          DN dYj6hi
+dYitlo          DN dYj3lo
+dYithi          DN dYj3hi
+
+qYh0            QN qYit
+qYh1            QN qYi0
+qYh2            QN qYi2
+qYh3            QN qYi3
+qYh4            QN qYi7
+qYh5            QN qYi5
+qYh6            QN qYi4
+qYh7            QN qYi1
+qYht            QN qYi6
+
+dYh0lo          DN dYitlo
+dYh0hi          DN dYithi
+dYh1lo          DN dYi0lo
+dYh1hi          DN dYi0hi
+dYh2lo          DN dYi2lo
+dYh2hi          DN dYi2hi
+dYh3lo          DN dYi3lo
+dYh3hi          DN dYi3hi
+dYh4lo          DN dYi7lo
+dYh4hi          DN dYi7hi
+dYh5lo          DN dYi5lo
+dYh5hi          DN dYi5hi
+dYh6lo          DN dYi4lo
+dYh6hi          DN dYi4hi
+dYh7lo          DN dYi1lo
+dYh7hi          DN dYi1hi
+dYhtlo          DN dYi6lo
+dYhthi          DN dYi6hi
+
+qYg0            QN qYh2
+qYg1            QN qYht
+qYg2            QN qYh1
+qYg3            QN qYh0
+qYg4            QN qYh4
+qYg5            QN qYh5
+qYg6            QN qYh6
+qYg7            QN qYh7
+qYgt            QN qYh3
+
+qYf0            QN qYg6
+qYf1            QN qYg5
+qYf2            QN qYg4
+qYf3            QN qYgt
+qYf4            QN qYg3
+qYf5            QN qYg2
+qYf6            QN qYg1
+qYf7            QN qYg0
+qYft            QN qYg7
+
+        VRSHR       qYj7, qYj7, #2
+        VRSHR       qYj6, qYj6, #1
+        
+        VHADD       qYi5, qYj1, qYj7        ;// i5 = (j1+j7)/2
+        VSUB        qYi6, qYj1, qYj7        ;// i6 = j1-j7
+        VHADD       qYi3, qYj2, qYj6        ;// i3 = (j2+j6)/2
+        VSUB        qYi2, qYj2, qYj6        ;// i2 = j2-j6
+        VHADD       qYi7, qYj5, qYj3        ;// i7 = (j5+j3)/2
+        VSUB        qYi4, qYj5, qYj3        ;// i4 = j5-j3
+
+        VQDMULH     qYi2, qYi2, InvSqrt2    ;// i2/sqrt(2)
+        ;// IStage 4,3 rows 0to1 x 1/2
+        
+        MOV         pTemp, #0x4             ;// ensure correct round
+        VDUP        qScale1, pTemp           ;// of DC result
+        VADD        qYi0, qYi0, qScale1
+        
+        VHADD       qYh0, qYi0, qYi1        ;// (i0+i1)/2
+        VHSUB       qYh1, qYi0, qYi1        ;// (i0-i1)/2
+
+        VHADD       qYh7, qYi5, qYi7        ;// (i5+i7)/4
+        VSUB        qYh5, qYi5, qYi7        ;// (i5-i7)/2
+        VSUB        qYh2, qYi2, qYi3        ;// h2, h3
+        VQDMULH     qYh5, qYh5, InvSqrt2    ;// h5/sqrt(2)
+
+        VMULL       qXt0, dYi4lo, C         ;// c*i4
+        VMLAL       qXt0, dYi6lo, S         ;// c*i4+s*i6
+        VMULL       qXt1, dYi4hi, C
+        VMLAL       qXt1, dYi6hi, S
+        VSHRN       dYh4lo, qXt0, #16       ;// h4
+        VSHRN       dYh4hi, qXt1, #16
+        
+        VMULL       qXt0, dYi6lo, C         ;// c*i6
+        VMLSL       qXt0, dYi4lo, S         ;// -s*i4 + c*h6
+        VMULL       qXt1, dYi6hi, C
+        VMLSL       qXt1, dYi4hi, S
+        VSHRN       dYh6lo, qXt0, #16       ;// h6
+        VSHRN       dYh6hi, qXt1, #16
+        
+        VSUB        qYg6, qYh6, qYh7
+        VSUB        qYg5, qYh5, qYg6
+        VSUB        qYg4, qYh4, qYg5
+        
+        ;// IStage 2 rows 0to3 x 1/2
+        VHADD       qYg1, qYh1, qYh2        ;// (h1+h2)/2
+        VHSUB       qYg2, qYh1, qYh2        ;// (h1-h2)/2
+        VHADD       qYg0, qYh0, qYh3        ;// (h0+h3)/2
+        VHSUB       qYg3, qYh0, qYh3        ;// (h0-h3)/2
+        
+
+        ;// IStage 1 all rows
+        VHADD        qYf3, qYg3, qYg4        
+        VHSUB        qYf4, qYg3, qYg4        
+        VHADD        qYf2, qYg2, qYg5        
+        VHSUB        qYf5, qYg2, qYg5        
+        VHADD        qYf1, qYg1, qYg6
+        VHSUB        qYf6, qYg1, qYg6        
+        VHADD        qYf0, qYg0, qYg7
+        VHSUB        qYf7, qYg0, qYg7      
+
+YTR0            EQU Src0
+YTR1            EQU Src4
+YTR2            EQU Src1
+YTR3            EQU Src2
+YTR4            EQU Src7
+YTR5            EQU Src5
+YTR6            EQU Tmp
+YTR7            EQU Src6
+YTRt            EQU Src3
+
+qC0             QN  YTR0.S32                ;// for YTRpose
+qC1             QN  YTR1.S32
+qC2             QN  YTR2.S32
+qC3             QN  YTR3.S32
+qC4             QN  YTR4.S32
+qC5             QN  YTR5.S32
+qC6             QN  YTR6.S32
+qC7             QN  YTR7.S32
+
+dD0             DN  YTR0*2+1                ;// for using VSWP
+dD1             DN  YTR1*2+1
+dD2             DN  YTR2*2+1
+dD3             DN  YTR3*2+1
+dD4             DN  YTR4*2
+dD5             DN  YTR5*2
+dD6             DN  YTR6*2
+dD7             DN  YTR7*2
+          
+        VTRN        qYf0, qYf1
+        VTRN        qYf2, qYf3
+        VTRN        qYf4, qYf5
+        VTRN        qYf6, qYf7
+        VTRN        qC0, qC2
+        VTRN        qC1, qC3
+        VTRN        qC4, qC6
+        VTRN        qC5, qC7        
+        VSWP        dD0, dD4
+        VSWP        dD1, dD5
+        VSWP        dD2, dD6
+        VSWP        dD3, dD7
+
+        
+dYf0U8          DN YTR0*2.U8
+dYf1U8          DN YTR1*2.U8
+dYf2U8          DN YTR2*2.U8
+dYf3U8          DN YTR3*2.U8
+dYf4U8          DN YTR4*2.U8
+dYf5U8          DN YTR5*2.U8
+dYf6U8          DN YTR6*2.U8
+dYf7U8          DN YTR7*2.U8
+        
+        ;//
+        ;// Do saturation if outsize is other than S16
+        ;//
+        
+        IF ("$outsize"="u8")
+            ;// Output range [0-255]
+            VQMOVN            dYf0U8, qYf0
+            VQMOVN            dYf1U8, qYf1
+            VQMOVN            dYf2U8, qYf2
+            VQMOVN            dYf3U8, qYf3
+            VQMOVN            dYf4U8, qYf4
+            VQMOVN            dYf5U8, qYf5
+            VQMOVN            dYf6U8, qYf6
+            VQMOVN            dYf7U8, qYf7
+        ENDIF
+        
+        IF ("$outsize"="s9")
+            ;// Output range [-256 to +255]
+            VQSHL            qYf0, qYf0, #16-9
+            VQSHL            qYf1, qYf1, #16-9
+            VQSHL            qYf2, qYf2, #16-9
+            VQSHL            qYf3, qYf3, #16-9
+            VQSHL            qYf4, qYf4, #16-9
+            VQSHL            qYf5, qYf5, #16-9
+            VQSHL            qYf6, qYf6, #16-9
+            VQSHL            qYf7, qYf7, #16-9
+            
+            VSHR             qYf0, qYf0, #16-9
+            VSHR             qYf1, qYf1, #16-9
+            VSHR             qYf2, qYf2, #16-9
+            VSHR             qYf3, qYf3, #16-9
+            VSHR             qYf4, qYf4, #16-9
+            VSHR             qYf5, qYf5, #16-9
+            VSHR             qYf6, qYf6, #16-9
+            VSHR             qYf7, qYf7, #16-9
+        ENDIF
+
+        ;// Store output depending on the Stride size
+        IF "$stride"="s"
+            VST1        qYf0, [pDest @64], Stride
+            VST1        qYf1, [pDest @64], Stride
+            VST1        qYf2, [pDest @64], Stride
+            VST1        qYf3, [pDest @64], Stride
+            VST1        qYf4, [pDest @64], Stride
+            VST1        qYf5, [pDest @64], Stride
+            VST1        qYf6, [pDest @64], Stride
+            VST1        qYf7, [pDest @64]            
+        ELSE
+            IF ("$outsize"="u8")
+                VST1        dYf0U8, [pDest @64], #8
+                VST1        dYf1U8, [pDest @64], #8
+                VST1        dYf2U8, [pDest @64], #8
+                VST1        dYf3U8, [pDest @64], #8
+                VST1        dYf4U8, [pDest @64], #8
+                VST1        dYf5U8, [pDest @64], #8
+                VST1        dYf6U8, [pDest @64], #8
+                VST1        dYf7U8, [pDest @64]
+            ELSE
+                ;// ("$outsize"="s9") or ("$outsize"="s16")
+                VST1        qYf0, [pDest @64], #16
+                VST1        qYf1, [pDest @64], #16
+                VST1        qYf2, [pDest @64], #16
+                VST1        qYf3, [pDest @64], #16
+                VST1        qYf4, [pDest @64], #16
+                VST1        qYf5, [pDest @64], #16
+                VST1        qYf6, [pDest @64], #16
+                VST1        qYf7, [pDest @64]
+            ENDIF
+        
+        ENDIF
+
+
+
+        ENDIF ;// CortexA8
+
+
+
+        MEND        
+
+        ;// Scale TWO input rows with TWO rows of 16 bit scale values
+        ;//
+        ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
+        ;// input (Eight input values) with one row of scale values. Also 
+        ;// Loads next scale values from pScale, if $LastRow flag is not set.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// $dAlo           - Input D register with first four S16 values of row n
+        ;// $dAhi           - Input D register with next four S16 values of row n
+        ;// $dBlo           - Input D register with first four S16 values of row n+1
+        ;// $dBhi           - Input D register with next four S16 values of row n+1
+        ;// pScale          - Pointer to next row of scale values
+        ;// qT0lo           - Temporary scratch register
+        ;// qT0hi           - Temporary scratch register
+        ;// qT1lo           - Temporary scratch register
+        ;// qT1hi           - Temporary scratch register
+        ;// dScale1lo       - Scale value of row n
+        ;// dScale1hi       - Scale value of row n
+        ;// dScale2lo       - Scale value of row n+1
+        ;// dScale2hi       - Scale value of row n+1
+        ;//
+        ;// Input Flag
+        ;//
+        ;// $LastRow        - Flag to indicate whether current row is last row
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// $dAlo           - Scaled output values (first four S16 of row n)
+        ;// $dAhi           - Scaled output values (next four S16 of row n)
+        ;// $dBlo           - Scaled output values (first four S16 of row n+1)
+        ;// $dBhi           - Scaled output values (next four S16 of row n+1)
+        ;// qScale1         - Scale values for next row
+        ;// qScale2         - Scale values for next row+1
+        ;// pScale          - Pointer to next row of scale values
+        ;//
+        MACRO
+        M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
+        VMULL       qT0lo, $dAlo, dScale1lo
+        VMULL       qT0hi, $dAhi, dScale1hi
+        VMULL       qT1lo, $dBlo, dScale2lo
+        VMULL       qT1hi, $dBhi, dScale2hi
+        IF "$LastRow"="0"
+            VLD1        qScale1, [pScale], #16  ;// Load scale for row n+1
+            VLD1        qScale2, [pScale], #16  ;// Load scale for row n+2
+        ENDIF
+        VQRSHRN       $dAlo, qT0lo, #12        
+        VQRSHRN       $dAhi, qT0hi, #12        
+        VQRSHRN       $dBlo, qT1lo, #12        
+        VQRSHRN       $dBhi, qT1hi, #12        
+        MEND
+
+        ;// Scale 8x8 block input values with 16 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to scale values
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// qXin            - n th output Q register with eight S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE16
+        VLD1        qScale1, [pScale], #16      ;// Load Pre scale for row 0
+        VLD1        qScale2, [pScale], #16      ;// Load Pre scale for row 0
+        M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0        ;// Pre scale row 0 & 1
+        M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0        
+        M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0        
+        M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1        
+        VHADD       qXi5, qXj1, qXj7            ;// (j1+j7)/2
+        VSUB        qXi6, qXj1, qXj7            ;// j1-j7
+        LDR         pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+        VHADD       qXi3, qXj2, qXj6            ;// (j2+j6)/2
+        VSUB        qXi2, qXj2, qXj6            ;// j2-j6
+        VLDR        dCoefs, [pSrc]              ;// Load DCT inverse AAN constants
+        VHADD       qXi7, qXj5, qXj3            ;// (j5+j3)/2
+        VSUB        qXi4, qXj5, qXj3            ;// j5-j3
+        MEND    
+        
+        
+        ;// Scale 8x8 block input values with 32 bit scale values
+        ;//
+        ;// This macro is used to pre-scale block of 8x8 input.
+        ;// This also do the Ist stage transformations of IDCT.
+        ;//
+        ;// Input Registers:
+        ;//
+        ;// dXjnlo          - n th input D register with first four S16 values
+        ;// dXjnhi          - n th input D register with next four S16 values
+        ;// qXjn            - n th input Q register with eight S16 values
+        ;// pScale          - Pointer to 32bit scale values in Q23 format
+        ;//
+        ;// Output Registers:
+        ;//
+        ;// dXinlo          - n th output D register with first four S16 output values of 1st stage
+        ;// dXinhi          - n th output D register with next four S16 output values of 1st stage
+        ;//
+        MACRO
+        M_IDCT_PRESCALE32
+qScale0lo       QN 0.S32
+qScale0hi       QN 1.S32
+qScale1lo       QN 2.S32
+qScale1hi       QN 3.S32
+qScale2lo       QN qScale1lo
+qScale2hi       QN qScale1hi
+qScale3lo       QN qScale1lo
+qScale3hi       QN qScale1hi
+qScale4lo       QN qScale1lo
+qScale4hi       QN qScale1hi
+qScale5lo       QN qScale0lo
+qScale5hi       QN qScale0hi
+qScale6lo       QN qScale0lo
+qScale6hi       QN qScale0hi
+qScale7lo       QN qScale0lo
+qScale7hi       QN qScale0hi
+
+qSrc0lo         QN 4.S32
+qSrc0hi         QN 5.S32
+qSrc1lo         QN 6.S32
+qSrc1hi         QN Src4.S32
+qSrc2lo         QN qSrc0lo
+qSrc2hi         QN qSrc0hi
+qSrc3lo         QN qSrc0lo
+qSrc3hi         QN qSrc0hi
+qSrc4lo         QN qSrc0lo
+qSrc4hi         QN qSrc0hi
+qSrc5lo         QN qSrc1lo
+qSrc5hi         QN qSrc1hi
+qSrc6lo         QN qSrc1lo
+qSrc6hi         QN qSrc1hi
+qSrc7lo         QN qSrc0lo
+qSrc7hi         QN qSrc0hi
+
+qRes17lo        QN qScale0lo
+qRes17hi        QN qScale0hi
+qRes26lo        QN qScale0lo
+qRes26hi        QN qScale0hi
+qRes53lo        QN qScale0lo
+qRes53hi        QN qScale0hi
+
+            ADD         pTemp, pScale, #4*8*7           ;// Address of  pScale[7]
+            
+            ;// Row 0
+            VLD1        {qScale0lo, qScale0hi}, [pScale]!
+            VSHLL       qSrc0lo, dXj0lo, #(12-1)
+            VSHLL       qSrc0hi, dXj0hi, #(12-1)            
+            VLD1        {qScale1lo, qScale1hi}, [pScale]!
+            VQRDMULH    qSrc0lo, qScale0lo, qSrc0lo
+            VQRDMULH    qSrc0hi, qScale0hi, qSrc0hi
+            VLD1        {qScale7lo, qScale7hi}, [pTemp]!
+            VSHLL       qSrc1lo, dXj1lo, #(12-1)
+            VSHLL       qSrc1hi, dXj1hi, #(12-1)            
+            VMOVN       dXi0lo, qSrc0lo                 ;// Output i0
+            VMOVN       dXi0hi, qSrc0hi
+            VSHLL       qSrc7lo, dXj7lo, #(12-1)
+            VSHLL       qSrc7hi, dXj7hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc1lo, qScale1lo, qSrc1lo
+            VQRDMULH    qSrc1hi, qScale1hi, qSrc1hi
+            VQRDMULH    qSrc7lo, qScale7lo, qSrc7lo
+            VQRDMULH    qSrc7hi, qScale7hi, qSrc7hi
+            VLD1        {qScale2lo, qScale2hi}, [pScale]!
+
+            ;// Row 1 & 7
+            VHADD       qRes17lo, qSrc1lo, qSrc7lo      ;// (j1+j7)/2
+            VHADD       qRes17hi, qSrc1hi, qSrc7hi      ;// (j1+j7)/2
+            VMOVN       dXi5lo, qRes17lo                ;// Output i5
+            VMOVN       dXi5hi, qRes17hi              
+            VSUB        qRes17lo, qSrc1lo, qSrc7lo      ;// j1-j7
+            VSUB        qRes17hi, qSrc1hi, qSrc7hi      ;// j1-j7
+            VMOVN       dXi6lo, qRes17lo                ;// Output i6
+            VMOVN       dXi6hi, qRes17hi      
+            VSHLL       qSrc2lo, dXj2lo, #(12-1)
+            VSHLL       qSrc2hi, dXj2hi, #(12-1)
+            VLD1        {qScale6lo, qScale6hi}, [pTemp]!
+            VSHLL       qSrc6lo, dXj6lo, #(12-1)
+            VSHLL       qSrc6hi, dXj6hi, #(12-1)
+            SUB         pTemp, pTemp, #((16*2)+(4*8*1))
+            VQRDMULH    qSrc2lo, qScale2lo, qSrc2lo
+            VQRDMULH    qSrc2hi, qScale2hi, qSrc2hi
+            VQRDMULH    qSrc6lo, qScale6lo, qSrc6lo
+            VQRDMULH    qSrc6hi, qScale6hi, qSrc6hi
+            VLD1        {qScale3lo, qScale3hi}, [pScale]!
+
+            ;// Row 2 & 6
+            VHADD       qRes26lo, qSrc2lo, qSrc6lo      ;// (j2+j6)/2
+            VHADD       qRes26hi, qSrc2hi, qSrc6hi      ;// (j2+j6)/2
+            VMOVN       dXi3lo, qRes26lo                ;// Output i3
+            VMOVN       dXi3hi, qRes26hi              
+            VSUB        qRes26lo, qSrc2lo, qSrc6lo      ;// j2-j6
+            VSUB        qRes26hi, qSrc2hi, qSrc6hi      ;// j2-j6
+            VMOVN       dXi2lo, qRes26lo                ;// Output i2
+            VMOVN       dXi2hi, qRes26hi      
+            VSHLL       qSrc3lo, dXj3lo, #(12-1)
+            VSHLL       qSrc3hi, dXj3hi, #(12-1)
+            VLD1        {qScale5lo, qScale5hi}, [pTemp]!
+            VSHLL       qSrc5lo, dXj5lo, #(12-1)
+            VSHLL       qSrc5hi, dXj5hi, #(12-1)
+            VQRDMULH    qSrc3lo, qScale3lo, qSrc3lo
+            VQRDMULH    qSrc3hi, qScale3hi, qSrc3hi
+            VQRDMULH    qSrc5lo, qScale5lo, qSrc5lo
+            VQRDMULH    qSrc5hi, qScale5hi, qSrc5hi
+            
+            ;// Row 3 & 5
+            VHADD       qRes53lo, qSrc5lo, qSrc3lo      ;// (j5+j3)/2
+            VHADD       qRes53hi, qSrc5hi, qSrc3hi      ;// (j5+j3)/2
+            SUB         pSrc, pSrc, #16*2*2
+            VMOVN       dXi7lo, qRes53lo                ;// Output i7
+            VMOVN       dXi7hi, qRes53hi              
+            VSUB        qRes53lo, qSrc5lo, qSrc3lo      ;// j5-j3
+            VSUB        qRes53hi, qSrc5hi, qSrc3hi      ;// j5-j3
+            VLD1        qXj4, [pSrc @64]
+            VMOVN       dXi4lo, qRes53lo                ;// Output i4
+            VMOVN       dXi4hi, qRes53hi                              
+            VSHLL       qSrc4lo, dXj4lo, #(12-1)
+            VSHLL       qSrc4hi, dXj4hi, #(12-1)
+            VLD1        {qScale4lo, qScale4hi}, [pScale]            
+            LDR         pSrc, =armCOMM_IDCTCoef     ;// Address of DCT inverse AAN constants
+            VQRDMULH    qSrc4lo, qScale4lo, qSrc4lo
+            VQRDMULH    qSrc4hi, qScale4hi, qSrc4hi
+            VLDR        dCoefs, [pSrc]                  ;// Load DCT inverse AAN constants
+            ;// Row 4
+            VMOVN       dXi1lo, qSrc4lo                 ;// Output i1
+            VMOVN       dXi1hi, qSrc4hi              
+        
+        MEND
+                                                
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
new file mode 100755
index 0000000..51118fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
@@ -0,0 +1,27 @@
+/**
+ * 
+ * File Name:  armCOMM_MaskTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Mask Table to mask the end of array
+ */
+ 
+
+
+#ifndef _ARMCOMM_MASKTABLE_H_
+#define _ARMCOMM_MASKTABLE_H_
+
+#define MaskTableSize 72
+  
+/* Mask table */
+
+extern const OMX_U16 armCOMM_qMaskTable16[MaskTableSize];
+extern const OMX_U8 armCOMM_qMaskTable8[MaskTableSize];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
new file mode 100755
index 0000000..41b3e1e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
@@ -0,0 +1,43 @@
+/* Guard the header against multiple inclusion. */
+#ifndef __ARM_COMM_VERSION_H__
+#define __ARM_COMM_VERSION_H__
+
+
+/* The following line should be in omxtypes.h but hasn't been approved by OpenMAX yet */
+#define OMX_VERSION 102
+
+/* We need to define these macros in order to convert a #define number into a #define string. */
+#define ARM_QUOTE(a) #a
+#define ARM_INDIRECT(A) ARM_QUOTE(A)
+
+/* Convert the OMX_VERSION number into a string that can be used, for example, to print it out. */
+#define ARM_VERSION_STRING ARM_INDIRECT(OMX_VERSION)
+
+
+/* Define this in order to turn on ARM version/release/build strings in each domain */
+#define ARM_INCLUDE_VERSION_DESCRIPTIONS
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+  extern const char * const omxAC_VersionDescription;
+  extern const char * const omxIC_VersionDescription;
+  extern const char * const omxIP_VersionDescription;
+  extern const char * const omxSP_VersionDescription;
+  extern const char * const omxVC_VersionDescription;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */
+
+
+/* The following entries should be automatically updated by the release script */
+/* They are used in the ARM version strings defined for each domain.             */
+
+/* The release tag associated with this release of the library. - used for source and object releases */
+#define OMX_ARM_RELEASE_TAG  "r1p0-00bet0"
+
+/* The ARM architecture used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_ARCHITECTURE "ARM Architecture V7 with NEON"
+
+/* The ARM Toolchain used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_TOOLCHAIN    "ARM RVCT 3.1"
+
+
+#endif /* __ARM_COMM_VERSION_H__ */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
new file mode 100755
index 0000000..0956bd1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
@@ -0,0 +1,1157 @@
+;//
+;// 
+;// File Name:  armCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// ARM optimized OpenMAX common header file
+;//
+
+;// Protect against multiple inclusion
+ IF :LNOT::DEF:ARMCOMM_S_H
+ GBLL ARMCOMM_S_H
+
+        REQUIRE8            ;// Requires 8-byte stack alignment
+        PRESERVE8           ;// Preserves 8-byte stack alignment
+        
+        GBLL    ARM_ERRORCHECK
+ARM_ERRORCHECK  SETL {FALSE}
+
+;// Globals
+
+        GBLS    _RRegList   ;// R saved register list
+        GBLS    _DRegList   ;// D saved register list
+        GBLS    _Variant    ;// Selected processor variant
+        GBLS    _CPU        ;// CPU name
+        GBLS    _Struct     ;// Structure name
+        
+        GBLL    _InFunc     ;// Inside function assembly flag
+        GBLL    _SwLong     ;// Long switch flag
+        
+        GBLA    _RBytes     ;// Number of register bytes on stack
+        GBLA    _SBytes     ;// Number of scratch bytes on stack 
+        GBLA    _ABytes     ;// Stack offset of next argument
+        GBLA    _Workspace  ;// Stack offset of scratch workspace
+        GBLA    _F          ;// Function number
+        GBLA    _StOff      ;// Struct offset
+        GBLA    _SwNum      ;// Switch number
+        GBLS    _32         ;// Suffix for 32 byte alignmnet
+        GBLS    _16         ;// Suffix for 16 byte alignmnet
+        
+_InFunc         SETL    {FALSE}
+_SBytes         SETA    0
+_F              SETA    0
+_SwNum          SETA    0
+_32             SETS    "ALIGN32"
+_16             SETS    "ALIGN16"
+
+;/////////////////////////////////////////////////////////
+;// Override the tools settings of the CPU if the #define
+;// USECPU is set, otherwise use the CPU defined by the
+;// assembler settings.
+;/////////////////////////////////////////////////////////
+
+       IF :DEF: OVERRIDECPU
+_CPU       SETS  OVERRIDECPU
+       ELSE
+_CPU       SETS    {CPU}       
+       ENDIF
+
+
+
+;/////////////////////////////////////////////////////////
+;// Work out which code to build
+;/////////////////////////////////////////////////////////
+
+        IF :DEF:ARM1136JS:LOR::DEF:CortexA8:LOR::DEF:ARM_GENERIC
+            INFO 1,"Please switch to using M_VARIANTS"
+        ENDIF
+
+        ;// Define and reset all officially recongnised variants
+        MACRO
+        _M_DEF_VARIANTS
+        _M_DEF_VARIANT ARM926EJS
+        _M_DEF_VARIANT ARM1136JS
+        _M_DEF_VARIANT ARM1136JS_U
+        _M_DEF_VARIANT CortexA8
+        _M_DEF_VARIANT ARM7TDMI
+        MEND
+        
+        MACRO
+        _M_DEF_VARIANT $var
+        GBLL $var
+        GBLL _ok$var
+$var    SETL {FALSE}
+        MEND        
+        
+
+        ;// Variant declaration
+        ;//
+        ;// Define a list of code variants supported by this
+        ;// source file. This macro then chooses the most
+        ;// appropriate variant to build for the currently configured
+        ;// core.
+        ;//        
+        MACRO
+        M_VARIANTS $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7        
+        ;// Set to TRUE variants that are supported
+        _M_DEF_VARIANTS
+        _M_VARIANT $v0
+        _M_VARIANT $v1
+        _M_VARIANT $v2
+        _M_VARIANT $v3
+        _M_VARIANT $v4
+        _M_VARIANT $v5
+        _M_VARIANT $v6
+        _M_VARIANT $v7
+        
+        ;// Look for first available variant to match a CPU
+        ;// _M_TRY cpu, variant fall back list
+_Variant SETS ""                
+        _M_TRY ARM926EJ-S,   ARM926EJS
+        _M_TRY ARM1176JZ-S,  ARM1136JS
+        _M_TRY ARM1176JZF-S, ARM1136JS
+        _M_TRY ARM1156T2-S,  ARM1136JS
+        _M_TRY ARM1156T2F-S, ARM1136JS
+        _M_TRY ARM1136J-S,   ARM1136JS
+        _M_TRY ARM1136JF-S,  ARM1136JS
+        _M_TRY MPCore,       ARM1136JS
+        _M_TRY falcon-vfp, ARM1136JS
+        _M_TRY falcon-full-neon, CortexA8
+        _M_TRY Cortex-A8NoNeon, ARM1136JS
+        _M_TRY Cortex-A8,    CortexA8, ARM1136JS
+        _M_TRY Cortex-R4,    ARM1136JS
+        _M_TRY ARM7TDMI
+        
+        ;// Select the correct variant
+        _M_DEF_VARIANTS
+        IF _Variant=""
+            INFO 1, "No match found for CPU '$_CPU'"
+        ELSE
+$_Variant   SETL {TRUE}
+        ENDIF
+        MEND
+        
+        ;// Register a variant as available
+        MACRO
+        _M_VARIANT $var
+        IF "$var"=""
+            MEXIT
+        ENDIF
+        IF :LNOT::DEF:_ok$var
+            INFO 1, "Unrecognized variant '$var'"
+        ENDIF
+$var    SETL {TRUE}
+        MEND
+        
+        ;// For a given CPU, see if any of the variants supporting
+        ;// this CPU are available. The first available variant is
+        ;// chosen
+        MACRO
+        _M_TRY $cpu, $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+        IF "$cpu"<>_CPU
+            MEXIT
+        ENDIF
+        _M_TRY1 $v0
+        _M_TRY1 $v1
+        _M_TRY1 $v2
+        _M_TRY1 $v3
+        _M_TRY1 $v4
+        _M_TRY1 $v5
+        _M_TRY1 $v6
+        _M_TRY1 $v7
+        ;// Check a match was found
+        IF _Variant=""
+            INFO 1, "No variant match found for CPU '$_CPU'"
+        ENDIF
+        MEND
+        
+        MACRO
+        _M_TRY1 $var
+        IF "$var"=""
+            MEXIT
+        ENDIF
+        IF (_Variant=""):LAND:$var
+_Variant SETS "$var"
+        ENDIF
+        MEND
+        
+;////////////////////////////////////////////////////////
+;// Structure definition
+;////////////////////////////////////////////////////////
+
+        ;// Declare a structure of given name
+        MACRO
+        M_STRUCT $sname
+_Struct SETS "$sname"
+_StOff  SETA 0
+        MEND
+        
+        ;// Declare a structure field
+        ;// The field is called $sname_$fname
+        ;// $size   = the size of each entry, must be power of 2 
+        ;// $number = (if provided) the number of entries for an array
+        MACRO
+        M_FIELD $fname, $size, $number
+        IF (_StOff:AND:($size-1))!=0
+_StOff      SETA _StOff + ($size - (_StOff:AND:($size-1)))
+        ENDIF
+$_Struct._$fname EQU _StOff
+        IF "$number"<>""
+_StOff      SETA _StOff + $size*$number
+        ELSE
+_StOff      SETA _StOff + $size
+        ENDIF
+        MEND
+        
+        
+        MACRO
+        M_ENDSTRUCT
+sizeof_$_Struct EQU _StOff
+_Struct SETS ""
+        MEND
+
+;//////////////////////////////////////////////////////////
+;// Switch and table macros
+;//////////////////////////////////////////////////////////
+
+        ;// Start a relative switch table with register to switch on
+        ;//
+        ;// $v = the register to switch on
+        ;// $s = if specified must be "L" to indicate long
+        ;//      this allows a greater range to the case code
+        MACRO
+        M_SWITCH $v, $s
+        ASSERT "$s"="":LOR:"$s"="L"
+_SwLong SETL {FALSE}
+        IF "$s"="L"
+_SwLong     SETL {TRUE}
+        ENDIF
+_SwNum  SETA _SwNum+1        
+        IF {CONFIG}=16
+            ;// Thumb
+            IF _SwLong
+                TBH [pc, $v, LSL#1]
+            ELSE
+                TBB [pc, $v]
+            ENDIF
+_Switch$_SwNum
+        ELSE
+            ;// ARM
+            ADD pc, pc, $v, LSL #2
+            NOP
+        ENDIF
+        MEND
+        
+        ;// Add a case to the switch statement
+        MACRO
+        M_CASE  $label
+        IF {CONFIG}=16
+            ;// Thumb
+            IF _SwLong
+                DCW ($label - _Switch$_SwNum)/2
+            ELSE
+                DCB ($label - _Switch$_SwNum)/2
+            ENDIF
+        ELSE
+            ;// ARM
+            B   $label
+        ENDIF
+        MEND
+        
+        ;// End of switch statement
+        MACRO
+        M_ENDSWITCH
+        ALIGN 2
+        MEND       
+
+
+;////////////////////////////////////////////////////////
+;// Data area allocation
+;////////////////////////////////////////////////////////
+
+        ;// Constant table allocator macro
+        ;//
+        ;// Creates a new section for each constant table
+        ;// $name is symbol through which the table can be accessed.
+        ;// $align is the optional alignment of the table, log2 of 
+        ;//  the byte alignment - $align=4 is 16 byte aligned
+        MACRO
+        M_TABLE  $name, $align
+        ASSERT :LNOT:_InFunc
+        IF "$align"=""
+            AREA |.constdata|, READONLY, DATA
+        ELSE
+            ;// AREAs inherit the alignment of the first declaration.
+            ;// Therefore for each alignment size we must have an area
+            ;// of a different name.
+            AREA constdata_a$align, READONLY, DATA, ALIGN=$align
+            
+            ;// We also force alignment incase we are tagging onto
+            ;// an already started area.
+            ALIGN (1<<$align)
+        ENDIF
+$name
+        MEND
+        
+;/////////////////////////////////////////////////////
+;// Macros to allocate space on the stack
+;//
+;// These all assume that the stack is 8-byte aligned
+;// at entry to the function, which means that the 
+;// 32-byte alignment macro needs to work in a
+;// bit more of a special way...
+;/////////////////////////////////////////////////////
+
+        
+
+
+        ;// Allocate 1-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC1  $name, $size
+        ASSERT :LNOT:_InFunc
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 2-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC2  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:1)!=0
+_SBytes     SETA _SBytes + (2 - (_SBytes:AND:1))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 4-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC4  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:3)!=0
+_SBytes     SETA _SBytes + (4 - (_SBytes:AND:3))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND
+            
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size $size bytes.
+        MACRO
+        M_ALLOC8  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F   EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+        MEND        
+
+        
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size ($size+16) bytes.
+        ;// The extra 16 bytes are later used to align the pointer to 16 bytes
+        
+        MACRO
+        M_ALLOC16  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F$_16   EQU (_SBytes + 8)
+_SBytes SETA _SBytes + ($size) + 8
+        MEND        
+        
+        ;// Allocate 8-byte aligned area of name
+        ;// $name size ($size+32) bytes.
+        ;// The extra 32 bytes are later used to align the pointer to 32 bytes
+        
+        MACRO
+        M_ALLOC32  $name, $size
+        ASSERT :LNOT:_InFunc
+        IF (_SBytes:AND:7)!=0
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+$name$_F$_32   EQU (_SBytes + 24)
+_SBytes SETA _SBytes + ($size) + 24
+        MEND        
+        
+        
+        
+        
+        ;// Argument Declaration Macro
+        ;//
+        ;// Allocate an argument name $name
+        ;// size $size bytes
+        MACRO
+        M_ARG     $name, $size
+        ASSERT _InFunc
+$name$_F    EQU _ABytes
+_ABytes SETA _ABytes + ($size)
+        MEND        
+        
+;///////////////////////////////////////////////
+;// Macros to access stacked variables
+;///////////////////////////////////////////////
+
+        ;// Macro to perform a data processing operation
+        ;// with a constant second operand
+        MACRO
+        _M_OPC $op,$rd,$rn,$const
+        LCLA    _sh
+        LCLA    _cst
+_sh     SETA    0
+_cst    SETA    $const
+        IF _cst=0
+        $op $rd, $rn, #_cst
+            MEXIT
+        ENDIF
+        WHILE (_cst:AND:3)=0
+_cst        SETA _cst>>2
+_sh         SETA _sh+2
+        WEND
+        $op $rd, $rn, #(_cst:AND:0x000000FF)<<_sh
+        IF _cst>=256
+            $op $rd, $rd, #(_cst:AND:0xFFFFFF00)<<_sh
+        ENDIF
+        MEND
+
+        ;// Macro to perform a data access operation
+        ;// Such as LDR or STR
+        ;// The addressing mode is modified such that
+        ;// 1. If no address is given then the name is taken
+        ;//    as a stack offset
+        ;// 2. If the addressing mode is not available for the
+        ;//    state being assembled for (eg Thumb) then a suitable
+        ;//    addressing mode is substituted.
+        ;//
+        ;// On Entry:
+        ;// $i = Instruction to perform (eg "LDRB")
+        ;// $a = Required byte alignment
+        ;// $r = Register(s) to transfer (eg "r1")
+        ;// $a0,$a1,$a2. Addressing mode and condition. One of:
+        ;//     label {,cc}
+        ;//     [base]                    {,,,cc}
+        ;//     [base, offset]{!}         {,,cc}
+        ;//     [base, offset, shift]{!}  {,cc}
+        ;//     [base], offset            {,,cc}
+        ;//     [base], offset, shift     {,cc}
+        MACRO
+        _M_DATA $i,$a,$r,$a0,$a1,$a2,$a3
+        IF "$a0":LEFT:1="["
+            IF "$a1"=""
+                $i$a3   $r, $a0
+            ELSE
+                IF "$a0":RIGHT:1="]"
+                    IF "$a2"=""
+                        _M_POSTIND $i$a3, "$r", $a0, $a1
+                    ELSE
+                        _M_POSTIND $i$a3, "$r", $a0, "$a1,$a2"
+                    ENDIF
+                ELSE
+                    IF "$a2"=""
+                        _M_PREIND  $i$a3, "$r", $a0, $a1
+                    ELSE
+                        _M_PREIND  $i$a3, "$r", $a0, "$a1,$a2"
+                    ENDIF
+                ENDIF
+            ENDIF
+        ELSE
+            LCLA    _Offset
+_Offset     SETA    _Workspace + $a0$_F
+            ASSERT  (_Offset:AND:($a-1))=0
+            $i$a1   $r, [sp, #_Offset]
+        ENDIF
+        MEND
+        
+        ;// Handle post indexed load/stores
+        ;// op  reg, [base], offset
+        MACRO
+        _M_POSTIND $i,$r,$a0,$a1
+        LCLS _base
+        LCLS _offset
+        IF {CONFIG}=16 ;// Thumb
+_base       SETS ("$a0":LEFT:(:LEN:"$a0"-1)):RIGHT:(:LEN:"$a0"-2)   ;// remove []
+_offset     SETS "$a1"
+            IF _offset:LEFT:1="+"
+_offset         SETS _offset:RIGHT:(:LEN:_offset-1)
+            ENDIF
+            $i  $r, $a0
+            IF _offset:LEFT:1="-"
+_offset         SETS _offset:RIGHT:(:LEN:_offset-1)
+                SUB $_base, $_base, $_offset
+            ELSE                
+                ADD $_base, $_base, $_offset
+            ENDIF
+        ELSE ;// ARM
+            $i  $r, $a0, $a1
+        ENDIF
+        MEND
+        
+        ;// Handle pre indexed load/store
+        ;// op  reg, [base, offset]{!}
+        MACRO
+        _M_PREIND $i,$r,$a0,$a1
+        LCLS _base
+        LCLS _offset
+        IF ({CONFIG}=16):LAND:(("$a1":RIGHT:2)="]!")
+_base       SETS "$a0":RIGHT:(:LEN:("$a0")-1)
+_offset     SETS "$a1":LEFT:(:LEN:("$a1")-2)
+            $i $r, [$_base, $_offset]
+            ADD $_base, $_base, $_offset
+        ELSE
+            $i  $r, $a0, $a1
+        ENDIF
+        MEND
+
+        ;// Load unsigned byte from stack
+        MACRO
+        M_LDRB  $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load signed byte from stack
+        MACRO
+        M_LDRSB $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRSB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store byte to stack
+        MACRO
+        M_STRB  $r,$a0,$a1,$a2,$a3
+        _M_DATA "STRB",1,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load unsigned half word from stack
+        MACRO
+        M_LDRH  $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Load signed half word from stack
+        MACRO
+        M_LDRSH $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDRSH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store half word to stack
+        MACRO
+        M_STRH  $r,$a0,$a1,$a2,$a3
+        _M_DATA "STRH",2,$r,$a0,$a1,$a2,$a3
+        MEND
+
+        ;// Load word from stack
+        MACRO
+        M_LDR   $r,$a0,$a1,$a2,$a3
+        _M_DATA "LDR",4,$r,$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Store word to stack
+        MACRO
+        M_STR   $r,$a0,$a1,$a2,$a3
+        _M_DATA "STR",4,$r,$a0,$a1,$a2,$a3
+        MEND
+
+        ;// Load double word from stack
+        MACRO
+        M_LDRD  $r0,$r1,$a0,$a1,$a2,$a3
+        _M_DATA "LDRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+        MEND
+                
+        ;// Store double word to stack
+        MACRO
+        M_STRD  $r0,$r1,$a0,$a1,$a2,$a3
+        _M_DATA "STRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+        MEND
+        
+        ;// Get absolute address of stack allocated location
+        MACRO
+        M_ADR   $a, $b, $cc
+        _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F)
+        MEND
+        
+        ;// Get absolute address of stack allocated location and align the address to 16 bytes
+        MACRO
+        M_ADR16 $a, $b, $cc
+            _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F$_16)
+        
+            ;// Now align $a to 16 bytes
+            BIC$cc  $a,$a,#0x0F
+        MEND
+        
+        ;// Get absolute address of stack allocated location and align the address to 32 bytes
+        MACRO
+        M_ADR32 $a, $b, $cc
+            _M_OPC  ADD$cc, $a, sp, (_Workspace + $b$_F$_32)
+        
+            ;// Now align $a to 32 bytes
+            BIC$cc  $a,$a,#0x1F
+        MEND
+
+;//////////////////////////////////////////////////////////
+;// Function header and footer macros
+;//////////////////////////////////////////////////////////      
+        
+        ;// Function Header Macro    
+        ;// Generates the function prologue
+        ;// Note that functions should all be "stack-moves-once"
+        ;// The FNSTART and FNEND macros should be the only places
+        ;// where the stack moves.
+        ;//    
+        ;// $name  = function name
+        ;// $rreg  = ""   don't stack any registers
+        ;//          "lr" stack "lr" only
+        ;//          "rN" stack registers "r4-rN,lr"
+        ;// $dreg  = ""   don't stack any D registers
+        ;//          "dN" stack registers "d8-dN"
+        ;//
+        ;// Note: ARM Archicture procedure call standard AAPCS
+        ;// states that r4-r11, sp, d8-d15 must be preserved by
+        ;// a compliant function.
+        MACRO
+        M_START $name, $rreg, $dreg
+        ASSERT :LNOT:_InFunc
+        ASSERT "$name"!=""
+_InFunc SETL {TRUE}
+_RBytes SETA 0
+_Workspace SETA 0
+
+        ;// Create an area for the function        
+        AREA    |.text|, CODE
+        EXPORT  $name
+$name   FUNCTION
+        
+        ;// Save R registers
+        _M_GETRREGLIST $rreg
+        IF _RRegList<>""
+            STMFD   sp!, {$_RRegList, lr}
+        ENDIF
+                
+        ;// Save D registers
+        _M_GETDREGLIST  $dreg        
+        IF _DRegList<>""
+            VSTMFD  sp!, {$_DRegList}
+        ENDIF            
+            
+                    
+        ;// Ensure size claimed on stack is 8-byte aligned
+        IF ((_SBytes:AND:7)!=0)
+_SBytes     SETA _SBytes + (8 - (_SBytes:AND:7))
+        ENDIF
+        
+        IF (_SBytes!=0)
+            _M_OPC SUB, sp, sp, _SBytes
+        ENDIF
+        
+        
+_ABytes SETA _SBytes + _RBytes - _Workspace
+
+                        
+        ;// Print function name if debug enabled
+        M_PRINTF "$name\n",
+        MEND
+        
+        ;// Work out a list of R saved registers
+        MACRO
+        _M_GETRREGLIST $rreg
+        IF "$rreg"=""
+_RRegList   SETS ""
+            MEXIT
+        ENDIF        
+        IF "$rreg"="lr":LOR:"$rreg"="r4"
+_RRegList   SETS "r4"
+_RBytes     SETA _RBytes+8
+            MEXIT
+        ENDIF
+        IF "$rreg"="r5":LOR:"$rreg"="r6"
+_RRegList   SETS "r4-r6"
+_RBytes     SETA _RBytes+16
+            MEXIT
+        ENDIF
+        IF "$rreg"="r7":LOR:"$rreg"="r8"
+_RRegList   SETS "r4-r8"
+_RBytes     SETA _RBytes+24
+            MEXIT
+        ENDIF
+        IF "$rreg"="r9":LOR:"$rreg"="r10"
+_RRegList   SETS "r4-r10"
+_RBytes     SETA _RBytes+32
+            MEXIT
+        ENDIF
+        IF "$rreg"="r11":LOR:"$rreg"="r12"
+_RRegList   SETS "r4-r12"
+_RBytes     SETA _RBytes+40
+            MEXIT
+        ENDIF
+        INFO 1, "Unrecognized saved r register limit '$rreg'"
+        MEND        
+        
+        ;// Work out a list of D saved registers
+        MACRO
+        _M_GETDREGLIST $dreg
+        IF "$dreg"=""
+_DRegList   SETS ""
+            MEXIT
+        ENDIF        
+        IF "$dreg"="d8"
+_DRegList   SETS "d8"
+_RBytes     SETA _RBytes+8
+            MEXIT
+        ENDIF
+        IF "$dreg"="d9"
+_DRegList   SETS "d8-d9"
+_RBytes     SETA _RBytes+16
+            MEXIT
+        ENDIF
+        IF "$dreg"="d10"
+_DRegList   SETS "d8-d10"
+_RBytes     SETA _RBytes+24
+            MEXIT
+        ENDIF
+        IF "$dreg"="d11"
+_DRegList   SETS "d8-d11"
+_RBytes     SETA _RBytes+32
+            MEXIT
+        ENDIF
+        IF "$dreg"="d12"
+_DRegList   SETS "d8-d12"
+_RBytes     SETA _RBytes+40
+            MEXIT
+        ENDIF
+        IF "$dreg"="d13"
+_DRegList   SETS "d8-d13"
+_RBytes     SETA _RBytes+48
+            MEXIT
+        ENDIF
+        IF "$dreg"="d14"
+_DRegList   SETS "d8-d14"
+_RBytes     SETA _RBytes+56
+            MEXIT
+        ENDIF
+        IF "$dreg"="d15"
+_DRegList   SETS "d8-d15"
+_RBytes     SETA _RBytes+64
+            MEXIT
+        ENDIF
+        INFO 1, "Unrecognized saved d register limit '$dreg'"
+        MEND
+        
+        ;// Produce function return instructions
+        MACRO
+        _M_RET $cc
+        IF _DRegList<>""
+            VPOP$cc {$_DRegList}
+        ENDIF
+        IF _RRegList=""
+            BX$cc lr
+        ELSE
+            LDM$cc.FD sp!, {$_RRegList, pc}
+        ENDIF
+        MEND        
+        
+        ;// Early Function Exit Macro
+        ;// $cc = condition to exit with
+        ;// (Example: M_EXIT EQ)
+        MACRO
+        M_EXIT  $cc
+        ASSERT  _InFunc
+        IF  _SBytes!=0
+            ;// Restore stack frame and exit
+            B$cc  _End$_F
+        ELSE
+            ;// Can return directly
+            _M_RET $cc
+        ENDIF        
+        MEND        
+
+        ;// Function Footer Macro        
+        ;// Generates the function epilogue
+        MACRO
+        M_END
+        ASSERT _InFunc
+_InFunc SETL {FALSE}
+_End$_F
+
+        ;// Restore the stack pointer to its original value on function entry
+        IF _SBytes!=0
+            _M_OPC ADD, sp, sp, _SBytes
+        ENDIF
+        _M_RET
+        ENDFUNC
+
+        ;// Reset the global stack tracking variables back to their 
+        ;// initial values, and increment the function count
+_SBytes        SETA 0
+_F             SETA _F+1
+        MEND
+
+                
+;//==========================================================================
+;// Debug Macros
+;//==========================================================================
+
+        GBLL    DEBUG_ON
+DEBUG_ON SETL   {FALSE}
+        GBLL    DEBUG_STALLS_ON
+DEBUG_STALLS_ON SETL {FALSE}
+        
+        ;//==========================================================================
+        ;// Debug call to printf
+        ;//  M_PRINTF $format, $val0, $val1, $val2
+        ;//
+        ;// Examples:
+        ;//  M_PRINTF "x=%08x\n", r0
+        ;//
+        ;// This macro preserves the value of all registers including the
+        ;// flags.
+        ;//==========================================================================
+
+        MACRO
+        M_PRINTF  $format, $val0, $val1, $val2
+        IF DEBUG_ON
+        
+        IMPORT  printf
+        LCLA    nArgs
+nArgs	SETA    0
+        
+        ;// save registers so we don't corrupt them
+        STMFD   sp!, {r0-r12, lr}
+        
+        ;// Drop stack to give us some workspace
+        SUB     sp, sp, #16
+        
+        ;// Save registers we need to print to the stack
+        IF "$val2" <> ""
+            ASSERT "$val1" <> ""
+            STR    $val2, [sp, #8]
+nArgs       SETA   nArgs+1
+        ENDIF
+        IF "$val1" <> ""
+            ASSERT "$val0" <> ""
+            STR    $val1, [sp, #4]
+nArgs	    SETA   nArgs+1
+        ENDIF
+        IF "$val0"<>""
+            STR    $val0, [sp]
+nArgs	    SETA   nArgs+1
+        ENDIF
+        
+        ;// Now we are safe to corrupt registers
+        ADR     r0, %FT00
+        IF nArgs=1
+          LDR   r1, [sp]
+        ENDIF
+        IF nArgs=2
+          LDMIA sp, {r1,r2}
+        ENDIF
+        IF nArgs=3
+          LDMIA sp, {r1,r2,r3}
+        ENDIF
+        
+        ;// print the values
+        MRS     r4, cpsr        ;// preserve flags
+        BL      printf
+        MSR     cpsr_f, r4      ;// restore flags
+        B       %FT01
+00      ;// string to print
+        DCB     "$format", 0
+        ALIGN
+01      ;// Finished
+        ADD     sp, sp, #16
+        ;// Restore registers
+        LDMFD	sp!, {r0-r12,lr}
+
+        ENDIF   ;// DEBUG_ON
+        MEND
+
+
+        ;// Stall Simulation Macro
+        ;// Inserts a given number of NOPs for the currently
+        ;//  defined platform
+        MACRO
+        M_STALL $plat1stall, $plat2stall, $plat3stall, $plat4stall, $plat5stall, $plat6stall
+        IF DEBUG_STALLS_ON
+            _M_STALL_SUB $plat1stall    
+            _M_STALL_SUB $plat2stall    
+            _M_STALL_SUB $plat3stall    
+            _M_STALL_SUB $plat4stall    
+            _M_STALL_SUB $plat5stall    
+            _M_STALL_SUB $plat6stall    
+        ENDIF
+        MEND
+        
+        MACRO
+        _M_STALL_SUB $platstall
+        IF "$platstall"!=""
+            LCLA _pllen
+            LCLS _pl
+            LCLL _pllog
+_pllen      SETA :LEN:"$platstall"
+_pl         SETS "$platstall":LEFT:(_pllen - 2)
+            IF :DEF:$_pl
+                IF $_pl
+                    LCLS _st
+                    LCLA _stnum
+_st                 SETS "$platstall":RIGHT:1        
+_stnum              SETA $_st
+                    WHILE _stnum>0
+			MOV sp, sp
+_stnum                  SETA _stnum - 1
+                    WEND
+                ENDIF
+            ENDIF
+        ENDIF
+        MEND
+        
+        
+        
+;//==========================================================================
+;// Endian Invarience Macros
+;// 
+;// The idea behind these macros is that if an array is
+;// loaded as words then the SMUL00 macro will multiply
+;// array elements 0 regardless of the endianess of the
+;// system. For little endian SMUL00=SMULBB, for big
+;// endian SMUL00=SMULTT and similarly for other packed operations.
+;//
+;//==========================================================================
+
+        MACRO
+        LIBI4   $comli, $combi, $a, $b, $c, $d, $cc
+        IF {ENDIAN}="big"
+        $combi.$cc $a, $b, $c, $d
+        ELSE
+        $comli.$cc $a, $b, $c, $d
+        ENDIF
+        MEND
+        
+        MACRO
+        LIBI3   $comli, $combi, $a, $b, $c, $cc
+        IF {ENDIAN}="big"
+        $combi.$cc $a, $b, $c
+        ELSE
+        $comli.$cc $a, $b, $c
+        ENDIF
+        MEND
+        
+        ;// SMLAxy macros
+        
+        MACRO
+        SMLA00  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA01  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA0B  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA0T  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA10  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA11  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA1B  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLA1T  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAB0  $a, $b, $c, $d, $cc
+        LIBI4 SMLABB, SMLABT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAB1  $a, $b, $c, $d, $cc
+        LIBI4 SMLABT, SMLABB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAT0  $a, $b, $c, $d, $cc
+        LIBI4 SMLATB, SMLATT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAT1  $a, $b, $c, $d, $cc
+        LIBI4 SMLATT, SMLATB, $a, $b, $c, $d, $cc
+        MEND
+        
+        ;// SMULxy macros
+        
+        MACRO
+        SMUL00  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL01  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL0B  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL0T  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL10  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL11  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL1B  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMUL1T  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULB0  $a, $b, $c, $cc
+        LIBI3 SMULBB, SMULBT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULB1  $a, $b, $c, $cc
+        LIBI3 SMULBT, SMULBB, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULT0  $a, $b, $c, $cc
+        LIBI3 SMULTB, SMULTT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULT1  $a, $b, $c, $cc
+        LIBI3 SMULTT, SMULTB, $a, $b, $c, $cc
+        MEND
+        
+        ;// SMLAWx, SMULWx macros
+        
+        MACRO
+        SMLAW0  $a, $b, $c, $d, $cc
+        LIBI4 SMLAWB, SMLAWT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAW1  $a, $b, $c, $d, $cc
+        LIBI4 SMLAWT, SMLAWB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMULW0  $a, $b, $c, $cc
+        LIBI3 SMULWB, SMULWT, $a, $b, $c, $cc
+        MEND
+        
+        MACRO
+        SMULW1  $a, $b, $c, $cc
+        LIBI3 SMULWT, SMULWB, $a, $b, $c, $cc
+        MEND
+
+        ;// SMLALxy macros
+
+
+        MACRO
+        SMLAL00  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL01  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL0B  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL0T  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL10  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+
+        MACRO
+        SMLAL11  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL1B  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLAL1T  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALB0  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBB, SMLALBT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALB1  $a, $b, $c, $d, $cc
+        LIBI4 SMLALBT, SMLALBB, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALT0  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTB, SMLALTT, $a, $b, $c, $d, $cc
+        MEND
+        
+        MACRO
+        SMLALT1  $a, $b, $c, $d, $cc
+        LIBI4 SMLALTT, SMLALTB, $a, $b, $c, $d, $cc
+        MEND
+        
+  ENDIF ;// ARMCOMM_S_H
+            
+  END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
new file mode 100755
index 0000000..7a68d14
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
@@ -0,0 +1,274 @@
+/* 
+ * 
+ * File Name:  armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix. 
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ *   a function with a different version (the original version would still be
+ *   in the library just with a different name - so you could debug the new
+ *   version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that 
+ *   you can include two versions of the library and choose between functions
+ *   at runtime.
+ *
+ *     e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ * 
+ */
+
+  
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX 
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX    BARE_SUFFIX   
+#define OMXACMP3_SUFFIX    BARE_SUFFIX
+#define OMXICJP_SUFFIX     BARE_SUFFIX
+#define OMXIPBM_SUFFIX     BARE_SUFFIX
+#define OMXIPCS_SUFFIX     BARE_SUFFIX
+#define OMXIPPP_SUFFIX     BARE_SUFFIX
+#define OMXSP_SUFFIX       BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX   BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX  BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX   BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt                        OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt                          OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt                            OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32                       OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I                        OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I                     OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt                          OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I                          OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32                 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I                          OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32                      OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I                OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32                              OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16                          OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode                          OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I                           OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader                         OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader                    OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32                        OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32                     OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32                  OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32                              OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I                         OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I                      OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16                        OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader                        OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8                    OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo                           OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3                          OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16                                OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I                              OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16                                OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I                              OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16                           OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I                         OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit                      OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16                           OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I                         OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit                      OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1            OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1         OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs                           OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R                               OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R                               OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R                             OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs                           OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R                      OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R          OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R            OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R                OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R        OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R           OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R               OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R                   OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I                      OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I                      OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R                          OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R                       OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64                      OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64                      OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize                        OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit                                OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R                            OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R                            OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16                                OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32                                OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16                                    OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16                                 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs                             OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs                    OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs                       OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16                        OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32                        OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32                      OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32                         OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16                              OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32                              OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32                            OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32                               OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs                       OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs                    OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32                            OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I                          OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16                              OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I                            OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs                         OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs                          OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16                           OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I                         OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs                      OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs                       OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16                        OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I                      OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16                              OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I                            OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16                     OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I                   OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16                           OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I                         OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x                             OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x                              OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock                OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD            OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16                               OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8                                 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I                           OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect                           OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x                                 OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x                                  OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x                             OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half                        OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer                     OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter                     OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I                        OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I                          OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC        OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC                OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo                             OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma                      OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma                OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma                OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma                        OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC           OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC             OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd             OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize                           OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit                                 OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB                     OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16                     OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4                       OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8                  OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x                                 OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x                            OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x                             OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x                             OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4                               OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual             OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair       OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair         OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC                OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC                  OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16                   OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8                     OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16                OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8                  OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk                               OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP                        OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV                                OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred                              OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk                              OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock                            OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize                            OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit                                  OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB                      OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra                   OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I                            OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I                            OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I                         OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I                         OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter                 OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra                 OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+
+#endif /* _armOMX_h_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
new file mode 100755
index 0000000..8b295a6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
@@ -0,0 +1,252 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright � 2005-2008 The Khronos Group Inc. All Rights Reserved. 
+ *
+ * These materials are protected by copyright laws and contain material 
+ * proprietary to the Khronos Group, Inc.  You may use these materials 
+ * for implementing Khronos specifications, without altering or removing 
+ * any trademark, copyright or other notice from the specification.
+ * 
+ * Khronos Group makes no, and expressly disclaims any, representations 
+ * or warranties, express or implied, regarding these materials, including, 
+ * without limitation, any implied warranties of merchantability or fitness 
+ * for a particular purpose or non-infringement of any intellectual property. 
+ * Khronos Group makes no, and expressly disclaims any, warranties, express 
+ * or implied, regarding the correctness, accuracy, completeness, timeliness, 
+ * and reliability of these materials. 
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters, 
+ * Contributors or Members or their respective partners, officers, directors, 
+ * employees, agents or representatives be liable for any damages, whether 
+ * direct, indirect, special or consequential damages for lost revenues, 
+ * lost profits, or otherwise, arising from or in connection with these 
+ * materials.
+ * 
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc. 
+ *
+ */
+  
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h> 
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+    
+    /* Mandatory return codes - use cases are explicitly described for each function */
+    OMX_Sts_NoErr                    =  0,    /* No error, the function completed successfully */
+    OMX_Sts_Err                      = -2,    /* Unknown/unspecified error */    
+    OMX_Sts_InvalidBitstreamValErr   = -182,  /* Invalid value detected during bitstream processing */    
+    OMX_Sts_MemAllocErr              = -9,    /* Not enough memory allocated for the operation */
+    OMX_StsACAAC_GainCtrErr    	     = -159,  /* AAC: Unsupported gain control data detected */
+    OMX_StsACAAC_PrgNumErr           = -167,  /* AAC: Invalid number of elements for one program   */
+    OMX_StsACAAC_CoefValErr          = -163,  /* AAC: Invalid quantized coefficient value          */     
+    OMX_StsACAAC_MaxSfbErr           = -162,  /* AAC: Invalid maxSfb value in relation to numSwb */    
+	OMX_StsACAAC_PlsDataErr		     = -160,  /* AAC: pulse escape sequence data error */
+
+    /* Optional return codes - use cases are explicitly described for each function*/
+    OMX_Sts_BadArgErr                = -5,    /* Bad Arguments */
+
+    OMX_StsACAAC_TnsNumFiltErr       = -157,  /* AAC: Invalid number of TNS filters  */
+    OMX_StsACAAC_TnsLenErr           = -156,  /* AAC: Invalid TNS region length  */   
+    OMX_StsACAAC_TnsOrderErr         = -155,  /* AAC: Invalid order of TNS filter  */                  
+    OMX_StsACAAC_TnsCoefResErr       = -154,  /* AAC: Invalid bit-resolution for TNS filter coefficients  */
+    OMX_StsACAAC_TnsCoefErr          = -153,  /* AAC: Invalid TNS filter coefficients  */                  
+    OMX_StsACAAC_TnsDirectErr        = -152,  /* AAC: Invalid TNS filter direction  */  
+
+    OMX_StsICJP_JPEGMarkerErr        = -183,  /* JPEG marker encountered within an entropy-coded block; */
+                                              /* Huffman decoding operation terminated early.           */
+    OMX_StsICJP_JPEGMarker           = -181,  /* JPEG marker encountered; Huffman decoding */
+                                              /* operation terminated early.                         */
+    OMX_StsIPPP_ContextMatchErr      = -17,   /* Context parameter doesn't match to the operation */
+
+    OMX_StsSP_EvenMedianMaskSizeErr  = -180,  /* Even size of the Median Filter mask was replaced by the odd one */
+
+    OMX_Sts_MaximumEnumeration       = INT_MAX  /*Placeholder, forces enum of size OMX_INT*/
+    
+ } OMXResult;          /** Return value or error value returned from a function. Identical to OMX_INT */
+
+ 
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff 
+typedef unsigned short int OMX_U8; 
+#else
+#error OMX_U8 undefined
+#endif 
+
+ 
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f 
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f 
+typedef signed short int OMX_S8; 
+#else
+#error OMX_S8 undefined
+#endif
+ 
+ 
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16; 
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff 
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff 
+typedef signed int OMX_S16; 
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32; 
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32; 
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+    typedef __int64 OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000i64)
+    #define OMX_MIN_U64			(0x0000000000000000i64)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFi64)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFi64)
+#else
+    typedef long long OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000LL)
+    #define OMX_MIN_U64			(0x0000000000000000LL)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFLL)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+  OMX_S8 Re; /** Real part */
+  OMX_S8 Im; /** Imaginary part */	
+	
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+  OMX_S16 Re; /** Real part */
+  OMX_S16 Im; /** Imaginary part */	
+	
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+  OMX_S32 Re; /** Real part */
+  OMX_S32 Im; /** Imaginary part */	
+	
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+  OMX_S64 Re; /** Real part */
+  OMX_S64 Im; /** Imaginary part */	
+	
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8  	   	(-128)
+#define OMX_MIN_U8  		0
+#define OMX_MIN_S16		 	(-32768)
+#define OMX_MIN_U16			0
+#define OMX_MIN_S32			(-2147483647-1)
+#define OMX_MIN_U32			0
+
+#define OMX_MAX_S8			(127)
+#define OMX_MAX_U8			(255)
+#define OMX_MAX_S16			(32767)
+#define OMX_MAX_U16			(0xFFFF)
+#define OMX_MAX_S32			(2147483647)
+#define OMX_MAX_U32			(0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle, 
+  * where x,y defines the coordinates of the top left corner
+  * of the rectangle, with dimensions width in the x-direction 
+  * and height in the y-direction */
+typedef struct {
+	OMX_INT x;      /** x-coordinate of top left corner of rectangle */
+	OMX_INT y;      /** y-coordinate of top left corner of rectangle */
+	OMX_INT width;  /** Width in the x-direction. */
+	OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct 
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y;	/** y-coordinate */
+	
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct 
+{
+ OMX_INT width;  /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+	
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
new file mode 100755
index 0000000..48703d1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
@@ -0,0 +1,77 @@
+;//
+;// 
+;// File Name:  omxtypes_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Mandatory return codes - use cases are explicitly described for each function 
+OMX_Sts_NoErr                    EQU  0    ;// No error the function completed successfully 
+OMX_Sts_Err                      EQU -2    ;// Unknown/unspecified error     
+OMX_Sts_InvalidBitstreamValErr   EQU -182  ;// Invalid value detected during bitstream processing     
+OMX_Sts_MemAllocErr              EQU -9    ;// Not enough memory allocated for the operation 
+OMX_StsACAAC_GainCtrErr    	     EQU -159  ;// AAC: Unsupported gain control data detected 
+OMX_StsACAAC_PrgNumErr           EQU -167  ;// AAC: Invalid number of elements for one program   
+OMX_StsACAAC_CoefValErr          EQU -163  ;// AAC: Invalid quantized coefficient value               
+OMX_StsACAAC_MaxSfbErr           EQU -162  ;// AAC: Invalid maxSfb value in relation to numSwb     
+OMX_StsACAAC_PlsDataErr		     EQU -160  ;// AAC: pulse escape sequence data error 
+
+;// Optional return codes - use cases are explicitly described for each function
+OMX_Sts_BadArgErr                EQU -5    ;// Bad Arguments 
+
+OMX_StsACAAC_TnsNumFiltErr       EQU -157  ;// AAC: Invalid number of TNS filters  
+OMX_StsACAAC_TnsLenErr           EQU -156  ;// AAC: Invalid TNS region length     
+OMX_StsACAAC_TnsOrderErr         EQU -155  ;// AAC: Invalid order of TNS filter                    
+OMX_StsACAAC_TnsCoefResErr       EQU -154  ;// AAC: Invalid bit-resolution for TNS filter coefficients  
+OMX_StsACAAC_TnsCoefErr          EQU -153  ;// AAC: Invalid TNS filter coefficients                    
+OMX_StsACAAC_TnsDirectErr        EQU -152  ;// AAC: Invalid TNS filter direction    
+
+OMX_StsICJP_JPEGMarkerErr        EQU -183  ;// JPEG marker encountered within an entropy-coded block; 
+                                            ;// Huffman decoding operation terminated early.           
+OMX_StsICJP_JPEGMarker           EQU -181  ;// JPEG marker encountered; Huffman decoding 
+                                            ;// operation terminated early.                         
+OMX_StsIPPP_ContextMatchErr      EQU -17   ;// Context parameter doesn't match to the operation 
+
+OMX_StsSP_EvenMedianMaskSizeErr  EQU -180  ;// Even size of the Median Filter mask was replaced by the odd one 
+
+OMX_Sts_MaximumEnumeration       EQU 0x7FFFFFFF
+
+
+
+OMX_MIN_S8      EQU 	   	(-128)
+OMX_MIN_U8  	EQU     	0
+OMX_MIN_S16		EQU      	(-32768)
+OMX_MIN_U16		EQU	        0
+
+
+OMX_MIN_S32		EQU	(-2147483647-1)
+OMX_MIN_U32		EQU	0
+
+OMX_MAX_S8		EQU	(127)
+OMX_MAX_U8		EQU	(255)
+OMX_MAX_S16		EQU	(32767)
+OMX_MAX_U16		EQU	(0xFFFF)
+OMX_MAX_S32		EQU	(2147483647)
+OMX_MAX_U32		EQU	(0xFFFFFFFF)
+
+OMX_VC_UPPER    EQU 0x1                 ;// Used by the PredictIntra functions   
+OMX_VC_LEFT     EQU 0x2                 ;// Used by the PredictIntra functions 
+OMX_VC_UPPER_RIGHT    EQU 0x40          ;// Used by the PredictIntra functions   
+
+NULL    EQU 0
+
+;// Structures
+
+    INCLUDE     armCOMM_s.h
+
+    M_STRUCT    OMXPoint
+    M_FIELD     x, 4
+    M_FIELD     y, 4
+    M_ENDSTRUCT
+
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl
new file mode 100755
index 0000000..649e74c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+#
+# 
+# File Name:  build_vc.pl
+# OpenMAX DL: v1.0.2
+# Revision:   12290
+# Date:       Wednesday, April 9, 2008
+# 
+# (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+# 
+# 
+#
+# This file builds the OpenMAX DL vc domain library omxVC.o.
+#
+
+use File::Spec;
+use strict;
+
+my ($CC, $CC_OPTS, $AS, $AS_OPTS, $LIB, $LIB_OPTS, $LIB_TYPE);
+
+$CC       = 'armcc';
+$CC_OPTS  = '--no_unaligned_access --cpu Cortex-A8 -c';
+$AS       = 'armasm';
+$AS_OPTS  = '--no_unaligned_access --cpu Cortex-A8';
+# $LIB      = 'armlink';
+# $LIB_OPTS = '--partial -o';
+# $LIB_TYPE = '.o';
+$LIB      = 'armar';
+$LIB_OPTS = '--create -r';
+$LIB_TYPE = '.a';
+
+#------------------------
+
+my (@headerlist, @filelist, $hd, $file, $ofile, $command, $objlist, $libfile, $h);
+
+# Define the list of directories containing included header files.
+@headerlist = qw(api vc/api vc/m4p2/api vc/m4p10/api);
+
+# Define the list of source files to compile.
+open(FILES, '<filelist_vc.txt') or die("Can't open source file list\n");
+@filelist = <FILES>;
+close(FILES);
+
+# Fix the file separators in the header paths
+foreach $h (@headerlist)
+{
+        $h = File::Spec->canonpath($h);
+}
+
+# Create the include path to be passed to the compiler
+$hd = '-I' . join(' -I', @headerlist);
+
+# Create the build directories "/lib/" and "/obj/" (if they are not there already)
+mkdir "obj", 0777 if (! -d "obj");
+mkdir "lib", 0777 if (! -d "lib");
+
+$objlist = '';
+
+# Compile each file
+foreach $file (@filelist)
+{
+	my $f;
+	my $base;
+	my $ext;
+	my $objfile;
+
+	chomp($file);
+	$file = File::Spec->canonpath($file);
+
+	(undef, undef, $f) = File::Spec->splitpath($file);
+    $f=~s/[\n\f\r]//g; # Remove any end-of-line characters
+
+	if(($base, $ext) = $f =~ /(.+)\.(\w)$/)
+	{
+		$objfile = File::Spec->catfile('obj', $base.'.o');
+
+		if($ext eq 'c')
+		{
+			$objlist .= "$objfile ";
+			$command = $CC.' '.$CC_OPTS.' '.$hd.' -o '.$objfile.' '.$file;
+			print "$command\n";
+			system($command);
+		}
+		elsif($ext eq 's')
+		{
+			$objlist .= "$objfile ";
+			$command = $AS.' '.$AS_OPTS.' '.$hd.' -o '.$objfile.' '.$file;
+			print "$command\n";
+			system($command);
+		}
+		else
+		{
+			print "Ignoring file: $f\n";
+		}
+	}
+	else
+	{
+		die "No file extension found: $f\n";
+	}
+}
+
+# Do the final link stage to create the libraries.
+$libfile = File::Spec->catfile('lib', 'omxVC'.$LIB_TYPE);
+$command = $LIB.' '.$LIB_OPTS.' '.$libfile.' '.$objlist;
+print "$command\n";
+(system($command) == 0) and print "Build successful\n";
+
+
+
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt
new file mode 100755
index 0000000..8db8eeb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt
@@ -0,0 +1,75 @@
+./api/armCOMM.h
+./api/armCOMM_BitDec_s.h
+./api/armCOMM_Bitstream.h
+./api/armCOMM_IDCT_s.h
+./api/armCOMM_IDCTTable.h
+./api/armCOMM_MaskTable.h
+./api/armCOMM_s.h
+./api/armCOMM_Version.h
+./api/armOMX_ReleaseVersion.h
+./api/omxtypes.h
+./api/omxtypes_s.h
+./src/armCOMM_IDCTTable.c
+./src/armCOMM_MaskTable.c
+./vc/api/armVC.h
+./vc/api/armVCCOMM_s.h
+./vc/api/omxVC.h
+./vc/api/omxVC_s.h
+./vc/comm/src/omxVCCOMM_Copy16x16_s.s
+./vc/comm/src/omxVCCOMM_Copy8x8_s.s
+./vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
+./vc/m4p10/api/armVCM4P10_CAVLCTables.h
+./vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_CAVLCTables.c
+./vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
+./vc/m4p10/src/armVCM4P10_DequantTables_s.s
+./vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_QuantTables_s.s
+./vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
+./vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
+./vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
+./vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
+./vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+./vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+./vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
+./vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
+./vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+./vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+./vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
+./vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
+./vc/m4p2/src/armVCM4P2_Clip8_s.s
+./vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+./vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
+./vc/m4p2/src/armVCM4P2_Lookup_Tables.c
+./vc/m4p2/src/armVCM4P2_SetPredDir_s.s
+./vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
+./vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
+./vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
+./vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+./vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
+./vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
+./vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
+./vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
+./vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
+./vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
+./vc/src/armVC_Version.c
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c
new file mode 100755
index 0000000..e572a89
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c
@@ -0,0 +1,936 @@
+/**
+ * 
+ * File Name:  armCOMM.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   9641
+ * Date:       Thursday, February 7, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Defines Common APIs used across OpenMAX API's
+ */
+
+#include "omxtypes.h"
+#include "armCOMM.h"
+
+/***********************************************************************/
+                /* Miscellaneous Arithmetic operations */
+
+/**
+ * Function: armRoundFloatToS16
+ *
+ * Description:
+ * Converts a double precision value into a short int after rounding
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value)
+{
+    if (Value > 0)
+    {
+        return (OMX_S16)(Value + .5);
+    }
+    else
+    {
+        return (OMX_S16)(Value - .5);
+    }
+}
+
+/**
+ * Function: armRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a int after rounding
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S32 format
+ *
+ */
+
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value)
+{
+    if (Value > 0)
+    {
+        return (OMX_S32)(Value + .5);
+    }
+    else
+    {
+        return (OMX_S32)(Value - .5);
+    }
+}
+/**
+ * Function: armSatRoundFloatToS16
+ *
+ * Description:
+ * Converts a double precision value into a short int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value)
+{
+    if (Value > 0)
+    {
+        Value += 0.5;
+        
+        if(Value > (OMX_S16)OMX_MAX_S16 )
+        {
+            return (OMX_S16)OMX_MAX_S16;
+        }
+        else
+        {
+            return (OMX_S16)Value;
+        }
+    }
+    else
+    {
+        Value -= 0.5;
+
+        if(Value < (OMX_S16)OMX_MIN_S16 )
+        {
+            return (OMX_S16)OMX_MIN_S16;
+        }
+        else
+        {
+            return (OMX_S16)Value;
+        }
+    }
+}
+
+/**
+ * Function: armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S32 format
+ *
+ */
+
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value)
+{
+    if (Value > 0)
+    {
+        Value += 0.5;
+        
+        if(Value > (OMX_S32)OMX_MAX_S32 )
+        {
+            return (OMX_S32)OMX_MAX_S32;
+        }
+        else
+        {
+            return (OMX_S32)Value;
+        }
+    }
+    else
+    {
+        Value -= 0.5;
+
+        if(Value < (OMX_S32)OMX_MIN_S32 )
+        {
+            return (OMX_S32)OMX_MIN_S32;
+        }
+        else
+        {
+            return (OMX_S32)Value;
+        }
+    }
+}
+
+/**
+ * Function: armSatRoundFloatToU16
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value)
+{
+    Value += 0.5;
+    
+    if(Value > (OMX_U16)OMX_MAX_U16 )
+    {
+        return (OMX_U16)OMX_MAX_U16;
+    }
+    else
+    {
+        return (OMX_U16)Value;
+    }
+}
+
+/**
+ * Function: armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned int after rounding and saturation
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U32 format
+ *
+ */
+
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value)
+{
+    Value += 0.5;
+    
+    if(Value > (OMX_U32)OMX_MAX_U32 )
+    {
+        return (OMX_U32)OMX_MAX_U32;
+    }
+    else
+    {
+        return (OMX_U32)Value;
+    }
+}
+
+/**
+ * Function: armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a 64 bit int after rounding
+ *
+ * Parameters:
+ * [in]  Value                 Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S64 format
+ *
+ */
+
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value)
+{
+    if (Value > 0)
+    {
+        return (OMX_S64)(Value + .5);
+    }
+    else
+    {
+        return (OMX_S64)(Value - .5);
+    }
+}
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	    var     Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT --   returns 1 if it is Positive
+ *              returns 0 if it is 0
+ *              returns -1 if it is Negative 
+ */ 
+
+OMX_INT armSignCheck (
+    OMX_S16 var
+)
+
+{
+    OMX_INT Sign;
+    
+    if (var < 0)
+    {
+        Sign = -1;
+    }
+    else if ( var > 0)
+    {
+        Sign = 1;
+    }
+    else
+    {
+        Sign = 0;
+    }
+    
+    return Sign;
+}
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 --   returns clipped value
+ */ 
+ 
+OMX_S32 armClip (
+    OMX_INT min,
+    OMX_INT max, 
+    OMX_S32 src 
+)
+ 
+{
+    if (src > max)
+    {
+        src = max;
+    }
+    else if (src < min)
+    {
+        src = min;
+    }
+    
+    return src;
+}
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ * 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min     lower bound
+ * [in] Max     upper bound
+ * [in] src     variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 --   returns clipped value
+ */ 
+ 
+OMX_F32 armClip_F32 (
+    OMX_F32 min,
+    OMX_F32 max, 
+    OMX_F32 src 
+)
+ 
+{
+    if (src > max)
+    {
+        src = max;
+    }
+    else if (src < min)
+    {
+        src = min;
+    }
+    
+    return src;
+}
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and 
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding 
+ * integer value. Takes care of rounding while clipping the final 
+ * value.
+ *
+ * Parameters:
+ * [in] v          Number to be operated upon
+ * [in] shift      Divides the input "v" by "2^shift"
+ * [in] satBits    Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 --   returns "shifted" saturated value
+ */ 
+ 
+OMX_U32 armShiftSat_F32(OMX_F32 v, OMX_INT shift, OMX_INT satBits) 
+{
+    OMX_U32 allOnes = (OMX_U32)(-1);
+    OMX_U32 maxV = allOnes >> (32-satBits);
+    OMX_F32 vShifted, vRounded, shiftDiv = (OMX_F32)(1 << shift);
+    OMX_U32 vInt;
+    OMX_U32 vIntSat;
+    
+    if(v <= 0)
+        return 0;
+    
+    vShifted = v / shiftDiv;
+    vRounded = (OMX_F32)(vShifted + 0.5);
+    vInt = (OMX_U32)vRounded;
+    vIntSat = vInt;
+    if(vIntSat > maxV) 
+        vIntSat = maxV;
+    return vIntSat;
+}
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * These function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(
+        OMX_U8 *pBuf1,
+        OMX_U8 *pBuf2,
+        OMX_INT elemSize
+       )
+{
+    OMX_INT i;
+    OMX_U8 temp;
+    armRetArgErrIf(!pBuf1 || !pBuf2, OMX_Sts_BadArgErr);
+    
+    for(i = 0; i < elemSize; i++)
+    {
+        temp = *(pBuf1 + i);
+        *(pBuf1 + i) = *(pBuf2 + i);
+        *(pBuf2 + i) = temp;
+    }
+    return OMX_Sts_NoErr;
+}
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry     First entry
+ * [in] sEntry     second entry
+ * [in] tEntry     Third entry
+ *
+ * Return Value:
+ * OMX_S32 --   returns the median value
+ */ 
+ 
+OMX_S32 armMedianOf3 (
+    OMX_S32 fEntry,
+    OMX_S32 sEntry, 
+    OMX_S32 tEntry 
+)
+{
+    OMX_S32 a, b, c;
+    
+    a = armMin (fEntry, sEntry);
+    b = armMax (fEntry, sEntry);
+    c = armMin (b, tEntry);
+    return (armMax (a, c));
+}
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ * 
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value    Positive value
+ *
+ * Return Value:
+ * OMX_U8 --     Returns the minimum number of bits required to represent the positive value. 
+                 This is the smallest k>=0 such that that value is less than (1<<k).
+ */ 
+ 
+OMX_U8 armLogSize (
+    OMX_U16 value 
+)
+{
+    OMX_U8 i;    
+    for ( i = 0; value > 0; value = value >> 1) 
+    {
+        i++;
+    }
+    return i;
+}
+
+/***********************************************************************/
+                /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+ 
+OMX_S32 armSatAdd_S32(OMX_S32 Value1,OMX_S32 Value2)
+{
+    OMX_S32 Result;
+    
+    Result = Value1 + Value2;
+
+    if( (Value1^Value2) >= 0)
+    {
+        /*Same sign*/
+        if( (Result^Value1) >= 0)
+        {
+            /*Result has not saturated*/
+            return Result;
+        }
+        else
+        {
+            if(Value1 >= 0)
+            {
+                /*Result has saturated in positive side*/
+                return OMX_MAX_S32;
+            }
+            else
+            {
+                /*Result has saturated in negative side*/
+                return OMX_MIN_S32;
+            }
+        
+        }
+   
+    }
+    else
+    {
+        return Result;
+    }
+    
+}
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ *   Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ *    
+ **/
+ 
+OMX_S64 armSatAdd_S64(OMX_S64 Value1,OMX_S64 Value2)
+{
+    OMX_S64 Result;
+    
+    Result = Value1 + Value2;
+
+    if( (Value1^Value2) >= 0)
+    {
+        /*Same sign*/
+        if( (Result^Value1) >= 0)
+        {
+            /*Result has not saturated*/
+            return Result;
+        }
+        else
+        {
+            if(Value1 >= 0)
+            {
+                /*Result has saturated in positive side*/
+                Result = OMX_MAX_S64;
+                return Result;
+            }
+            else
+            {
+                /*Result has saturated in negative side*/
+                return OMX_MIN_S64;
+            }
+        
+        }
+   
+    }
+    else
+    {
+        return Result;
+    }
+    
+}
+
+/** Function :armSatSub_S32()
+ * 
+ * Description :
+ *     Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ *
+ * Return:
+ * [out]             Result of operation
+ * 
+ **/
+
+OMX_S32 armSatSub_S32(OMX_S32 Value1,OMX_S32 Value2)
+{
+    OMX_S32 Result;
+    
+    Result = Value1 - Value2;
+
+    if( (Value1^Value2) < 0)
+    {
+        /*Opposite sign*/
+        if( (Result^Value1) >= 0)
+        {
+            /*Result has not saturated*/
+            return Result;
+        }
+        else
+        {
+            if(Value1 >= 0)
+            {
+                /*Result has saturated in positive side*/
+                return OMX_MAX_S32;
+            }
+            else
+            {
+                /*Result has saturated in negative side*/
+                return OMX_MIN_S32;
+            }
+        
+        }
+   
+    }
+    else
+    {
+        return Result;
+    }
+    
+}
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ *     Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ *     accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1       First Operand
+ * [in] Value2       Second Operand
+ * [in] Mac          Accumulator
+ *
+ * Return:
+ * [out]             Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(OMX_S32 Mac,OMX_S16 Value1,OMX_S16 Value2)
+{
+    OMX_S32 Result;
+    
+    Result = (OMX_S32)(Value1*Value2);
+    Result = armSatAdd_S32( Mac , Result );
+
+    return Result;    
+}
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ *   Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ *   mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem    First 32 bit Operand
+ * [in] filTap       Second 16 bit Operand
+ * [in] mac          Result of MAC operation
+ *
+ * Return:
+ * [out]  mac        Result of operation
+ *    
+ **/
+ 
+OMX_S32 armSatMac_S16S32_S32(OMX_S32 mac, OMX_S32 delayElem, OMX_S16 filTap )
+{
+    
+    OMX_S32 result;
+
+    result = armSatMulS16S32_S32(filTap,delayElem); 
+
+    if ( result > OMX_MAX_S16 )
+    {
+        result = OMX_MAX_S32;
+    }
+    else if( result < OMX_MIN_S16 )
+    {
+        result = OMX_MIN_S32;
+    }
+    else
+    {
+        result = delayElem * filTap;
+    }
+
+    mac = armSatAdd_S32(mac,result);
+    
+    return mac;
+}
+
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ *   Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ *   output = Saturate_in_16Bits( ( Right/LeftShift( (Round(input) , shift ) )
+ *
+ * Parametrs:
+ * [in] input       The input to be operated on
+ * [in] shift The shift number
+ *
+ * Return:
+ * [out]            Result of operation
+ *    
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(OMX_S32 input, OMX_INT shift)
+{
+    input = armSatRoundLeftShift_S32(input,-shift);
+
+    if ( input > OMX_MAX_S16 )
+    {
+        return (OMX_S16)OMX_MAX_S16;
+    }
+    else if (input < OMX_MIN_S16)
+    {
+        return (OMX_S16)OMX_MIN_S16;
+    }
+    else
+    {
+       return (OMX_S16)input;
+    }
+
+}
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *     
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] Shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+OMX_S32 armSatRoundLeftShift_S32(OMX_S32 Value, OMX_INT Shift)
+{
+    OMX_INT i;
+    
+    if (Shift < 0)
+    {
+        Shift = -Shift;
+        Value = armSatAdd_S32(Value, (1 << (Shift - 1)));
+        Value = Value >> Shift;
+    }
+    else
+    {
+        for (i = 0; i < Shift; i++)
+        {
+            Value = armSatAdd_S32(Value, Value);
+        }
+    }
+    return Value;
+}
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ *     Returns the result of saturating left-shift operation on input
+ *     Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value        Operand
+ * [in] shift        Operand for shift operation
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+ 
+OMX_S64 armSatRoundLeftShift_S64(OMX_S64 Value, OMX_INT Shift)
+{
+    OMX_INT i;
+    
+    if (Shift < 0)
+    {
+        Shift = -Shift;
+        Value = armSatAdd_S64(Value, ((OMX_S64)1 << (Shift - 1)));
+        Value = Value >> Shift;
+    }
+    else
+    {
+        for (i = 0; i < Shift; i++)
+        {
+            Value = armSatAdd_S64(Value, Value);
+        }
+    }
+    return Value;
+}
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S16 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(OMX_S16 input1,OMX_S32 input2)
+{
+    OMX_S16 hi2,lo1;
+    OMX_U16 lo2;
+    
+    OMX_S32 temp1,temp2;
+    OMX_S32 result;
+    
+    lo1  = input1;
+
+    hi2  = ( input2 >>  16 );
+    lo2  = ( (OMX_U32)( input2 << 16 ) >> 16 );
+    
+    temp1 = hi2 * lo1;
+    temp2 = ( lo2* lo1 ) >> 16;
+
+    result =  armSatAdd_S32(temp1,temp2);
+
+    return result;
+}
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ *     Returns the result of a S32 data type multiplied with an S32 data type
+ *     in a S32 container
+ *
+ * Parametrs:
+ * [in] input1       Operand 1
+ * [in] input2       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation
+ *    
+ **/
+
+OMX_S32 armSatMulS32S32_S32(OMX_S32 input1,OMX_S32 input2)
+{
+    OMX_S16 hi1,hi2;
+    OMX_U16 lo1,lo2;
+    
+    OMX_S32 temp1,temp2,temp3;
+    OMX_S32 result;
+
+    hi1  = ( input1 >>  16 );
+    lo1  = ( (OMX_U32)( input1 << 16 ) >> 16 );
+
+    hi2  = ( input2 >>  16 );
+    lo2  = ( (OMX_U32)( input2 << 16 ) >> 16 );
+    
+    temp1 =   hi1 * hi2;
+    temp2 = ( hi1* lo2 ) >> 16;
+    temp3 = ( hi2* lo1 ) >> 16;
+
+    result = armSatAdd_S32(temp1,temp2);
+    result = armSatAdd_S32(result,temp3);
+
+    return result;
+}
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer. 
+ *               Half-integer values are rounded away from zero
+ *               unless otherwise specified. For example 3//2 is rounded 
+ *               to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num        Operand 1
+ * [in] Deno       Operand 2
+ *
+ * Return:
+ * [out]             Result of operation input1//input2
+ *    
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno)
+{
+    OMX_F64 result;
+    
+    result = ((OMX_F64)Num)/((OMX_F64)Deno);
+    
+    if (result >= 0)
+    {
+        result += 0.5;
+    }
+    else
+    {
+        result -= 0.5;
+    }
+
+    return (OMX_S32)(result);
+}
+
+
+/*End of File*/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c
new file mode 100755
index 0000000..9ef9319
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c
@@ -0,0 +1,329 @@
+/**
+ * 
+ * File Name:  armCOMM_Bitstream.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   9641
+ * Date:       Thursday, February 7, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Defines bitstream encode and decode functions common to all codecs
+ */
+
+#include "omxtypes.h"
+#include "armCOMM.h"
+#include "armCOMM_Bitstream.h"
+
+/***************************************
+ * Fixed bit length Decode
+ ***************************************/
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in]     **ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1...32
+ *
+ * Returns  Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N)
+{
+    const OMX_U8 *pBitStream = *ppBitStream;
+    OMX_INT Offset = *pOffset;
+    OMX_U32 Value;
+
+    armAssert(Offset>=0 && Offset<=7);
+    armAssert(N>=1 && N<=32);
+
+    /* Read next 32 bits from stream */
+    Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16)  | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+    Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+    /* Return N bits */
+    return Value >> (32-N);
+}
+
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *    
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N=1..32
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ * Returns  Value
+ */
+
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N)
+{
+    const OMX_U8 *pBitStream = *ppBitStream;
+    OMX_INT Offset = *pOffset;
+    OMX_U32 Value;
+    
+    if(N == 0)
+    {
+      return 0;
+    }
+
+    armAssert(Offset>=0 && Offset<=7);
+    armAssert(N>=1 && N<=32);
+
+    /* Read next 32 bits from stream */
+    Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16)  | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+    Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+    /* Advance bitstream pointer by N bits */
+    Offset += N;
+    *ppBitStream = pBitStream + (Offset>>3);
+    *pOffset = Offset & 7;
+
+    /* Return N bits */
+    return Value >> (32-N);
+}
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+ 
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset)
+{
+    if(*pOffset > 0)
+    {
+        *ppBitStream += 1;
+        *pOffset = 0;
+    }    
+}
+
+/** 
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in]     *ppBitStream
+ * [in]     *pOffset
+ * [in]     N
+ *
+ * [out]    *ppBitStream
+ * [out]    *pOffset
+ *
+ **/
+
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N)
+{
+    OMX_INT Offset = *pOffset;
+    const OMX_U8 *pBitStream = *ppBitStream;
+   
+    /* Advance bitstream pointer by N bits */
+    Offset += N;
+    *ppBitStream = pBitStream + (Offset>>3);
+    *pOffset = Offset & 7;
+}
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ * 
+ * Parameters:
+ * [in]     *pBitStream
+ * [in]     *pOffset
+ * [in]     pCodeBook
+ * 
+ * [out]    *pBitStream
+ * [out]    *pOffset
+ *
+ * Returns : Code Book Index if successfull. 
+ *         : ARM_NO_CODEBOOK_INDEX = -1 if search fails.
+ **/
+#ifndef C_OPTIMIZED_IMPLEMENTATION 
+
+OMX_U16 armUnPackVLC32(
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pOffset,
+    const ARM_VLC32 *pCodeBook
+)
+{    
+    const OMX_U8 *pBitStream = *ppBitStream;
+    OMX_INT Offset = *pOffset;
+    OMX_U32 Value;
+    OMX_INT Index;
+        
+    armAssert(Offset>=0 && Offset<=7);
+
+    /* Read next 32 bits from stream */
+    Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16)  | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+    Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+    /* Search through the codebook */    
+    for (Index=0; pCodeBook->codeLen != 0; Index++)
+    {
+        if (pCodeBook->codeWord == (Value >> (32 - pCodeBook->codeLen)))
+        {
+            Offset       = Offset + pCodeBook->codeLen;
+            *ppBitStream = pBitStream + (Offset >> 3) ;
+            *pOffset     = Offset & 7;
+            
+            return Index;
+        }        
+        pCodeBook++;
+    }
+
+    /* No code match found */
+    return ARM_NO_CODEBOOK_INDEX;
+}
+
+#endif
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream     pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in] pOffset         pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in] codeWord        Code word that need to be inserted in to the
+ *                          bitstream
+ * [in] codeLength      Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream    *ppBitStream is updated after the block is encoded,
+ *                          so that it points to the current byte in the bit
+ *                          stream buffer.
+ * [out] pBitOffset     *pBitOffset is updated so that it points to the
+ *                          current bit position in the byte pointed by
+ *                          *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackBits (
+    OMX_U8  **ppBitStream, 
+    OMX_INT *pOffset,
+    OMX_U32 codeWord, 
+    OMX_INT codeLength 
+)
+{
+    OMX_U8  *pBitStream = *ppBitStream;
+    OMX_INT Offset = *pOffset;
+    OMX_U32 Value;
+        
+    /* checking argument validity */
+    armRetArgErrIf(Offset < 0, OMX_Sts_BadArgErr);
+    armRetArgErrIf(Offset > 7, OMX_Sts_BadArgErr);
+    armRetArgErrIf(codeLength < 1, OMX_Sts_BadArgErr);
+    armRetArgErrIf(codeLength > 32, OMX_Sts_BadArgErr);
+
+    /* Prepare the first byte */
+    codeWord = codeWord << (32-codeLength);
+    Value = (pBitStream[0] >> (8-Offset)) << (8-Offset);
+    Value = Value | (codeWord >> (24+Offset));
+
+    /* Write out whole bytes */
+    while (8-Offset <= codeLength)
+    {
+        *pBitStream++ = (OMX_U8)Value;
+        codeWord   = codeWord  << (8-Offset);
+        codeLength = codeLength - (8-Offset);
+        Offset = 0;
+        Value = codeWord >> 24;
+    }
+
+    /* Write out final partial byte */
+    *pBitStream  = (OMX_U8)Value;
+    *ppBitStream = pBitStream;
+    *pOffset = Offset + codeLength;
+    
+    return  OMX_Sts_NoErr;
+}
+ 
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte 
+ *                      in the bit stream.
+ * [in]	pBitOffset	    pointer to the bit position in the byte 
+ *                      pointed by *ppBitStream. Valid within 0
+ *                      to 7.
+ * [in]	 code     		VLC code word that need to be inserted in to the
+ *                      bitstream
+ *
+ * [out] ppBitStream	*ppBitStream is updated after the block is encoded,
+ *	                    so that it points to the current byte in the bit
+ *						stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the
+ *						current bit position in the byte pointed by
+ *						*ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+ 
+OMXResult armPackVLC32 (
+    OMX_U8 **ppBitStream, 
+    OMX_INT *pBitOffset,
+    ARM_VLC32 code 
+)
+{
+    return (armPackBits(ppBitStream, pBitOffset, code.codeWord, code.codeLen));
+}
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c
new file mode 100755
index 0000000..3f5e279
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c
@@ -0,0 +1,60 @@
+/**
+ * 
+ * File Name:  armCOMM_IDCTTable.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *   
+ * File: armCOMM_IDCTTable.c
+ * Brief: Defines Tables used in IDCT computation
+ *
+ */
+
+#include "armCOMM_IDCTTable.h"
+
+     /*  Table of s(u)*A(u)*A(v)/16 at Q15
+      *  s(u)=1.0 0 <= u <= 5
+      *  s(6)=2.0
+      *  s(7)=4.0
+      *  A(0) = 2*sqrt(2)
+      *  A(u) = 4*cos(u*pi/16)  for (u!=0)
+	  */
+	  
+__align(4) const OMX_U16 armCOMM_IDCTPreScale [64] =
+{
+    0x4000, 0x58c5, 0x539f, 0x4b42, 0x4000, 0x3249, 0x4546, 0x46a1,
+    0x58c5, 0x7b21, 0x73fc, 0x6862, 0x58c5, 0x45bf, 0x6016, 0x61f8,
+    0x539f, 0x73fc, 0x6d41, 0x6254, 0x539f, 0x41b3, 0x5a82, 0x5c48,
+    0x4b42, 0x6862, 0x6254, 0x587e, 0x4b42, 0x3b21, 0x5175, 0x530d,
+    0x4000, 0x58c5, 0x539f, 0x4b42, 0x4000, 0x3249, 0x4546, 0x46a1,
+    0x3249, 0x45bf, 0x41b3, 0x3b21, 0x3249, 0x2782, 0x366d, 0x377e,
+    0x22a3, 0x300b, 0x2d41, 0x28ba, 0x22a3, 0x1b37, 0x257e, 0x263a,
+    0x11a8, 0x187e, 0x1712, 0x14c3, 0x11a8, 0x0de0, 0x131d, 0x137d    
+};
+    /* Above array armCOMM_IDCTPreScale,  in Q23 format */
+const OMX_U32 armCOMM_IDCTPreScaleU32 [64] =
+{
+    0x400000, 0x58c543, 0x539eba, 0x4b418c, 0x400000, 0x3248d4, 0x4545ea, 0x46a157,
+    0x58c543, 0x7b20d8, 0x73fbfc, 0x686214, 0x58c543, 0x45bf1f, 0x6015a5, 0x61f78b,
+    0x539eba, 0x73fbfc, 0x6d413d, 0x6253a6, 0x539eba, 0x41b328, 0x5a827a, 0x5c4869,
+    0x4b418c, 0x686214, 0x6253a6, 0x587de3, 0x4b418c, 0x3b20d8, 0x5174e0, 0x530d69,
+    0x400000, 0x58c543, 0x539eba, 0x4b418c, 0x400000, 0x3248d4, 0x4545ea, 0x46a157,
+    0x3248d4, 0x45bf1f, 0x41b328, 0x3b20d8, 0x3248d4, 0x27821d, 0x366d72, 0x377e6b,
+    0x22a2f5, 0x300ad3, 0x2d413d, 0x28ba70, 0x22a2f5, 0x1b36b9, 0x257d86, 0x26398d,
+    0x11a856, 0x187de3, 0x17121a, 0x14c35a, 0x11a856, 0x0ddf9b, 0x131cc7, 0x137ca2
+};
+   
+const OMX_U16 armCOMM_IDCTCoef [4] =
+{
+    0x5a82, /* InvSqrt2 */
+    0x30fc, /* SinPIBy8 */
+    0x7642, /* CosPIBy8 */
+    0x0000    
+};
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c
new file mode 100755
index 0000000..09f88c3
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c
@@ -0,0 +1,45 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  armCOMM_MaskTable.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Mask Table to mask the end of array.
+ * 
+ */
+ 
+#include "omxtypes.h"
+
+#define MaskTableSize 72
+
+const OMX_U16 armCOMM_qMaskTable16[MaskTableSize] = 
+{
+        0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF    
+};
+
+const OMX_U8 armCOMM_qMaskTable8[MaskTableSize] = 
+{
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
+        0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
+        0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 
+        0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 
+        0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,  
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,  
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF    
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h
new file mode 100755
index 0000000..35b510b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h
@@ -0,0 +1,1153 @@
+/**
+ * 
+ * File Name:  armVC.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File: armVideo.h
+ * Brief: Declares API's/Basic Data types used across the OpenMAX Video domain
+ *
+ */
+
+
+#ifndef _armVideo_H_
+#define _armVideo_H_
+
+#include "omxVC.h"
+#include "armCOMM_Bitstream.h"
+
+/**
+ * ARM specific state structure to hold Motion Estimation information.
+ */
+ 
+struct m4p2_MESpec
+{
+    OMXVCM4P2MEParams MEParams;
+    OMXVCM4P2MEMode   MEMode;
+};
+
+struct m4p10_MESpec
+{
+    OMXVCM4P10MEParams MEParams;
+    OMXVCM4P10MEMode   MEMode;
+};
+
+typedef struct m4p2_MESpec  ARMVCM4P2_MESpec;
+typedef struct m4p10_MESpec ARMVCM4P10_MESpec;
+
+/**
+ * Function: armVCM4P2_CompareMV
+ *
+ * Description:
+ * Performs comparision of motion vectors and SAD's to decide the
+ * best MV and SAD
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]     mvX     x coordinate of the candidate motion vector
+ * [in]     mvY     y coordinate of the candidate motion vector
+ * [in]     candSAD Candidate SAD
+ * [in]     bestMVX x coordinate of the best motion vector
+ * [in]     bestMVY y coordinate of the best motion vector
+ * [in]     bestSAD best SAD
+ *
+ * Return Value:
+ * OMX_INT -- 1 to indicate that the current sad is the best
+ *            0 to indicate that it is NOT the best SAD
+ */
+
+OMX_INT armVCM4P2_CompareMV (
+    OMX_S16 mvX,
+    OMX_S16 mvY,
+    OMX_INT candSAD,
+    OMX_S16 bestMVX,
+    OMX_S16 bestMVY,
+    OMX_INT bestSAD);
+
+/**
+ * Function: armVCM4P2_ACDCPredict
+ *
+ * Description:
+ * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+ * to the function call, prediction direction (predDir) should be selected
+ * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDst     pointer to the coefficient buffer which contains
+ *                          the quantized coefficient residuals (PQF) of the
+ *                          current block
+ * [in] pPredBufRow pointer to the coefficient row buffer
+ * [in] pPredBufCol pointer to the coefficient column buffer
+ * [in] curQP       quantization parameter of the current block. curQP
+ *                          may equal to predQP especially when the current
+ *                          block and the predictor block are in the same
+ *                          macroblock.
+ * [in] predQP      quantization parameter of the predictor block
+ * [in] predDir     indicates the prediction direction which takes one
+ *                          of the following values:
+ *                          OMX_VIDEO_HORIZONTAL    predict horizontally
+ *                          OMX_VIDEO_VERTICAL      predict vertically
+ * [in] ACPredFlag  a flag indicating if AC prediction should be
+ *                          performed. It is equal to ac_pred_flag in the bit
+ *                          stream syntax of MPEG-4
+ * [in] videoComp   video component type (luminance, chrominance or
+ *                          alpha) of the current block
+ * [in] flag        This flag defines the if one wants to use this functions to
+ *                  calculate PQF (set 1, prediction) or QF (set 0, reconstruction)
+ * [out]    pPreACPredict   pointer to the predicted coefficients buffer.
+ *                          Filled ONLY if it is not NULL
+ * [out]    pSrcDst     pointer to the coefficient buffer which contains
+ *                          the quantized coefficients (QF) of the current
+ *                          block
+ * [out]    pPredBufRow pointer to the updated coefficient row buffer
+ * [out]    pPredBufCol pointer to the updated coefficient column buffer
+ * [out]    pSumErr     pointer to the updated sum of the difference
+ *                      between predicted and unpredicted coefficients
+ *                      If this is NULL, do not update
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_ACDCPredict(
+     OMX_S16 * pSrcDst,
+     OMX_S16 * pPreACPredict,
+     OMX_S16 * pPredBufRow,
+     OMX_S16 * pPredBufCol,
+     OMX_INT curQP,
+     OMX_INT predQP,
+     OMX_INT predDir,
+     OMX_INT ACPredFlag,
+     OMXVCM4P2VideoComponent  videoComp,
+     OMX_U8 flag,
+     OMX_INT *pSumErr
+);
+
+/**
+ * Function: armVCM4P2_SetPredDir
+ *
+ * Description:
+ * Performs detecting the prediction direction
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] blockIndex  block index indicating the component type and
+ *                          position as defined in subclause 6.1.3.8, of ISO/IEC
+ *                          14496-2. Furthermore, indexes 6 to 9 indicate the
+ *                          alpha blocks spatially corresponding to luminance
+ *                          blocks 0 to 3 in the same macroblock.
+ * [in] pCoefBufRow pointer to the coefficient row buffer
+ * [in] pQpBuf      pointer to the quantization parameter buffer
+ * [out]    predQP      quantization parameter of the predictor block
+ * [out]    predDir     indicates the prediction direction which takes one
+ *                          of the following values:
+ *                          OMX_VIDEO_HORIZONTAL    predict horizontally
+ *                          OMX_VIDEO_VERTICAL      predict vertically
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_SetPredDir(
+     OMX_INT blockIndex,
+     OMX_S16 *pCoefBufRow,
+     OMX_S16 *pCoefBufCol,
+     OMX_INT *predDir,
+     OMX_INT *predQP,
+     const OMX_U8 *pQpBuf
+);
+
+/**
+ * Function: armVCM4P2_EncodeVLCZigzag_Intra
+ *
+ * Description:
+ * Performs zigzag scanning and VLC encoding for one intra block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream     pointer to the pointer to the current byte in
+ *                              the bit stream
+ * [in] pBitOffset      pointer to the bit position in the byte pointed
+ *                              by *ppBitStream. Valid within 0 to 7.
+ * [in] pQDctBlkCoef    pointer to the quantized DCT coefficient
+ * [in] predDir         AC prediction direction, which is used to decide
+ *                              the zigzag scan pattern. This takes one of the
+ *                              following values:
+ *                              OMX_VIDEO_NONE          AC prediction not used.
+ *                                                      Performs classical zigzag
+ *                                                      scan.
+ *                              OMX_VIDEO_HORIZONTAL    Horizontal prediction.
+ *                                                      Performs alternate-vertical
+ *                                                      zigzag scan.
+ *                              OMX_VIDEO_VERTICAL      Vertical prediction.
+ *                                                      Performs alternate-horizontal
+ *                                                      zigzag scan.
+ * [in] pattern         block pattern which is used to decide whether
+ *                              this block is encoded
+ * [in] start           start indicates whether the encoding begins with 0th element
+ *                      or 1st.
+ * [out]    ppBitStream     *ppBitStream is updated after the block is encoded,
+ *                              so that it points to the current byte in the bit
+ *                              stream buffer.
+ * [out]    pBitOffset      *pBitOffset is updated so that it points to the
+ *                              current bit position in the byte pointed by
+ *                              *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_EncodeVLCZigzag_Intra(
+     OMX_U8 **ppBitStream,
+     OMX_INT *pBitOffset,
+     const OMX_S16 *pQDctBlkCoef,
+     OMX_U8 predDir,
+     OMX_U8 pattern,
+     OMX_INT shortVideoHeader,
+     OMX_U8 start
+);
+
+/**
+ * Function: armVCM4P2_DecodeVLCZigzag_Intra
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream     pointer to the pointer to the current byte in
+ *                              the bitstream buffer
+ * [in] pBitOffset      pointer to the bit position in the byte pointed
+ *                              to by *ppBitStream. *pBitOffset is valid within
+ *                              [0-7].
+ * [in] predDir         AC prediction direction which is used to decide
+ *                              the zigzag scan pattern. It takes one of the
+ *                              following values:
+ *                              OMX_VIDEO_NONE  AC prediction not used;
+ *                                              perform classical zigzag scan;
+ *                              OMX_VIDEO_HORIZONTAL    Horizontal prediction;
+ *                                                      perform alternate-vertical
+ *                                                      zigzag scan;
+ *                              OMX_VIDEO_VERTICAL      Vertical prediction;
+ *                                                      thus perform
+ *                                                      alternate-horizontal
+ *                                                      zigzag scan.
+ * [in] videoComp       video component type (luminance, chrominance or
+ *                              alpha) of the current block
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ *                           and escape mode 4 is used when shortVideoHeader==1.
+ * [in] start           start indicates whether the encoding begins with 0th element
+ *                      or 1st.
+ * [out]    ppBitStream     *ppBitStream is updated after the block is
+ *                              decoded, so that it points to the current byte
+ *                              in the bit stream buffer
+ * [out]    pBitOffset      *pBitOffset is updated so that it points to the
+ *                              current bit position in the byte pointed by
+ *                              *ppBitStream
+ * [out]    pDst            pointer to the coefficient buffer of current
+ *                              block. Should be 32-bit aligned
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_DecodeVLCZigzag_Intra(
+     const OMX_U8 ** ppBitStream,
+     OMX_INT * pBitOffset,
+     OMX_S16 * pDst,
+     OMX_U8 predDir,
+     OMX_INT shortVideoHeader, 
+     OMX_U8  start
+);
+
+/**
+ * Function: armVCM4P2_FillVLDBuffer
+ *
+ * Description:
+ * Performs filling of the coefficient buffer according to the run, level
+ * and sign, also updates the index
+ * 
+ * Parameters:
+ * [in]  storeRun        Stored Run value (count of zeros)   
+ * [in]  storeLevel      Stored Level value (non-zero value)
+ * [in]  sign            Flag indicating the sign of level
+ * [in]  last            status of the last flag
+ * [in]  pIndex          pointer to coefficient index in 8x8 matrix
+ * [out] pIndex          pointer to updated coefficient index in 8x8 
+ *                       matrix
+ * [in]  pZigzagTable    pointer to the zigzag tables
+ * [out] pDst            pointer to the coefficient buffer of current
+ *                       block. Should be 32-bit aligned
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_FillVLDBuffer(
+    OMX_U32 storeRun,
+    OMX_S16 * pDst,
+    OMX_S16 storeLevel,
+    OMX_U8  sign,
+    OMX_U8  last,
+    OMX_U8  * index,
+    const OMX_U8 * pZigzagTable
+);
+
+/**
+ * Function: armVCM4P2_GetVLCBits
+ *
+ * Description:
+ * Performs escape mode decision based on the run, run+, level, level+ and 
+ * last combinations.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte in
+ *								the bit stream
+ * [in]	pBitOffset		pointer to the bit position in the byte pointed
+ *								by *ppBitStream. Valid within 0 to 7
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ *                           and escape mode 4 is used when shortVideoHeader==1.
+ * [in] start           start indicates whether the encoding begins with 
+ *                      0th element or 1st.
+ * [in/out] pLast       pointer to last status flag
+ * [in] runBeginSingleLevelEntriesL0      The run value from which level 
+ *                                        will be equal to 1: last == 0
+ * [in] IndexBeginSingleLevelEntriesL0    Array index in the VLC table 
+ *                                        pointing to the  
+ *                                        runBeginSingleLevelEntriesL0 
+ * [in] runBeginSingleLevelEntriesL1      The run value from which level 
+ *                                        will be equal to 1: last == 1
+ * [in] IndexBeginSingleLevelEntriesL1    Array index in the VLC table 
+ *                                        pointing to the  
+ *                                        runBeginSingleLevelEntriesL0 
+ * [in] pRunIndexTableL0    Run Index table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pVlcTableL0         VLC table for last == 0
+ * [in] pRunIndexTableL1    Run Index table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pVlcTableL1         VLC table for last == 1
+ * [in] pLMAXTableL0        Level MAX table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pLMAXTableL1        Level MAX table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pRMAXTableL0        Run MAX table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pRMAXTableL1        Run MAX table defined in 
+ *                          armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [out]pDst			    pointer to the coefficient buffer of current
+ *							block. Should be 32-bit aligned
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_GetVLCBits (
+              const OMX_U8 **ppBitStream,
+              OMX_INT * pBitOffset,
+			  OMX_S16 * pDst,
+			  OMX_INT shortVideoHeader,
+			  OMX_U8    start,			  
+			  OMX_U8  * pLast,
+			  OMX_U8    runBeginSingleLevelEntriesL0,
+			  OMX_U8    maxIndexForMultipleEntriesL0,
+			  OMX_U8    maxRunForMultipleEntriesL1,
+			  OMX_U8    maxIndexForMultipleEntriesL1,
+              const OMX_U8  * pRunIndexTableL0,
+              const ARM_VLC32 *pVlcTableL0,
+			  const OMX_U8  * pRunIndexTableL1,
+              const ARM_VLC32 *pVlcTableL1,
+              const OMX_U8  * pLMAXTableL0,
+              const OMX_U8  * pLMAXTableL1,
+              const OMX_U8  * pRMAXTableL0,
+              const OMX_U8  * pRMAXTableL1,
+              const OMX_U8  * pZigzagTable
+);
+
+/**
+ * Function: armVCM4P2_PutVLCBits
+ *
+ * Description:
+ * Checks the type of Escape Mode and put encoded bits for 
+ * quantized DCT coefficients.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	 ppBitStream      pointer to the pointer to the current byte in
+ *						  the bit stream
+ * [in]	 pBitOffset       pointer to the bit position in the byte pointed
+ *                        by *ppBitStream. Valid within 0 to 7
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ *                           and escape mode 4 is used when shortVideoHeader==1.
+ * [in]  start            start indicates whether the encoding begins with 
+ *                        0th element or 1st.
+ * [in]  maxStoreRunL0    Max store possible (considering last and inter/intra)
+ *                        for last = 0
+ * [in]  maxStoreRunL1    Max store possible (considering last and inter/intra)
+ *                        for last = 1
+ * [in]  maxRunForMultipleEntriesL0 
+ *                        The run value after which level 
+ *                        will be equal to 1: 
+ *                        (considering last and inter/intra status) for last = 0
+ * [in]  maxRunForMultipleEntriesL1 
+ *                        The run value after which level 
+ *                        will be equal to 1: 
+ *                        (considering last and inter/intra status) for last = 1
+ * [in]  pRunIndexTableL0 Run Index table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in]  pVlcTableL0      VLC table for last == 0
+ * [in]  pRunIndexTableL1 Run Index table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in]  pVlcTableL1      VLC table for last == 1
+ * [in]  pLMAXTableL0     Level MAX table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in]  pLMAXTableL1     Level MAX table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in]  pRMAXTableL0     Run MAX table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in]  pRMAXTableL1     Run MAX table defined in 
+ *                        armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [out] pQDctBlkCoef     pointer to the quantized DCT coefficient
+ * [out] ppBitStream      *ppBitStream is updated after the block is encoded
+ *                        so that it points to the current byte in the bit
+ *                        stream buffer.
+ * [out] pBitOffset       *pBitOffset is updated so that it points to the
+ *                        current bit position in the byte pointed by
+ *                        *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+
+OMXResult armVCM4P2_PutVLCBits (
+              OMX_U8 **ppBitStream,
+              OMX_INT * pBitOffset,
+              const OMX_S16 *pQDctBlkCoef,
+              OMX_INT shortVideoHeader,
+              OMX_U8 start,
+              OMX_U8 maxStoreRunL0,
+              OMX_U8 maxStoreRunL1,
+              OMX_U8  maxRunForMultipleEntriesL0,
+              OMX_U8  maxRunForMultipleEntriesL1,
+              const OMX_U8  * pRunIndexTableL0,
+              const ARM_VLC32 *pVlcTableL0,
+			  const OMX_U8  * pRunIndexTableL1,
+              const ARM_VLC32 *pVlcTableL1,
+              const OMX_U8  * pLMAXTableL0,
+              const OMX_U8  * pLMAXTableL1,
+              const OMX_U8  * pRMAXTableL0,
+              const OMX_U8  * pRMAXTableL1,
+              const OMX_U8  * pZigzagTable
+);
+/**
+ * Function: armVCM4P2_FillVLCBuffer
+ *
+ * Description:
+ * Performs calculating the VLC bits depending on the escape type and insert 
+ * the same in the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	 ppBitStream		pointer to the pointer to the current byte in
+ *	                        the bit stream
+ * [in]	 pBitOffset         pointer to the bit position in the byte pointed
+ *                          by *ppBitStream. Valid within 0 to 7
+ * [in]  run                Run value (count of zeros) to be encoded  
+ * [in]  level              Level value (non-zero value) to be encoded
+ * [in]  runPlus            Calculated as runPlus = run - (RMAX + 1)  
+ * [in]  levelPlus          Calculated as 
+ *                          levelPlus = sign(level)*[abs(level) - LMAX]
+ * [in]  fMode              Flag indicating the escape modes
+ * [in]  last               status of the last flag
+ * [in]  maxRunForMultipleEntries 
+ *                          The run value after which level will be equal to 1: 
+ *                          (considering last and inter/intra status)
+ * [in]  pRunIndexTable     Run Index table defined in
+ *                          armVCM4P2_Huff_tables_VLC.h
+ * [in]  pVlcTable          VLC table defined in armVCM4P2_Huff_tables_VLC.h
+ * [out] ppBitStream		*ppBitStream is updated after the block is encoded
+ *                          so that it points to the current byte in the bit
+ *                          stream buffer.
+ * [out] pBitOffset         *pBitOffset is updated so that it points to the
+ *                          current bit position in the byte pointed by
+ *                          *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_FillVLCBuffer (
+              OMX_U8 **ppBitStream,
+              OMX_INT * pBitOffset,
+              OMX_U32 run,
+              OMX_S16 level, 
+			  OMX_U32 runPlus,
+              OMX_S16 levelPlus, 
+              OMX_U8  fMode,
+			  OMX_U8  last,
+              OMX_U8  maxRunForMultipleEntries, 
+              const OMX_U8  *pRunIndexTable,
+              const ARM_VLC32 *pVlcTable
+);
+
+/**
+ * Function: armVCM4P2_CheckVLCEscapeMode
+ *
+ * Description:
+ * Performs escape mode decision based on the run, run+, level, level+ and 
+ * last combinations.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] run             Run value (count of zeros) to be encoded  
+ * [in] level           Level value (non-zero value) to be encoded
+ * [in] runPlus         Calculated as runPlus = run - (RMAX + 1)  
+ * [in] levelPlus       Calculated as 
+ *                      levelPlus = sign(level)*[abs(level) - LMAX]
+ * [in] maxStoreRun     Max store possible (considering last and inter/intra)
+ * [in] maxRunForMultipleEntries 
+ *                      The run value after which level 
+ *                      will be equal to 1: 
+ *                      (considering last and inter/intra status)
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ *                           and escape mode 4 is used when shortVideoHeader==1.
+ * [in] pRunIndexTable  Run Index table defined in 
+ *                      armVCM4P2_Huff_Tables_VLC.c
+ *                      (considering last and inter/intra status)
+ *
+ *                      
+ * Return Value:
+ * Returns an Escape mode which can take values from 0 to 3
+ * 0 --> no escape mode, 1 --> escape type 1,
+ * 1 --> escape type 2, 3 --> escape type 3, check section 7.4.1.3
+ * in the MPEG ISO standard.
+ *
+ */
+
+OMX_U8 armVCM4P2_CheckVLCEscapeMode(
+     OMX_U32 run,
+     OMX_U32 runPlus,
+     OMX_S16 level,
+     OMX_S16 levelPlus,
+     OMX_U8  maxStoreRun,
+     OMX_U8  maxRunForMultipleEntries,
+     OMX_INT shortVideoHeader,
+     const OMX_U8  *pRunIndexTable
+);
+
+
+/**
+ * Function: armVCM4P2_BlockMatch_Integer
+ *
+ * Description:
+ * Performs a 16x16 block search; estimates motion vector and associated minimum SAD.  
+ * Both the input and output motion vectors are represented using half-pixel units, and 
+ * therefore a shift left or right by 1 bit may be required, respectively, to match the 
+ * input or output MVs with other functions that either generate output MVs or expect 
+ * input MVs represented using integer pixel units. 
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	pSrcRefBuf		pointer to the reference Y plane; points to the reference MB that 
+ *                    corresponds to the location of the current macroblock in the current 
+ *                    plane.
+ * [in]	refWidth		  width of the reference plane
+ * [in]	pRefRect		  pointer to the valid rectangular in reference plane. Relative to image origin. 
+ *                    It's not limited to the image boundary, but depended on the padding. For example, 
+ *                    if you pad 4 pixels outside the image border, then the value for left border 
+ *                    can be -4
+ * [in]	pSrcCurrBuf		pointer to the current macroblock extracted from original plane (linear array, 
+ *                    256 entries); must be aligned on an 8-byte boundary.
+ * [in] pCurrPointPos	position of the current macroblock in the current plane
+ * [in] pSrcPreMV		  pointer to predicted motion vector; NULL indicates no predicted MV
+ * [in] pSrcPreSAD		pointer to SAD associated with the predicted MV (referenced by pSrcPreMV)
+ * [in] searchRange		search range for 16X16 integer block,the units of it is full pixel,the search range 
+ *                    is the same in all directions.It is in inclusive of the boundary and specified in 
+ *                    terms of integer pixel units.
+ * [in] pMESpec			  vendor-specific motion estimation specification structure; must have been allocated 
+ *                    and then initialized using omxVCM4P2_MEInit prior to calling the block matching 
+ *                    function.
+ * [in] BlockSize     MacroBlock Size i.e either 16x16 or 8x8.
+ * [out]	pDstMV			pointer to estimated MV
+ * [out]	pDstSAD			pointer to minimum SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr �C no error.
+ * OMX_Sts_BadArgErr �C bad arguments
+ *
+ */
+
+OMXResult armVCM4P2_BlockMatch_Integer(
+     const OMX_U8 *pSrcRefBuf,
+     OMX_INT refWidth,
+     const OMXRect *pRefRect,
+     const OMX_U8 *pSrcCurrBuf,
+     const OMXVCM4P2Coordinate *pCurrPointPos,
+     const OMXVCMotionVector *pSrcPreMV,
+     const OMX_INT *pSrcPreSAD,
+     void *pMESpec,
+     OMXVCMotionVector *pDstMV,
+     OMX_INT *pDstSAD,
+     OMX_U8 BlockSize
+);
+
+/**
+ * Function: armVCM4P2_BlockMatch_Half
+ *
+ * Description:
+ * Performs a 16x16 block match with half-pixel resolution.  Returns the estimated 
+ * motion vector and associated minimum SAD.  This function estimates the half-pixel 
+ * motion vector by interpolating the integer resolution motion vector referenced 
+ * by the input parameter pSrcDstMV, i.e., the initial integer MV is generated 
+ * externally.  The input parameters pSrcRefBuf and pSearchPointRefPos should be 
+ * shifted by the winning MV of 16x16 integer search prior to calling BlockMatch_Half_16x16.  
+ * The function BlockMatch_Integer_16x16 may be used for integer motion estimation.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	pSrcRefBuf		pointer to the reference Y plane; points to the reference MB 
+ *                    that corresponds to the location of the current macroblock in 
+ *                    the	current plane.
+ * [in]	refWidth		  width of the reference plane
+ * [in]	pRefRect		  reference plane valid region rectangle
+ * [in]	pSrcCurrBuf		pointer to the current macroblock extracted from original plane 
+ *                    (linear array, 256 entries); must be aligned on an 8-byte boundary. 
+ * [in]	pSearchPointRefPos	position of the starting point for half pixel search (specified 
+ *                          in terms of integer pixel units) in the reference plane.
+ * [in]	rndVal			  rounding control bit for half pixel motion estimation; 
+ *                    0=rounding control disabled; 1=rounding control enabled
+ * [in]	pSrcDstMV		pointer to the initial MV estimate; typically generated during a prior 
+ *                  16X16 integer search and its unit is half pixel.
+ * [in] BlockSize     MacroBlock Size i.e either 16x16 or 8x8.
+ * [out]pSrcDstMV		pointer to estimated MV
+ * [out]pDstSAD			pointer to minimum SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr �C no error
+ * OMX_Sts_BadArgErr �C bad arguments
+ *
+ */
+
+OMXResult armVCM4P2_BlockMatch_Half(
+     const OMX_U8 *pSrcRefBuf,
+     OMX_INT refWidth,
+     const OMXRect *pRefRect,
+     const OMX_U8 *pSrcCurrBuf,
+     const OMXVCM4P2Coordinate *pSearchPointRefPos,
+     OMX_INT rndVal,
+     OMXVCMotionVector *pSrcDstMV,
+     OMX_INT *pDstSAD,
+     OMX_U8 BlockSize
+);
+/**
+ * Function: armVCM4P2_PadMV
+ *
+ * Description:
+ * Performs motion vector padding for a macroblock.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDstMV       pointer to motion vector buffer of the current
+ *                              macroblock
+ * [in] pTransp         pointer to transparent status buffer of the
+ *                              current macroblock
+ * [out]    pSrcDstMV       pointer to motion vector buffer in which the
+ *                              motion vectors have been padded
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_PadMV(
+     OMXVCMotionVector * pSrcDstMV,
+     OMX_U8 * pTransp
+);
+
+/* 
+ * H.264 Specific Declarations 
+ */
+/* Defines */
+#define ARM_M4P10_Q_OFFSET        (15)
+
+
+/* Dequant tables */
+
+extern const OMX_U8 armVCM4P10_PosToVCol4x4[16];
+extern const OMX_U8 armVCM4P10_PosToVCol2x2[4];
+extern const OMX_U8 armVCM4P10_VMatrix[6][3];
+extern const OMX_U32 armVCM4P10_MFMatrix[6][3];
+
+
+/*
+ * Description:
+ * This function perform the work required by the OpenMAX
+ * DecodeCoeffsToPair function and DecodeChromaDCCoeffsToPair.
+ * Since most of the code is common we share it here.
+ *
+ * Parameters:
+ * [in]	ppBitStream		Double pointer to current byte in bit stream buffer
+ * [in]	pOffset			Pointer to current bit position in the byte pointed
+ *								to by *ppBitStream
+ * [in]	sMaxNumCoeff	Maximum number of non-zero coefficients in current
+ *								block (4,15 or 16)
+ * [in]	nTable          Table number (0 to 4) according to the five columns
+ *                      of Table 9-5 in the H.264 spec
+ * [out]	ppBitStream		*ppBitStream is updated after each block is decoded
+ * [out]	pOffset			*pOffset is updated after each block is decoded
+ * [out]	pNumCoeff		Pointer to the number of nonzero coefficients in
+ *								this block
+ * [out]	ppPosCoefbuf	Double pointer to destination residual
+ *								coefficient-position pair buffer
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+
+ */
+
+OMXResult armVCM4P10_DecodeCoeffsToPair(
+     const OMX_U8** ppBitStream,
+     OMX_S32* pOffset,
+     OMX_U8* pNumCoeff,
+     OMX_U8**ppPosCoefbuf,
+     OMX_INT nTable,
+     OMX_INT sMaxNumCoeff        
+ );
+
+/*
+ * Description:
+ * Perform DC style intra prediction, averaging upper and left block
+ *
+ * Parameters:
+ * [in]	pSrcLeft		Pointer to the buffer of 16 left coefficients:
+ *								p[x, y] (x = -1, y = 0..3)
+ * [in]	pSrcAbove		Pointer to the buffer of 16 above coefficients:
+ *								p[x,y] (x = 0..3, y = -1)
+ * [in]	leftStep		Step of left coefficient buffer
+ * [in]	dstStep			Step of the destination buffer
+ * [in]	availability	Neighboring 16x16 MB availability flag
+ * [out]	pDst			Pointer to the destination buffer
+ *
+ * Return Value:
+ * None
+ */
+
+void armVCM4P10_PredictIntraDC4x4(
+     const OMX_U8* pSrcLeft,
+     const OMX_U8 *pSrcAbove,
+     OMX_U8* pDst,
+     OMX_INT leftStep,
+     OMX_INT dstStep,
+     OMX_S32 availability        
+);
+
+/*
+ * Description
+ * Unpack a 4x4 block of coefficient-residual pair values
+ *
+ * Parameters:
+ * [in]	ppSrc	Double pointer to residual coefficient-position pair
+ *						buffer output by CALVC decoding
+ * [out]	ppSrc	*ppSrc is updated to the start of next non empty block
+ * [out]	pDst	Pointer to unpacked 4x4 block
+ */
+
+void armVCM4P10_UnpackBlock4x4(
+     const OMX_U8 **ppSrc,
+     OMX_S16* pDst
+);
+
+/*
+ * Description
+ * Unpack a 2x2 block of coefficient-residual pair values
+ *
+ * Parameters:
+ * [in]	ppSrc	Double pointer to residual coefficient-position pair
+ *						buffer output by CALVC decoding
+ * [out]	ppSrc	*ppSrc is updated to the start of next non empty block
+ * [out]	pDst	Pointer to unpacked 4x4 block
+ */
+
+void armVCM4P10_UnpackBlock2x2(
+     const OMX_U8 **ppSrc,
+     OMX_S16* pDst
+);
+
+/*
+ * Description
+ * Deblock one boundary pixel
+ *
+ * Parameters:
+ * [in]	pQ0         Pointer to pixel q0
+ * [in] Step        Step between pixels q0 and q1
+ * [in] tC0         Edge threshold value
+ * [in] alpha       alpha threshold value
+ * [in] beta        beta threshold value
+ * [in] bS          deblocking strength
+ * [in] ChromaFlag  True for chroma blocks
+ * [out] pQ0        Deblocked pixels
+ * 
+ */
+
+void armVCM4P10_DeBlockPixel(
+    OMX_U8 *pQ0,    /* pointer to the pixel q0 */
+    int Step,       /* step between pixels q0 and q1 */
+    int tC0,        /* edge threshold value */
+    int alpha,      /* alpha */
+    int beta,       /* beta */
+    int bS,         /* deblocking strength */
+    int ChromaFlag
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfHor_Luma
+ *
+ * Description:
+ * This function performs interpolation for horizontal 1/2-pel positions
+ *
+ * Remarks:
+ *
+ *	[in]	pSrc			Pointer to top-left corner of block used to interpolate 
+ 													in the reconstructed frame plane
+ *	[in]	iSrcStep	Step of the source buffer.
+ *	[in]	iDstStep	Step of the destination(interpolation) buffer.
+ *	[in]	iWidth		Width of the current block
+ *	[in]	iHeight		Height of the current block
+ *	[out]	pDst	    Pointer to the interpolation buffer of the 1/2-pel 
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfHor_Luma(
+        const OMX_U8*		pSrc, 
+		OMX_U32 	iSrcStep, 
+		OMX_U8* 	pDst, 
+		OMX_U32 	iDstStep, 
+		OMX_U32 	iWidth, 
+		OMX_U32 	iHeight
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfVer_Luma
+ * 
+ * Description:
+ * This function performs interpolation for vertical 1/2-pel positions 
+ * around a full-pel position.
+ *
+ * Remarks:
+ *
+ *	[in]	pSrc			Pointer to top-left corner of block used to interpolate 
+ *												in the reconstructed frame plane
+ *	[in]	iSrcStep	Step of the source buffer.
+ *	[in]	iDstStep	Step of the destination(interpolation) buffer.
+ *	[in]	iWidth		Width of the current block
+ *	[in]	iHeight		Height of the current block
+ *	[out]	pDst    	Pointer to the interpolation buffer of the 1/2-pel
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfVer_Luma(	
+	 const OMX_U8* 	pSrc, 
+	 OMX_U32 	iSrcStep, 
+ 	 OMX_U8* 	pDst,
+ 	 OMX_U32 	iDstStep, 
+ 	 OMX_U32 	iWidth, 
+ 	 OMX_U32 	iHeight
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfDiag_Luma
+ * 
+ * Description:
+ * This function performs interpolation for (1/2, 1/2)  positions 
+ * around a full-pel position.
+ *
+ * Remarks:
+ *
+ *  [in]    pSrc        Pointer to top-left corner of block used to interpolate 
+ *                      in the reconstructed frame plane
+ *  [in]    iSrcStep    Step of the source buffer.
+ *  [in]    iDstStep    Step of the destination(interpolation) buffer.
+ *  [in]    iWidth      Width of the current block
+ *  [in]    iHeight     Height of the current block
+ *  [out]   pDst        Pointer to the interpolation buffer of the (1/2,1/2)-pel
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfDiag_Luma(  
+        const OMX_U8*     pSrc, 
+        OMX_U32     iSrcStep, 
+        OMX_U8*     pDst, 
+        OMX_U32     iDstStep,
+        OMX_U32     iWidth, 
+        OMX_U32     iHeight
+);
+
+/*
+ * Description:
+ * Transform Residual 4x4 Coefficients
+ *
+ * Parameters:
+ * [in]  pSrc		Source 4x4 block
+ * [out] pDst		Destination 4x4 block
+ *
+ */
+
+void armVCM4P10_TransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc);
+
+/*
+ * Description:
+ * Forward Transform Residual 4x4 Coefficients
+ *
+ * Parameters:
+ * [in]  pSrc		Source 4x4 block
+ * [out] pDst		Destination 4x4 block
+ *
+ */
+
+void armVCM4P10_FwdTransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc);
+
+OMX_INT armVCM4P10_CompareMotionCostToMV (
+    OMX_S16  mvX,
+    OMX_S16  mvY,
+    OMXVCMotionVector diffMV, 
+    OMX_INT candSAD, 
+    OMXVCMotionVector *bestMV, 
+    OMX_U32 nLamda,
+    OMX_S32 *pBestCost);
+
+/**
+ * Function: armVCCOMM_SAD
+ *
+ * Description:
+ * This function calculate the SAD for NxM blocks.
+ *
+ * Remarks:
+ *
+ * [in]		pSrcOrg		Pointer to the original block
+ * [in]		iStepOrg	Step of the original block buffer
+ * [in]		pSrcRef		Pointer to the reference block
+ * [in]		iStepRef	Step of the reference block buffer
+ * [in]		iHeight		Height of the block
+ * [in]		iWidth		Width of the block
+ * [out]	pDstSAD		Pointer of result SAD
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+OMXResult armVCCOMM_SAD(	
+	const OMX_U8* 	pSrcOrg,
+	OMX_U32 	iStepOrg,
+	const OMX_U8* 	pSrcRef,
+	OMX_U32 	iStepRef,
+	OMX_S32*	pDstSAD,
+	OMX_U32		iHeight,
+	OMX_U32		iWidth);
+
+/**
+ * Function: armVCCOMM_Average
+ *
+ * Description:
+ * This function calculates the average of two blocks and stores the result.
+ *
+ * Remarks:
+ *
+ *	[in]	pPred0			Pointer to the top-left corner of reference block 0
+ *	[in]	pPred1			Pointer to the top-left corner of reference block 1
+ *	[in]	iPredStep0	    Step of reference block 0
+ *	[in]	iPredStep1	    Step of reference block 1
+ *	[in]	iDstStep 		Step of the destination buffer
+ *	[in]	iWidth			Width of the blocks
+ *	[in]	iHeight			Height of the blocks
+ *	[out]	pDstPred		Pointer to the destination buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+ OMXResult armVCCOMM_Average (
+	 const OMX_U8* 	    pPred0,
+	 const OMX_U8* 	    pPred1,	
+	 OMX_U32		iPredStep0,
+	 OMX_U32		iPredStep1,
+	 OMX_U8*		pDstPred,
+	 OMX_U32		iDstStep, 
+	 OMX_U32		iWidth,
+	 OMX_U32		iHeight
+);
+
+/**
+ * Function: armVCM4P10_SADQuar
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the 
+ * average of the other two (pSrcRef0 and pSrcRef1)
+ *
+ * Remarks:
+ *
+ * [in]		pSrc				Pointer to the original block
+ * [in]		pSrcRef0		Pointer to reference block 0
+ * [in]		pSrcRef1		Pointer to reference block 1
+ * [in]		iSrcStep 		Step of the original block buffer
+ * [in]		iRefStep0		Step of reference block 0 
+ * [in]		iRefStep1 	Step of reference block 1 
+ * [in]		iHeight			Height of the block
+ * [in]		iWidth			Width of the block
+ * [out]	pDstSAD			Pointer of result SAD
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+OMXResult armVCM4P10_SADQuar(
+	const OMX_U8* 	pSrc,
+    const OMX_U8* 	pSrcRef0,
+	const OMX_U8* 	pSrcRef1,	
+    OMX_U32 	iSrcStep,
+    OMX_U32		iRefStep0,
+    OMX_U32		iRefStep1,
+    OMX_U32*	pDstSAD,
+    OMX_U32     iHeight,
+    OMX_U32     iWidth
+);
+
+/**
+ * Function: armVCM4P10_Interpolate_Chroma
+ *
+ * Description:
+ * This function performs interpolation for chroma components.
+ *
+ * Remarks:
+ *
+ *  [in]    pSrc            Pointer to top-left corner of block used to 
+ *                                              interpolate in the reconstructed frame plane
+ *  [in]    iSrcStep    Step of the source buffer.
+ *  [in]    iDstStep    Step of the destination(interpolation) buffer.
+ *  [in]    iWidth      Width of the current block
+ *  [in]    iHeight     Height of the current block
+ *  [in]    dx              Fractional part of horizontal motion vector 
+ *                                              component in 1/8 pixel unit (0~7) 
+ *  [in]    dy              Fractional part of vertical motion vector 
+ *                                              component in 1/8 pixel unit (0~7)
+ *  [out]   pDst            Pointer to the interpolation buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+ OMXResult armVCM4P10_Interpolate_Chroma(
+        OMX_U8      *pSrc,
+        OMX_U32     iSrcStep,
+        OMX_U8      *pDst,
+        OMX_U32     iDstStep,
+        OMX_U32     iWidth,
+        OMX_U32     iHeight,
+        OMX_U32     dx,
+        OMX_U32     dy
+);
+
+/**
+ * Function: armVCM4P10_Interpolate_Luma
+ *
+ * Description:
+ * This function performs interpolation for luma components.
+ *
+ * Remarks:
+ *
+ *  [in]    pSrc            Pointer to top-left corner of block used to 
+ *                                              interpolate in the reconstructed frame plane
+ *  [in]    iSrcStep    Step of the source buffer.
+ *  [in]    iDstStep    Step of the destination(interpolation) buffer.
+ *  [in]    iWidth      Width of the current block
+ *  [in]    iHeight     Height of the current block
+ *  [in]    dx              Fractional part of horizontal motion vector 
+ *                                              component in 1/4 pixel unit (0~3) 
+ *  [in]    dy              Fractional part of vertical motion vector 
+ *                                              component in 1/4 pixel unit (0~3) 
+ *  [out]   pDst            Pointer to the interpolation buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+ OMXResult armVCM4P10_Interpolate_Luma(
+     const OMX_U8     *pSrc,
+     OMX_U32    iSrcStep,
+     OMX_U8     *pDst,
+     OMX_U32    iDstStep,
+     OMX_U32    iWidth,
+     OMX_U32    iHeight,
+     OMX_U32    dx,
+     OMX_U32    dy
+);
+
+/**
+ * Function: omxVCH264_DequantTransformACFromPair_U8_S16_C1_DLx
+ *
+ * Description:
+ * Reconstruct the 4x4 residual block from coefficient-position pair buffer,
+ * perform dequantisation and integer inverse transformation for 4x4 block of
+ * residuals and update the pair buffer pointer to next non-empty block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppSrc		Double pointer to residual coefficient-position
+ *							pair buffer output by CALVC decoding
+ * [in]	pDC			Pointer to the DC coefficient of this block, NULL
+ *							if it doesn't exist
+ * [in]	QP			Quantization parameter
+ * [in] AC          Flag indicating if at least one non-zero coefficient exists
+ * [out]	pDst		pointer to the reconstructed 4x4 block data
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P10_DequantTransformACFromPair_U8_S16_C1_DLx(
+     OMX_U8 **ppSrc,
+     OMX_S16 *pDst,
+     OMX_INT QP,
+     OMX_S16* pDC,
+     int AC
+);
+
+#endif  /*_armVideo_H_*/
+
+/*End of File*/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h
new file mode 100755
index 0000000..32a0166
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h
@@ -0,0 +1,72 @@
+;//
+;// 
+;// File Name:  armVCCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// ARM optimized OpenMAX AC header file
+;// 
+;// Formula used:
+;// MACRO for calculating median for three values.
+
+
+
+    IF :LNOT::DEF:ARMVCCOMM_S_H
+        INCLUDE armCOMM_s.h
+    M_VARIANTS      CortexA8, ARM1136JS
+    
+    IF ARM1136JS :LOR: CortexA8 
+     
+     ;///*
+     ;// * Macro: M_MEDIAN3
+     ;// *
+     ;// * Description: Finds the median of three numbers
+     ;// * 
+     ;// * Remarks:
+     ;// *
+     ;// * Parameters:
+     ;// * [in] x     First entry for the list of three numbers.
+     ;// * [in] y     Second entry for the list of three numbers.
+     ;// *            Input value may be corrupted at the end of
+     ;// *            the execution of this macro.
+     ;// * [in] z     Third entry of the list of three numbers.
+     ;// *            Input value corrupted at the end of the 
+     ;// *            execution of this macro.
+     ;// * [in] t     Temporary scratch  register.
+     ;// * [out]z     Median of the three numbers.       
+     ;// */
+     
+     MACRO
+
+     M_MEDIAN3 $x, $y, $z, $t 
+     
+     SUBS  $t, $y, $z; // if (y < z)
+     ADDLT $z, $z, $t; //  swap y and z
+     SUBLT $y, $y, $t;
+
+     ;// Now z' <= y', so there are three cases for the
+     ;// median value, depending on x.
+
+     ;// 1) x <= z'      <= y'      : median value is z'
+     ;// 2)      z' <= x <= y'      : median value is x
+     ;// 3)      z'      <= y' <= x : median value is y'
+
+     CMP   $z, $x;     // if ( x > min(y,z) )
+     MOVLT $z, $x;     // ans = x 
+
+     CMP   $x, $y;     // if ( x > max(y,z) )
+     MOVGT $z, $y;     // ans = max(y,z)
+     
+     MEND
+    ENDIF      
+    
+    
+        
+    ENDIF ;// ARMACCOMM_S_H
+
+ END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h
new file mode 100755
index 0000000..7b3cc72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h
@@ -0,0 +1,4381 @@
+/**
+ * File: omxVC.h
+ * Brief: OpenMAX DL v1.0.2 - Video Coding library
+ *
+ * Copyright � 2005-2008 The Khronos Group Inc. All Rights Reserved. 
+ *
+ * These materials are protected by copyright laws and contain material 
+ * proprietary to the Khronos Group, Inc.  You may use these materials 
+ * for implementing Khronos specifications, without altering or removing 
+ * any trademark, copyright or other notice from the specification.
+ * 
+ * Khronos Group makes no, and expressly disclaims any, representations 
+ * or warranties, express or implied, regarding these materials, including, 
+ * without limitation, any implied warranties of merchantability or fitness 
+ * for a particular purpose or non-infringement of any intellectual property. 
+ * Khronos Group makes no, and expressly disclaims any, warranties, express 
+ * or implied, regarding the correctness, accuracy, completeness, timeliness, 
+ * and reliability of these materials. 
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters, 
+ * Contributors or Members or their respective partners, officers, directors, 
+ * employees, agents or representatives be liable for any damages, whether 
+ * direct, indirect, special or consequential damages for lost revenues, 
+ * lost profits, or otherwise, arising from or in connection with these 
+ * materials.
+ * 
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc. 
+ *
+ */
+
+/* *****************************************************************************************/
+
+#ifndef _OMXVC_H_
+#define _OMXVC_H_
+
+#include "omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* 6.1.1.1 Motion Vectors  */
+/* In omxVC, motion vectors are represented as follows:  */
+
+typedef struct {
+    OMX_S16 dx;
+    OMX_S16 dy;
+} OMXVCMotionVector;
+
+
+
+/**
+ * Function:  omxVCCOMM_Average_8x   (6.1.3.1.1)
+ *
+ * Description:
+ * This function calculates the average of two 8x4, 8x8, or 8x16 blocks.  The 
+ * result is rounded according to (a+b+1)/2.  The block average function can 
+ * be used in conjunction with half-pixel interpolation to obtain quarter 
+ * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1. 
+ *
+ * Input Arguments:
+ *   
+ *   pPred0     - Pointer to the top-left corner of reference block 0 
+ *   pPred1     - Pointer to the top-left corner of reference block 1 
+ *   iPredStep0 - Step of reference block 0 
+ *   iPredStep1 - Step of reference block 1 
+ *   iDstStep   - Step of the destination buffer. 
+ *   iHeight    - Height of the blocks 
+ *
+ * Output Arguments:
+ *   
+ *   pDstPred - Pointer to the destination buffer. 8-byte aligned. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -   one or more of the following pointers is NULL: pPred0, pPred1, or 
+ *              pDstPred. 
+ *    -   pDstPred is not aligned on an 8-byte boundary. 
+ *    -   iPredStep0 <= 0 or iPredStep0 is not a multiple of 8. 
+ *    -   iPredStep1 <= 0 or iPredStep1 is not a multiple of 8. 
+ *    -   iDstStep   <= 0 or iDstStep is not a multiple of 8. 
+ *    -   iHeight is not 4, 8, or 16. 
+ *
+ */
+OMXResult omxVCCOMM_Average_8x (
+    const OMX_U8 *pPred0,
+    const OMX_U8 *pPred1,
+    OMX_U32 iPredStep0,
+    OMX_U32 iPredStep1,
+    OMX_U8 *pDstPred,
+    OMX_U32 iDstStep,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_Average_16x   (6.1.3.1.2)
+ *
+ * Description:
+ * This function calculates the average of two 16x16 or 16x8 blocks.  The 
+ * result is rounded according to (a+b+1)/2.  The block average function can 
+ * be used in conjunction with half-pixel interpolation to obtain quarter 
+ * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1. 
+ *
+ * Input Arguments:
+ *   
+ *   pPred0 - Pointer to the top-left corner of reference block 0 
+ *   pPred1 - Pointer to the top-left corner of reference block 1 
+ *   iPredStep0 - Step of reference block 0 
+ *   iPredStep1 - Step of reference block 1 
+ *   iDstStep - Step of the destination buffer 
+ *   iHeight - Height of the blocks 
+ *
+ * Output Arguments:
+ *   
+ *   pDstPred - Pointer to the destination buffer. 16-byte aligned. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -   one or more of the following pointers is NULL: pPred0, pPred1, or 
+ *              pDstPred. 
+ *    -   pDstPred is not aligned on a 16-byte boundary. 
+ *    -   iPredStep0 <= 0 or iPredStep0 is not a multiple of 16. 
+ *    -   iPredStep1 <= 0 or iPredStep1 is not a multiple of 16. 
+ *    -   iDstStep <= 0 or iDstStep is not a multiple of 16. 
+ *    -   iHeight is not 8 or 16. 
+ *
+ */
+OMXResult omxVCCOMM_Average_16x (
+    const OMX_U8 *pPred0,
+    const OMX_U8 *pPred1,
+    OMX_U32 iPredStep0,
+    OMX_U32 iPredStep1,
+    OMX_U8 *pDstPred,
+    OMX_U32 iDstStep,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_ExpandFrame_I   (6.1.3.2.1)
+ *
+ * Description:
+ * This function expands a reconstructed frame in-place.  The unexpanded 
+ * source frame should be stored in a plane buffer with sufficient space 
+ * pre-allocated for edge expansion, and the input frame should be located in 
+ * the plane buffer center.  This function executes the pixel expansion by 
+ * replicating source frame edge pixel intensities in the empty pixel 
+ * locations (expansion region) between the source frame edge and the plane 
+ * buffer edge.  The width/height of the expansion regions on the 
+ * horizontal/vertical edges is controlled by the parameter iExpandPels. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDstPlane - pointer to the top-left corner of the frame to be 
+ *            expanded; must be aligned on an 8-byte boundary. 
+ *   iFrameWidth - frame width; must be a multiple of 8. 
+ *   iFrameHeight -frame height; must be a multiple of 8. 
+ *   iExpandPels - number of pixels to be expanded in the horizontal and 
+ *            vertical directions; must be a multiple of 8. 
+ *   iPlaneStep - distance, in bytes, between the start of consecutive lines 
+ *            in the plane buffer; must be larger than or equal to 
+ *            (iFrameWidth + 2 * iExpandPels). 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstPlane -Pointer to the top-left corner of the frame (NOT the 
+ *            top-left corner of the plane); must be aligned on an 8-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -    pSrcDstPlane is NULL. 
+ *    -    pSrcDstPlane is not aligned on an 8-byte boundary. 
+ *    -    one of the following parameters is either equal to zero or is a 
+ *              non-multiple of 8: iFrameHeight, iFrameWidth, iPlaneStep, or 
+ *              iExpandPels. 
+ *    -    iPlaneStep < (iFrameWidth + 2 * iExpandPels). 
+ *
+ */
+OMXResult omxVCCOMM_ExpandFrame_I (
+    OMX_U8 *pSrcDstPlane,
+    OMX_U32 iFrameWidth,
+    OMX_U32 iFrameHeight,
+    OMX_U32 iExpandPels,
+    OMX_U32 iPlaneStep
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_Copy8x8   (6.1.3.3.1)
+ *
+ * Description:
+ * Copies the reference 8x8 block to the current block. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the reference block in the source frame; must be 
+ *            aligned on an 8-byte boundary. 
+ *   step - distance between the starts of consecutive lines in the reference 
+ *            frame, in bytes; must be a multiple of 8 and must be larger than 
+ *            or equal to 8. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination block; must be aligned on an 8-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst 
+ *    -   one or more of the following pointers is not aligned on an 8-byte 
+ *              boundary: pSrc, pDst 
+ *    -    step <8 or step is not a multiple of 8. 
+ *
+ */
+OMXResult omxVCCOMM_Copy8x8 (
+    const OMX_U8 *pSrc,
+    OMX_U8 *pDst,
+    OMX_INT step
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_Copy16x16   (6.1.3.3.2)
+ *
+ * Description:
+ * Copies the reference 16x16 macroblock to the current macroblock. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the reference macroblock in the source frame; must be 
+ *            aligned on a 16-byte boundary. 
+ *   step - distance between the starts of consecutive lines in the reference 
+ *            frame, in bytes; must be a multiple of 16 and must be larger 
+ *            than or equal to 16. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination macroblock; must be aligned on a 
+ *            16-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -   one or more of the following pointers is NULL: pSrc, pDst 
+ *    -   one or more of the following pointers is not aligned on a 16-byte 
+ *              boundary: pSrc, pDst 
+ *    -    step <16 or step is not a multiple of 16. 
+ *
+ */
+OMXResult omxVCCOMM_Copy16x16 (
+    const OMX_U8 *pSrc,
+    OMX_U8 *pDst,
+    OMX_INT step
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_ComputeTextureErrorBlock_SAD   (6.1.4.1.1)
+ *
+ * Description:
+ * Computes texture error of the block; also returns SAD. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the source plane; must be aligned on an 8-byte 
+ *            boundary. 
+ *   srcStep - step of the source plane 
+ *   pSrcRef - pointer to the reference buffer, an 8x8 block; must be aligned 
+ *            on an 8-byte boundary. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination buffer, an 8x8 block; must be aligned 
+ *            on an 8-byte boundary. 
+ *   pDstSAD - pointer to the Sum of Absolute Differences (SAD) value 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments 
+ *    -    At least one of the following 
+ *         pointers is NULL: pSrc, pSrcRef, pDst and pDstSAD. 
+ *    -    pSrc is not 8-byte aligned. 
+ *    -    SrcStep <= 0 or srcStep is not a multiple of 8. 
+ *    -    pSrcRef is not 8-byte aligned. 
+ *    -    pDst is not 8-byte aligned. 
+ *
+ */
+OMXResult omxVCCOMM_ComputeTextureErrorBlock_SAD (
+    const OMX_U8 *pSrc,
+    OMX_INT srcStep,
+    const OMX_U8 *pSrcRef,
+    OMX_S16 *pDst,
+    OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_ComputeTextureErrorBlock   (6.1.4.1.2)
+ *
+ * Description:
+ * Computes the texture error of the block. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the source plane. This should be aligned on an 8-byte 
+ *            boundary. 
+ *   srcStep - step of the source plane 
+ *   pSrcRef - pointer to the reference buffer, an 8x8 block. This should be 
+ *            aligned on an 8-byte boundary. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination buffer, an 8x8 block. This should be 
+ *            aligned on an 8-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    At least one of the following pointers is NULL: 
+ *         pSrc, pSrcRef, pDst. 
+ *    -    pSrc is not 8-byte aligned. 
+ *    -    SrcStep <= 0 or srcStep is not a multiple of 8. 
+ *    -    pSrcRef is not 8-byte aligned. 
+ *    -    pDst is not 8-byte aligned 
+ *
+ */
+OMXResult omxVCCOMM_ComputeTextureErrorBlock (
+    const OMX_U8 *pSrc,
+    OMX_INT srcStep,
+    const OMX_U8 *pSrcRef,
+    OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_LimitMVToRect   (6.1.4.1.3)
+ *
+ * Description:
+ * Limits the motion vector associated with the current block/macroblock to 
+ * prevent the motion compensated block/macroblock from moving outside a 
+ * bounding rectangle as shown in Figure 6-1. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcMV - pointer to the motion vector associated with the current block 
+ *            or macroblock 
+ *   pRectVOPRef - pointer to the bounding rectangle 
+ *   Xcoord, Ycoord  - coordinates of the current block or macroblock 
+ *   size - size of the current block or macroblock; must be equal to 8 or 
+ *            16. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstMV - pointer to the limited motion vector 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one or more of the 
+ *              following conditions is true: 
+ *    -    at least one of the following pointers is NULL: 
+ *         pSrcMV, pDstMV, or pRectVOPRef. 
+ *    -    size is not equal to either 8 or 16. 
+ *    -    the width or height of the bounding rectangle is less than 
+ *         twice the block size.
+ */
+OMXResult omxVCCOMM_LimitMVToRect (
+    const OMXVCMotionVector *pSrcMV,
+    OMXVCMotionVector *pDstMV,
+    const OMXRect *pRectVOPRef,
+    OMX_INT Xcoord,
+    OMX_INT Ycoord,
+    OMX_INT size
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_SAD_16x   (6.1.4.1.4)
+ *
+ * Description:
+ * This function calculates the SAD for 16x16 and 16x8 blocks. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrg - Pointer to the original block; must be aligned on a 16-byte 
+ *             boundary. 
+ *   iStepOrg - Step of the original block buffer 
+ *   pSrcRef  - Pointer to the reference block 
+ *   iStepRef - Step of the reference block buffer 
+ *   iHeight  - Height of the block 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD - Pointer of result SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one or more of the 
+ *              following conditions is true: 
+ *    -    at least one of the following pointers is NULL: 
+ *         pSrcOrg, pDstSAD, or pSrcRef 
+ *    -    pSrcOrg is not 16-byte aligned. 
+ *    -    iStepOrg  <= 0 or iStepOrg is not a multiple of 16 
+ *    -    iStepRef <= 0 or iStepRef is not a multiple of 16 
+ *    -    iHeight is not 8 or 16 
+ *
+ */
+OMXResult omxVCCOMM_SAD_16x (
+    const OMX_U8 *pSrcOrg,
+    OMX_U32 iStepOrg,
+    const OMX_U8 *pSrcRef,
+    OMX_U32 iStepRef,
+    OMX_S32 *pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCCOMM_SAD_8x   (6.1.4.1.5)
+ *
+ * Description:
+ * This function calculates the SAD for 8x16, 8x8, 8x4 blocks. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrg  - Pointer to the original block; must be aligned on a 8-byte 
+ *              boundary. 
+ *   iStepOrg - Step of the original block buffer 
+ *   pSrcRef  - Pointer to the reference block 
+ *   iStepRef - Step of the reference block buffer 
+ *   iHeight  - Height of the block 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD -Pointer of result SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one or more of the 
+ *              following conditions is true: 
+ *    -    at least one of the following pointers is NULL: 
+ *         pSrcOrg, pDstSAD, or pSrcRef 
+ *    -    pSrcOrg is not 8-byte aligned. 
+ *    -    iStepOrg  <= 0 or iStepOrg is not a multiple of 8 
+ *    -    iStepRef <= 0 or iStepRef is not a multiple of 8 
+ *    -    iHeight is not 4, 8 or 16 
+ *
+ */
+OMXResult omxVCCOMM_SAD_8x (
+    const OMX_U8 *pSrcOrg,
+    OMX_U32 iStepOrg,
+    const OMX_U8 *pSrcRef,
+    OMX_U32 iStepRef,
+    OMX_S32*pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/* 6.2.1.1 Direction  */
+/* The direction enumerator is used with functions that perform AC/DC prediction and zig-zag scan.  */
+
+enum {
+    OMX_VC_NONE       = 0,
+    OMX_VC_HORIZONTAL = 1,
+    OMX_VC_VERTICAL   = 2 
+};
+
+
+
+/* 6.2.1.2 Bilinear Interpolation  */
+/* The bilinear interpolation enumerator is used with motion estimation, motion compensation, and reconstruction functions.  */
+
+enum {
+    OMX_VC_INTEGER_PIXEL = 0, /* case a */
+    OMX_VC_HALF_PIXEL_X  = 1, /* case b */
+    OMX_VC_HALF_PIXEL_Y  = 2, /* case c */
+    OMX_VC_HALF_PIXEL_XY = 3  /* case d */ 
+};
+
+
+
+/* 6.2.1.3 Neighboring Macroblock Availability  */
+/* Neighboring macroblock availability is indicated using the following flags:   */
+
+enum {
+    OMX_VC_UPPER = 1,        /** above macroblock is available */
+    OMX_VC_LEFT = 2,         /** left macroblock is available */
+    OMX_VC_CENTER = 4,
+    OMX_VC_RIGHT = 8,
+    OMX_VC_LOWER = 16,
+    OMX_VC_UPPER_LEFT = 32,  /** above-left macroblock is available */
+    OMX_VC_UPPER_RIGHT = 64, /** above-right macroblock is available */
+    OMX_VC_LOWER_LEFT = 128,
+    OMX_VC_LOWER_RIGHT = 256 
+};
+
+
+
+/* 6.2.1.4 Video Components  */
+/* A data type that enumerates video components is defined as follows:  */
+
+typedef enum {
+    OMX_VC_LUMINANCE,    /** Luminance component */
+    OMX_VC_CHROMINANCE   /** chrominance component */ 
+} OMXVCM4P2VideoComponent;
+
+
+
+/* 6.2.1.5 MacroblockTypes  */
+/* A data type that enumerates macroblock types is defined as follows:  */
+
+typedef enum {
+    OMX_VC_INTER     = 0, /** P picture or P-VOP */
+    OMX_VC_INTER_Q   = 1, /** P picture or P-VOP */
+    OMX_VC_INTER4V   = 2, /** P picture or P-VOP */
+    OMX_VC_INTRA     = 3, /** I and P picture, I- and P-VOP */
+    OMX_VC_INTRA_Q   = 4, /** I and P picture, I- and P-VOP */
+    OMX_VC_INTER4V_Q = 5  /** P picture or P-VOP (H.263)*/
+} OMXVCM4P2MacroblockType;
+
+
+
+/* 6.2.1.6 Coordinates  */
+/* Coordinates are represented as follows:  */
+
+typedef struct {
+    OMX_INT x;
+    OMX_INT y;
+} OMXVCM4P2Coordinate;
+
+
+
+/* 6.2.1.7 Motion Estimation Algorithms  */
+/* A data type that enumerates motion estimation search methods is defined as follows:  */
+
+typedef enum {
+    OMX_VC_M4P2_FAST_SEARCH = 0,  /** Fast motion search */
+    OMX_VC_M4P2_FULL_SEARCH = 1   /** Full motion search */ 
+} OMXVCM4P2MEMode;
+
+
+
+/* 6.2.1.8 Motion Estimation Parameters  */
+/* A data structure containing control parameters for 
+ * motion estimation functions is defined as follows:  
+ */
+
+typedef struct {
+    OMX_INT searchEnable8x8;     /** enables 8x8 search */
+    OMX_INT halfPelSearchEnable; /** enables half-pel resolution */
+    OMX_INT searchRange;         /** search range */
+    OMX_INT rndVal;              /** rounding control; 0-disabled, 1-enabled*/
+} OMXVCM4P2MEParams;
+
+
+
+/* 6.2.1.9 Macroblock Information   */
+/* A data structure containing macroblock parameters for 
+ * motion estimation functions is defined as follows:  
+ */
+
+typedef struct {
+    OMX_S32 sliceId;                 /* slice number */
+    OMXVCM4P2MacroblockType mbType;  /* MB type: OMX_VC_INTRA, OMX_VC_INTER, or OMX_VC_INTER4 */
+    OMX_S32 qp;                      /* quantization parameter*/
+    OMX_U32 cbpy;                    /* CBP Luma */
+    OMX_U32 cbpc;                    /* CBP Chroma */
+    OMXVCMotionVector pMV0[2][2];    /* motion vector, represented using 1/2-pel units, 
+                                      * pMV0[blocky][blockx] (blocky = 0~1, blockx =0~1) 
+                                      */
+    OMXVCMotionVector pMVPred[2][2]; /* motion vector prediction, represented using 1/2-pel units, 
+                                      * pMVPred[blocky][blockx] (blocky = 0~1, blockx = 0~1) 
+                                      */
+    OMX_U8 pPredDir[2][2];           /* AC prediction direction: 
+                                      *   OMX_VC_NONE, OMX_VC_VERTICAL, OMX_VC_HORIZONTAL 
+                                      */
+} OMXVCM4P2MBInfo, *OMXVCM4P2MBInfoPtr;
+
+
+
+/**
+ * Function:  omxVCM4P2_FindMVpred   (6.2.3.1.1)
+ *
+ * Description:
+ * Predicts a motion vector for the current block using the procedure 
+ * specified in [ISO14496-2], subclause 7.6.5.  The resulting predicted MV is 
+ * returned in pDstMVPred. If the parameter pDstMVPredME if is not NULL then 
+ * the set of three MV candidates used for prediction is also returned, 
+ * otherwise pDstMVPredMEis NULL upon return. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcMVCurMB - pointer to the MV buffer associated with the current Y 
+ *            macroblock; a value of NULL indicates unavailability. 
+ *   pSrcCandMV1 - pointer to the MV buffer containing the 4 MVs associated 
+ *            with the MB located to the left of the current MB; set to NULL 
+ *            if there is no MB to the left. 
+ *   pSrcCandMV2 - pointer to the MV buffer containing the 4 MVs associated 
+ *            with the MB located above the current MB; set to NULL if there 
+ *            is no MB located above the current MB. 
+ *   pSrcCandMV3 - pointer to the MV buffer containing the 4 MVs associated 
+ *            with the MB located to the right and above the current MB; set 
+ *            to NULL if there is no MB located to the above-right. 
+ *   iBlk - the index of block in the current macroblock 
+ *   pDstMVPredME - MV candidate return buffer;  if set to NULL then 
+ *            prediction candidate MVs are not returned and pDstMVPredME will 
+ *            be NULL upon function return; if pDstMVPredME is non-NULL then it 
+ *            must point to a buffer containing sufficient space for three 
+ *            return MVs. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstMVPred - pointer to the predicted motion vector 
+ *   pDstMVPredME - if non-NULL upon input then pDstMVPredME  points upon 
+ *            return to a buffer containing the three motion vector candidates 
+ *            used for prediction as specified in [ISO14496-2], subclause 
+ *            7.6.5, otherwise if NULL upon input then pDstMVPredME is NULL 
+ *            upon output. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -    the pointer pDstMVPred is NULL 
+ *    -    the parameter iBlk does not fall into the range 0 <= iBlk<=3 
+ *
+ */
+OMXResult omxVCM4P2_FindMVpred (
+    const OMXVCMotionVector *pSrcMVCurMB,
+    const OMXVCMotionVector *pSrcCandMV1,
+    const OMXVCMotionVector *pSrcCandMV2,
+    const OMXVCMotionVector *pSrcCandMV3,
+    OMXVCMotionVector *pDstMVPred,
+    OMXVCMotionVector *pDstMVPredME,
+    OMX_INT iBlk
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_IDCT8x8blk   (6.2.3.2.1)
+ *
+ * Description:
+ * Computes a 2D inverse DCT for a single 8x8 block, as defined in 
+ * [ISO14496-2]. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the start of the linearly arranged IDCT input buffer; 
+ *            must be aligned on a 16-byte boundary.  According to 
+ *            [ISO14496-2], the input coefficient values should lie within the 
+ *            range [-2048, 2047]. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the start of the linearly arranged IDCT output buffer; 
+ *            must be aligned on a 16-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    pSrc or pDst is NULL. 
+ *    -    pSrc or pDst is not 16-byte aligned. 
+ *
+ */
+OMXResult omxVCM4P2_IDCT8x8blk (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_MEGetBufSize   (6.2.4.1.1)
+ *
+ * Description:
+ * Computes the size, in bytes, of the vendor-specific specification 
+ * structure for the following motion estimation functions: 
+ * BlockMatch_Integer_8x8, BlockMatch_Integer_16x16, and MotionEstimationMB. 
+ *
+ * Input Arguments:
+ *   
+ *   MEmode - motion estimation mode; available modes are defined by the 
+ *            enumerated type OMXVCM4P2MEMode 
+ *   pMEParams - motion estimation parameters 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - one or more of the following is true: 
+ *    -    an invalid value was specified for the parameter MEmode 
+ *    -    a negative or zero value was specified for the 
+ *         parameter pMEParams->searchRange 
+ *
+ */
+OMXResult omxVCM4P2_MEGetBufSize (
+    OMXVCM4P2MEMode MEmode,
+    const OMXVCM4P2MEParams *pMEParams,
+    OMX_U32 *pSize
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_MEInit   (6.2.4.1.2)
+ *
+ * Description:
+ * Initializes the vendor-specific specification structure required for the 
+ * following motion estimation functions:  BlockMatch_Integer_8x8, 
+ * BlockMatch_Integer_16x16, and MotionEstimationMB. Memory for the 
+ * specification structure *pMESpec must be allocated prior to calling the 
+ * function, and should be aligned on a 4-byte boundary.  Following 
+ * initialization by this function, the vendor-specific structure *pMESpec 
+ * should contain an implementation-specific representation of all motion 
+ * estimation parameters received via the structure pMEParams, for example  
+ * rndVal, searchRange, etc.  The number of bytes required for the 
+ * specification structure can be determined using the function 
+ * omxVCM4P2_MEGetBufSize. 
+ *
+ * Input Arguments:
+ *   
+ *   MEmode - motion estimation mode; available modes are defined by the 
+ *            enumerated type OMXVCM4P2MEMode 
+ *   pMEParams - motion estimation parameters 
+ *   pMESpec - pointer to the uninitialized ME specification structure 
+ *
+ * Output Arguments:
+ *   
+ *   pMESpec - pointer to the initialized ME specification structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - one or more of the following is true: 
+ *    -    an invalid value was specified for the parameter MEmode 
+ *    -    a negative or zero value was specified for the 
+ *         parameter pMEParams->searchRange 
+ *
+ */
+OMXResult omxVCM4P2_MEInit (
+    OMXVCM4P2MEMode MEmode,
+    const OMXVCM4P2MEParams*pMEParams,
+    void *pMESpec
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_BlockMatch_Integer_16x16   (6.2.4.2.1)
+ *
+ * Description:
+ * Performs a 16x16 block search; estimates motion vector and associated 
+ * minimum SAD. Both the input and output motion vectors are represented using 
+ * half-pixel units, and therefore a shift left or right by 1 bit may be 
+ * required, respectively, to match the input or output MVs with other 
+ * functions that either generate output MVs or expect input MVs represented 
+ * using integer pixel units. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcRefBuf - pointer to the reference Y plane; points to the reference 
+ *            MB that corresponds to the location of the current macroblock in 
+ *            the current plane. 
+ *   refWidth - width of the reference plane 
+ *   pRefRect - pointer to the valid reference plane rectangle; coordinates 
+ *            are specified relative to the image origin.  Rectangle 
+ *            boundaries may extend beyond image boundaries if the image has 
+ *            been padded.  For example, if padding extends 4 pixels beyond 
+ *            frame border, then the value for the left border could be set to 
+ *            -4. 
+ *   pSrcCurrBuf - pointer to the current block in the current macroblock 
+ *            buffer extracted from the original plane (linear array, 256 
+ *            entries); must be aligned on a 16-byte boundary.  The number of 
+ *            bytes between lines (step) is 16. 
+ *   pCurrPointPos - position of the current macroblock in the current plane 
+ *   pSrcPreMV - pointer to predicted motion vector; NULL indicates no 
+ *            predicted MV 
+ *   pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced 
+ *            by pSrcPreMV); may be set to NULL if unavailable. 
+ *   pMESpec - vendor-specific motion estimation specification structure; 
+ *            must have been allocated and then initialized using 
+ *            omxVCM4P2_MEInit prior to calling the block matching function. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstMV - pointer to estimated MV 
+ *   pDstSAD - pointer to minimum SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one of the following 
+ *              conditions is true: 
+ *    -    at least one of the following pointers is NULL: pSrcRefBuf, 
+ *              pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or 
+ *              pMESpec, or 
+ *    -    pSrcCurrBuf is not 16-byte aligned 
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Integer_16x16 (
+    const OMX_U8 *pSrcRefBuf,
+    OMX_INT refWidth,
+    const OMXRect *pRefRect,
+    const OMX_U8 *pSrcCurrBuf,
+    const OMXVCM4P2Coordinate *pCurrPointPos,
+    const OMXVCMotionVector*pSrcPreMV,
+    const OMX_INT *pSrcPreSAD,
+    void *pMESpec,
+    OMXVCMotionVector*pDstMV,
+    OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_BlockMatch_Integer_8x8   (6.2.4.2.2)
+ *
+ * Description:
+ * Performs an 8x8 block search; estimates motion vector and associated 
+ * minimum SAD.  Both the input and output motion vectors are represented 
+ * using half-pixel units, and therefore a shift left or right by 1 bit may be 
+ * required, respectively, to match the input or output MVs with other 
+ * functions that either generate output MVs or expect input MVs represented 
+ * using integer pixel units. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcRefBuf - pointer to the reference Y plane; points to the reference 
+ *            block that corresponds to the location of the current 8x8 block 
+ *            in the current plane. 
+ *   refWidth - width of the reference plane 
+ *   pRefRect - pointer to the valid reference plane rectangle; coordinates 
+ *            are specified relative to the image origin.  Rectangle 
+ *            boundaries may extend beyond image boundaries if the image has 
+ *            been padded. 
+ *   pSrcCurrBuf - pointer to the current block in the current macroblock 
+ *            buffer extracted from the original plane (linear array, 128 
+ *            entries); must be aligned on an 8-byte boundary.  The number of 
+ *            bytes between lines (step) is 16 bytes. 
+ *   pCurrPointPos - position of the current block in the current plane 
+ *   pSrcPreMV - pointer to predicted motion vector; NULL indicates no 
+ *            predicted MV 
+ *   pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced 
+ *            by pSrcPreMV); may be set to NULL if unavailable. 
+ *   pMESpec - vendor-specific motion estimation specification structure; 
+ *            must have been allocated and then initialized using 
+ *            omxVCM4P2_MEInit prior to calling the block matching function. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstMV - pointer to estimated MV 
+ *   pDstSAD - pointer to minimum SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one of the following 
+ *              conditions is true: 
+ *    -    at least one of the following pointers is NULL: pSrcRefBuf, 
+ *              pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or 
+ *              pMESpec, or 
+ *    -    pSrcCurrBuf is not 8-byte aligned 
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Integer_8x8 (
+    const OMX_U8 *pSrcRefBuf,
+    OMX_INT refWidth,
+    const OMXRect *pRefRect,
+    const OMX_U8 *pSrcCurrBuf,
+    const OMXVCM4P2Coordinate *pCurrPointPos,
+    const OMXVCMotionVector *pSrcPreMV,
+    const OMX_INT *pSrcPreSAD,
+    void *pMESpec,
+    OMXVCMotionVector *pDstMV,
+    OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_BlockMatch_Half_16x16   (6.2.4.2.3)
+ *
+ * Description:
+ * Performs a 16x16 block match with half-pixel resolution.  Returns the 
+ * estimated motion vector and associated minimum SAD.  This function 
+ * estimates the half-pixel motion vector by interpolating the integer 
+ * resolution motion vector referenced by the input parameter pSrcDstMV, i.e., 
+ * the initial integer MV is generated externally.  The input parameters 
+ * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of 
+ * 16x16 integer search prior to calling BlockMatch_Half_16x16. The function 
+ * BlockMatch_Integer_16x16 may be used for integer motion estimation. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcRefBuf - pointer to the reference Y plane; points to the reference 
+ *            macroblock that corresponds to the location of the current 
+ *            macroblock in the current plane. 
+ *   refWidth - width of the reference plane 
+ *   pRefRect - reference plane valid region rectangle 
+ *   pSrcCurrBuf - pointer to the current block in the current macroblock 
+ *            buffer extracted from the original plane (linear array, 256 
+ *            entries); must be aligned on a 16-byte boundary.  The number of 
+ *            bytes between lines (step) is 16. 
+ *   pSearchPointRefPos - position of the starting point for half pixel 
+ *            search (specified in terms of integer pixel units) in the 
+ *            reference plane, i.e., the reference position pointed to by the 
+ *            predicted motion vector. 
+ *   rndVal - rounding control parameter: 0 - disabled; 1 - enabled. 
+ *   pSrcDstMV - pointer to the initial MV estimate; typically generated 
+ *            during a prior 16X16 integer search; specified in terms of 
+ *            half-pixel units. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstMV - pointer to estimated MV 
+ *   pDstSAD - pointer to minimum SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one of the following 
+ *              conditions is true: 
+ *    -    at least one of the following pointers is NULL: pSrcRefBuf, 
+ *         pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV.
+ *    -    pSrcCurrBuf is not 16-byte aligned, or 
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Half_16x16 (
+    const OMX_U8 *pSrcRefBuf,
+    OMX_INT refWidth,
+    const OMXRect *pRefRect,
+    const OMX_U8 *pSrcCurrBuf,
+    const OMXVCM4P2Coordinate *pSearchPointRefPos,
+    OMX_INT rndVal,
+    OMXVCMotionVector *pSrcDstMV,
+    OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_BlockMatch_Half_8x8   (6.2.4.2.4)
+ *
+ * Description:
+ * Performs an 8x8 block match with half-pixel resolution. Returns the 
+ * estimated motion vector and associated minimum SAD.  This function 
+ * estimates the half-pixel motion vector by interpolating the integer 
+ * resolution motion vector referenced by the input parameter pSrcDstMV, i.e., 
+ * the initial integer MV is generated externally.  The input parameters 
+ * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of 
+ * 8x8 integer search prior to calling BlockMatch_Half_8x8. The function 
+ * BlockMatch_Integer_8x8 may be used for integer motion estimation. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcRefBuf - pointer to the reference Y plane; points to the reference 
+ *            block that corresponds to the location of the current 8x8 block 
+ *            in the current plane. 
+ *   refWidth - width of the reference plane 
+ *   pRefRect - reference plane valid region rectangle 
+ *   pSrcCurrBuf - pointer to the current block in the current macroblock 
+ *            buffer extracted from the original plane (linear array, 128 
+ *            entries); must be aligned on a 8-byte boundary.  The number of 
+ *            bytes between lines (step) is 16. 
+ *   pSearchPointRefPos - position of the starting point for half pixel 
+ *            search (specified in terms of integer pixel units) in the 
+ *            reference plane. 
+ *   rndVal - rounding control parameter: 0 - disabled; 1 - enabled. 
+ *   pSrcDstMV - pointer to the initial MV estimate; typically generated 
+ *            during a prior 8x8 integer search, specified in terms of 
+ *            half-pixel units. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstMV - pointer to estimated MV 
+ *   pDstSAD - pointer to minimum SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one of the following 
+ *              conditions is true: 
+ *    -    at least one of the following pointers is NULL: 
+ *         pSrcRefBuf, pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV
+ *    -    pSrcCurrBuf is not 8-byte aligned 
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Half_8x8 (
+    const OMX_U8 *pSrcRefBuf,
+    OMX_INT refWidth,
+    const OMXRect *pRefRect,
+    const OMX_U8 *pSrcCurrBuf,
+    const OMXVCM4P2Coordinate *pSearchPointRefPos,
+    OMX_INT rndVal,
+    OMXVCMotionVector *pSrcDstMV,
+    OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_MotionEstimationMB   (6.2.4.3.1)
+ *
+ * Description:
+ * Performs motion search for a 16x16 macroblock.  Selects best motion search 
+ * strategy from among inter-1MV, inter-4MV, and intra modes.  Supports 
+ * integer and half pixel resolution. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcCurrBuf - pointer to the top-left corner of the current MB in the 
+ *            original picture plane; must be aligned on a 16-byte boundary.  
+ *            The function does not expect source data outside the region 
+ *            bounded by the MB to be available; for example it is not 
+ *            necessary for the caller to guarantee the availability of 
+ *            pSrcCurrBuf[-SrcCurrStep], i.e., the row of pixels above the MB 
+ *            to be processed. 
+ *   srcCurrStep - width of the original picture plane, in terms of full 
+ *            pixels; must be a multiple of 16. 
+ *   pSrcRefBuf - pointer to the reference Y plane; points to the reference 
+ *            plane location corresponding to the location of the current 
+ *            macroblock in the current plane; must be aligned on a 16-byte 
+ *            boundary. 
+ *   srcRefStep - width of the reference picture plane, in terms of full 
+ *            pixels; must be a multiple of 16. 
+ *   pRefRect - reference plane valid region rectangle, specified relative to 
+ *            the image origin 
+ *   pCurrPointPos - position of the current macroblock in the current plane 
+ *   pMESpec - pointer to the vendor-specific motion estimation specification 
+ *            structure; must be allocated and then initialized using 
+ *            omxVCM4P2_MEInit prior to calling this function. 
+ *   pMBInfo - array, of dimension four, containing pointers to information 
+ *            associated with four nearby MBs: 
+ *            -   pMBInfo[0] - pointer to left MB information 
+ *            -   pMBInfo[1] - pointer to top MB information 
+ *            -   pMBInfo[2] - pointer to top-left MB information 
+ *            -   pMBInfo[3] - pointer to top-right MB information 
+ *            Any pointer in the array may be set equal to NULL if the 
+ *            corresponding MB doesn't exist.  For each MB, the following structure 
+ *            members are used:    
+ *            -   mbType - macroblock type, either OMX_VC_INTRA, OMX_VC_INTER, or 
+ *                OMX_VC_INTER4V 
+ *            -   pMV0[2][2] - estimated motion vectors; represented 
+ *                in 1/2 pixel units 
+ *            -   sliceID - number of the slice to which the MB belongs 
+ *   pSrcDstMBCurr - pointer to information structure for the current MB.  
+ *            The following entries should be set prior to calling the 
+ *            function: sliceID - the number of the slice the to which the 
+ *            current MB belongs.  The structure elements cbpy and cbpc are 
+ *            ignored. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstMBCurr - pointer to updated information structure for the current 
+ *            MB after MB-level motion estimation has been completed.  The 
+ *            following structure members are updated by the ME function:   
+ *              -  mbType - macroblock type: OMX_VC_INTRA, OMX_VC_INTER, or 
+ *                 OMX_VC_INTER4V. 
+ *              -  pMV0[2][2] - estimated motion vectors; represented in 
+ *                 terms of 1/2 pel units. 
+ *              -  pMVPred[2][2] - predicted motion vectors; represented 
+ *                 in terms of 1/2 pel units. 
+ *            The structure members cbpy and cbpc are not updated by the function. 
+ *   pDstSAD - pointer to the minimum SAD for INTER1V, or sum of minimum SADs 
+ *            for INTER4V 
+ *   pDstBlockSAD - pointer to an array of SAD values for each of the four 
+ *            8x8 luma blocks in the MB.  The block SADs are in scan order for 
+ *            each MB. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments.  Returned if one or more of the 
+ *              following conditions is true: 
+ *    -    at least one of the following pointers is NULL: pSrcCurrBuf, 
+ *              pSrcRefBuf, pRefRect, pCurrPointPos, pMBInter, pMBIntra, 
+ *              pSrcDstMBCurr, or pDstSAD. 
+ *
+ */
+OMXResult omxVCM4P2_MotionEstimationMB (
+    const OMX_U8 *pSrcCurrBuf,
+    OMX_S32 srcCurrStep,
+    const OMX_U8 *pSrcRefBuf,
+    OMX_S32 srcRefStep,
+    const OMXRect*pRefRect,
+    const OMXVCM4P2Coordinate *pCurrPointPos,
+    void *pMESpec,
+    const OMXVCM4P2MBInfoPtr *pMBInfo,
+    OMXVCM4P2MBInfo *pSrcDstMBCurr,
+    OMX_U16 *pDstSAD,
+    OMX_U16 *pDstBlockSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DCT8x8blk   (6.2.4.4.1)
+ *
+ * Description:
+ * Computes a 2D forward DCT for a single 8x8 block, as defined in 
+ * [ISO14496-2]. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the start of the linearly arranged input buffer; must 
+ *            be aligned on a 16-byte boundary.  Input values (pixel 
+ *            intensities) are valid in the range [-255,255]. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the start of the linearly arranged output buffer; must 
+ *            be aligned on a 16-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, returned if:
+ *    -    pSrc or pDst is NULL. 
+ *    -    pSrc or pDst is not 16-byte aligned. 
+ *
+ */
+OMXResult omxVCM4P2_DCT8x8blk (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_QuantIntra_I   (6.2.4.4.2)
+ *
+ * Description:
+ * Performs quantization on intra block coefficients. This function supports 
+ * bits_per_pixel == 8. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input intra block coefficients; must be aligned 
+ *            on a 16-byte boundary. 
+ *   QP - quantization parameter (quantizer_scale). 
+ *   blockIndex - block index indicating the component type and position, 
+ *            valid in the range 0 to 5, as defined in [ISO14496-2], subclause 
+ *            6.1.3.8. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; shortVideoHeader==1 selects linear intra DC 
+ *            mode, and shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the output (quantized) interblock coefficients.  
+ *            When shortVideoHeader==1, AC coefficients are saturated on the 
+ *            interval [-127, 127], and DC coefficients are saturated on the 
+ *            interval [1, 254].  When shortVideoHeader==0, AC coefficients 
+ *            are saturated on the interval [-2047, 2047]. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    pSrcDst is NULL. 
+ *    -    blockIndex < 0 or blockIndex >= 10 
+ *    -    QP <= 0 or QP >= 32. 
+ *
+ */
+OMXResult omxVCM4P2_QuantIntra_I (
+    OMX_S16 *pSrcDst,
+    OMX_U8 QP,
+    OMX_INT blockIndex,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_QuantInter_I   (6.2.4.4.3)
+ *
+ * Description:
+ * Performs quantization on an inter coefficient block; supports 
+ * bits_per_pixel == 8. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input inter block coefficients; must be aligned 
+ *            on a 16-byte boundary. 
+ *   QP - quantization parameter (quantizer_scale) 
+ *   shortVideoHeader - binary flag indicating presence of short_video_header; 
+ *            shortVideoHeader==1 selects linear intra DC mode, and 
+ *            shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the output (quantized) interblock coefficients.  
+ *            When shortVideoHeader==1, AC coefficients are saturated on the 
+ *            interval [-127, 127], and DC coefficients are saturated on the 
+ *            interval [1, 254].  When shortVideoHeader==0, AC coefficients 
+ *            are saturated on the interval [-2047, 2047]. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    pSrcDst is NULL. 
+ *    -    QP <= 0 or QP >= 32. 
+ *
+ */
+OMXResult omxVCM4P2_QuantInter_I (
+    OMX_S16 *pSrcDst,
+    OMX_U8 QP,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_TransRecBlockCoef_intra   (6.2.4.4.4)
+ *
+ * Description:
+ * Quantizes the DCT coefficients, implements intra block AC/DC coefficient 
+ * prediction, and reconstructs the current intra block texture for prediction 
+ * on the next frame.  Quantized row and column coefficients are returned in 
+ * the updated coefficient buffers. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the pixels of current intra block; must be aligned on 
+ *            an 8-byte boundary. 
+ *   pPredBufRow - pointer to the coefficient row buffer containing 
+ *            ((num_mb_per_row * 2 + 1) * 8) elements of type OMX_S16. 
+ *            Coefficients are organized into blocks of eight as described 
+ *            below (Internal Prediction Coefficient Update Procedures).  The 
+ *            DC coefficient is first, and the remaining buffer locations 
+ *            contain the quantized AC coefficients. Each group of eight row 
+ *            buffer elements combined with one element eight elements ahead 
+ *            contains the coefficient predictors of the neighboring block 
+ *            that is spatially above or to the left of the block currently to 
+ *            be decoded. A negative-valued DC coefficient indicates that this 
+ *            neighboring block is not INTRA-coded or out of bounds, and 
+ *            therefore the AC and DC coefficients are invalid.  Pointer must 
+ *            be aligned on an 8-byte boundary. 
+ *   pPredBufCol - pointer to the prediction coefficient column buffer 
+ *            containing 16 elements of type OMX_S16. Coefficients are 
+ *            organized as described in section 6.2.2.5.  Pointer must be 
+ *            aligned on an 8-byte boundary. 
+ *   pSumErr - pointer to a flag indicating whether or not AC prediction is 
+ *            required; AC prediction is enabled if *pSumErr >=0, but the 
+ *            value is not used for coefficient prediction, i.e., the sum of 
+ *            absolute differences starts from 0 for each call to this 
+ *            function.  Otherwise AC prediction is disabled if *pSumErr < 0 . 
+ *   blockIndex - block index indicating the component type and position, as 
+ *            defined in [ISO14496-2], subclause 6.1.3.8. 
+ *   curQp - quantization parameter of the macroblock to which the current 
+ *            block belongs 
+ *   pQpBuf - pointer to a 2-element quantization parameter buffer; pQpBuf[0] 
+ *            contains the quantization parameter associated with the 8x8 
+ *            block left of the current block (QPa), and pQpBuf[1] contains 
+ *            the quantization parameter associated with the 8x8 block above 
+ *            the current block (QPc).  In the event that the corresponding 
+ *            block is outside of the VOP bound, the Qp value will not affect 
+ *            the intra prediction process, as described in [ISO14496-2], 
+ *            sub-clause 7.4.3.3,  Adaptive AC Coefficient Prediction.  
+ *   srcStep - width of the source buffer; must be a multiple of 8. 
+ *   dstStep - width of the reconstructed destination buffer; must be a 
+ *            multiple of 16. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; shortVideoHeader==1 selects linear intra DC 
+ *            mode, and shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the quantized DCT coefficient buffer; pDst[0] contains 
+ *            the predicted DC coefficient; the remaining entries contain the 
+ *            quantized AC coefficients (without prediction).  The pointer 
+ *            pDstmust be aligned on a 16-byte boundary. 
+ *   pRec - pointer to the reconstructed texture; must be aligned on an 
+ *            8-byte boundary. 
+ *   pPredBufRow - pointer to the updated coefficient row buffer 
+ *   pPredBufCol - pointer to the updated coefficient column buffer 
+ *   pPreACPredict - if prediction is enabled, the parameter points to the 
+ *            start of the buffer containing the coefficient differences for 
+ *            VLC encoding. The entry pPreACPredict[0]indicates prediction 
+ *            direction for the current block and takes one of the following 
+ *            values: OMX_VC_NONE (prediction disabled), OMX_VC_HORIZONTAL, or 
+ *            OMX_VC_VERTICAL.  The entries 
+ *            pPreACPredict[1]-pPreACPredict[7]contain predicted AC 
+ *            coefficients.  If prediction is disabled (*pSumErr<0) then the 
+ *            contents of this buffer are undefined upon return from the 
+ *            function 
+ *   pSumErr - pointer to the value of the accumulated AC coefficient errors, 
+ *            i.e., sum of the absolute differences between predicted and 
+ *            unpredicted AC coefficients 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - Bad arguments:
+ *    -    At least one of the following pointers is NULL: pSrc, pDst, pRec, 
+ *         pCoefBufRow, pCoefBufCol, pQpBuf, pPreACPredict, pSumErr. 
+ *    -    blockIndex < 0 or blockIndex >= 10; 
+ *    -    curQP <= 0 or curQP >= 32. 
+ *    -    srcStep, or dstStep <= 0 or not a multiple of 8. 
+ *    -    pDst is not 16-byte aligned: . 
+ *    -    At least one of the following pointers is not 8-byte aligned: 
+ *         pSrc, pRec.  
+ *
+ *  Note: The coefficient buffers must be updated in accordance with the 
+ *        update procedures defined in section in 6.2.2. 
+ *
+ */
+OMXResult omxVCM4P2_TransRecBlockCoef_intra (
+    const OMX_U8 *pSrc,
+    OMX_S16 *pDst,
+    OMX_U8 *pRec,
+    OMX_S16 *pPredBufRow,
+    OMX_S16 *pPredBufCol,
+    OMX_S16 *pPreACPredict,
+    OMX_INT *pSumErr,
+    OMX_INT blockIndex,
+    OMX_U8 curQp,
+    const OMX_U8 *pQpBuf,
+    OMX_INT srcStep,
+    OMX_INT dstStep,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_TransRecBlockCoef_inter   (6.2.4.4.5)
+ *
+ * Description:
+ * Implements DCT, and quantizes the DCT coefficients of the inter block 
+ * while reconstructing the texture residual. There is no boundary check for 
+ * the bit stream buffer. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc -pointer to the residuals to be encoded; must be aligned on an 
+ *            16-byte boundary. 
+ *   QP - quantization parameter. 
+ *   shortVideoHeader - binary flag indicating presence of short_video_header; 
+ *                      shortVideoHeader==1 selects linear intra DC mode, and 
+ *                      shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the quantized DCT coefficients buffer; must be aligned 
+ *            on a 16-byte boundary. 
+ *   pRec - pointer to the reconstructed texture residuals; must be aligned 
+ *            on a 16-byte boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    At least one of the following pointers is either NULL or 
+ *         not 16-byte aligned: 
+ *            - pSrc 
+ *            - pDst
+ *            - pRec
+ *    -    QP <= 0 or QP >= 32. 
+ *
+ */
+OMXResult omxVCM4P2_TransRecBlockCoef_inter (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_S16 *pRec,
+    OMX_U8 QP,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_EncodeVLCZigzag_IntraDCVLC   (6.2.4.5.2)
+ *
+ * Description:
+ * Performs zigzag scan and VLC encoding of AC and DC coefficients for one 
+ * intra block.  Two versions of the function (DCVLC and ACVLC) are provided 
+ * in order to support the two different methods of processing DC 
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, "Intra DC 
+ * Coefficient Decoding for the Case of Switched VLC Encoding".  
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - double pointer to the current byte in the bitstream 
+ *   pBitOffset - pointer to the bit position in the byte pointed by 
+ *            *ppBitStream. Valid within 0 to 7. 
+ *   pQDctBlkCoef - pointer to the quantized DCT coefficient 
+ *   predDir - AC prediction direction, which is used to decide the zigzag 
+ *            scan pattern; takes one of the following values: 
+ *            -  OMX_VC_NONE - AC prediction not used.  
+ *                             Performs classical zigzag scan. 
+ *            -  OMX_VC_HORIZONTAL - Horizontal prediction.  
+ *                             Performs alternate-vertical zigzag scan. 
+ *            -  OMX_VC_VERTICAL - Vertical prediction.  
+ *                             Performs alternate-horizontal zigzag scan. 
+ *   pattern - block pattern which is used to decide whether this block is 
+ *            encoded 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *   videoComp - video component type (luminance, chrominance) of the current 
+ *            block 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is encoded, so 
+ *            that it points to the current byte in the bit stream buffer. 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - Bad arguments:
+ *    -    At least one of the following pointers is NULL: ppBitStream, 
+ *              *ppBitStream, pBitOffset, pQDctBlkCoef. 
+ *    -   *pBitOffset < 0, or *pBitOffset >7. 
+ *    -    PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or 
+ *         OMX_VC_VERTICAL. 
+ *    -    VideoComp is not one component of enum OMXVCM4P2VideoComponent. 
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_IntraDCVLC (
+    OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    const OMX_S16 *pQDctBlkCoef,
+    OMX_U8 predDir,
+    OMX_U8 pattern,
+    OMX_INT shortVideoHeader,
+    OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_EncodeVLCZigzag_IntraACVLC   (6.2.4.5.2)
+ *
+ * Description:
+ * Performs zigzag scan and VLC encoding of AC and DC coefficients for one 
+ * intra block.  Two versions of the function (DCVLC and ACVLC) are provided 
+ * in order to support the two different methods of processing DC 
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4,  Intra DC 
+ * Coefficient Decoding for the Case of Switched VLC Encoding.  
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - double pointer to the current byte in the bitstream 
+ *   pBitOffset - pointer to the bit position in the byte pointed by 
+ *            *ppBitStream. Valid within 0 to 7. 
+ *   pQDctBlkCoef - pointer to the quantized DCT coefficient 
+ *   predDir - AC prediction direction, which is used to decide the zigzag 
+ *            scan pattern; takes one of the following values: 
+ *            -  OMX_VC_NONE - AC prediction not used.  
+ *                             Performs classical zigzag scan. 
+ *            -  OMX_VC_HORIZONTAL - Horizontal prediction.  
+ *                             Performs alternate-vertical zigzag scan. 
+ *            -  OMX_VC_VERTICAL - Vertical prediction.  
+ *                             Performs alternate-horizontal zigzag scan. 
+ *   pattern - block pattern which is used to decide whether this block is 
+ *            encoded 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is encoded, so 
+ *            that it points to the current byte in the bit stream buffer. 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - Bad arguments:
+ *    -    At least one of the following pointers is NULL: ppBitStream, 
+ *              *ppBitStream, pBitOffset, pQDctBlkCoef. 
+ *    -   *pBitOffset < 0, or *pBitOffset >7. 
+ *    -    PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or 
+ *         OMX_VC_VERTICAL. 
+ *    -    VideoComp is not one component of enum OMXVCM4P2VideoComponent. 
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_IntraACVLC (
+    OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    const OMX_S16 *pQDctBlkCoef,
+    OMX_U8 predDir,
+    OMX_U8 pattern,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_EncodeVLCZigzag_Inter   (6.2.4.5.3)
+ *
+ * Description:
+ * Performs classical zigzag scanning and VLC encoding for one inter block. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the bit 
+ *            stream 
+ *   pBitOffset - pointer to the bit position in the byte pointed by 
+ *            *ppBitStream. Valid within 0 to 7 
+ *   pQDctBlkCoef - pointer to the quantized DCT coefficient 
+ *   pattern - block pattern which is used to decide whether this block is 
+ *            encoded 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is encoded so that 
+ *            it points to the current byte in the bit stream buffer. 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - Bad arguments 
+ *    -    At least one of the pointers: is NULL: ppBitStream, *ppBitStream, 
+ *              pBitOffset, pQDctBlkCoef 
+ *    -   *pBitOffset < 0, or *pBitOffset >7. 
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_Inter (
+    OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    const OMX_S16 *pQDctBlkCoef,
+    OMX_U8 pattern,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_EncodeMV   (6.2.4.5.4)
+ *
+ * Description:
+ * Predicts a motion vector for the current macroblock, encodes the 
+ * difference, and writes the output to the stream buffer. The input MVs 
+ * pMVCurMB, pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB should lie 
+ * within the ranges associated with the input parameter fcodeForward, as 
+ * described in [ISO14496-2], subclause 7.6.3.  This function provides a 
+ * superset of the functionality associated with the function 
+ * omxVCM4P2_FindMVpred. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - double pointer to the current byte in the bitstream buffer 
+ *   pBitOffset - index of the first free (next available) bit in the stream 
+ *            buffer referenced by *ppBitStream, valid in the range 0 to 7. 
+ *   pMVCurMB - pointer to the current macroblock motion vector; a value of 
+ *            NULL indicates unavailability. 
+ *   pSrcMVLeftMB - pointer to the source left macroblock motion vector; a 
+ *            value of  NULLindicates unavailability. 
+ *   pSrcMVUpperMB - pointer to source upper macroblock motion vector; a 
+ *            value of NULL indicates unavailability. 
+ *   pSrcMVUpperRightMB - pointer to source upper right MB motion vector; a 
+ *            value of NULL indicates unavailability. 
+ *   fcodeForward - an integer with values from 1 to 7; used in encoding 
+ *            motion vectors related to search range, as described in 
+ *            [ISO14496-2], subclause 7.6.3. 
+ *   MBType - macro block type, valid in the range 0 to 5 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - updated pointer to the current byte in the bit stream 
+ *            buffer 
+ *   pBitOffset - updated index of the next available bit position in stream 
+ *            buffer referenced by *ppBitStream 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments 
+ *    -    At least one of the following pointers is NULL: ppBitStream, 
+ *              *ppBitStream, pBitOffset, pMVCurMB 
+ *    -    *pBitOffset < 0, or *pBitOffset >7. 
+ *    -    fcodeForward <= 0, or fcodeForward > 7, or MBType < 0. 
+ *
+ */
+OMXResult omxVCM4P2_EncodeMV (
+    OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    const OMXVCMotionVector *pMVCurMB,
+    const OMXVCMotionVector*pSrcMVLeftMB,
+    const OMXVCMotionVector *pSrcMVUpperMB,
+    const OMXVCMotionVector *pSrcMVUpperRightMB,
+    OMX_INT fcodeForward,
+    OMXVCM4P2MacroblockType MBType
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodePadMV_PVOP   (6.2.5.1.1)
+ *
+ * Description:
+ * Decodes and pads the four motion vectors associated with a non-intra P-VOP 
+ * macroblock.  For macroblocks of type OMX_VC_INTER4V, the output MV is 
+ * padded as specified in [ISO14496-2], subclause 7.6.1.6. Otherwise, for 
+ * macroblocks of types other than OMX_VC_INTER4V, the decoded MV is copied to 
+ * all four output MV buffer entries. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the bit 
+ *            stream buffer 
+ *   pBitOffset - pointer to the bit position in the byte pointed to by 
+ *            *ppBitStream. *pBitOffset is valid within [0-7]. 
+ *   pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB - pointers to the 
+ *            motion vector buffers of the macroblocks specially at the left, 
+ *            upper, and upper-right side of the current macroblock, 
+ *            respectively; a value of NULL indicates unavailability.  Note: 
+ *            Any neighborhood macroblock outside the current VOP or video 
+ *            packet or outside the current GOB (when short_video_header is 
+ *             1 ) for which gob_header_empty is  0  is treated as 
+ *            transparent, according to [ISO14496-2], subclause 7.6.5. 
+ *   fcodeForward - a code equal to vop_fcode_forward in MPEG-4 bit stream 
+ *            syntax 
+ *   MBType - the type of the current macroblock. If MBType is not equal to 
+ *            OMX_VC_INTER4V, the destination motion vector buffer is still 
+ *            filled with the same decoded vector. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded, so 
+ *            that it points to the current byte in the bit stream buffer 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream 
+ *   pDstMVCurMB - pointer to the motion vector buffer for the current 
+ *            macroblock; contains four decoded motion vectors 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    At least one of the following pointers is NULL: 
+ *         ppBitStream, *ppBitStream, pBitOffset, pDstMVCurMB 
+ *    -    *pBitOffset exceeds [0,7]
+ *    -    fcodeForward exceeds (0,7]
+ *    -    MBType less than zero
+ *    -    motion vector buffer is not 4-byte aligned. 
+ *    OMX_Sts_Err - status error 
+ *
+ */
+OMXResult omxVCM4P2_DecodePadMV_PVOP (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMXVCMotionVector *pSrcMVLeftMB,
+    OMXVCMotionVector*pSrcMVUpperMB,
+    OMXVCMotionVector *pSrcMVUpperRightMB,
+    OMXVCMotionVector*pDstMVCurMB,
+    OMX_INT fcodeForward,
+    OMXVCM4P2MacroblockType MBType
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodeVLCZigzag_IntraDCVLC   (6.2.5.2.2)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients 
+ * for one intra block.  Two versions of the function (DCVLC and ACVLC) are 
+ * provided in order to support the two different methods of processing DC 
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4,  Intra DC 
+ * Coefficient Decoding for the Case of Switched VLC Encoding.  
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the 
+ *            bitstream buffer 
+ *   pBitOffset - pointer to the bit position in the current byte referenced 
+ *            by *ppBitStream.  The parameter *pBitOffset is valid in the 
+ *            range [0-7]. 
+ *            Bit Position in one byte:  |Most      Least| 
+ *                    *pBitOffset        |0 1 2 3 4 5 6 7| 
+ *   predDir - AC prediction direction; used to select the zigzag scan 
+ *            pattern; takes one of the following values: 
+ *            -  OMX_VC_NONE - AC prediction not used; 
+ *                             performs classical zigzag scan. 
+ *            -  OMX_VC_HORIZONTAL - Horizontal prediction; 
+ *                             performs alternate-vertical zigzag scan; 
+ *            -  OMX_VC_VERTICAL - Vertical prediction; 
+ *                             performs alternate-horizontal zigzag scan. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *   videoComp - video component type (luminance or chrominance) of the 
+ *            current block 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded such 
+ *            that it points to the current byte in the bit stream buffer 
+ *   pBitOffset - *pBitOffset is updated such that it points to the current 
+ *            bit position in the byte pointed by *ppBitStream 
+ *   pDst - pointer to the coefficient buffer of current block; must be 
+ *            4-byte aligned. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if:
+ *    -    At least one of the following pointers is NULL: 
+ *         ppBitStream, *ppBitStream, pBitOffset, pDst
+ *    -    *pBitOffset exceeds [0,7]
+ *    -    preDir exceeds [0,2]
+ *    -    pDst is not 4-byte aligned 
+ *    OMX_Sts_Err - if:
+ *    -    In DecodeVLCZigzag_IntraDCVLC, dc_size > 12 
+ *    -    At least one of mark bits equals zero 
+ *    -    Illegal stream encountered; code cannot be located in VLC table 
+ *    -    Forbidden code encountered in the VLC FLC table. 
+ *    -    The number of coefficients is greater than 64 
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_IntraDCVLC (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMX_S16 *pDst,
+    OMX_U8 predDir,
+    OMX_INT shortVideoHeader,
+    OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodeVLCZigzag_IntraACVLC   (6.2.5.2.2)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients 
+ * for one intra block.  Two versions of the function (DCVLC and ACVLC) are 
+ * provided in order to support the two different methods of processing DC 
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4,  Intra DC 
+ * Coefficient Decoding for the Case of Switched VLC Encoding.  
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the 
+ *            bitstream buffer 
+ *   pBitOffset - pointer to the bit position in the current byte referenced 
+ *            by *ppBitStream.  The parameter *pBitOffset is valid in the 
+ *            range [0-7]. Bit Position in one byte:  |Most Least| *pBitOffset 
+ *            |0 1 2 3 4 5 6 7| 
+ *   predDir - AC prediction direction; used to select the zigzag scan 
+ *            pattern; takes one of the following values: OMX_VC_NONE - AC 
+ *            prediction not used; performs classical zigzag scan. 
+ *            OMX_VC_HORIZONTAL - Horizontal prediction; performs 
+ *            alternate-vertical zigzag scan; OMX_VC_VERTICAL - Vertical 
+ *            prediction; performs alternate-horizontal zigzag scan. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *   videoComp - video component type (luminance or chrominance) of the 
+ *            current block 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded such 
+ *            that it points to the current byte in the bit stream buffer 
+ *   pBitOffset - *pBitOffset is updated such that it points to the current 
+ *            bit position in the byte pointed by *ppBitStream 
+ *   pDst - pointer to the coefficient buffer of current block; must be 
+ *            4-byte aligned. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments At least one of the following 
+ *              pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, 
+ *              or At least one of the following conditions is true: 
+ *              *pBitOffset exceeds [0,7], preDir exceeds [0,2], or pDst is 
+ *              not 4-byte aligned 
+ *    OMX_Sts_Err In DecodeVLCZigzag_IntraDCVLC, dc_size > 12 At least one of 
+ *              mark bits equals zero Illegal stream encountered; code cannot 
+ *              be located in VLC table Forbidden code encountered in the VLC 
+ *              FLC table The number of coefficients is greater than 64 
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_IntraACVLC (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMX_S16 *pDst,
+    OMX_U8 predDir,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodeVLCZigzag_Inter   (6.2.5.2.3)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan for one inter-coded block. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - double pointer to the current byte in the stream buffer 
+ *   pBitOffset - pointer to the next available bit in the current stream 
+ *            byte referenced by *ppBitStream. The parameter *pBitOffset is 
+ *            valid within the range [0-7]. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; escape modes 0-3 are used if 
+ *            shortVideoHeader==0, and escape mode 4 is used when 
+ *            shortVideoHeader==1. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded such 
+ *            that it points to the current byte in the stream buffer 
+ *   pBitOffset - *pBitOffset is updated after decoding such that it points 
+ *            to the next available bit in the stream byte referenced by 
+ *            *ppBitStream 
+ *   pDst - pointer to the coefficient buffer of current block; must be 
+ *            4-byte aligned. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_BadArgErr - bad arguments:
+ *    -    At least one of the following pointers is NULL: 
+ *         ppBitStream, *ppBitStream, pBitOffset, pDst
+ *    -    pDst is not 4-byte aligned
+ *    -   *pBitOffset exceeds [0,7]
+ *    OMX_Sts_Err - status error, if:
+ *    -    At least one mark bit is equal to zero 
+ *    -    Encountered an illegal stream code that cannot be found in the VLC table 
+ *    -    Encountered an illegal code in the VLC FLC table 
+ *    -    The number of coefficients is greater than 64 
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_Inter (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMX_S16 *pDst,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_QuantInvIntra_I   (6.2.5.3.2)
+ *
+ * Description:
+ * Performs the second inverse quantization mode on an intra/inter coded 
+ * block. Supports bits_per_pixel = 8. The output coefficients are clipped to 
+ * the range [-2048, 2047]. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input (quantized) intra/inter block; must be 
+ *            aligned on a 16-byte boundary. 
+ *   QP - quantization parameter (quantizer_scale) 
+ *   videoComp - video component type of the current block. Takes one of the 
+ *            following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra 
+ *            version only). 
+ *   shortVideoHeader - binary flag indicating presence of short_video_header 
+ *            (intra version only). 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the output (dequantized) intra/inter block 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; one or more of the following is 
+ *              true: 
+ *    -    pSrcDst is NULL 
+ *    -    QP <= 0 or QP >=31 
+ *    -    videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE. 
+ *
+ */
+OMXResult omxVCM4P2_QuantInvIntra_I (
+    OMX_S16 *pSrcDst,
+    OMX_INT QP,
+    OMXVCM4P2VideoComponent videoComp,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_QuantInvInter_I   (6.2.5.3.2)
+ *
+ * Description:
+ * Performs the second inverse quantization mode on an intra/inter coded 
+ * block. Supports bits_per_pixel = 8. The output coefficients are clipped to 
+ * the range [-2048, 2047]. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input (quantized) intra/inter block; must be 
+ *            aligned on a 16-byte boundary. 
+ *   QP - quantization parameter (quantizer_scale) 
+ *   videoComp - video component type of the current block. Takes one of the 
+ *            following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra 
+ *            version only). 
+ *   shortVideoHeader - binary flag indicating presence of short_video_header 
+ *            (intra version only). 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the output (dequantized) intra/inter block 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; one or more of the following is 
+ *              true: 
+ *    -    pSrcDst is NULL 
+ *    -    QP <= 0 or QP >=31 
+ *    -    videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE. 
+ *
+ */
+OMXResult omxVCM4P2_QuantInvInter_I (
+    OMX_S16 *pSrcDst,
+    OMX_INT QP
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodeBlockCoef_Intra   (6.2.5.4.1)
+ *
+ * Description:
+ * Decodes the INTRA block coefficients. Inverse quantization, inversely 
+ * zigzag positioning, and IDCT, with appropriate clipping on each step, are 
+ * performed on the coefficients. The results are then placed in the output 
+ * frame/plane on a pixel basis.  Note: This function will be used only when 
+ * at least one non-zero AC coefficient of current block exists in the bit 
+ * stream. The DC only condition will be handled in another function. 
+ *
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the bit 
+ *            stream buffer. There is no boundary check for the bit stream 
+ *            buffer. 
+ *   pBitOffset - pointer to the bit position in the byte pointed to by 
+ *            *ppBitStream. *pBitOffset is valid within [0-7]. 
+ *   step - width of the destination plane 
+ *   pCoefBufRow - pointer to the coefficient row buffer; must be aligned on 
+ *            an 8-byte boundary. 
+ *   pCoefBufCol - pointer to the coefficient column buffer; must be aligned 
+ *            on an 8-byte boundary. 
+ *   curQP - quantization parameter of the macroblock which the current block 
+ *            belongs to 
+ *   pQPBuf - pointer to the quantization parameter buffer 
+ *   blockIndex - block index indicating the component type and position as 
+ *            defined in [ISO14496-2], subclause 6.1.3.8, Figure 6-5. 
+ *   intraDCVLC - a code determined by intra_dc_vlc_thr and QP. This allows a 
+ *            mechanism to switch between two VLC for coding of Intra DC 
+ *            coefficients as per [ISO14496-2], Table 6-21. 
+ *   ACPredFlag - a flag equal to ac_pred_flag (of luminance) indicating if 
+ *            the ac coefficients of the first row or first column are 
+ *            differentially coded for intra coded macroblock. 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; shortVideoHeader==1 selects linear intra DC 
+ *            mode, and shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded, so 
+ *            that it points to the current byte in the bit stream buffer 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream 
+ *   pDst - pointer to the block in the destination plane; must be aligned on 
+ *            an 8-byte boundary. 
+ *   pCoefBufRow - pointer to the updated coefficient row buffer. 
+ *   pCoefBufCol - pointer to the updated coefficient column buffer  Note: 
+ *            The coefficient buffers must be updated in accordance with the 
+ *            update procedure defined in section 6.2.2. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if:
+ *    -    At least one of the following pointers is NULL: 
+ *         ppBitStream, *ppBitStream, pBitOffset, pCoefBufRow, pCoefBufCol, 
+ *         pQPBuf, pDst. 
+ *    -    *pBitOffset exceeds [0,7] 
+ *    -    curQP exceeds (1, 31)
+ *    -    blockIndex exceeds [0,5]
+ *    -    step is not the multiple of 8
+ *    -    a pointer alignment requirement was violated. 
+ *    OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Intra.  
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Intra (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMX_U8 *pDst,
+    OMX_INT step,
+    OMX_S16 *pCoefBufRow,
+    OMX_S16 *pCoefBufCol,
+    OMX_U8 curQP,
+    const OMX_U8 *pQPBuf,
+    OMX_INT blockIndex,
+    OMX_INT intraDCVLC,
+    OMX_INT ACPredFlag,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_DecodeBlockCoef_Inter   (6.2.5.4.2)
+ *
+ * Description:
+ * Decodes the INTER block coefficients. This function performs inverse 
+ * quantization, inverse zigzag positioning, and IDCT (with appropriate 
+ * clipping on each step) on the coefficients. The results (residuals) are 
+ * placed in a contiguous array of 64 elements. For INTER block, the output 
+ * buffer holds the residuals for further reconstruction. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - pointer to the pointer to the current byte in the bit 
+ *            stream buffer. There is no boundary check for the bit stream 
+ *            buffer. 
+ *   pBitOffset - pointer to the bit position in the byte pointed to by 
+ *            *ppBitStream. *pBitOffset is valid within [0-7] 
+ *   QP - quantization parameter 
+ *   shortVideoHeader - binary flag indicating presence of 
+ *            short_video_header; shortVideoHeader==1 selects linear intra DC 
+ *            mode, and shortVideoHeader==0 selects non linear intra DC mode. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after the block is decoded, so 
+ *            that it points to the current byte in the bit stream buffer 
+ *   pBitOffset - *pBitOffset is updated so that it points to the current bit 
+ *            position in the byte pointed by *ppBitStream 
+ *   pDst - pointer to the decoded residual buffer (a contiguous array of 64 
+ *            elements of OMX_S16 data type); must be aligned on a 16-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if:
+ *    -    At least one of the following pointers is Null: 
+ *         ppBitStream, *ppBitStream, pBitOffset , pDst 
+ *    -    *pBitOffset exceeds [0,7]
+ *    -    QP <= 0. 
+ *    -    pDst is not 16-byte aligned 
+ *    OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Inter . 
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Inter (
+    const OMX_U8 **ppBitStream,
+    OMX_INT *pBitOffset,
+    OMX_S16 *pDst,
+    OMX_INT QP,
+    OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_PredictReconCoefIntra   (6.2.5.4.3)
+ *
+ * Description:
+ * Performs adaptive DC/AC coefficient prediction for an intra block.  Prior 
+ * to the function call, prediction direction (predDir) should be selected as 
+ * specified in [ISO14496-2], subclause 7.4.3.1. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the coefficient buffer which contains the quantized 
+ *            coefficient residuals (PQF) of the current block; must be 
+ *            aligned on a 4-byte boundary.  The output coefficients are 
+ *            saturated to the range [-2048, 2047]. 
+ *   pPredBufRow - pointer to the coefficient row buffer; must be aligned on 
+ *            a 4-byte boundary. 
+ *   pPredBufCol - pointer to the coefficient column buffer; must be aligned 
+ *            on a 4-byte boundary. 
+ *   curQP - quantization parameter of the current block. curQP may equal to 
+ *            predQP especially when the current block and the predictor block 
+ *            are in the same macroblock. 
+ *   predQP - quantization parameter of the predictor block 
+ *   predDir - indicates the prediction direction which takes one of the 
+ *            following values: OMX_VC_HORIZONTAL - predict horizontally 
+ *            OMX_VC_VERTICAL - predict vertically 
+ *   ACPredFlag - a flag indicating if AC prediction should be performed. It 
+ *            is equal to ac_pred_flag in the bit stream syntax of MPEG-4 
+ *   videoComp - video component type (luminance or chrominance) of the 
+ *            current block 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to the coefficient buffer which contains the quantized 
+ *            coefficients (QF) of the current block 
+ *   pPredBufRow - pointer to the updated coefficient row buffer 
+ *   pPredBufCol - pointer to the updated coefficient column buffer  Note: 
+ *            Buffer update: Update the AC prediction buffer (both row and 
+ *            column buffer). 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments, if:
+ *        -    At least one of the pointers is NULL: 
+ *              pSrcDst, pPredBufRow, or pPredBufCol. 
+ *        -    curQP <= 0, 
+ *        -    predQP <= 0, 
+ *        -    curQP >31, 
+ *        -    predQP > 31, 
+ *        -    preDir exceeds [1,2]
+ *        -    pSrcDst, pPredBufRow, or pPredBufCol is not 4-byte aligned. 
+ *
+ */
+OMXResult omxVCM4P2_PredictReconCoefIntra (
+    OMX_S16 *pSrcDst,
+    OMX_S16 *pPredBufRow,
+    OMX_S16 *pPredBufCol,
+    OMX_INT curQP,
+    OMX_INT predQP,
+    OMX_INT predDir,
+    OMX_INT ACPredFlag,
+    OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function:  omxVCM4P2_MCReconBlock   (6.2.5.5.1)
+ *
+ * Description:
+ * Performs motion compensation prediction for an 8x8 block using 
+ * interpolation described in [ISO14496-2], subclause 7.6.2. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - pointer to the block in the reference plane. 
+ *   srcStep - distance between the start of consecutive lines in the 
+ *            reference plane, in bytes; must be a multiple of 8. 
+ *   dstStep - distance between the start of consecutive lines in the 
+ *            destination plane, in bytes; must be a multiple of 8. 
+ *   pSrcResidue - pointer to a buffer containing the 16-bit prediction 
+ *            residuals; must be 16-byte aligned. If the pointer is NULL, then 
+ *            no prediction is done, only motion compensation, i.e., the block 
+ *            is moved with interpolation. 
+ *   predictType - bilinear interpolation type, as defined in section 
+ *            6.2.1.2. 
+ *   rndVal - rounding control parameter: 0 - disabled; 1 - enabled. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the destination buffer; must be 8-byte aligned.  If 
+ *            prediction residuals are added then output intensities are 
+ *            clipped to the range [0,255]. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned under any of the following 
+ *              conditions: 
+ *    -    pDst is not 8-byte aligned. 
+ *    -    pSrcResidue is not 16-byte aligned. 
+ *    -    one or more of the following pointers is NULL: pSrc or pDst. 
+ *    -    either srcStep or dstStep is not a multiple of 8. 
+ *    -    invalid type specified for the parameter predictType. 
+ *    -    the parameter rndVal is not equal either to 0 or 1. 
+ *
+ */
+OMXResult omxVCM4P2_MCReconBlock (
+    const OMX_U8 *pSrc,
+    OMX_INT srcStep,
+    const OMX_S16 *pSrcResidue,
+    OMX_U8 *pDst,
+    OMX_INT dstStep,
+    OMX_INT predictType,
+    OMX_INT rndVal
+);
+
+
+
+/* 6.3.1.1 Intra 16x16 Prediction Modes  */
+/* A data type that enumerates intra_16x16 macroblock prediction modes is defined as follows:  */
+
+typedef enum {
+    OMX_VC_16X16_VERT = 0,  /** Intra_16x16_Vertical */
+    OMX_VC_16X16_HOR = 1,   /** Intra_16x16_Horizontal */
+    OMX_VC_16X16_DC = 2,    /** Intra_16x16_DC */
+    OMX_VC_16X16_PLANE = 3  /** Intra_16x16_Plane */ 
+} OMXVCM4P10Intra16x16PredMode;
+
+
+
+/* 6.3.1.2 Intra 4x4 Prediction Modes  */
+/* A data type that enumerates intra_4x4 macroblock prediction modes is defined as follows:  */
+
+typedef enum {
+    OMX_VC_4X4_VERT = 0,     /** Intra_4x4_Vertical */
+    OMX_VC_4X4_HOR = 1,      /** Intra_4x4_Horizontal */
+    OMX_VC_4X4_DC = 2,       /** Intra_4x4_DC */
+    OMX_VC_4X4_DIAG_DL = 3,  /** Intra_4x4_Diagonal_Down_Left */
+    OMX_VC_4X4_DIAG_DR = 4,  /** Intra_4x4_Diagonal_Down_Right */
+    OMX_VC_4X4_VR = 5,       /** Intra_4x4_Vertical_Right */
+    OMX_VC_4X4_HD = 6,       /** Intra_4x4_Horizontal_Down */
+    OMX_VC_4X4_VL = 7,       /** Intra_4x4_Vertical_Left */
+    OMX_VC_4X4_HU = 8        /** Intra_4x4_Horizontal_Up */ 
+} OMXVCM4P10Intra4x4PredMode;
+
+
+
+/* 6.3.1.3 Chroma Prediction Modes  */
+/* A data type that enumerates intra chroma prediction modes is defined as follows:  */
+
+typedef enum {
+    OMX_VC_CHROMA_DC = 0,    /** Intra_Chroma_DC */
+    OMX_VC_CHROMA_HOR = 1,   /** Intra_Chroma_Horizontal */
+    OMX_VC_CHROMA_VERT = 2,  /** Intra_Chroma_Vertical */
+    OMX_VC_CHROMA_PLANE = 3  /** Intra_Chroma_Plane */ 
+} OMXVCM4P10IntraChromaPredMode;
+
+
+
+/* 6.3.1.4 Motion Estimation Modes  */
+/* A data type that enumerates H.264 motion estimation modes is defined as follows:  */
+
+typedef enum {
+    OMX_VC_M4P10_FAST_SEARCH = 0, /** Fast motion search */
+    OMX_VC_M4P10_FULL_SEARCH = 1  /** Full motion search */ 
+} OMXVCM4P10MEMode;
+
+
+
+/* 6.3.1.5 Macroblock Types  */
+/* A data type that enumerates H.264 macroblock types is defined as follows:  */
+
+typedef enum {
+    OMX_VC_P_16x16  = 0, /* defined by [ISO14496-10] */
+    OMX_VC_P_16x8  = 1,
+    OMX_VC_P_8x16  = 2,
+    OMX_VC_P_8x8  = 3,
+    OMX_VC_PREF0_8x8  = 4,
+    OMX_VC_INTER_SKIP  = 5,
+    OMX_VC_INTRA_4x4  = 8,
+    OMX_VC_INTRA_16x16  = 9,
+    OMX_VC_INTRA_PCM = 10 
+} OMXVCM4P10MacroblockType;
+
+
+
+/* 6.3.1.6 Sub-Macroblock Types  */
+/* A data type that enumerates H.264 sub-macroblock types is defined as follows:  */
+
+typedef enum {
+    OMX_VC_SUB_P_8x8 = 0, /* defined by [ISO14496-10] */
+    OMX_VC_SUB_P_8x4 = 1,
+    OMX_VC_SUB_P_4x8 = 2,
+    OMX_VC_SUB_P_4x4 = 3 
+} OMXVCM4P10SubMacroblockType;
+
+
+
+/* 6.3.1.7 Variable Length Coding (VLC) Information  */
+
+typedef struct {
+    OMX_U8 uTrailing_Ones;      /* Trailing ones; 3 at most */
+    OMX_U8 uTrailing_One_Signs; /* Trailing ones signal */
+    OMX_U8 uNumCoeffs;          /* Total number of non-zero coefs, including trailing ones */
+    OMX_U8 uTotalZeros;         /* Total number of zero coefs */
+    OMX_S16 iLevels[16];        /* Levels of non-zero coefs, in reverse zig-zag order */
+    OMX_U8 uRuns[16];           /* Runs for levels and trailing ones, in reverse zig-zag order */
+} OMXVCM4P10VLCInfo;
+
+
+
+/* 6.3.1.8 Macroblock Information  */
+
+typedef struct {
+    OMX_S32 sliceId;                          /* slice number */
+    OMXVCM4P10MacroblockType mbType;          /* MB type */
+    OMXVCM4P10SubMacroblockType subMBType[4]; /* sub-block type */
+    OMX_S32 qpy;                              /* qp for luma */
+    OMX_S32 qpc;                              /* qp for chroma */
+    OMX_U32 cbpy;                             /* CBP Luma */
+    OMX_U32 cbpc;                             /* CBP Chroma */
+    OMXVCMotionVector pMV0[4][4]; /* motion vector, represented using 1/4-pel units, pMV0[blocky][blockx] (blocky = 0~3, blockx =0~3) */
+    OMXVCMotionVector pMVPred[4][4]; /* motion vector prediction, Represented using 1/4-pel units, pMVPred[blocky][blockx] (blocky = 0~3, blockx = 0~3) */
+    OMX_U8 pRefL0Idx[4];                      /* reference picture indices */
+    OMXVCM4P10Intra16x16PredMode Intra16x16PredMode; /* best intra 16x16 prediction mode */
+    OMXVCM4P10Intra4x4PredMode pIntra4x4PredMode[16]; /* best intra 4x4 prediction mode for each block, pMV0 indexed as above */
+} OMXVCM4P10MBInfo, *OMXVCM4P10MBInfoPtr;
+
+
+
+/* 6.3.1.9 Motion Estimation Parameters  */
+
+typedef struct {
+    OMX_S32 blockSplitEnable8x8; /* enables 16x8, 8x16, 8x8 */
+    OMX_S32 blockSplitEnable4x4; /* enable splitting of 8x4, 4x8, 4x4 blocks */
+    OMX_S32 halfSearchEnable;
+    OMX_S32 quarterSearchEnable;
+    OMX_S32 intraEnable4x4;      /* 1=enable, 0=disable */
+    OMX_S32 searchRange16x16;    /* integer pixel units */
+    OMX_S32 searchRange8x8;
+    OMX_S32 searchRange4x4;
+} OMXVCM4P10MEParams;
+
+
+
+/**
+ * Function:  omxVCM4P10_PredictIntra_4x4   (6.3.3.1.1)
+ *
+ * Description:
+ * Perform Intra_4x4 prediction for luma samples. If the upper-right block is 
+ * not available, then duplication work should be handled inside the function. 
+ * Users need not define them outside. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcLeft -  Pointer to the buffer of 4 left pixels: 
+ *                  p[x, y] (x = -1, y = 0..3) 
+ *   pSrcAbove - Pointer to the buffer of 8 above pixels: 
+ *                  p[x,y] (x = 0..7, y =-1); 
+ *               must be aligned on a 4-byte boundary. 
+ *   pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) 
+ *   leftStep - Step of left pixel buffer; must be a multiple of 4. 
+ *   dstStep - Step of the destination buffer; must be a multiple of 4. 
+ *   predMode - Intra_4x4 prediction mode. 
+ *   availability - Neighboring 4x4 block availability flag, refer to 
+ *             "Neighboring Macroblock Availability" . 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - Pointer to the destination buffer; must be aligned on a 4-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If one of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    pDst is NULL. 
+ *    dstStep < 4, or dstStep is not a multiple of 4. 
+ *    leftStep is not a multiple of 4. 
+ *    predMode is not in the valid range of enumeration 
+ *              OMXVCM4P10Intra4x4PredMode. 
+ *    predMode is OMX_VC_4x4_VERT, but availability doesn't set OMX_VC_UPPER 
+ *              indicating p[x,-1] (x = 0..3) is not available. 
+ *    predMode is OMX_VC_4x4_HOR, but availability doesn't set OMX_VC_LEFT 
+ *              indicating p[-1,y] (y = 0..3) is not available. 
+ *    predMode is OMX_VC_4x4_DIAG_DL, but availability doesn't set 
+ *              OMX_VC_UPPER indicating p[x, -1] (x = 0..3) is not available. 
+ *    predMode is OMX_VC_4x4_DIAG_DR, but availability doesn't set 
+ *              OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating 
+ *              p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not 
+ *              available. 
+ *    predMode is OMX_VC_4x4_VR, but availability doesn't set 
+ *              OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating 
+ *              p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not 
+ *              available. 
+ *    predMode is OMX_VC_4x4_HD, but availability doesn't set 
+ *              OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating 
+ *              p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not 
+ *              available. 
+ *    predMode is OMX_VC_4x4_VL, but availability doesn't set OMX_VC_UPPER 
+ *              indicating p[x,-1] (x = 0..3) is not available. 
+ *    predMode is OMX_VC_4x4_HU, but availability doesn't set OMX_VC_LEFT 
+ *              indicating p[-1,y] (y = 0..3) is not available. 
+ *    availability sets OMX_VC_UPPER, but pSrcAbove is NULL. 
+ *    availability sets OMX_VC_LEFT, but pSrcLeft is NULL. 
+ *    availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. 
+ *    either pSrcAbove or pDst is not aligned on a 4-byte boundary.  
+ *
+ * Note: 
+ *     pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if 
+ *     they are not used by intra prediction as implied in predMode. 
+ *
+ */
+OMXResult omxVCM4P10_PredictIntra_4x4 (
+    const OMX_U8 *pSrcLeft,
+    const OMX_U8 *pSrcAbove,
+    const OMX_U8 *pSrcAboveLeft,
+    OMX_U8 *pDst,
+    OMX_INT leftStep,
+    OMX_INT dstStep,
+    OMXVCM4P10Intra4x4PredMode predMode,
+    OMX_S32 availability
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_PredictIntra_16x16   (6.3.3.1.2)
+ *
+ * Description:
+ * Perform Intra_16x16 prediction for luma samples. If the upper-right block 
+ * is not available, then duplication work should be handled inside the 
+ * function. Users need not define them outside. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcLeft - Pointer to the buffer of 16 left pixels: p[x, y] (x = -1, y = 
+ *            0..15) 
+ *   pSrcAbove - Pointer to the buffer of 16 above pixels: p[x,y] (x = 0..15, 
+ *            y= -1); must be aligned on a 16-byte boundary. 
+ *   pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) 
+ *   leftStep - Step of left pixel buffer; must be a multiple of 16. 
+ *   dstStep - Step of the destination buffer; must be a multiple of 16. 
+ *   predMode - Intra_16x16 prediction mode, please refer to section 3.4.1. 
+ *   availability - Neighboring 16x16 MB availability flag. Refer to 
+ *                  section 3.4.4. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst -Pointer to the destination buffer; must be aligned on a 16-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If one of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    pDst is NULL. 
+ *    dstStep < 16. or dstStep is not a multiple of 16. 
+ *    leftStep is not a multiple of 16. 
+ *    predMode is not in the valid range of enumeration 
+ *              OMXVCM4P10Intra16x16PredMode 
+ *    predMode is OMX_VC_16X16_VERT, but availability doesn't set 
+ *              OMX_VC_UPPER indicating p[x,-1] (x = 0..15) is not available. 
+ *    predMode is OMX_VC_16X16_HOR, but availability doesn't set OMX_VC_LEFT 
+ *              indicating p[-1,y] (y = 0..15) is not available. 
+ *    predMode is OMX_VC_16X16_PLANE, but availability doesn't set 
+ *              OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating 
+ *              p[x,-1](x = 0..15), or p[-1,y] (y = 0..15), or p[-1,-1] is not 
+ *              available. 
+ *    availability sets OMX_VC_UPPER, but pSrcAbove is NULL. 
+ *    availability sets OMX_VC_LEFT, but pSrcLeft is NULL. 
+ *    availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. 
+ *    either pSrcAbove or pDst is not aligned on a 16-byte boundary.  
+ *
+ * Note: 
+ *     pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if 
+ *     they are not used by intra prediction implied in predMode. 
+ * Note: 
+ *     OMX_VC_UPPER_RIGHT is not used in intra_16x16 luma prediction. 
+ *
+ */
+OMXResult omxVCM4P10_PredictIntra_16x16 (
+    const OMX_U8 *pSrcLeft,
+    const OMX_U8 *pSrcAbove,
+    const OMX_U8 *pSrcAboveLeft,
+    OMX_U8 *pDst,
+    OMX_INT leftStep,
+    OMX_INT dstStep,
+    OMXVCM4P10Intra16x16PredMode predMode,
+    OMX_S32 availability
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_PredictIntraChroma_8x8   (6.3.3.1.3)
+ *
+ * Description:
+ * Performs intra prediction for chroma samples. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcLeft - Pointer to the buffer of 8 left pixels: p[x, y] (x = -1, y= 
+ *            0..7). 
+ *   pSrcAbove - Pointer to the buffer of 8 above pixels: p[x,y] (x = 0..7, y 
+ *            = -1); must be aligned on an 8-byte boundary. 
+ *   pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1) 
+ *   leftStep - Step of left pixel buffer; must be a multiple of 8. 
+ *   dstStep - Step of the destination buffer; must be a multiple of 8. 
+ *   predMode - Intra chroma prediction mode, please refer to section 3.4.3. 
+ *   availability - Neighboring chroma block availability flag, please refer 
+ *            to  "Neighboring Macroblock Availability". 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - Pointer to the destination buffer; must be aligned on an 8-byte 
+ *            boundary. 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If any of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    pDst is NULL. 
+ *    dstStep < 8 or dstStep is not a multiple of 8. 
+ *    leftStep is not a multiple of 8. 
+ *    predMode is not in the valid range of enumeration 
+ *              OMXVCM4P10IntraChromaPredMode. 
+ *    predMode is OMX_VC_CHROMA_VERT, but availability doesn't set 
+ *              OMX_VC_UPPER indicating p[x,-1] (x = 0..7) is not available. 
+ *    predMode is OMX_VC_CHROMA_HOR, but availability doesn't set OMX_VC_LEFT 
+ *              indicating p[-1,y] (y = 0..7) is not available. 
+ *    predMode is OMX_VC_CHROMA_PLANE, but availability doesn't set 
+ *              OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating 
+ *              p[x,-1](x = 0..7), or p[-1,y] (y = 0..7), or p[-1,-1] is not 
+ *              available. 
+ *    availability sets OMX_VC_UPPER, but pSrcAbove is NULL. 
+ *    availability sets OMX_VC_LEFT, but pSrcLeft is NULL. 
+ *    availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL. 
+ *    either pSrcAbove or pDst is not aligned on a 8-byte boundary.  
+ *
+ *  Note: pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointer if 
+ *  they are not used by intra prediction implied in predMode. 
+ *
+ *  Note: OMX_VC_UPPER_RIGHT is not used in intra chroma prediction. 
+ *
+ */
+OMXResult omxVCM4P10_PredictIntraChroma_8x8 (
+    const OMX_U8 *pSrcLeft,
+    const OMX_U8 *pSrcAbove,
+    const OMX_U8 *pSrcAboveLeft,
+    OMX_U8 *pDst,
+    OMX_INT leftStep,
+    OMX_INT dstStep,
+    OMXVCM4P10IntraChromaPredMode predMode,
+    OMX_S32 availability
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InterpolateLuma   (6.3.3.2.1)
+ *
+ * Description:
+ * Performs quarter-pixel interpolation for inter luma MB. It is assumed that 
+ * the frame is already padded when calling this function. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the source reference frame buffer 
+ *   srcStep - reference frame step, in bytes; must be a multiple of roi.width 
+ *   dstStep - destination frame step, in bytes; must be a multiple of 
+ *            roi.width 
+ *   dx - Fractional part of horizontal motion vector component in 1/4 pixel 
+ *            unit; valid in the range [0,3] 
+ *   dy - Fractional part of vertical motion vector y component in 1/4 pixel 
+ *            unit; valid in the range [0,3] 
+ *   roi - Dimension of the interpolation region; the parameters roi.width and 
+ *            roi.height must be equal to either 4, 8, or 16. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - Pointer to the destination frame buffer: 
+ *          if roi.width==4,  4-byte alignment required 
+ *          if roi.width==8,  8-byte alignment required 
+ *          if roi.width==16, 16-byte alignment required 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If one of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    pSrc or pDst is NULL. 
+ *    srcStep or dstStep < roi.width. 
+ *    dx or dy is out of range [0,3]. 
+ *    roi.width or roi.height is out of range {4, 8, 16}. 
+ *    roi.width is equal to 4, but pDst is not 4 byte aligned. 
+ *    roi.width is equal to 8 or 16, but pDst is not 8 byte aligned. 
+ *    srcStep or dstStep is not a multiple of 8. 
+ *
+ */
+OMXResult omxVCM4P10_InterpolateLuma (
+    const OMX_U8 *pSrc,
+    OMX_S32 srcStep,
+    OMX_U8 *pDst,
+    OMX_S32 dstStep,
+    OMX_S32 dx,
+    OMX_S32 dy,
+    OMXSize roi
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InterpolateChroma   (6.3.3.2.2)
+ *
+ * Description:
+ * Performs 1/8-pixel interpolation for inter chroma MB. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc -Pointer to the source reference frame buffer 
+ *   srcStep -Reference frame step in bytes 
+ *   dstStep -Destination frame step in bytes; must be a multiple of 
+ *            roi.width. 
+ *   dx -Fractional part of horizontal motion vector component in 1/8 pixel 
+ *            unit; valid in the range [0,7] 
+ *   dy -Fractional part of vertical motion vector component in 1/8 pixel 
+ *            unit; valid in the range [0,7] 
+ *   roi -Dimension of the interpolation region; the parameters roi.width and 
+ *            roi.height must be equal to either 2, 4, or 8. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst -Pointer to the destination frame buffer:
+ *         if roi.width==2,  2-byte alignment required 
+ *         if roi.width==4,  4-byte alignment required 
+ *         if roi.width==8, 8-byte alignment required 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If one of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    pSrc or pDst is NULL. 
+ *    srcStep or dstStep < 8. 
+ *    dx or dy is out of range [0-7]. 
+ *    roi.width or roi.height is out of range {2,4,8}. 
+ *    roi.width is equal to 2, but pDst is not 2-byte aligned. 
+ *    roi.width is equal to 4, but pDst is not 4-byte aligned. 
+ *    roi.width is equal to 8, but pDst is not 8 byte aligned. 
+ *    srcStep or dstStep is not a multiple of 8. 
+ *
+ */
+OMXResult omxVCM4P10_InterpolateChroma (
+    const OMX_U8 *pSrc,
+    OMX_S32 srcStep,
+    OMX_U8 *pDst,
+    OMX_S32 dstStep,
+    OMX_S32 dx,
+    OMX_S32 dy,
+    OMXSize roi
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I   (6.3.3.3.1)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four vertical edges of the luma 
+ * macroblock (16x16). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - Pointer to the input macroblock; must be 16-byte aligned. 
+ *   srcdstStep -Step of the arrays; must be a multiple of 16. 
+ *   pAlpha -Array of size 2 of alpha thresholds (the first item is the alpha 
+ *            threshold for the external vertical edge, and the second item is 
+ *            for the internal vertical edge); per [ISO14496-10] alpha values 
+ *            must be in the range [0,255]. 
+ *   pBeta -Array of size 2 of beta thresholds (the first item is the beta 
+ *            threshold for the external vertical edge, and the second item is 
+ *            for the internal vertical edge); per [ISO14496-10] beta values 
+ *            must be in the range [0,18]. 
+ *   pThresholds -Array of size 16 of Thresholds (TC0) (values for the left 
+ *            edge of each 4x4 block, arranged in vertical block order); must 
+ *            be aligned on a 4-byte boundary..  Per [ISO14496-10] values must 
+ *            be in the range [0,25]. 
+ *   pBS -Array of size 16 of BS parameters (arranged in vertical block 
+ *            order); valid in the range [0,4] with the following 
+ *            restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) 
+ *            pBS[i]== 4 if and only if pBS[i^3]== 4.  Must be 4-byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst -Pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    If the function runs without error, it returns OMX_Sts_NoErr. 
+ *    If one of the following cases occurs, the function returns 
+ *              OMX_Sts_BadArgErr: 
+ *    Either of the pointers in pSrcDst, pAlpha, pBeta, pThresholds, or pBS 
+ *              is NULL. 
+ *    Either pThresholds or pBS is not aligned on a 4-byte boundary. 
+ *    pSrcDst is not 16-byte aligned. 
+ *    srcdstStep is not a multiple of 16. 
+ *    pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. 
+ *    pBeta[0] and/or pBeta[1] is outside the range [0,18]. 
+ *    One or more entries in the table pThresholds[0..15]is outside of the 
+ *              range [0,25]. 
+ *    pBS is out of range, i.e., one of the following conditions is true: 
+ *              pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && 
+ *              pBS[i^3]!=4) for 0<=i<=3. 
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingLuma_VerEdge_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_FilterDeblockingLuma_HorEdge_I   (6.3.3.3.2)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four horizontal edges of the luma 
+ * macroblock (16x16). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input macroblock; must be 16-byte aligned. 
+ *   srcdstStep - step of the arrays; must be a multiple of 16. 
+ *   pAlpha - array of size 2 of alpha thresholds (the first item is the alpha 
+ *            threshold for the external vertical edge, and the second item is 
+ *            for the internal horizontal edge); per [ISO14496-10] alpha 
+ *            values must be in the range [0,255]. 
+ *   pBeta - array of size 2 of beta thresholds (the first item is the beta 
+ *            threshold for the external horizontal edge, and the second item 
+ *            is for the internal horizontal edge). Per [ISO14496-10] beta 
+ *            values must be in the range [0,18]. 
+ *   pThresholds - array of size 16 containing thresholds, TC0, for the top 
+ *            horizontal edge of each 4x4 block, arranged in horizontal block 
+ *            order; must be aligned on a 4-byte boundary.  Per [ISO14496 10] 
+ *            values must be in the range [0,25]. 
+ *   pBS - array of size 16 of BS parameters (arranged in horizontal block 
+ *            order); valid in the range [0,4] with the following 
+ *            restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) 
+ *            pBS[i]== 4 if and only if pBS[i^3]== 4.  Must be 4-byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst -Pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr, if the function runs without error.
+ * 
+ *    OMX_Sts_BadArgErr, if one of the following cases occurs: 
+ *    -    one or more of the following pointers is NULL: pSrcDst, pAlpha, 
+ *              pBeta, pThresholds, or pBS. 
+ *    -    either pThresholds or pBS is not aligned on a 4-byte boundary. 
+ *    -    pSrcDst is not 16-byte aligned. 
+ *    -    srcdstStep is not a multiple of 16. 
+ *    -    pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. 
+ *    -    pBeta[0] and/or pBeta[1] is outside the range [0,18]. 
+ *    -    One or more entries in the table pThresholds[0..15] is 
+ *         outside of the range [0,25]. 
+ *    -    pBS is out of range, i.e., one of the following conditions is true: 
+ *              pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or 
+ *              (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. 
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingLuma_HorEdge_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I   (6.3.3.3.3)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four vertical edges of the chroma 
+ * macroblock (8x8). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - Pointer to the input macroblock; must be 8-byte aligned. 
+ *   srcdstStep - Step of the arrays; must be a multiple of 8. 
+ *   pAlpha - Array of size 2 of alpha thresholds (the first item is alpha 
+ *            threshold for external vertical edge, and the second item is for 
+ *            internal vertical edge); per [ISO14496-10] alpha values must be 
+ *            in the range [0,255]. 
+ *   pBeta - Array of size 2 of beta thresholds (the first item is the beta 
+ *            threshold for the external vertical edge, and the second item is 
+ *            for the internal vertical edge); per [ISO14496-10] beta values 
+ *            must be in the range [0,18]. 
+ *   pThresholds - Array of size 8 containing thresholds, TC0, for the left 
+ *            vertical edge of each 4x2 chroma block, arranged in vertical 
+ *            block order; must be aligned on a 4-byte boundary.  Per 
+ *            [ISO14496-10] values must be in the range [0,25]. 
+ *   pBS - Array of size 16 of BS parameters (values for each 2x2 chroma 
+ *            block, arranged in vertical block order). This parameter is the 
+ *            same as the pBS parameter passed into FilterDeblockLuma_VerEdge; 
+ *            valid in the range [0,4] with the following restrictions: i) 
+ *            pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and 
+ *            only if pBS[i^3]== 4.  Must be 4 byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst -Pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr, if the function runs without error.
+ * 
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    one or more of the following pointers is NULL: pSrcDst, pAlpha, 
+ *              pBeta, pThresholds, or pBS. 
+ *    -    pSrcDst is not 8-byte aligned. 
+ *    -    srcdstStep is not a multiple of 8. 
+ *    -    pThresholds is not 4-byte aligned. 
+ *    -    pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. 
+ *    -    pBeta[0] and/or pBeta[1] is outside the range [0,18]. 
+ *    -    One or more entries in the table pThresholds[0..7] is outside 
+ *         of the range [0,25]. 
+ *    -    pBS is out of range, i.e., one of the following conditions is true: 
+ *         pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or 
+ *         (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. 
+ *    -    pBS is not 4-byte aligned. 
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingChroma_VerEdge_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_FilterDeblockingChroma_HorEdge_I   (6.3.3.3.4)
+ *
+ * Description:
+ * Performs in-place deblock filtering on the horizontal edges of the chroma 
+ * macroblock (8x8). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input macroblock; must be 8-byte aligned. 
+ *   srcdstStep - array step; must be a multiple of 8. 
+ *   pAlpha - array of size 2 containing alpha thresholds; the first element 
+ *            contains the threshold for the external horizontal edge, and the 
+ *            second element contains the threshold for internal horizontal 
+ *            edge.  Per [ISO14496-10] alpha values must be in the range 
+ *            [0,255]. 
+ *   pBeta - array of size 2 containing beta thresholds; the first element 
+ *            contains the threshold for the external horizontal edge, and the 
+ *            second element contains the threshold for the internal 
+ *            horizontal edge.  Per [ISO14496-10] beta values must be in the 
+ *            range [0,18]. 
+ *   pThresholds - array of size 8 containing thresholds, TC0, for the top 
+ *            horizontal edge of each 2x4 chroma block, arranged in horizontal 
+ *            block order; must be aligned on a 4-byte boundary.  Per 
+ *            [ISO14496-10] values must be in the range [0,25]. 
+ *   pBS - array of size 16 containing BS parameters for each 2x2 chroma 
+ *            block, arranged in horizontal block order; valid in the range 
+ *            [0,4] with the following restrictions: i) pBS[i]== 4 may occur 
+ *            only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== 4. 
+ *            Must be 4-byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst -Pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr, if the function runs without error.
+ * 
+ *    OMX_Sts_BadArgErr, if one of the following cases occurs: 
+ *    -    any of the following pointers is NULL: 
+ *         pSrcDst, pAlpha, pBeta, pThresholds, or pBS. 
+ *    -    pSrcDst is not 8-byte aligned. 
+ *    -    srcdstStep is not a multiple of 8. 
+ *    -    pThresholds is not 4-byte aligned. 
+ *    -    pAlpha[0] and/or pAlpha[1] is outside the range [0,255]. 
+ *    -    pBeta[0] and/or pBeta[1] is outside the range [0,18]. 
+ *    -    One or more entries in the table pThresholds[0..7] is outside 
+ *         of the range [0,25]. 
+ *    -    pBS is out of range, i.e., one of the following conditions is true: 
+ *              pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or 
+ *              (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ *    -    pBS is not 4-byte aligned. 
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingChroma_HorEdge_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_DeblockLuma_I   (6.3.3.3.5)
+ *
+ * Description:
+ * This function performs in-place deblock filtering the horizontal and 
+ * vertical edges of a luma macroblock (16x16). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input macroblock; must be 16-byte aligned. 
+ *   srcdstStep - image width; must be a multiple of 16. 
+ *   pAlpha - pointer to a 2x2 table of alpha thresholds, organized as 
+ *            follows: {external vertical edge, internal vertical edge, 
+ *            external horizontal edge, internal horizontal edge }.  Per 
+ *            [ISO14496-10] alpha values must be in the range [0,255]. 
+ *   pBeta - pointer to a 2x2 table of beta thresholds, organized as follows: 
+ *            {external vertical edge, internal vertical edge, external 
+ *            horizontal edge, internal horizontal edge }.  Per [ISO14496-10] 
+ *            beta values must be in the range [0,18]. 
+ *   pThresholds - pointer to a 16x2 table of threshold (TC0), organized as 
+ *            follows: {values for the left or above edge of each 4x4 block, 
+ *            arranged in vertical block order and then in horizontal block 
+ *            order}; must be aligned on a 4-byte boundary.  Per [ISO14496-10] 
+ *            values must be in the range [0,25]. 
+ *   pBS - pointer to a 16x2 table of BS parameters arranged in scan block 
+ *            order for vertical edges and then horizontal edges; valid in the 
+ *            range [0,4] with the following restrictions: i) pBS[i]== 4 may 
+ *            occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== 
+ *            4. Must be 4-byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments 
+ *    -     one or more of the following pointers is NULL: pSrcDst, pAlpha, 
+ *              pBeta, pThresholds or pBS. 
+ *    -    pSrcDst is not 16-byte aligned. 
+ *    -    either pThresholds or pBS is not aligned on a 4-byte boundary. 
+ *    -    one or more entries in the table pAlpha[0..3] is outside the range 
+ *              [0,255]. 
+ *    -    one or more entries in the table pBeta[0..3] is outside the range 
+ *              [0,18]. 
+ *    -    one or more entries in the table pThresholds[0..31]is outside of 
+ *              the range [0,25]. 
+ *    -    pBS is out of range, i.e., one of the following conditions is true: 
+ *              pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or 
+ *             (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. 
+ *    -    srcdstStep is not a multiple of 16. 
+ *
+ */
+OMXResult omxVCM4P10_DeblockLuma_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_DeblockChroma_I   (6.3.3.3.6)
+ *
+ * Description:
+ * Performs in-place deblocking filtering on all edges of the chroma 
+ * macroblock (16x16). 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - pointer to the input macroblock; must be 8-byte aligned. 
+ *   srcdstStep - step of the arrays; must be a multiple of 8. 
+ *   pAlpha - pointer to a 2x2 array of alpha thresholds, organized as 
+ *            follows: {external vertical edge, internal vertical edge, 
+ *            external horizontal edge, internal horizontal edge }.  Per 
+ *            [ISO14496-10] alpha values must be in the range [0,255]. 
+ *   pBeta - pointer to a 2x2 array of Beta Thresholds, organized as follows: 
+ *            { external vertical edge, internal vertical edge, external 
+ *            horizontal edge, internal horizontal edge }.  Per [ISO14496-10] 
+ *            beta values must be in the range [0,18]. 
+ *   pThresholds - array of size 8x2 of Thresholds (TC0) (values for the left 
+ *            or above edge of each 4x2 or 2x4 block, arranged in vertical 
+ *            block order and then in horizontal block order); must be aligned 
+ *            on a 4-byte boundary. Per [ISO14496-10] values must be in the 
+ *            range [0,25]. 
+ *   pBS - array of size 16x2 of BS parameters (arranged in scan block order 
+ *            for vertical edges and then horizontal edges); valid in the 
+ *            range [0,4] with the following restrictions: i) pBS[i]== 4 may 
+ *            occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== 
+ *            4.  Must be 4-byte aligned. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - pointer to filtered output macroblock. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments 
+ *    -   one or more of the following pointers is NULL: pSrcDst, pAlpha, 
+ *              pBeta, pThresholds, or pBS. 
+ *    -   pSrcDst is not 8-byte aligned. 
+ *    -   either pThresholds or pBS is not 4-byte aligned. 
+ *    -   one or more entries in the table pAlpha[0..3] is outside the range 
+ *              [0,255]. 
+ *    -   one or more entries in the table pBeta[0..3] is outside the range 
+ *              [0,18]. 
+ *    -   one or more entries in the table pThresholds[0..15]is outside of 
+ *              the range [0,25]. 
+ *    -   pBS is out of range, i.e., one of the following conditions is true: 
+ *            pBS[i]<0, pBS[i]>4, pBS[i]==4  for i>=4, or 
+ *            (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3. 
+ *    -   srcdstStep is not a multiple of 8. 
+ *
+ */
+OMXResult omxVCM4P10_DeblockChroma_I (
+    OMX_U8 *pSrcDst,
+    OMX_S32 srcdstStep,
+    const OMX_U8 *pAlpha,
+    const OMX_U8 *pBeta,
+    const OMX_U8 *pThresholds,
+    const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC   (6.3.4.1.1)
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse raster scan for a 2x2 block of 
+ * ChromaDCLevel.  The decoded coefficients in the packed position-coefficient 
+ * buffer are stored in reverse zig-zag order, i.e., the first buffer element 
+ * contains the last non-zero postion-coefficient pair of the block. Within 
+ * each position-coefficient pair, the position entry indicates the 
+ * raster-scan position of the coefficient, while the coefficient entry 
+ * contains the coefficient value. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream - Double pointer to current byte in bit stream buffer 
+ *   pOffset - Pointer to current bit position in the byte pointed to by 
+ *            *ppBitStream; valid in the range [0,7]. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after each block is decoded 
+ *   pOffset - *pOffset is updated after each block is decoded 
+ *   pNumCoeff - Pointer to the number of nonzero coefficients in this block 
+ *   ppPosCoefBuf - Double pointer to destination residual 
+ *            coefficient-position pair buffer.  Buffer position 
+ *            (*ppPosCoefBuf) is updated upon return, unless there are only 
+ *            zero coefficients in the currently decoded block.  In this case 
+ *            the caller is expected to bypass the transform/dequantization of 
+ *            the empty blocks. 
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr, if the function runs without error.
+ * 
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    ppBitStream or pOffset is NULL. 
+ *    -    ppPosCoefBuf or pNumCoeff is NULL. 
+ *    OMX_Sts_Err - if one of the following is true: 
+ *    -    an illegal code is encountered in the bitstream 
+ *
+ */
+OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (
+    const OMX_U8 **ppBitStream,
+    OMX_S32*pOffset,
+    OMX_U8 *pNumCoeff,
+    OMX_U8 **ppPosCoefbuf
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_DecodeCoeffsToPairCAVLC   (6.3.4.1.2)
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of 
+ * Intra16x16DCLevel, Intra16x16ACLevel, LumaLevel, and ChromaACLevel. Inverse 
+ * field scan is not supported. The decoded coefficients in the packed 
+ * position-coefficient buffer are stored in reverse zig-zag order, i.e., the 
+ * first buffer element contains the last non-zero postion-coefficient pair of 
+ * the block. Within each position-coefficient pair, the position entry 
+ * indicates the raster-scan position of the coefficient, while the 
+ * coefficient entry contains the coefficient value. 
+ *
+ * Input Arguments:
+ *   
+ *   ppBitStream -Double pointer to current byte in bit stream buffer 
+ *   pOffset - Pointer to current bit position in the byte pointed to by 
+ *            *ppBitStream; valid in the range [0,7]. 
+ *   sMaxNumCoeff - Maximum the number of non-zero coefficients in current 
+ *            block 
+ *   sVLCSelect - VLC table selector, obtained from the number of non-zero 
+ *            coefficients contained in the above and left 4x4 blocks.  It is 
+ *            equivalent to the variable nC described in H.264 standard table 
+ *            9 5, except its value can t be less than zero. 
+ *
+ * Output Arguments:
+ *   
+ *   ppBitStream - *ppBitStream is updated after each block is decoded.  
+ *            Buffer position (*ppPosCoefBuf) is updated upon return, unless 
+ *            there are only zero coefficients in the currently decoded block. 
+ *             In this case the caller is expected to bypass the 
+ *            transform/dequantization of the empty blocks. 
+ *   pOffset - *pOffset is updated after each block is decoded 
+ *   pNumCoeff - Pointer to the number of nonzero coefficients in this block 
+ *   ppPosCoefBuf - Double pointer to destination residual 
+ *            coefficient-position pair buffer 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ * 
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    ppBitStream or pOffset is NULL. 
+ *    -    ppPosCoefBuf or pNumCoeff is NULL. 
+ *    -    sMaxNumCoeff is not equal to either 15 or 16. 
+ *    -    sVLCSelect is less than 0. 
+ *
+ *    OMX_Sts_Err - if one of the following is true: 
+ *    -    an illegal code is encountered in the bitstream 
+ *
+ */
+OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC (
+    const OMX_U8 **ppBitStream,
+    OMX_S32 *pOffset,
+    OMX_U8 *pNumCoeff,
+    OMX_U8 **ppPosCoefbuf,
+    OMX_INT sVLCSelect,
+    OMX_INT sMaxNumCoeff
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_TransformDequantLumaDCFromPair   (6.3.4.2.1)
+ *
+ * Description:
+ * Reconstructs the 4x4 LumaDC block from the coefficient-position pair 
+ * buffer, performs integer inverse, and dequantization for 4x4 LumaDC 
+ * coefficients, and updates the pair buffer pointer to the next non-empty 
+ * block. 
+ *
+ * Input Arguments:
+ *   
+ *   ppSrc - Double pointer to residual coefficient-position pair buffer 
+ *            output by CALVC decoding 
+ *   QP - Quantization parameter QpY 
+ *
+ * Output Arguments:
+ *   
+ *   ppSrc - *ppSrc is updated to the start of next non empty block 
+ *   pDst - Pointer to the reconstructed 4x4 LumaDC coefficients buffer; must 
+ *            be aligned on a 8-byte boundary. 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    ppSrc or pDst is NULL. 
+ *    -    pDst is not 8 byte aligned. 
+ *    -    QP is not in the range of [0-51]. 
+ *
+ */
+OMXResult omxVCM4P10_TransformDequantLumaDCFromPair (
+    const OMX_U8 **ppSrc,
+    OMX_S16 *pDst,
+    OMX_INT QP
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_TransformDequantChromaDCFromPair   (6.3.4.2.2)
+ *
+ * Description:
+ * Reconstruct the 2x2 ChromaDC block from coefficient-position pair buffer, 
+ * perform integer inverse transformation, and dequantization for 2x2 chroma 
+ * DC coefficients, and update the pair buffer pointer to next non-empty 
+ * block. 
+ *
+ * Input Arguments:
+ *   
+ *   ppSrc - Double pointer to residual coefficient-position pair buffer 
+ *            output by CALVC decoding 
+ *   QP - Quantization parameter QpC 
+ *
+ * Output Arguments:
+ *   
+ *   ppSrc - *ppSrc is updated to the start of next non empty block 
+ *   pDst - Pointer to the reconstructed 2x2 ChromaDC coefficients buffer; 
+ *            must be aligned on a 4-byte boundary. 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    ppSrc or pDst is NULL. 
+ *    -    pDst is not 4-byte aligned. 
+ *    -    QP is not in the range of [0-51]. 
+ *
+ */
+OMXResult omxVCM4P10_TransformDequantChromaDCFromPair (
+    const OMX_U8 **ppSrc,
+    OMX_S16 *pDst,
+    OMX_INT QP
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_DequantTransformResidualFromPairAndAdd   (6.3.4.2.3)
+ *
+ * Description:
+ * Reconstruct the 4x4 residual block from coefficient-position pair buffer, 
+ * perform dequantization and integer inverse transformation for 4x4 block of 
+ * residuals with previous intra prediction or motion compensation data, and 
+ * update the pair buffer pointer to next non-empty block. If pDC == NULL, 
+ * there re 16 non-zero AC coefficients at most in the packed buffer starting 
+ * from 4x4 block position 0; If pDC != NULL, there re 15 non-zero AC 
+ * coefficients at most in the packet buffer starting from 4x4 block position 
+ * 1. 
+ *
+ * Input Arguments:
+ *   
+ *   ppSrc - Double pointer to residual coefficient-position pair buffer 
+ *            output by CALVC decoding 
+ *   pPred - Pointer to the predicted 4x4 block; must be aligned on a 4-byte 
+ *            boundary 
+ *   predStep - Predicted frame step size in bytes; must be a multiple of 4 
+ *   dstStep - Destination frame step in bytes; must be a multiple of 4 
+ *   pDC - Pointer to the DC coefficient of this block, NULL if it doesn't 
+ *            exist 
+ *   QP - QP Quantization parameter.  It should be QpC in chroma 4x4 block 
+ *            decoding, otherwise it should be QpY. 
+ *   AC - Flag indicating if at least one non-zero AC coefficient exists 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - pointer to the reconstructed 4x4 block data; must be aligned on a 
+ *            4-byte boundary 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    pPred or pDst is NULL. 
+ *    -    pPred or pDst is not 4-byte aligned. 
+ *    -    predStep or dstStep is not a multiple of 4. 
+ *    -    AC !=0 and Qp is not in the range of [0-51] or ppSrc == NULL. 
+ *    -    AC ==0 && pDC ==NULL. 
+ *
+ */
+OMXResult omxVCM4P10_DequantTransformResidualFromPairAndAdd (
+    const OMX_U8 **ppSrc,
+    const OMX_U8 *pPred,
+    const OMX_S16 *pDC,
+    OMX_U8 *pDst,
+    OMX_INT predStep,
+    OMX_INT dstStep,
+    OMX_INT QP,
+    OMX_INT AC
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_MEGetBufSize   (6.3.5.1.1)
+ *
+ * Description:
+ * Computes the size, in bytes, of the vendor-specific specification 
+ * structure for the omxVCM4P10 motion estimation functions BlockMatch_Integer 
+ * and MotionEstimationMB. 
+ *
+ * Input Arguments:
+ *   
+ *   MEmode - motion estimation mode; available modes are defined by the 
+ *            enumerated type OMXVCM4P10MEMode 
+ *   pMEParams -motion estimation parameters 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the motion 
+ *            estimation specification structure 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    pMEParams or pSize is NULL. 
+ *    -    an invalid MEMode is specified. 
+ *
+ */
+OMXResult omxVCM4P10_MEGetBufSize (
+    OMXVCM4P10MEMode MEmode,
+    const OMXVCM4P10MEParams *pMEParams,
+    OMX_U32 *pSize
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_MEInit   (6.3.5.1.2)
+ *
+ * Description:
+ * Initializes the vendor-specific specification structure required for the 
+ * omxVCM4P10 motion estimation functions:  BlockMatch_Integer and 
+ * MotionEstimationMB. Memory for the specification structure *pMESpec must be 
+ * allocated prior to calling the function, and should be aligned on a 4-byte 
+ * boundary.  The number of bytes required for the specification structure can 
+ * be determined using the function omxVCM4P10_MEGetBufSize. Following 
+ * initialization by this function, the vendor-specific structure *pMESpec 
+ * should contain an implementation-specific representation of all motion 
+ * estimation parameters received via the structure pMEParams, for example  
+ * searchRange16x16, searchRange8x8, etc. 
+ *
+ * Input Arguments:
+ *   
+ *   MEmode - motion estimation mode; available modes are defined by the 
+ *            enumerated type OMXVCM4P10MEMode 
+ *   pMEParams - motion estimation parameters 
+ *   pMESpec - pointer to the uninitialized ME specification structure 
+ *
+ * Output Arguments:
+ *   
+ *   pMESpec - pointer to the initialized ME specification structure 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    pMEParams or pSize is NULL. 
+ *    -    an invalid value was specified for the parameter MEmode 
+ *    -    a negative or zero value was specified for one of the search ranges 
+ *         (e.g.,  pMBParams >searchRange8x8, pMEParams->searchRange16x16, etc.) 
+ *    -    either in isolation or in combination, one or more of the enables or 
+ *         search ranges in the structure *pMEParams were configured such 
+ *         that the requested behavior fails to comply with [ISO14496-10]. 
+ *
+ */
+OMXResult omxVCM4P10_MEInit (
+    OMXVCM4P10MEMode MEmode,
+    const OMXVCM4P10MEParams *pMEParams,
+    void *pMESpec
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_BlockMatch_Integer   (6.3.5.2.1)
+ *
+ * Description:
+ * Performs integer block match.  Returns best MV and associated cost. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrgY - Pointer to the top-left corner of the current block:
+ *            If iBlockWidth==4,  4-byte alignment required. 
+ *            If iBlockWidth==8,  8-byte alignment required. 
+ *            If iBlockWidth==16, 16-byte alignment required. 
+ *   pSrcRefY - Pointer to the top-left corner of the co-located block in the 
+ *            reference picture: 
+ *            If iBlockWidth==4,  4-byte alignment required.  
+ *            If iBlockWidth==8,  8-byte alignment required.  
+ *            If iBlockWidth==16, 16-byte alignment required. 
+ *   nSrcOrgStep - Stride of the original picture plane, expressed in terms 
+ *            of integer pixels; must be a multiple of iBlockWidth. 
+ *   nSrcRefStep - Stride of the reference picture plane, expressed in terms 
+ *            of integer pixels 
+ *   pRefRect - pointer to the valid reference rectangle inside the reference 
+ *            picture plane 
+ *   nCurrPointPos - position of the current block in the current plane 
+ *   iBlockWidth - Width of the current block, expressed in terms of integer 
+ *            pixels; must be equal to either 4, 8, or 16. 
+ *   iBlockHeight - Height of the current block, expressed in terms of 
+ *            integer pixels; must be equal to either 4, 8, or 16. 
+ *   nLamda - Lamda factor; used to compute motion cost 
+ *   pMVPred - Predicted MV; used to compute motion cost, expressed in terms 
+ *            of 1/4-pel units 
+ *   pMVCandidate - Candidate MV; used to initialize the motion search, 
+ *            expressed in terms of integer pixels 
+ *   pMESpec - pointer to the ME specification structure 
+ *
+ * Output Arguments:
+ *   
+ *   pDstBestMV - Best MV resulting from integer search, expressed in terms 
+ *            of 1/4-pel units 
+ *   pBestCost - Motion cost associated with the best MV; computed as 
+ *            SAD+Lamda*BitsUsedByMV 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    any of the following pointers are NULL:
+ *         pSrcOrgY, pSrcRefY, pRefRect, pMVPred, pMVCandidate, or pMESpec. 
+ *    -    Either iBlockWidth or iBlockHeight are values other than 4, 8, or 16. 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Integer (
+    const OMX_U8 *pSrcOrgY,
+    OMX_S32 nSrcOrgStep,
+    const OMX_U8 *pSrcRefY,
+    OMX_S32 nSrcRefStep,
+    const OMXRect *pRefRect,
+    const OMXVCM4P2Coordinate *pCurrPointPos,
+    OMX_U8 iBlockWidth,
+    OMX_U8 iBlockHeight,
+    OMX_U32 nLamda,
+    const OMXVCMotionVector *pMVPred,
+    const OMXVCMotionVector *pMVCandidate,
+    OMXVCMotionVector *pBestMV,
+    OMX_S32 *pBestCost,
+    void *pMESpec
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_BlockMatch_Half   (6.3.5.2.2)
+ *
+ * Description:
+ * Performs a half-pel block match using results from a prior integer search. 
+ *  Returns the best MV and associated cost.  This function estimates the 
+ * half-pixel motion vector by interpolating the integer resolution motion 
+ * vector referenced by the input parameter pSrcDstBestMV, i.e., the initial 
+ * integer MV is generated externally.  The function 
+ * omxVCM4P10_BlockMatch_Integer may be used for integer motion estimation. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrgY - Pointer to the current position in original picture plane:
+ *              If iBlockWidth==4,  4-byte alignment required. 
+ *              If iBlockWidth==8,  8-byte alignment required. 
+ *              If iBlockWidth==16, 16-byte alignment required. 
+ *   pSrcRefY - Pointer to the top-left corner of the co-located block in the 
+ *            reference picture:  
+ *              If iBlockWidth==4,  4-byte alignment required.  
+ *              If iBlockWidth==8,  8-byte alignment required.  
+ *              If iBlockWidth==16, 16-byte alignment required. 
+ *   nSrcOrgStep - Stride of the original picture plane in terms of full 
+ *            pixels; must be a multiple of iBlockWidth. 
+ *   nSrcRefStep - Stride of the reference picture plane in terms of full 
+ *            pixels 
+ *   iBlockWidth - Width of the current block in terms of full pixels; must 
+ *            be equal to either 4, 8, or 16. 
+ *   iBlockHeight - Height of the current block in terms of full pixels; must 
+ *            be equal to either 4, 8, or 16. 
+ *   nLamda - Lamda factor, used to compute motion cost 
+ *   pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to 
+ *            compute motion cost 
+ *   pSrcDstBestMV - The best MV resulting from a prior integer search, 
+ *            represented in terms of 1/4-pel units 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstBestMV - Best MV resulting from the half-pel search, expressed in 
+ *            terms of 1/4-pel units 
+ *   pBestCost - Motion cost associated with the best MV; computed as 
+ *            SAD+Lamda*BitsUsedByMV 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    any of the following pointers is NULL: pSrcOrgY, pSrcRefY, 
+ *              pSrcDstBestMV, pMVPred, pBestCost 
+ *    -    iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16. 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Half (
+    const OMX_U8 *pSrcOrgY,
+    OMX_S32 nSrcOrgStep,
+    const OMX_U8 *pSrcRefY,
+    OMX_S32 nSrcRefStep,
+    OMX_U8 iBlockWidth,
+    OMX_U8 iBlockHeight,
+    OMX_U32 nLamda,
+    const OMXVCMotionVector *pMVPred,
+    OMXVCMotionVector *pSrcDstBestMV,
+    OMX_S32 *pBestCost
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_BlockMatch_Quarter   (6.3.5.2.3)
+ *
+ * Description:
+ * Performs a quarter-pel block match using results from a prior half-pel 
+ * search.  Returns the best MV and associated cost.  This function estimates 
+ * the quarter-pixel motion vector by interpolating the half-pel resolution 
+ * motion vector referenced by the input parameter pSrcDstBestMV, i.e., the 
+ * initial half-pel MV is generated externally.  The function 
+ * omxVCM4P10_BlockMatch_Half may be used for half-pel motion estimation. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrgY - Pointer to the current position in original picture plane:
+ *            If iBlockWidth==4,  4-byte alignment required. 
+ *            If iBlockWidth==8,  8-byte alignment required. 
+ *            If iBlockWidth==16, 16-byte alignment required. 
+ *   pSrcRefY - Pointer to the top-left corner of the co-located block in the 
+ *            reference picture:
+ *            If iBlockWidth==4,  4-byte alignment required.  
+ *            If iBlockWidth==8,  8-byte alignment required.  
+ *            If iBlockWidth==16, 16-byte alignment required. 
+ *   nSrcOrgStep - Stride of the original picture plane in terms of full 
+ *            pixels; must be a multiple of iBlockWidth. 
+ *   nSrcRefStep - Stride of the reference picture plane in terms of full 
+ *            pixels 
+ *   iBlockWidth - Width of the current block in terms of full pixels; must 
+ *            be equal to either 4, 8, or 16. 
+ *   iBlockHeight - Height of the current block in terms of full pixels; must 
+ *            be equal to either 4, 8, or 16. 
+ *   nLamda - Lamda factor, used to compute motion cost 
+ *   pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to 
+ *            compute motion cost 
+ *   pSrcDstBestMV - The best MV resulting from a prior half-pel search, 
+ *            represented in terms of 1/4 pel units 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDstBestMV - Best MV resulting from the quarter-pel search, expressed 
+ *            in terms of 1/4-pel units 
+ *   pBestCost - Motion cost associated with the best MV; computed as 
+ *            SAD+Lamda*BitsUsedByMV 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    One or more of the following pointers is NULL: 
+ *         pSrcOrgY, pSrcRefY, pSrcDstBestMV, pMVPred, pBestCost 
+ *    -    iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16. 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Quarter (
+    const OMX_U8 *pSrcOrgY,
+    OMX_S32 nSrcOrgStep,
+    const OMX_U8 *pSrcRefY,
+    OMX_S32 nSrcRefStep,
+    OMX_U8 iBlockWidth,
+    OMX_U8 iBlockHeight,
+    OMX_U32 nLamda,
+    const OMXVCMotionVector *pMVPred,
+    OMXVCMotionVector *pSrcDstBestMV,
+    OMX_S32 *pBestCost
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_MotionEstimationMB   (6.3.5.3.1)
+ *
+ * Description:
+ * Performs MB-level motion estimation and selects best motion estimation 
+ * strategy from the set of modes supported in baseline profile [ISO14496-10]. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcCurrBuf - Pointer to the current position in original picture plane; 
+ *            16-byte alignment required 
+ *   pSrcRefBufList - Pointer to an array with 16 entries.  Each entry points 
+ *            to the top-left corner of the co-located MB in a reference 
+ *            picture.  The array is filled from low-to-high with valid 
+ *            reference frame pointers; the unused high entries should be set 
+ *            to NULL.  Ordering of the reference frames should follow 
+ *            [ISO14496-10] subclause 8.2.4  Decoding Process for Reference 
+ *            Picture Lists.   The entries must be 16-byte aligned. 
+ *   pSrcRecBuf - Pointer to the top-left corner of the co-located MB in the 
+ *            reconstructed picture; must be 16-byte aligned. 
+ *   SrcCurrStep - Width of the original picture plane in terms of full 
+ *            pixels; must be a multiple of 16. 
+ *   SrcRefStep - Width of the reference picture plane in terms of full 
+ *            pixels; must be a multiple of 16. 
+ *   SrcRecStep - Width of the reconstructed picture plane in terms of full 
+ *            pixels; must be a multiple of 16. 
+ *   pRefRect - Pointer to the valid reference rectangle; relative to the 
+ *            image origin. 
+ *   pCurrPointPos - Position of the current macroblock in the current plane. 
+ *   Lambda - Lagrange factor for computing the cost function 
+ *   pMESpec - Pointer to the motion estimation specification structure; must 
+ *            have been allocated and initialized prior to calling this 
+ *            function. 
+ *   pMBInter - Array, of dimension four, containing pointers to information 
+ *            associated with four adjacent type INTER MBs (Left, Top, 
+ *            Top-Left, Top-Right). Any pointer in the array may be set equal 
+ *            to NULL if the corresponding MB doesn t exist or is not of type 
+ *            INTER. 
+ *            -  pMBInter[0] - Pointer to left MB information 
+ *            -  pMBInter[1] - Pointer to top MB information 
+ *            -  pMBInter[2] - Pointer to top-left MB information 
+ *            -  pMBInter[3] - Pointer to top-right MB information 
+ *   pMBIntra - Array, of dimension four, containing pointers to information 
+ *            associated with four adjacent type INTRA MBs (Left, Top, 
+ *            Top-Left, Top-Right). Any pointer in the array may be set equal 
+ *            to NULL if the corresponding MB doesn t exist or is not of type 
+ *            INTRA. 
+ *            -  pMBIntra[0] - Pointer to left MB information 
+ *            -  pMBIntra[1] - Pointer to top MB information 
+ *            -  pMBIntra[2] - Pointer to top-left MB information 
+ *            -  pMBIntra[3] - Pointer to top-right MB information 
+ *   pSrcDstMBCurr - Pointer to information structure for the current MB.  
+ *            The following entries should be set prior to calling the 
+ *            function:  sliceID - the number of the slice the to which the 
+ *            current MB belongs. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstCost - Pointer to the minimum motion cost for the current MB. 
+ *   pDstBlockSAD - Pointer to the array of SADs for each of the sixteen luma 
+ *            4x4 blocks in each MB.  The block SADs are in scan order for 
+ *            each MB.  For implementations that cannot compute the SAD values 
+ *            individually, the maximum possible value (0xffff) is returned 
+ *            for each of the 16 block SAD entries. 
+ *   pSrcDstMBCurr - Pointer to updated information structure for the current 
+ *            MB after MB-level motion estimation has been completed.  The 
+ *            following fields are updated by the ME function.   The following 
+ *            parameter set quantifies the MB-level ME search results: 
+ *            -  MbType 
+ *            -  subMBType[4] 
+ *            -  pMV0[4][4] 
+ *            -  pMVPred[4][4] 
+ *            -  pRefL0Idx[4] 
+ *            -  Intra16x16PredMode 
+ *            -  pIntra4x4PredMode[4][4] 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -   One or more of the following pointers is NULL: pSrcCurrBuf, 
+ *           pSrcRefBufList, pSrcRecBuf, pRefRect, pCurrPointPos, pMESpec, 
+ *           pMBInter, pMBIntra,pSrcDstMBCurr, pDstCost, pSrcRefBufList[0] 
+ *    -    SrcRefStep, SrcRecStep are not multiples of 16 
+ *    -    iBlockWidth or iBlockHeight are values other than 4, 8, or 16. 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_MotionEstimationMB (
+    const OMX_U8 *pSrcCurrBuf,
+    OMX_S32 SrcCurrStep,
+    const OMX_U8 *pSrcRefBufList[15],
+    OMX_S32 SrcRefStep,
+    const OMX_U8 *pSrcRecBuf,
+    OMX_S32 SrcRecStep,
+    const OMXRect *pRefRect,
+    const OMXVCM4P2Coordinate *pCurrPointPos,
+    OMX_U32 Lambda,
+    void *pMESpec,
+    const OMXVCM4P10MBInfoPtr *pMBInter,
+    const OMXVCM4P10MBInfoPtr *pMBIntra,
+    OMXVCM4P10MBInfoPtr pSrcDstMBCurr,
+    OMX_INT *pDstCost,
+    OMX_U16 *pDstBlockSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SAD_4x   (6.3.5.4.1)
+ *
+ * Description:
+ * This function calculates the SAD for 4x8 and 4x4 blocks. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrg -Pointer to the original block; must be aligned on a 4-byte 
+ *            boundary. 
+ *   iStepOrg -Step of the original block buffer; must be a multiple of 4. 
+ *   pSrcRef -Pointer to the reference block 
+ *   iStepRef -Step of the reference block buffer 
+ *   iHeight -Height of the block; must be equal to either 4 or 8. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD -Pointer of result SAD 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    One or more of the following pointers is NULL: 
+ *         pSrcOrg, pSrcRef, or pDstSAD 
+ *    -    iHeight is not equal to either 4 or 8. 
+ *    -    iStepOrg is not a multiple of 4 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_SAD_4x (
+    const OMX_U8 *pSrcOrg,
+    OMX_U32 iStepOrg,
+    const OMX_U8 *pSrcRef,
+    OMX_U32 iStepRef,
+    OMX_S32 *pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SADQuar_4x   (6.3.5.4.2)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average 
+ * of the other two (pSrcRef0 and pSrcRef1) for 4x8 or 4x4 blocks.  Rounding 
+ * is applied according to the convention (a+b+1)>>1. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the original block; must be aligned on a 4-byte 
+ *            boundary. 
+ *   pSrcRef0 - Pointer to reference block 0 
+ *   pSrcRef1 - Pointer to reference block 1 
+ *   iSrcStep - Step of the original block buffer; must be a multiple of 4. 
+ *   iRefStep0 - Step of reference block 0 
+ *   iRefStep1 - Step of reference block 1 
+ *   iHeight - Height of the block; must be equal to either 4 or 8. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD - Pointer of result SAD 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    iHeight is not equal to either 4 or 8. 
+ *    -    One or more of the following pointers is NULL: pSrc, pSrcRef0, 
+ *              pSrcRef1, pDstSAD. 
+ *    -    iSrcStep is not a multiple of 4 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_4x (
+    const OMX_U8 *pSrc,
+    const OMX_U8 *pSrcRef0,
+    const OMX_U8 *pSrcRef1,
+    OMX_U32 iSrcStep,
+    OMX_U32 iRefStep0,
+    OMX_U32 iRefStep1,
+    OMX_U32 *pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SADQuar_8x   (6.3.5.4.3)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average 
+ * of the other two (pSrcRef0 and pSrcRef1) for 8x16, 8x8, or 8x4 blocks.  
+ * Rounding is applied according to the convention (a+b+1)>>1. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the original block; must be aligned on an 8-byte 
+ *            boundary. 
+ *   pSrcRef0 - Pointer to reference block 0 
+ *   pSrcRef1 - Pointer to reference block 1 
+ *   iSrcStep - Step of the original block buffer; must be a multiple of 8. 
+ *   iRefStep0 - Step of reference block 0 
+ *   iRefStep1 - Step of reference block 1 
+ *   iHeight - Height of the block; must be equal either 4, 8, or 16. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD - Pointer of result SAD 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    iHeight is not equal to either 4, 8, or 16. 
+ *    -    One or more of the following pointers is NULL: pSrc, pSrcRef0, 
+ *              pSrcRef1, pDstSAD. 
+ *    -    iSrcStep is not a multiple of 8 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_8x (
+    const OMX_U8 *pSrc,
+    const OMX_U8 *pSrcRef0,
+    const OMX_U8 *pSrcRef1,
+    OMX_U32 iSrcStep,
+    OMX_U32 iRefStep0,
+    OMX_U32 iRefStep1,
+    OMX_U32 *pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SADQuar_16x   (6.3.5.4.4)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average 
+ * of the other two (pSrcRef0 and pSrcRef1) for 16x16 or 16x8 blocks.  
+ * Rounding is applied according to the convention (a+b+1)>>1. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the original block; must be aligned on a 16-byte 
+ *            boundary. 
+ *   pSrcRef0 - Pointer to reference block 0 
+ *   pSrcRef1 - Pointer to reference block 1 
+ *   iSrcStep - Step of the original block buffer; must be a multiple of 16 
+ *   iRefStep0 - Step of reference block 0 
+ *   iRefStep1 - Step of reference block 1 
+ *   iHeight - Height of the block; must be equal to either 8 or 16 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD -Pointer of result SAD 
+ *
+ * Return Value:
+ *    OMX_Sts_NoErr, if the function runs without error.
+ *    OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs: 
+ *    -    iHeight is not equal to either 8 or 16. 
+ *    -    One or more of the following pointers is NULL: pSrc, pSrcRef0, 
+ *              pSrcRef1, pDstSAD. 
+ *    -    iSrcStep is not a multiple of 16 
+ *    -    Any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_16x (
+    const OMX_U8 *pSrc,
+    const OMX_U8 *pSrcRef0,
+    const OMX_U8 *pSrcRef1,
+    OMX_U32 iSrcStep,
+    OMX_U32 iRefStep0,
+    OMX_U32 iRefStep1,
+    OMX_U32 *pDstSAD,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SATD_4x4   (6.3.5.4.5)
+ *
+ * Description:
+ * This function calculates the sum of absolute transform differences (SATD) 
+ * for a 4x4 block by applying a Hadamard transform to the difference block 
+ * and then calculating the sum of absolute coefficient values. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrg - Pointer to the original block; must be aligned on a 4-byte 
+ *            boundary 
+ *   iStepOrg - Step of the original block buffer; must be a multiple of 4 
+ *   pSrcRef - Pointer to the reference block; must be aligned on a 4-byte 
+ *            boundary 
+ *   iStepRef - Step of the reference block buffer; must be a multiple of 4 
+ *
+ * Output Arguments:
+ *   
+ *   pDstSAD - pointer to the resulting SAD 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *         pSrcOrg, pSrcRef, or pDstSAD either pSrcOrg 
+ *    -    pSrcRef is not aligned on a 4-byte boundary 
+ *    -    iStepOrg <= 0 or iStepOrg is not a multiple of 4 
+ *    -    iStepRef <= 0 or iStepRef is not a multiple of 4 
+ *
+ */
+OMXResult omxVCM4P10_SATD_4x4 (
+    const OMX_U8 *pSrcOrg,
+    OMX_U32 iStepOrg,
+    const OMX_U8 *pSrcRef,
+    OMX_U32 iStepRef,
+    OMX_U32 *pDstSAD
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InterpolateHalfHor_Luma   (6.3.5.5.1)
+ *
+ * Description:
+ * This function performs interpolation for two horizontal 1/2-pel positions 
+ * (-1/2,0) and (1/2, 0) - around a full-pel position. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the top-left corner of the block used to interpolate in 
+ *            the reconstruction frame plane. 
+ *   iSrcStep - Step of the source buffer. 
+ *   iDstStep - Step of the destination(interpolation) buffer; must be a 
+ *            multiple of iWidth. 
+ *   iWidth - Width of the current block; must be equal to either 4, 8, or 16 
+ *   iHeight - Height of the current block; must be equal to 4, 8, or 16 
+ *
+ * Output Arguments:
+ *   
+ *   pDstLeft -Pointer to the interpolation buffer of the left -pel position 
+ *            (-1/2, 0) 
+ *                 If iWidth==4,  4-byte alignment required. 
+ *                 If iWidth==8,  8-byte alignment required. 
+ *                 If iWidth==16, 16-byte alignment required. 
+ *   pDstRight -Pointer to the interpolation buffer of the right -pel 
+ *            position (1/2, 0) 
+ *                 If iWidth==4,  4-byte alignment required. 
+ *                 If iWidth==8,  8-byte alignment required. 
+ *                 If iWidth==16, 16-byte alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *             pSrc, pDstLeft, or pDstRight 
+ *    -    iWidth or iHeight have values other than 4, 8, or 16 
+ *    -    iWidth==4 but pDstLeft and/or pDstRight is/are not aligned on a 4-byte boundary 
+ *    -    iWidth==8 but pDstLeft and/or pDstRight is/are not aligned on a 8-byte boundary 
+ *    -    iWidth==16 but pDstLeft and/or pDstRight is/are not aligned on a 16-byte boundary 
+ *    -    any alignment restrictions are violated 
+ *
+ */
+OMXResult omxVCM4P10_InterpolateHalfHor_Luma (
+    const OMX_U8 *pSrc,
+    OMX_U32 iSrcStep,
+    OMX_U8 *pDstLeft,
+    OMX_U8 *pDstRight,
+    OMX_U32 iDstStep,
+    OMX_U32 iWidth,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InterpolateHalfVer_Luma   (6.3.5.5.2)
+ *
+ * Description:
+ * This function performs interpolation for two vertical 1/2-pel positions - 
+ * (0, -1/2) and (0, 1/2) - around a full-pel position. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to top-left corner of block used to interpolate in the 
+ *            reconstructed frame plane 
+ *   iSrcStep - Step of the source buffer. 
+ *   iDstStep - Step of the destination (interpolation) buffer; must be a 
+ *            multiple of iWidth. 
+ *   iWidth - Width of the current block; must be equal to either 4, 8, or 16 
+ *   iHeight - Height of the current block; must be equal to either 4, 8, or 16 
+ *
+ * Output Arguments:
+ *   
+ *   pDstUp -Pointer to the interpolation buffer of the -pel position above 
+ *            the current full-pel position (0, -1/2) 
+ *                If iWidth==4, 4-byte alignment required. 
+ *                If iWidth==8, 8-byte alignment required. 
+ *                If iWidth==16, 16-byte alignment required. 
+ *   pDstDown -Pointer to the interpolation buffer of the -pel position below 
+ *            the current full-pel position (0, 1/2) 
+ *                If iWidth==4, 4-byte alignment required. 
+ *                If iWidth==8, 8-byte alignment required. 
+ *                If iWidth==16, 16-byte alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *            pSrc, pDstUp, or pDstDown 
+ *    -    iWidth or iHeight have values other than 4, 8, or 16 
+ *    -    iWidth==4 but pDstUp and/or pDstDown is/are not aligned on a 4-byte boundary 
+ *    -    iWidth==8 but pDstUp and/or pDstDown is/are not aligned on a 8-byte boundary 
+ *    -    iWidth==16 but pDstUp and/or pDstDown is/are not aligned on a 16-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_InterpolateHalfVer_Luma (
+    const OMX_U8 *pSrc,
+    OMX_U32 iSrcStep,
+    OMX_U8 *pDstUp,
+    OMX_U8 *pDstDown,
+    OMX_U32 iDstStep,
+    OMX_U32 iWidth,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_Average_4x   (6.3.5.5.3)
+ *
+ * Description:
+ * This function calculates the average of two 4x4, 4x8 blocks.  The result 
+ * is rounded according to (a+b+1)/2. 
+ *
+ * Input Arguments:
+ *   
+ *   pPred0 - Pointer to the top-left corner of reference block 0 
+ *   pPred1 - Pointer to the top-left corner of reference block 1 
+ *   iPredStep0 - Step of reference block 0; must be a multiple of 4. 
+ *   iPredStep1 - Step of reference block 1; must be a multiple of 4. 
+ *   iDstStep - Step of the destination buffer; must be a multiple of 4. 
+ *   iHeight - Height of the blocks; must be either 4 or 8. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstPred - Pointer to the destination buffer. 4-byte alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *           pPred0, pPred1, or pDstPred 
+ *    -    pDstPred is not aligned on a 4-byte boundary 
+ *    -    iPredStep0 <= 0 or iPredStep0 is not a multiple of 4 
+ *    -    iPredStep1 <= 0 or iPredStep1 is not a multiple of 4 
+ *    -    iDstStep <= 0 or iDstStep is not a multiple of 4 
+ *    -    iHeight is not equal to either 4 or 8 
+ *
+ */
+OMXResult omxVCM4P10_Average_4x (
+    const OMX_U8 *pPred0,
+    const OMX_U8 *pPred1,
+    OMX_U32 iPredStep0,
+    OMX_U32 iPredStep1,
+    OMX_U8 *pDstPred,
+    OMX_U32 iDstStep,
+    OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_TransformQuant_ChromaDC   (6.3.5.6.1)
+ *
+ * Description:
+ * This function performs 2x2 Hadamard transform of chroma DC coefficients 
+ * and then quantizes the coefficients. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - Pointer to the 2x2 array of chroma DC coefficients.  8-byte 
+ *            alignment required. 
+ *   iQP - Quantization parameter; must be in the range [0,51]. 
+ *   bIntra - Indicate whether this is an INTRA block. 1-INTRA, 0-INTER 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - Pointer to transformed and quantized coefficients.  8-byte 
+ *            alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *             pSrcDst 
+ *    -    pSrcDst is not aligned on an 8-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_TransformQuant_ChromaDC (
+    OMX_S16 *pSrcDst,
+    OMX_U32 iQP,
+    OMX_U8 bIntra
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_TransformQuant_LumaDC   (6.3.5.6.2)
+ *
+ * Description:
+ * This function performs a 4x4 Hadamard transform of luma DC coefficients 
+ * and then quantizes the coefficients. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcDst - Pointer to the 4x4 array of luma DC coefficients.  16-byte 
+ *            alignment required. 
+ *   iQP - Quantization parameter; must be in the range [0,51]. 
+ *
+ * Output Arguments:
+ *   
+ *   pSrcDst - Pointer to transformed and quantized coefficients.  16-byte 
+ *             alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: pSrcDst 
+ *    -    pSrcDst is not aligned on an 16-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_TransformQuant_LumaDC (
+    OMX_S16 *pSrcDst,
+    OMX_U32 iQP
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InvTransformDequant_LumaDC   (6.3.5.6.3)
+ *
+ * Description:
+ * This function performs inverse 4x4 Hadamard transform and then dequantizes 
+ * the coefficients. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the 4x4 array of the 4x4 Hadamard-transformed and 
+ *            quantized coefficients.  16 byte alignment required. 
+ *   iQP - Quantization parameter; must be in the range [0,51]. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - Pointer to inverse-transformed and dequantized coefficients.  
+ *            16-byte alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: pSrc 
+ *    -    pSrc or pDst is not aligned on a 16-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_InvTransformDequant_LumaDC (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_U32 iQP
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InvTransformDequant_ChromaDC   (6.3.5.6.4)
+ *
+ * Description:
+ * This function performs inverse 2x2 Hadamard transform and then dequantizes 
+ * the coefficients. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrc - Pointer to the 2x2 array of the 2x2 Hadamard-transformed and 
+ *            quantized coefficients.  8 byte alignment required. 
+ *   iQP - Quantization parameter; must be in the range [0,51]. 
+ *
+ * Output Arguments:
+ *   
+ *   pDst - Pointer to inverse-transformed and dequantized coefficients.  
+ *            8-byte alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: pSrc 
+ *    -    pSrc or pDst is not aligned on an 8-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_InvTransformDequant_ChromaDC (
+    const OMX_S16 *pSrc,
+    OMX_S16 *pDst,
+    OMX_U32 iQP
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_InvTransformResidualAndAdd   (6.3.5.7.1)
+ *
+ * Description:
+ * This function performs inverse an 4x4 integer transformation to produce 
+ * the difference signal and then adds the difference to the prediction to get 
+ * the reconstructed signal. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcPred - Pointer to prediction signal.  4-byte alignment required. 
+ *   pDequantCoeff - Pointer to the transformed coefficients.  8-byte 
+ *            alignment required. 
+ *   iSrcPredStep - Step of the prediction buffer; must be a multiple of 4. 
+ *   iDstReconStep - Step of the destination reconstruction buffer; must be a 
+ *            multiple of 4. 
+ *   bAC - Indicate whether there is AC coefficients in the coefficients 
+ *            matrix. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstRecon -Pointer to the destination reconstruction buffer.  4-byte 
+ *            alignment required. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *            pSrcPred, pDequantCoeff, pDstRecon 
+ *    -    pSrcPred is not aligned on a 4-byte boundary 
+ *    -    iSrcPredStep or iDstReconStep is not a multiple of 4. 
+ *    -    pDequantCoeff is not aligned on an 8-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_InvTransformResidualAndAdd (
+    const OMX_U8 *pSrcPred,
+    const OMX_S16 *pDequantCoeff,
+    OMX_U8 *pDstRecon,
+    OMX_U32 iSrcPredStep,
+    OMX_U32 iDstReconStep,
+    OMX_U8 bAC
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_SubAndTransformQDQResidual   (6.3.5.8.1)
+ *
+ * Description:
+ * This function subtracts the prediction signal from the original signal to 
+ * produce the difference signal and then performs a 4x4 integer transform and 
+ * quantization. The quantized transformed coefficients are stored as 
+ * pDstQuantCoeff. This function can also output dequantized coefficients or 
+ * unquantized DC coefficients optionally by setting the pointers 
+ * pDstDeQuantCoeff, pDCCoeff. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcOrg - Pointer to original signal. 4-byte alignment required. 
+ *   pSrcPred - Pointer to prediction signal. 4-byte alignment required. 
+ *   iSrcOrgStep - Step of the original signal buffer; must be a multiple of 
+ *            4. 
+ *   iSrcPredStep - Step of the prediction signal buffer; must be a multiple 
+ *            of 4. 
+ *   pNumCoeff -Number of non-zero coefficients after quantization. If this 
+ *            parameter is not required, it is set to NULL. 
+ *   nThreshSAD - Zero-block early detection threshold. If this parameter is 
+ *            not required, it is set to 0. 
+ *   iQP - Quantization parameter; must be in the range [0,51]. 
+ *   bIntra - Indicates whether this is an INTRA block, either 1-INTRA or 
+ *            0-INTER 
+ *
+ * Output Arguments:
+ *   
+ *   pDstQuantCoeff - Pointer to the quantized transformed coefficients.  
+ *            8-byte alignment required. 
+ *   pDstDeQuantCoeff - Pointer to the dequantized transformed coefficients 
+ *            if this parameter is not equal to NULL.  8-byte alignment 
+ *            required. 
+ *   pDCCoeff - Pointer to the unquantized DC coefficient if this parameter 
+ *            is not equal to NULL. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *            pSrcOrg, pSrcPred, pNumCoeff, pDstQuantCoeff, 
+ *            pDstDeQuantCoeff, pDCCoeff 
+ *    -    pSrcOrg is not aligned on a 4-byte boundary 
+ *    -    pSrcPred is not aligned on a 4-byte boundary 
+ *    -    iSrcOrgStep is not a multiple of 4 
+ *    -    iSrcPredStep is not a multiple of 4 
+ *    -    pDstQuantCoeff or pDstDeQuantCoeff is not aligned on an 8-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_SubAndTransformQDQResidual (
+    const OMX_U8 *pSrcOrg,
+    const OMX_U8 *pSrcPred,
+    OMX_U32 iSrcOrgStep,
+    OMX_U32 iSrcPredStep,
+    OMX_S16 *pDstQuantCoeff,
+    OMX_S16 *pDstDeQuantCoeff,
+    OMX_S16 *pDCCoeff,
+    OMX_S8 *pNumCoeff,
+    OMX_U32 nThreshSAD,
+    OMX_U32 iQP,
+    OMX_U8 bIntra
+);
+
+
+
+/**
+ * Function:  omxVCM4P10_GetVLCInfo   (6.3.5.9.1)
+ *
+ * Description:
+ * This function extracts run-length encoding (RLE) information from the 
+ * coefficient matrix.  The results are returned in an OMXVCM4P10VLCInfo 
+ * structure. 
+ *
+ * Input Arguments:
+ *   
+ *   pSrcCoeff - pointer to the transform coefficient matrix.  8-byte 
+ *            alignment required. 
+ *   pScanMatrix - pointer to the scan order definition matrix.  For a luma 
+ *            block the scan matrix should follow [ISO14496-10] section 8.5.4, 
+ *            and should contain the values 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 
+ *            10, 7, 11, 14, 15.  For a chroma block, the scan matrix should 
+ *            contain the values 0, 1, 2, 3. 
+ *   bAC - indicates presence of a DC coefficient; 0 = DC coefficient 
+ *            present, 1= DC coefficient absent. 
+ *   MaxNumCoef - specifies the number of coefficients contained in the 
+ *            transform coefficient matrix, pSrcCoeff. The value should be 16 
+ *            for blocks of type LUMADC, LUMAAC, LUMALEVEL, and CHROMAAC. The 
+ *            value should be 4 for blocks of type CHROMADC. 
+ *
+ * Output Arguments:
+ *   
+ *   pDstVLCInfo - pointer to structure that stores information for 
+ *            run-length coding. 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    OMX_Sts_BadArgErr - bad arguments; returned if any of the following 
+ *              conditions are true: 
+ *    -    at least one of the following pointers is NULL: 
+ *            pSrcCoeff, pScanMatrix, pDstVLCInfo 
+ *    -    pSrcCoeff is not aligned on an 8-byte boundary 
+ *
+ */
+OMXResult omxVCM4P10_GetVLCInfo (
+    const OMX_S16 *pSrcCoeff,
+    const OMX_U8 *pScanMatrix,
+    OMX_U8 bAC,
+    OMX_U32 MaxNumCoef,
+    OMXVCM4P10VLCInfo*pDstVLCInfo
+);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /** end of #define _OMXVC_H_ */
+
+/** EOF */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h
new file mode 100755
index 0000000..89f3040
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h
@@ -0,0 +1,129 @@
+;/******************************************************************************
+;// Copyright (c) 1999-2005 The Khronos Group Inc. All Rights Reserved
+;//
+;//
+;//
+;//
+;//
+;//
+;//
+;//
+;******************************************************************************/
+
+;/** =============== Structure Definition for Sample Generation ============== */
+;/** transparent status */
+
+;enum {
+OMX_VIDEO_TRANSPARENT	EQU 0;	/** Wholly transparent */
+OMX_VIDEO_PARTIAL		EQU 1;	/** Partially transparent */
+OMX_VIDEO_OPAQUE		EQU 2;	/** Opaque */
+;}
+
+;/** direction */
+;enum {
+OMX_VIDEO_NONE			EQU 0;
+OMX_VIDEO_HORIZONTAL	EQU 1;
+OMX_VIDEO_VERTICAL		EQU 2;
+;}
+
+;/** bilinear interpolation type */
+;enum {
+OMX_VIDEO_INTEGER_PIXEL EQU 0;	/** case a */
+OMX_VIDEO_HALF_PIXEL_X  EQU 1;	/** case b */
+OMX_VIDEO_HALF_PIXEL_Y  EQU 2;	/** case c */
+OMX_VIDEO_HALF_PIXEL_XY EQU 3;	/** case d */
+;}
+
+;enum {
+OMX_UPPER  				EQU 1;		/** set if the above macroblock is available */
+OMX_LEFT   				EQU 2;		/** set if the left macroblock is available */
+OMX_CENTER 				EQU 4;
+OMX_RIGHT				EQU 8;
+OMX_LOWER  				EQU	16;
+OMX_UPPER_LEFT  		EQU 32;		/** set if the above-left macroblock is available */
+OMX_UPPER_RIGHT 		EQU 64;		/** set if the above-right macroblock is available */
+OMX_LOWER_LEFT  		EQU 128;
+OMX_LOWER_RIGHT 		EQU 256
+;}
+
+;enum {
+OMX_VIDEO_LUMINANCE  	EQU 0;	/** Luminance component */
+OMX_VIDEO_CHROMINANCE  	EQU 1;	/** chrominance component */
+OMX_VIDEO_ALPHA  		EQU 2;	/** Alpha component */
+;}
+
+;enum {
+OMX_VIDEO_INTER			EQU 0;	/** P picture or P-VOP */
+OMX_VIDEO_INTER_Q		EQU 1;	/** P picture or P-VOP */
+OMX_VIDEO_INTER4V		EQU 2;	/** P picture or P-VOP */
+OMX_VIDEO_INTRA			EQU 3;	/** I and P picture; I- and P-VOP */
+OMX_VIDEO_INTRA_Q		EQU 4;	/** I and P picture; I- and P-VOP */
+OMX_VIDEO_INTER4V_Q		EQU 5;	/** P picture or P-VOP (H.263)*/
+OMX_VIDEO_DIRECT		EQU 6;	/** B picture or B-VOP (MPEG-4 only) */
+OMX_VIDEO_INTERPOLATE	EQU 7;	/** B picture or B-VOP */
+OMX_VIDEO_BACKWARD		EQU 8;	/** B picture or B-VOP */
+OMX_VIDEO_FORWARD		EQU 9;	/** B picture or B-VOP */
+OMX_VIDEO_NOTCODED		EQU 10;	/** B picture or B-VOP */
+;}
+
+;enum {
+OMX_16X16_VERT 			EQU 0;		/** Intra_16x16_Vertical (prediction mode) */
+OMX_16X16_HOR 			EQU 1;		/** Intra_16x16_Horizontal (prediction mode) */
+OMX_16X16_DC 			EQU 2;		/** Intra_16x16_DC (prediction mode) */
+OMX_16X16_PLANE 		EQU 3;	/** Intra_16x16_Plane (prediction mode) */
+;}
+
+;enum {
+OMX_4x4_VERT 			EQU 0;		/** Intra_4x4_Vertical (prediction mode) */
+OMX_4x4_HOR  			EQU 1;		/** Intra_4x4_Horizontal (prediction mode) */
+OMX_4x4_DC   			EQU 2;		/** Intra_4x4_DC (prediction mode) */
+OMX_4x4_DIAG_DL 		EQU 3;	/** Intra_4x4_Diagonal_Down_Left (prediction mode) */
+OMX_4x4_DIAG_DR 		EQU 4;	/** Intra_4x4_Diagonal_Down_Right (prediction mode) */
+OMX_4x4_VR 				EQU 5;			/** Intra_4x4_Vertical_Right (prediction mode) */
+OMX_4x4_HD 				EQU 6;			/** Intra_4x4_Horizontal_Down (prediction mode) */
+OMX_4x4_VL 				EQU 7;			/** Intra_4x4_Vertical_Left (prediction mode) */
+OMX_4x4_HU 				EQU 8;			/** Intra_4x4_Horizontal_Up (prediction mode) */
+;}
+
+;enum {
+OMX_CHROMA_DC 			EQU 0;		/** Intra_Chroma_DC (prediction mode) */
+OMX_CHROMA_HOR 			EQU 1;		/** Intra_Chroma_Horizontal (prediction mode) */
+OMX_CHROMA_VERT 		EQU 2;	/** Intra_Chroma_Vertical (prediction mode) */
+OMX_CHROMA_PLANE 		EQU 3;	/** Intra_Chroma_Plane (prediction mode) */
+;}
+
+;typedef	struct {	
+x	EQU	0;
+y	EQU	4;
+;}OMXCoordinate;
+
+;typedef struct {
+dx	EQU	0;
+dy	EQU	2;
+;}OMXMotionVector;
+
+;typedef struct {
+xx		EQU	0;
+yy		EQU	4;
+width	EQU	8;
+height	EQU	12;
+;}OMXiRect;
+
+;typedef enum {
+OMX_VC_INTER         EQU 0;        /** P picture or P-VOP */
+OMX_VC_INTER_Q       EQU 1;       /** P picture or P-VOP */
+OMX_VC_INTER4V       EQU 2;       /** P picture or P-VOP */
+OMX_VC_INTRA         EQU 3;        /** I and P picture, I- and P-VOP */
+OMX_VC_INTRA_Q       EQU 4;       /** I and P picture, I- and P-VOP */
+OMX_VC_INTER4V_Q     EQU 5;    /** P picture or P-VOP (H.263)*/
+;} OMXVCM4P2MacroblockType;
+
+;enum {
+OMX_VC_NONE          EQU 0
+OMX_VC_HORIZONTAL    EQU 1
+OMX_VC_VERTICAL      EQU 2 
+;};
+
+
+	END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s
new file mode 100755
index 0000000..296d59d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s
@@ -0,0 +1,95 @@
+ ;/**
+ ; * Function: omxVCCOMM_Copy16x16
+ ; *
+ ; * Description:
+ ; * Copies the reference 16x16 block to the current block.
+ ; * Parameters:
+ ; * [in] pSrc         - pointer to the reference block in the source frame; must be aligned on an 16-byte boundary.
+ ; * [in] step         - distance between the starts of consecutive lines in the reference frame, in bytes;
+ ; *                     must be a multiple of 16 and must be larger than or equal to 16.
+ ; * [out] pDst        - pointer to the destination block; must be aligned on an 8-byte boundary.
+ ; * Return Value:
+ ; * OMX_Sts_NoErr     - no error
+ ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions:
+ ; *                   - one or more of the following pointers is NULL:  pSrc, pDst
+ ; *                   - one or more of the following pointers is not aligned on an 16-byte boundary:  pSrc, pDst
+ ; *                   - step <16 or step is not a multiple of 16.  
+ ; */
+
+   INCLUDE omxtypes_s.h
+   
+     
+     M_VARIANTS CortexA8
+     
+     IF CortexA8
+     
+     
+ ;//Input Arguments
+pSrc    RN 0        
+pDst    RN 1        
+step    RN 2
+
+;//Local Variables
+Return  RN 0
+;// Neon Registers
+
+X0      DN D0.S8 
+X1      DN D1.S8 
+X2      DN D2.S8
+X3      DN D3.S8
+X4      DN D4.S8
+X5      DN D5.S8
+X6      DN D6.S8
+X7      DN D7.S8 
+ 
+     M_START omxVCCOMM_Copy16x16
+         
+        
+        VLD1  {X0,X1},[pSrc@128],step       ;// Load 16 bytes from 16 byte aligned pSrc and pSrc=pSrc + step after loading
+        VLD1  {X2,X3},[pSrc@128],step
+        VLD1  {X4,X5},[pSrc@128],step
+        VLD1  {X6,X7},[pSrc@128],step
+        
+        VST1  {X0,X1,X2,X3},[pDst@128]!     ;// Store 32 bytes to 16 byte aligned pDst   
+        VST1  {X4,X5,X6,X7},[pDst@128]!        
+               
+         
+        VLD1  {X0,X1},[pSrc@128],step
+        VLD1  {X2,X3},[pSrc@128],step
+        VLD1  {X4,X5},[pSrc@128],step
+        VLD1  {X6,X7},[pSrc@128],step
+        
+        VST1  {X0,X1,X2,X3},[pDst@128]!
+        VST1  {X4,X5,X6,X7},[pDst@128]!
+         
+      
+        VLD1  {X0,X1},[pSrc@128],step
+        VLD1  {X2,X3},[pSrc@128],step
+        VLD1  {X4,X5},[pSrc@128],step
+        VLD1  {X6,X7},[pSrc@128],step
+        
+        VST1  {X0,X1,X2,X3},[pDst@128]!              
+        VST1  {X4,X5,X6,X7},[pDst@128]!        
+        
+        
+        VLD1  {X0,X1},[pSrc@128],step
+        VLD1  {X2,X3},[pSrc@128],step
+        VLD1  {X4,X5},[pSrc@128],step
+        VLD1  {X6,X7},[pSrc@128],step
+        
+        VST1  {X0,X1,X2,X3},[pDst@128]!
+        VST1  {X4,X5,X6,X7},[pDst@128]!
+
+        
+        MOV   Return,#OMX_Sts_NoErr
+
+     
+        
+        M_END
+        ENDIF
+
+
+
+        
+        END
+       
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s
new file mode 100755
index 0000000..db9e5ef
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s
@@ -0,0 +1,70 @@
+ ;/**
+ ; * Function: omxVCCOMM_Copy8x8
+ ; *
+ ; * Description:
+ ; * Copies the reference 8x8 block to the current block.
+ ; * Parameters:
+ ; * [in] pSrc         - pointer to the reference block in the source frame; must be aligned on an 8-byte boundary.
+ ; * [in] step         - distance between the starts of consecutive lines in the reference frame, in bytes;
+ ; *                     must be a multiple of 8 and must be larger than or equal to 8.
+ ; * [out] pDst        - pointer to the destination block; must be aligned on an 8-byte boundary.
+ ; * Return Value:
+ ; * OMX_Sts_NoErr     - no error
+ ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions:
+ ; *                   - one or more of the following pointers is NULL:  pSrc, pDst
+ ; *                   - one or more of the following pointers is not aligned on an 8-byte boundary:  pSrc, pDst
+ ; *                   - step <8 or step is not a multiple of 8.  
+ ; */
+
+   INCLUDE omxtypes_s.h
+   
+     
+     M_VARIANTS CortexA8
+     
+     IF CortexA8
+     
+     
+ ;//Input Arguments
+pSrc    RN 0        
+pDst    RN 1        
+step    RN 2
+
+;//Local Variables
+Count   RN 3
+Return  RN 0
+;// Neon Registers
+
+X0      DN D0.S8 
+X1      DN D1.S8
+X2      DN D2.S8
+X3      DN D3.S8
+     M_START omxVCCOMM_Copy8x8
+        
+            
+        
+        VLD1  {X0},[pSrc],step            ;// Load 8 bytes from 8 byte aligned pSrc, pSrc=pSrc+step after load
+        VLD1  {X1},[pSrc],step
+        VLD1  {X2},[pSrc],step
+        VLD1  {X3},[pSrc],step
+        
+        VST1  {X0,X1},[pDst]!            ;// Store 16 bytes to 8 byte aligned pDst  
+        VST1  {X2,X3},[pDst]!              
+        
+        VLD1  {X0},[pSrc],step
+        VLD1  {X1},[pSrc],step
+        VLD1  {X2},[pSrc],step
+        VLD1  {X3},[pSrc],step
+        
+        VST1  {X0,X1},[pDst]!              
+        VST1  {X2,X3},[pDst]!             
+                
+        MOV   Return,#OMX_Sts_NoErr
+             
+        M_END
+        ENDIF
+
+
+
+        
+        END
+        
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
new file mode 100755
index 0000000..5c5b7d8
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
@@ -0,0 +1,236 @@
+;//
+;// 
+;// File Name:  omxVCCOMM_ExpandFrame_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// This function will Expand Frame boundary pixels into Plane
+;// 
+;// 
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+    
+  
+;// Set debugging level        
+DEBUG_ON    SETL {FALSE}
+
+
+    
+    IF CortexA8
+    
+        M_START omxVCCOMM_ExpandFrame_I,r11
+
+;//Input registers
+
+pSrcDstPlane    RN  0
+iFrameWidth     RN  1
+iFrameHeight    RN  2    
+iExpandPels     RN  3
+iPlaneStep      RN  4
+pTop            RN  5
+pBot            RN  6
+pDstTop         RN  7
+pDstBot         RN  8
+pLeft           RN  5
+pRight          RN  6
+pDstLeft        RN  9
+pDstRight       RN  10
+Offset          RN  11
+Temp            RN  14
+Counter         RN  12
+Tmp             RN  7
+;//Output registers
+
+result          RN  0
+;// Neon registers
+qData0          QN  0.U8
+qData1          QN  1.U8
+dData0          DN  0.U8
+dData1          DN  1.U8
+dData2          DN  2.U8
+dData3          DN  3.U8
+
+        ;// Define stack arguments
+        M_ARG       pPlaneStep, 4
+        
+        ;// Load argument from the stack
+        M_LDR       iPlaneStep, pPlaneStep
+        
+        SUB         pTop, pSrcDstPlane, #0              ;// Top row pointer of the frame
+        MUL         Offset, iExpandPels, iPlaneStep     ;// E*Step        
+        SUB         Temp, iFrameHeight, #1              ;// H-1
+        MUL         Temp, iPlaneStep, Temp              ;// (H-1)*Step
+        ADD         pBot, Temp, pSrcDstPlane            ;// BPtr = TPtr + (H-1)*Step
+        MOV         Temp, iFrameWidth                   ;// Outer loop counter
+        
+        ;// Check if pSrcDstPlane and iPlaneStep are 16 byte aligned
+        TST         pSrcDstPlane, #0xf
+        TSTEQ       iPlaneStep, #0xf        
+        BNE         Hor8Loop00
+        
+        ;//
+        ;// Copy top and bottom region of the plane as follows
+        ;// top region = top row elements from the frame
+        ;// bottom region = last row elements from the frame
+        ;//
+
+        ;// Case for 16 byte alignment
+Hor16Loop00
+        SUB         pDstTop, pTop, Offset
+        VLD1        qData0, [pTop @128]!
+        MOV         Counter, iExpandPels                ;// Inner loop counter
+        ADD         pDstBot, pBot, iPlaneStep
+        VLD1        qData1, [pBot @128]!
+Ver16Loop0
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        VST1        qData0, [pDstTop @128], iPlaneStep
+        SUBS        Counter, Counter, #8
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep
+        VST1        qData1, [pDstBot @128], iPlaneStep        
+        BGT         Ver16Loop0
+
+        SUBS        Temp, Temp, #16
+        BGT         Hor16Loop00
+        B           EndAlignedLoop
+        
+        ;// Case for 8 byte alignment
+Hor8Loop00
+        SUB         pDstTop, pTop, Offset
+        VLD1        qData0, [pTop @64]!
+        MOV         Counter, iExpandPels                ;// Inner loop counter
+        ADD         pDstBot, pBot, iPlaneStep
+        VLD1        qData1, [pBot @64]!
+Ver8Loop0
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        VST1        qData0, [pDstTop @64], iPlaneStep
+        SUBS        Counter, Counter, #8
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep
+        VST1        qData1, [pDstBot @64], iPlaneStep        
+        BGT         Ver8Loop0
+
+        SUBS        Temp, Temp, #16
+        BGT         Hor8Loop00
+
+EndAlignedLoop
+        ADD         Temp, pSrcDstPlane, iFrameWidth
+        SUB         pDstRight, Temp, Offset
+        SUB         pRight, Temp, #1
+        SUB         pDstLeft, pSrcDstPlane, Offset    
+        SUB         pDstLeft, pDstLeft, iExpandPels    
+        ADD         pLeft, pSrcDstPlane, #0
+        
+        VLD1        {dData0 []}, [pLeft], iPlaneStep        ;// Top-Left corner pixel from frame duplicated in dData0
+        SUB         Offset, iPlaneStep, iExpandPels
+        VLD1        {dData1 []}, [pRight], iPlaneStep       ;// Top-Right corner pixel from frame duplicated in dData1
+        MOV         Temp, iExpandPels
+
+        ;//
+        ;// Copy top-left and top-right region of the plane as follows
+        ;// top-left region = top-left corner pixel from the frame
+        ;// top-right region = top-right corner pixel from the frame
+        ;//
+HorLoop11
+        MOV         Counter, iExpandPels
+VerLoop1
+        VST1        dData0, [pDstLeft], #8
+        SUBS        Counter, Counter, #8
+        VST1        dData1, [pDstRight], #8        
+        BGT         VerLoop1
+
+        SUBS        Temp, Temp, #1
+        ADD         pDstLeft, pDstLeft, Offset
+        ADD         pDstRight, pDstRight, Offset
+        BPL         HorLoop11
+
+        SUB         iFrameHeight, iFrameHeight, #1
+        ;//
+        ;// Copy left and right region of the plane as follows
+        ;// Left region = copy the row with left start pixel from the frame
+        ;// Right region = copy the row with right end pixel from the frame
+        ;//
+HorLoop22
+        VLD1        {dData0 []}, [pLeft], iPlaneStep
+        MOV         Counter, iExpandPels
+        VLD1        {dData1 []}, [pRight], iPlaneStep
+VerLoop2
+        VST1        dData0, [pDstLeft], #8
+        SUBS        Counter, Counter, #8
+        VST1        dData1, [pDstRight], #8        
+        BGT         VerLoop2
+
+        SUBS        iFrameHeight, iFrameHeight, #1
+        ADD         pDstLeft, pDstLeft, Offset
+        ADD         pDstRight, pDstRight, Offset
+        BGT         HorLoop22
+                
+        MOV         Temp, iExpandPels
+        ;//
+        ;// Copy bottom-left and bottom-right region of the plane as follows
+        ;// bottom-left region = bottom-left corner pixel from the frame
+        ;// bottom-right region = bottom-right corner pixel from the frame
+        ;//
+HorLoop33
+        MOV         Counter, iExpandPels
+VerLoop3
+        VST1        dData0, [pDstLeft], #8
+        SUBS        Counter, Counter, #8
+        VST1        dData1, [pDstRight], #8        
+        BGT         VerLoop3
+
+        SUBS        Temp, Temp, #1
+        ADD         pDstLeft, pDstLeft, Offset
+        ADD         pDstRight, pDstRight, Offset
+        BGT         HorLoop33
+End
+        MOV         r0, #OMX_Sts_NoErr
+        
+        M_END    
+    
+    ENDIF
+
+
+
+        
+;// Guarding implementation by the processor name
+    
+ 
+            
+    END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h
new file mode 100755
index 0000000..547a2d9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------
+ * 
+ * 
+ * File Name:  armVCM4P10_CAVLCTables.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * 
+ * Header file for optimized H.264 CALVC tables
+ * 
+ */
+ 
+#ifndef ARMVCM4P10_CAVLCTABLES_H
+#define ARMVCM4P10_CAVLCTABLES_H
+  
+/* CAVLC tables */
+
+extern const OMX_U16 *armVCM4P10_CAVLCCoeffTokenTables[18];
+extern const OMX_U16 *armVCM4P10_CAVLCTotalZeroTables[15];
+extern const OMX_U16 *armVCM4P10_CAVLCTotalZeros2x2Tables[3];
+extern const OMX_U16 *armVCM4P10_CAVLCRunBeforeTables[15];
+extern const OMX_U8 armVCM4P10_ZigZag_4x4[16];
+extern const OMX_U8 armVCM4P10_ZigZag_2x2[4];
+extern const OMX_S8 armVCM4P10_SuffixToLevel[7];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
new file mode 100755
index 0000000..4f0892d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
@@ -0,0 +1,222 @@
+;//
+;// 
+;// File Name:  armVCM4P10_Average_4x_Align_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+
+;// Functions:
+;//     armVCM4P10_Average_4x4_Align<ALIGNMENT>_unsafe  
+;//
+;// Implements Average of 4x4 with equation c = (a+b+1)>>1.
+;// First operand will be at offset ALIGNMENT from aligned address
+;// Second operand will be at aligned location and will be used as output.
+;// destination pointed by (pDst) for vertical interpolation.
+;// This function needs to copy 4 bytes in horizontal direction 
+;//
+;// Registers used as input for this function
+;// r0,r1,r2,r3 where r2 containings aligned memory pointer and r3 step size
+;//
+;// Registers preserved for top level function
+;// r4,r5,r6,r8,r9,r14
+;//
+;// Registers modified by the function
+;// r7,r10,r11,r12
+;//
+;// Output registers
+;// r2 - pointer to the aligned location
+;// r3 - step size to this aligned location
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS ARM1136JS
+
+        EXPORT armVCM4P10_Average_4x4_Align0_unsafe
+        EXPORT armVCM4P10_Average_4x4_Align2_unsafe
+        EXPORT armVCM4P10_Average_4x4_Align3_unsafe
+
+DEBUG_ON    SETL {FALSE}
+
+;// Declare input registers
+pPred0          RN 0
+iPredStep0      RN 1
+pPred1          RN 2
+iPredStep1      RN 3
+pDstPred        RN 2
+iDstStep        RN 3
+
+;// Declare other intermediate registers
+iPredA0         RN 10
+iPredA1         RN 11
+iPredB0         RN 12
+iPredB1         RN 14
+Temp1           RN 4
+Temp2           RN 5
+ResultA         RN 5
+ResultB         RN 4
+r0x80808080     RN 7
+
+    IF ARM1136JS
+        
+        ;// This function calculates average of 4x4 block 
+        ;// pPred0 is at alignment offset 0 and pPred1 is alignment 4
+
+        ;// Function header
+        M_START armVCM4P10_Average_4x4_Align0_unsafe, r6
+
+        ;// Code start        
+        LDR         r0x80808080, =0x80808080
+
+        ;// 1st load
+        M_LDR       iPredB0, [pPred1]
+        M_LDR       iPredA0, [pPred0], iPredStep0        
+        M_LDR       iPredB1, [pPred1, iPredStep1]
+        M_LDR       iPredA1, [pPred0], iPredStep0
+
+        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
+        MVN         iPredB0, iPredB0
+        MVN         iPredB1, iPredB1
+        UHSUB8      ResultA, iPredA0, iPredB0
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep        
+        
+        ;// 2nd load
+        M_LDR       iPredA0, [pPred0], iPredStep0        
+        M_LDR       iPredB0, [pPred1]
+        M_LDR       iPredA1, [pPred0], iPredStep0
+        M_LDR       iPredB1, [pPred1, iPredStep1]
+
+        MVN         iPredB0, iPredB0
+        UHSUB8      ResultA, iPredA0, iPredB0
+        MVN         iPredB1, iPredB1
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080        
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep                
+End0
+        M_END
+
+        ;// This function calculates average of 4x4 block 
+        ;// pPred0 is at alignment offset 2 and pPred1 is alignment 4
+
+        ;// Function header
+        M_START armVCM4P10_Average_4x4_Align2_unsafe, r6
+
+        ;// Code start        
+        LDR         r0x80808080, =0x80808080
+
+        ;// 1st load
+        LDR         Temp1, [pPred0, #4]
+        M_LDR       iPredA0, [pPred0], iPredStep0        
+        M_LDR       iPredB0, [pPred1]
+        M_LDR       iPredB1, [pPred1, iPredStep1]
+        M_LDR       Temp2, [pPred0, #4]
+        M_LDR       iPredA1, [pPred0], iPredStep0
+        MVN         iPredB0, iPredB0
+        MVN         iPredB1, iPredB1        
+        MOV         iPredA0, iPredA0, LSR #16
+        ORR         iPredA0, iPredA0, Temp1, LSL #16        
+        MOV         iPredA1, iPredA1, LSR #16
+        ORR         iPredA1, iPredA1, Temp2, LSL #16
+
+        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
+        UHSUB8      ResultA, iPredA0, iPredB0
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep        
+        
+        ;// 2nd load
+        LDR         Temp1, [pPred0, #4]
+        M_LDR         iPredA0, [pPred0], iPredStep0        
+        LDR         iPredB0, [pPred1]
+        LDR         iPredB1, [pPred1, iPredStep1]
+        LDR         Temp2, [pPred0, #4]
+        M_LDR         iPredA1, [pPred0], iPredStep0
+        MVN         iPredB0, iPredB0
+        MVN         iPredB1, iPredB1
+        MOV         iPredA0, iPredA0, LSR #16
+        ORR         iPredA0, iPredA0, Temp1, LSL #16        
+        MOV         iPredA1, iPredA1, LSR #16
+        ORR         iPredA1, iPredA1, Temp2, LSL #16
+
+        UHSUB8      ResultA, iPredA0, iPredB0
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080        
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep                
+End2
+        M_END
+
+
+        ;// This function calculates average of 4x4 block 
+        ;// pPred0 is at alignment offset 3 and pPred1 is alignment 4
+
+        ;// Function header
+        M_START armVCM4P10_Average_4x4_Align3_unsafe, r6
+
+        ;// Code start        
+        LDR         r0x80808080, =0x80808080
+
+        ;// 1st load
+        LDR         Temp1, [pPred0, #4]
+        M_LDR       iPredA0, [pPred0], iPredStep0        
+        LDR         iPredB0, [pPred1]
+        LDR         iPredB1, [pPred1, iPredStep1]
+        LDR         Temp2, [pPred0, #4]
+        M_LDR       iPredA1, [pPred0], iPredStep0
+
+        MVN         iPredB0, iPredB0
+        MVN         iPredB1, iPredB1
+        MOV         iPredA0, iPredA0, LSR #24
+        ORR         iPredA0, iPredA0, Temp1, LSL #8                
+        MOV         iPredA1, iPredA1, LSR #24
+        ORR         iPredA1, iPredA1, Temp2, LSL #8
+        UHSUB8      ResultA, iPredA0, iPredB0
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep        
+        
+        ;// 2nd load
+        LDR         Temp1, [pPred0, #4]
+        M_LDR       iPredA0, [pPred0], iPredStep0        
+        LDR         iPredB0, [pPred1]
+        LDR         iPredB1, [pPred1, iPredStep1]
+        LDR         Temp2, [pPred0, #4]
+        M_LDR       iPredA1, [pPred0], iPredStep0
+
+        MVN         iPredB0, iPredB0
+        MVN         iPredB1, iPredB1
+        MOV         iPredA0, iPredA0, LSR #24
+        ORR         iPredA0, iPredA0, Temp1, LSL #8        
+        MOV         iPredA1, iPredA1, LSR #24
+        ORR         iPredA1, iPredA1, Temp2, LSL #8
+
+        UHSUB8      ResultA, iPredA0, iPredB0
+        UHSUB8      ResultB, iPredA1, iPredB1
+        EOR         ResultA, ResultA, r0x80808080        
+        M_STR       ResultA, [pDstPred], iDstStep        
+        EOR         ResultB, ResultB, r0x80808080
+        M_STR       ResultB, [pDstPred], iDstStep                
+End3
+        M_END
+
+    ENDIF
+    
+    END
+    
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c
new file mode 100755
index 0000000..137495d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c
@@ -0,0 +1,327 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  armVCM4P10_CAVLCTables.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * 
+ * Optimized CAVLC tables for H.264
+ * 
+ */
+ 
+#include "omxtypes.h"
+#include "armOMX.h"
+
+#include "armVCM4P10_CAVLCTables.h"
+
+/* 4x4 DeZigZag table */
+
+const OMX_U8 armVCM4P10_ZigZag_4x4[16] =
+{
+    0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+/* 2x2 DeZigZag table */
+
+const OMX_U8 armVCM4P10_ZigZag_2x2[4] =
+{
+    0, 1, 2, 3
+};
+
+
+/*
+ * Suffix To Level table
+ * We increment the suffix length if 
+ * ((LevelCode>>1)+1)>(3<<(SuffixLength-1)) && SuffixLength<6
+ * (LevelCode>>1)>=(3<<(SuffixLength-1))    && SuffixLength<6
+ *  LevelCode    >= 3<<SuffixLength         && SuffixLength<6
+ * (LevelCode+2) >= (3<<SuffixLength)+2     && SuffixLength<6
+ */
+const OMX_S8 armVCM4P10_SuffixToLevel[7] =
+{
+    (3<<1)+2,       /* SuffixLength=1 */
+    (3<<1)+2,       /* SuffixLength=1 */
+    (3<<2)+2,       /* SuffixLength=2 */
+    (3<<3)+2,       /* SuffixLength=3 */
+    (3<<4)+2,       /* SuffixLength=4 */
+    (3<<5)+2,       /* SuffixLength=5 */
+    -1              /* SuffixLength=6 - never increment */
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_0[132] = {
+    0x0020, 0x0100, 0x2015, 0x2015, 0x400b, 0x400b, 0x400b, 0x400b,
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+    0x0028, 0x00f0, 0x00f8, 0x0027, 0x0030, 0x00d8, 0x00e0, 0x00e8,
+    0x0038, 0x00a0, 0x00c8, 0x00d0, 0x0040, 0x0068, 0x0090, 0x0098,
+    0x0048, 0x0050, 0x0058, 0x0060, 0x27ff, 0x27ff, 0x206b, 0x206b,
+    0x0081, 0x0085, 0x0083, 0x0079, 0x0087, 0x007d, 0x007b, 0x0071,
+    0x007f, 0x0075, 0x0073, 0x0069, 0x0070, 0x0078, 0x0080, 0x0088,
+    0x2077, 0x2077, 0x206d, 0x206d, 0x2063, 0x2063, 0x2061, 0x2061,
+    0x206f, 0x206f, 0x2065, 0x2065, 0x205b, 0x205b, 0x2059, 0x2059,
+    0x0067, 0x005d, 0x0053, 0x0051, 0x005f, 0x0055, 0x004b, 0x0049,
+    0x00a8, 0x00b0, 0x00b8, 0x00c0, 0x2041, 0x2041, 0x204d, 0x204d,
+    0x2043, 0x2043, 0x2039, 0x2039, 0x2057, 0x2057, 0x2045, 0x2045,
+    0x203b, 0x203b, 0x2031, 0x2031, 0x204f, 0x204f, 0x203d, 0x203d,
+    0x2033, 0x2033, 0x2029, 0x2029, 0x0047, 0x0035, 0x002b, 0x0021,
+    0x203f, 0x203f, 0x202d, 0x202d, 0x2023, 0x2023, 0x2019, 0x2019,
+    0x0037, 0x0025, 0x001b, 0x0011, 0x202f, 0x202f, 0x201d, 0x201d,
+    0x0013, 0x0009, 0x201f, 0x201f
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_1[128] = {
+    0x0020, 0x00e8, 0x00f0, 0x00f8, 0x0027, 0x001f, 0x2015, 0x2015,
+    0x400b, 0x400b, 0x400b, 0x400b, 0x4001, 0x4001, 0x4001, 0x4001,
+    0x0028, 0x00d0, 0x00d8, 0x00e0, 0x0030, 0x0098, 0x00c0, 0x00c8,
+    0x0038, 0x0060, 0x0088, 0x0090, 0x0040, 0x0048, 0x0050, 0x0058,
+    0x27ff, 0x27ff, 0x207f, 0x207f, 0x0087, 0x0085, 0x0083, 0x0081,
+    0x007b, 0x0079, 0x007d, 0x0073, 0x2075, 0x2075, 0x2071, 0x2071,
+    0x0068, 0x0070, 0x0078, 0x0080, 0x2077, 0x2077, 0x206d, 0x206d,
+    0x206b, 0x206b, 0x2069, 0x2069, 0x206f, 0x206f, 0x2065, 0x2065,
+    0x2063, 0x2063, 0x2061, 0x2061, 0x0059, 0x005d, 0x005b, 0x0051,
+    0x0067, 0x0055, 0x0053, 0x0049, 0x00a0, 0x00a8, 0x00b0, 0x00b8,
+    0x205f, 0x205f, 0x204d, 0x204d, 0x204b, 0x204b, 0x2041, 0x2041,
+    0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2039, 0x2039,
+    0x204f, 0x204f, 0x203d, 0x203d, 0x203b, 0x203b, 0x2031, 0x2031,
+    0x0029, 0x0035, 0x0033, 0x0021, 0x2047, 0x2047, 0x202d, 0x202d,
+    0x202b, 0x202b, 0x2019, 0x2019, 0x003f, 0x0025, 0x0023, 0x0011,
+    0x0037, 0x001d, 0x001b, 0x0009, 0x202f, 0x202f, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_2[112] = {
+    0x0020, 0x0088, 0x00b0, 0x00b8, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+    0x003f, 0x0037, 0x002f, 0x0027, 0x001f, 0x0015, 0x000b, 0x0001,
+    0x0028, 0x0050, 0x0078, 0x0080, 0x0030, 0x0038, 0x0040, 0x0048,
+    0x07ff, 0x0081, 0x0087, 0x0085, 0x0083, 0x0079, 0x007f, 0x007d,
+    0x007b, 0x0071, 0x0077, 0x0075, 0x0073, 0x0069, 0x206b, 0x206b,
+    0x0058, 0x0060, 0x0068, 0x0070, 0x2061, 0x2061, 0x206d, 0x206d,
+    0x2063, 0x2063, 0x2059, 0x2059, 0x206f, 0x206f, 0x2065, 0x2065,
+    0x205b, 0x205b, 0x2051, 0x2051, 0x0067, 0x005d, 0x0053, 0x0049,
+    0x005f, 0x0055, 0x004b, 0x0041, 0x0090, 0x0098, 0x00a0, 0x00a8,
+    0x2039, 0x2039, 0x2031, 0x2031, 0x204d, 0x204d, 0x2029, 0x2029,
+    0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2021, 0x2021,
+    0x0019, 0x003d, 0x003b, 0x0011, 0x004f, 0x0035, 0x0033, 0x0009,
+    0x202b, 0x202b, 0x202d, 0x202d, 0x2023, 0x2023, 0x2025, 0x2025,
+    0x201b, 0x201b, 0x2047, 0x2047, 0x201d, 0x201d, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_3[80] = {
+    0x0020, 0x0028, 0x0030, 0x0038, 0x0040, 0x0048, 0x0050, 0x0058,
+    0x0060, 0x0068, 0x0070, 0x0078, 0x0080, 0x0088, 0x0090, 0x0098,
+    0x0009, 0x000b, 0x07ff, 0x0001, 0x0011, 0x0013, 0x0015, 0x07ff,
+    0x0019, 0x001b, 0x001d, 0x001f, 0x0021, 0x0023, 0x0025, 0x0027,
+    0x0029, 0x002b, 0x002d, 0x002f, 0x0031, 0x0033, 0x0035, 0x0037,
+    0x0039, 0x003b, 0x003d, 0x003f, 0x0041, 0x0043, 0x0045, 0x0047,
+    0x0049, 0x004b, 0x004d, 0x004f, 0x0051, 0x0053, 0x0055, 0x0057,
+    0x0059, 0x005b, 0x005d, 0x005f, 0x0061, 0x0063, 0x0065, 0x0067,
+    0x0069, 0x006b, 0x006d, 0x006f, 0x0071, 0x0073, 0x0075, 0x0077,
+    0x0079, 0x007b, 0x007d, 0x007f, 0x0081, 0x0083, 0x0085, 0x0087
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_4[32] = {
+    0x0020, 0x0038, 0x2015, 0x2015, 0x4001, 0x4001, 0x4001, 0x4001,
+    0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b,
+    0x0028, 0x0030, 0x0021, 0x0019, 0x2027, 0x2027, 0x0025, 0x0023,
+    0x201d, 0x201d, 0x201b, 0x201b, 0x0011, 0x001f, 0x0013, 0x0009
+};
+
+const OMX_U16 * armVCM4P10_CAVLCCoeffTokenTables[18] = {
+    armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=0 */
+    armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=1 */
+    armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=2 */
+    armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=3 */
+    armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=4 */
+    armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=5 */
+    armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=6 */
+    armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=7 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=8 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=9 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=10 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=11 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=12 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=13 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=14 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=15 */
+    armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=16 */
+    armVCM4P10_CAVLCCoeffTokenTables_4  /* nC=-1 */
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_0[40] = {
+    0x0020, 0x0048, 0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003,
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+    0x0028, 0x0040, 0x0011, 0x000f, 0x0030, 0x0038, 0x0019, 0x0017,
+    0x27ff, 0x27ff, 0x201f, 0x201f, 0x201d, 0x201d, 0x201b, 0x201b,
+    0x2015, 0x2015, 0x2013, 0x2013, 0x200d, 0x200d, 0x200b, 0x200b
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_1[24] = {
+    0x0020, 0x0028, 0x0011, 0x000f, 0x000d, 0x000b, 0x2009, 0x2009,
+    0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001,
+    0x001d, 0x001b, 0x0019, 0x0017, 0x2015, 0x2015, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_2[24] = {
+    0x0020, 0x0028, 0x0011, 0x000b, 0x0009, 0x0001, 0x200f, 0x200f,
+    0x200d, 0x200d, 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003,
+    0x001b, 0x0017, 0x2019, 0x2019, 0x2015, 0x2015, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_3[24] = {
+    0x0020, 0x0028, 0x0013, 0x000f, 0x0007, 0x0005, 0x2011, 0x2011,
+    0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2003, 0x2003,
+    0x2019, 0x2019, 0x2017, 0x2017, 0x2015, 0x2015, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_4[20] = {
+    0x0020, 0x0015, 0x0011, 0x0005, 0x0003, 0x0001, 0x200f, 0x200f,
+    0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007,
+    0x2017, 0x2017, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_5[20] = {
+    0x0020, 0x0011, 0x2013, 0x2013, 0x200f, 0x200f, 0x200d, 0x200d,
+    0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2005, 0x2005,
+    0x0015, 0x0001, 0x2003, 0x2003
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_6[20] = {
+    0x0020, 0x000f, 0x2011, 0x2011, 0x200d, 0x200d, 0x2009, 0x2009,
+    0x2007, 0x2007, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b,
+    0x0013, 0x0001, 0x2003, 0x2003
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_7[20] = {
+    0x0020, 0x0003, 0x200f, 0x200f, 0x200d, 0x200d, 0x2007, 0x2007,
+    0x400b, 0x400b, 0x400b, 0x400b, 0x4009, 0x4009, 0x4009, 0x4009,
+    0x0011, 0x0001, 0x2005, 0x2005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_8[20] = {
+    0x0020, 0x0005, 0x200b, 0x200b, 0x400d, 0x400d, 0x400d, 0x400d,
+    0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007,
+    0x0003, 0x0001, 0x200f, 0x200f
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_9[20] = {
+    0x0020, 0x000d, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b,
+    0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007,
+    0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_10[16] = {
+    0x0001, 0x0003, 0x2005, 0x2005, 0x2007, 0x2007, 0x200b, 0x200b,
+    0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_11[16] = {
+    0x0001, 0x0003, 0x2009, 0x2009, 0x4005, 0x4005, 0x4005, 0x4005,
+    0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_12[16] = {
+    0x2001, 0x2001, 0x2003, 0x2003, 0x4007, 0x4007, 0x4007, 0x4007,
+    0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_13[16] = {
+    0x4001, 0x4001, 0x4001, 0x4001, 0x4003, 0x4003, 0x4003, 0x4003,
+    0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_14[16] = {
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+    0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003
+};
+
+const OMX_U16 * armVCM4P10_CAVLCTotalZeroTables[15] = {
+    armVCM4P10_CAVLCTotalZeroTables_0,
+    armVCM4P10_CAVLCTotalZeroTables_1,
+    armVCM4P10_CAVLCTotalZeroTables_2,
+    armVCM4P10_CAVLCTotalZeroTables_3,
+    armVCM4P10_CAVLCTotalZeroTables_4,
+    armVCM4P10_CAVLCTotalZeroTables_5,
+    armVCM4P10_CAVLCTotalZeroTables_6,
+    armVCM4P10_CAVLCTotalZeroTables_7,
+    armVCM4P10_CAVLCTotalZeroTables_8,
+    armVCM4P10_CAVLCTotalZeroTables_9,
+    armVCM4P10_CAVLCTotalZeroTables_10,
+    armVCM4P10_CAVLCTotalZeroTables_11,
+    armVCM4P10_CAVLCTotalZeroTables_12,
+    armVCM4P10_CAVLCTotalZeroTables_13,
+    armVCM4P10_CAVLCTotalZeroTables_14
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_0[16] = {
+    0x2007, 0x2007, 0x2005, 0x2005, 0x4003, 0x4003, 0x4003, 0x4003,
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_1[16] = {
+    0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003,
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_2[16] = {
+    0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003,
+    0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+const OMX_U16 * armVCM4P10_CAVLCTotalZeros2x2Tables[3] = {
+    armVCM4P10_CAVLCTotalZeros2x2Tables_0,
+    armVCM4P10_CAVLCTotalZeros2x2Tables_1,
+    armVCM4P10_CAVLCTotalZeros2x2Tables_2
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_0[8] = {
+    0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_1[8] = {
+    0x2005, 0x2005, 0x2003, 0x2003, 0x4001, 0x4001, 0x4001, 0x4001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_2[8] = {
+    0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_3[8] = {
+    0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_4[8] = {
+    0x000b, 0x0009, 0x0007, 0x0005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_5[8] = {
+    0x0003, 0x0005, 0x0009, 0x0007, 0x000d, 0x000b, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_6[24] = {
+    0x0010, 0x000d, 0x000b, 0x0009, 0x0007, 0x0005, 0x0003, 0x0001,
+    0x0018, 0x0011, 0x200f, 0x200f, 0x0020, 0x0015, 0x2013, 0x2013,
+    0x0028, 0x0019, 0x2017, 0x2017, 0x07ff, 0x001d, 0x201b, 0x201b
+};
+
+/* Tables 7 to 14 are duplicates of table 6 */
+
+const OMX_U16 * armVCM4P10_CAVLCRunBeforeTables[15] = {
+    armVCM4P10_CAVLCRunBeforeTables_0,  /* ZerosLeft=1 */
+    armVCM4P10_CAVLCRunBeforeTables_1,
+    armVCM4P10_CAVLCRunBeforeTables_2,
+    armVCM4P10_CAVLCRunBeforeTables_3,
+    armVCM4P10_CAVLCRunBeforeTables_4,
+    armVCM4P10_CAVLCRunBeforeTables_5,  /* ZerosLeft=6 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=7 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=8 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=9 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=10 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=11 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=12 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=13 */
+    armVCM4P10_CAVLCRunBeforeTables_6,  /* ZerosLeft=14 */
+    armVCM4P10_CAVLCRunBeforeTables_6   /* ZerosLeft=15 */
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
new file mode 100755
index 0000000..4c3a77c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
@@ -0,0 +1,198 @@
+;//
+;// 
+;// File Name:  armVCM4P10_DeblockingChroma_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+
+
+    IF  CortexA8
+        
+pAlpha      RN 2
+pBeta       RN 3
+
+pThresholds RN 5
+pBS         RN 4
+bS3210      RN 6
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dP_3        DN D7.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+dQ_3        DN D11.U8 
+
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+dBeta       DN D2.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+
+;// bSLT4
+dTC3210     DN D18.U8   
+dTCs        DN D31.S8
+dTC         DN D31.U8
+
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+dMask_4     DN D26.U16
+
+dTemp       DN D28.U8
+dDummy      DN D17.U8
+
+;// Computing P0,Q0
+qDq0p0      QN Q10.S16
+qDp1q1      QN Q11.S16
+qDelta      QN Q10.S16  ; reuse qDq0p0
+dDelta      DN D20.S8
+
+
+;// Computing P1,Q1
+qP_0n       QN Q14.S16
+qQ_0n       QN Q12.S16
+
+dQ_0n       DN D24.U8
+dP_0n       DN D29.U8
+
+;// bSGE4
+
+dHSp0q1     DN D13.U8
+dHSq0p1     DN D31.U8   
+
+dBS3210     DN D28.U16
+
+dP_0t       DN D13.U8   ;dHSp0q1        
+dQ_0t       DN D31.U8   ;Temp1        
+
+dP_0n       DN D29.U8
+dQ_0n       DN D24.U8   ;Temp2        
+
+;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
+;//
+;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
+;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
+;//        - Additional Params  - pThresholds: r5
+;//         
+;// Outputs - Pixels            - P0-P1: D29-D30, Q0-Q1: D24-D25
+;//         - Additional Params - pThresholds: r5
+
+;// Registers Corrupted         - D18-D31
+
+
+        M_START armVCM4P10_DeblockingChromabSLT4_unsafe
+
+        
+        ;dTC3210 -18
+        ;dTemp-28
+
+        VLD1        d18.U32[0], [pThresholds]! ;here
+
+        ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+        ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
+
+        ;// qDp1q1-11
+        ;// qDq0p0-10
+        VSUBL       qDp1q1, dP_1, dQ_1      
+        VMOV        dTemp, dTC3210
+        VSUBL       qDq0p0, dQ_0, dP_0      
+        VSHR        qDp1q1, qDp1q1, #2      
+        VZIP.8      dTC3210, dTemp
+    
+        ;// qDelta-qDq0p0-10
+
+        ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
+
+        ;// dTC3210-18
+        ;// dTemp-28
+        ;// dTC-31
+        VBIF        dTC3210, dMask_0, dFilt
+        VRHADD      qDelta, qDp1q1, qDq0p0  
+        VADD        dTC, dTC3210, dMask_1
+        VQMOVN      dDelta, qDelta
+        ;// dDelta-d20
+
+        ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
+        VLD1        {dAlpha[]}, [pAlpha]
+        VMIN        dDelta, dDelta, dTCs
+        VNEG        dTCs, dTCs
+        VLD1        {dBeta[]}, [pBeta]
+        ;1
+        VMAX        dDelta, dDelta, dTCs
+
+        ;// dP_0n - 29
+        ;// dQ_0n - 24
+
+        ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
+        ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
+
+        ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+        ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+        
+        ;// qP_0n - 14
+        ;// qQ_0n - 12
+        
+        VMOVL       qP_0n, dP_0
+        VMOVL       qQ_0n, dQ_0
+
+        ;1
+        VADDW       qP_0n, qP_0n, dDelta
+        VSUBW       qQ_0n, qQ_0n, dDelta
+        
+        VQMOVUN     dP_0n, qP_0n
+        VQMOVUN     dQ_0n, qQ_0n
+
+        M_END
+
+;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
+;//
+;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
+;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
+;//        - Additional Params  - alpha: D0, dMask_1: D15
+;//         
+;// Outputs - Pixels            - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
+
+;// Registers Corrupted         - D18-D31
+
+        M_START armVCM4P10_DeblockingChromabSGE4_unsafe
+    
+        ;dHSq0p1 - 31
+        ;dHSp0q1 - 13
+        VHADD       dHSp0q1, dP_0, dQ_1     
+        VHADD       dHSq0p1, dQ_0, dP_1         
+
+        ;// Prepare the bS mask
+
+        ;// dHSp0q1-13
+        ;// dP_0t-dHSp0q1-13
+        ;// dHSq0p1-31
+        ;// dQ_0t-Temp1-31
+        VLD1        {dAlpha[]}, [pAlpha]
+        ADD         pThresholds, pThresholds, #4
+        VLD1        {dBeta[]}, [pBeta]
+
+        VRHADD      dP_0t, dHSp0q1, dP_1    
+        VRHADD      dQ_0t, dHSq0p1, dQ_1
+        
+        M_END
+        
+        ENDIF  
+
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
new file mode 100755
index 0000000..0afe4fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
@@ -0,0 +1,396 @@
+;//
+;// 
+;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+
+
+    IF  CortexA8
+        
+pThresholds RN 5
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dP_3        DN D7.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+dQ_3        DN D11.U8 
+
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+
+;// bSLT4
+dTC0        DN D18.U8   
+dTC1        DN D19.U8   
+dTC01       DN D18.U8   
+
+dTCs        DN D31.S8
+dTC         DN D31.U8
+
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+
+dTemp       DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0      QN Q10.S16
+qDp1q1      QN Q11.S16
+qDelta      QN Q10.S16  ; reuse qDq0p0
+dDelta      DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0      DN D24.U8
+
+dMaxP       DN D23.U8
+dMinP       DN D22.U8
+
+dMaxQ       DN D19.U8
+dMinQ       DN D21.U8
+
+dDeltaP     DN D26.U8
+dDeltaQ     DN D27.U8
+
+qP_0n       QN Q14.S16
+qQ_0n       QN Q12.S16
+
+dQ_0n       DN D24.U8
+dQ_1n       DN D25.U8
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+
+;// bSGE4
+
+qSp0q0      QN Q10.U16
+
+qSp2q1      QN Q11.U16
+qSp0q0p1    QN Q12.U16
+qSp3p2      QN Q13.U16
+dHSp0q1     DN D28.U8
+
+qSq2p1      QN Q11.U16
+qSp0q0q1    QN Q12.U16
+qSq3q2      QN Q13.U16  ;!!
+dHSq0p1     DN D28.U8   ;!!
+
+qTemp1      QN Q11.U16  ;!!;qSp2q1 
+qTemp2      QN Q12.U16  ;!!;qSp0q0p1        
+
+dP_0t       DN D28.U8   ;!!;dHSp0q1        
+dQ_0t       DN D22.U8   ;!!;Temp1        
+
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+dP_2n       DN D31.U8
+
+dQ_0n       DN D24.U8   ;!!;Temp2        
+dQ_1n       DN D25.U8   ;!!;Temp2        
+dQ_2n       DN D28.U8   ;!!;dQ_0t        
+
+;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
+;//
+;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
+;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
+;//        - Additional Params  - pThresholds: r5
+;//         
+;// Outputs - Pixels            - P0-P1: D29-D30, Q0-Q1: D24-D25
+;//         - Additional Params - pThresholds: r5
+
+;// Registers Corrupted         - D18-D31
+
+
+        M_START armVCM4P10_DeblockingLumabSLT4_unsafe
+
+        
+        ;// qDq0p0-10
+        VSUBL       qDp1q1, dP_1, dQ_1      
+        VLD1        {dTC0[]}, [pThresholds]!
+        ;// qDp1q1-11
+        VSUBL       qDq0p0, dQ_0, dP_0      
+        VLD1        {dTC1[]}, [pThresholds]!
+
+        ;// dRp0q0-24
+        VSHR        qDp1q1, qDp1q1, #2      
+    
+        ;// dTC01 = (dTC1 << 4) | dTC0
+        ;// dTC01-18
+        VEXT        dTC01, dTC0, dTC1, #4
+        ;// dTemp-19
+        VAND        dTemp, dApflg, dMask_1
+        
+        VBIF        dTC01, dMask_0, dFilt
+    
+
+        ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+        ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
+
+        ;// qDelta-qDq0p0-10
+        VRHADD      qDelta, qDp1q1, qDq0p0  
+        VRHADD      dRp0q0, dP_0, dQ_0      
+        VADD        dTC, dTC01, dTemp
+
+        ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
+        
+        VAND        dTemp, dAqflg, dMask_1
+        VQADD       dMaxP, dP_1, dTC01      
+        VQMOVN      dDelta, qDelta
+        VADD        dTC, dTC, dTemp
+
+        ;// dMaxP = QADD(dP_1, dTC01)
+        ;// dMinP = QSUB(dP_1, dTC01)
+ 
+        ;// dMaxP-d23
+        ;// dMinP-d22
+        VQSUB       dMinP, dP_1, dTC01      
+
+        ;// dDelta-d20
+
+        ;// dMaxQ = QADD(dQ_1, dTC01)
+        ;// dMinQ = QSUB(dQ_1, dTC01)
+ 
+        ;// dMaxQ-19
+        ;// dMinQ-21
+        VQADD       dMaxQ, dQ_1, dTC01
+        VHADD       dDeltaP, dRp0q0, dP_2   
+        VMIN        dDelta, dDelta, dTCs
+
+        ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
+        VNEG        dTCs, dTCs
+        
+        VQSUB       dMinQ, dQ_1, dTC01
+
+        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
+        ;// delta = armClip(-tC0, tC0, delta);
+        ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta);
+
+        ;// dDeltaP = (dP_2 + dRp0q0)>>1;
+        ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP);
+        ;// dP_1n = armClip(MinP, MaxP, dDeltaP);
+        
+        ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
+        ;// delta = armClip(-tC0, tC0, delta);
+        ;// pQ0[1*Step] = (OMX_U8)(q1 + delta);
+
+        ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
+        ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ);
+        ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ);
+        
+        ;// dDeltaP-26
+        VHADD       dDeltaQ, dRp0q0, dQ_2   
+
+        ;// dDeltaQ-27
+        
+        ;// dP_0n - 29
+        ;// dP_1n - 30
+        ;// dQ_0n - 24
+        ;// dQ_1n - 25
+        
+        ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
+        ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
+
+        VMAX        dP_1n, dDeltaP, dMinP   
+        VMAX        dDelta, dDelta, dTCs
+
+        ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
+        ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
+
+        ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+        ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+        
+        ;// qP_0n - 14
+        ;// qQ_0n - 12
+        
+        VMOVL       qP_0n, dP_0
+        VMOVL       qQ_0n, dQ_0
+
+        VADDW       qP_0n, qP_0n, dDelta
+        VSUBW       qQ_0n, qQ_0n, dDelta
+        
+        VQMOVUN     dP_0n, qP_0n
+        VQMOVUN     dQ_0n, qQ_0n
+        
+        VMAX        dQ_1n, dDeltaQ, dMinQ
+
+        VMIN        dP_1n, dP_1n, dMaxP
+        VMIN        dQ_1n, dQ_1n, dMaxQ
+        VBIF        dP_0n, dP_0, dFilt      
+
+        VBIF        dP_1n, dP_1, dApflg
+        VBIF        dQ_0n, dQ_0, dFilt  
+        VBIF        dQ_1n, dQ_1, dAqflg
+
+        M_END
+
+;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
+;//
+;// Inputs - Pixels             - p0-p3: D4-D7, q0-q3: D8-D11
+;//        - Filter masks       - filt: D16, aqflg: D12, apflg: D17
+;//        - Additional Params  - alpha: D0, dMask_1: D15
+;//         
+;// Outputs - Pixels            - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
+
+;// Registers Corrupted         - D18-D31
+
+        M_START armVCM4P10_DeblockingLumabSGE4_unsafe
+    
+
+        ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2)        
+        ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2)        
+
+        ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) )
+        ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) )
+
+        ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
+        ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
+
+        ;// P Filter
+
+        VSHR        dTemp, dAlpha, #2
+        VADD        dTemp, dTemp, dMask_1
+
+        ;// qSp0q0-10
+        VADDL       qSp0q0, dQ_0, dP_0      
+        VADD        dTemp, dTemp, dMask_1
+
+        ;// qSp2q1-11
+        ;// qSp0q0p1-12
+        VADDL       qSp2q1, dP_2, dQ_1      
+        VADDW       qSp0q0p1, qSp0q0, dP_1  
+
+        VCGT        dTemp, dTemp, dAp0q0
+        VSHR        qSp2q1, #1              
+
+        ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3);
+        ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1
+
+        ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1
+        ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1
+        ;// dP_0n = ( qTemp1 + 1 ) >> 1
+        
+        ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2);
+        
+        ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2);
+        ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2);
+        
+        ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
+        ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2);
+
+        ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2);
+        ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2);
+
+        ;// qTemp1-qSp2q1-11
+        ;// qTemp2-qSp0q0p1-12
+        VHADD       qTemp1, qSp0q0p1, qSp2q1
+        VADDW       qTemp2, qSp0q0p1, dP_2  
+        
+        ;// qSp3p2-13
+        VADDL       qSp3p2, dP_3, dP_2      
+
+        VAND        dApflg, dApflg, dTemp
+        VHADD       dHSp0q1, dP_0, dQ_1     
+        VSRA        qSp3p2, qTemp2, #1      
+        ;// dHSp0q1-28
+        VAND        dAqflg, dAqflg, dTemp
+
+        ;// dP_0n-29
+        ;// dP_0t-dHSp0q1-28
+        VQRSHRN     dP_0n, qTemp1, #1
+        VRHADD      dP_0t, dHSp0q1, dP_1    
+
+        ;// dP_1n-30
+        VQRSHRN     dP_1n, qTemp2, #2
+        
+        VADDL       qSq2p1, dQ_2, dP_1          
+        VADDW       qSp0q0q1, qSp0q0, dQ_1      
+        
+        VBIF        dP_0n, dP_0t, dApflg    
+
+        ;// Q Filter
+
+        ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
+        ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1
+
+        ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1
+        ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1
+        ;// dQ_0n = ( qTemp1 + 1 ) >> 1
+        
+        ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2);
+        
+        ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2);
+        ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2);
+        
+        ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3);
+        ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2);
+
+        ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2);
+        ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2);
+
+        ;// qTemp1-qSp2q1-11
+        ;// qTemp2-qSp0q0p1-12
+        ;// qSq2p1-11
+        ;// qSp0q0q1-12
+
+
+        ;// qTemp2-qSp0q0p1-12
+        ;// qTemp1-qSq2p1-11
+        ;// qSq3q2-13
+        ;// dP_2n-31
+        
+        VQRSHRN     dP_2n, qSp3p2, #2
+        VADDL       qSq3q2, dQ_3, dQ_2          
+
+        VSHR        qSq2p1, #1                  
+
+        VHADD       qTemp1, qSp0q0q1, qSq2p1
+        VADDW       qTemp2, qSp0q0q1, dQ_2      
+
+        ;// dHSq0p1-28
+        VHADD       dHSq0p1, dQ_0, dP_1         
+
+        VBIF        dP_0n, dP_0, dFilt
+        VBIF        dP_1n, dP_1, dApflg
+
+        VSRA        qSq3q2, qTemp2, #1          
+
+        ;// dQ_1-Temp2-25
+        ;// dQ_0-Temp2-24
+        VQRSHRN     dQ_1n, qTemp2, #2
+        VQRSHRN     dQ_0n, qTemp1, #1
+
+        ;// dQ_0t-Temp1-22
+        VRHADD      dQ_0t, dHSq0p1, dQ_1
+        VBIF        dQ_1n, dQ_1, dAqflg         
+
+        VBIF        dP_2n, dP_2, dApflg        
+        VBIF        dQ_0n, dQ_0t, dAqflg        
+        VQRSHRN     dQ_2n, qSq3q2, #2
+        VBIF        dQ_0n, dQ_0, dFilt
+        VBIF        dQ_2n, dQ_2, dAqflg       
+
+        M_END
+        
+    ENDIF  
+
+
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
new file mode 100755
index 0000000..10a89e9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
@@ -0,0 +1,325 @@
+;//
+;// 
+;// File Name:  armVCM4P10_DecodeCoeffsToPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        INCLUDE armCOMM_BitDec_s.h
+        
+        IMPORT armVCM4P10_CAVLCCoeffTokenTables
+        IMPORT armVCM4P10_CAVLCTotalZeroTables
+        IMPORT armVCM4P10_CAVLCTotalZeros2x2Tables
+        IMPORT armVCM4P10_CAVLCRunBeforeTables
+        IMPORT armVCM4P10_SuffixToLevel
+        IMPORT armVCM4P10_ZigZag_4x4
+        IMPORT armVCM4P10_ZigZag_2x2
+        
+        M_VARIANTS ARM1136JS
+        
+;//DEBUG_ON    SETL {TRUE}
+        
+LAST_COEFF               EQU 0x20        ;// End of block flag
+TWO_BYTE_COEFF           EQU 0x10
+
+;// Declare input registers
+
+ppBitStream     RN 0
+pOffset         RN 1
+pNumCoeff       RN 2
+ppPosCoefbuf    RN 3
+nC              RN 4 ;// number of coeffs or 17 for chroma
+sMaxNumCoeff    RN 5
+
+;// Declare inner loop registers
+
+;// Level loop
+Count           RN 0
+TrailingOnes    RN 1
+pLevel          RN 2
+LevelSuffix     RN 3
+SuffixLength    RN 4
+TotalCoeff      RN 5
+
+pVLDTable       RN 6
+Symbol          RN 7
+T1              RN 8
+T2              RN 9
+RBitStream      RN 10
+RBitBuffer      RN 11
+RBitCount       RN 12
+lr              RN 14
+
+;// Run loop
+Count           RN 0
+ZerosLeft       RN 1
+pLevel          RN 2
+ppRunTable      RN 3
+pRun            RN 4
+TotalCoeff      RN 5
+
+pVLDTable       RN 6
+Symbol          RN 7
+T1              RN 8
+T2              RN 9
+RBitStream      RN 10
+RBitBuffer      RN 11
+RBitCount       RN 12
+lr              RN 14
+
+;// Fill in coefficients loop
+pPosCoefbuf     RN 0
+temp            RN 1
+pLevel          RN 2
+ppPosCoefbuf    RN 3
+pRun            RN 4
+TotalCoeff      RN 5
+pZigZag         RN 6
+
+T1              RN 8
+T2              RN 9
+RBitStream      RN 10
+RBitBuffer      RN 11
+RBitCount       RN 12
+CoeffNum        RN 14
+
+
+
+    IF ARM1136JS
+        
+        ;// Allocate stack memory required by the function
+        M_ALLOC4 pppBitStream, 4
+        M_ALLOC4 ppOffset, 4
+        M_ALLOC4 pppPosCoefbuf, 4
+        M_ALLOC4 ppLevel, 16*2
+        M_ALLOC4 ppRun, 16
+        
+        ;// Write function header
+        M_START armVCM4P10_DecodeCoeffsToPair, r11
+        
+        ;// Define stack arguments
+        M_ARG   pNC, 4
+        M_ARG   pSMaxNumCoeff,4
+        
+        ;// Code start        
+        M_BD_INIT0 ppBitStream, pOffset, RBitStream, RBitBuffer, RBitCount
+        LDR        pVLDTable, =armVCM4P10_CAVLCCoeffTokenTables
+        M_LDR      nC, pNC
+        
+        M_BD_INIT1 T1, T2, lr
+        LDR     pVLDTable, [pVLDTable, nC, LSL #2]  ;// Find VLD table    
+        
+        M_BD_INIT2 T1, T2, lr
+
+        ;// Decode Symbol = TotalCoeff*4 + TrailingOnes
+        M_BD_VLD  Symbol, T1, T2, pVLDTable, 4, 2
+    
+        MOVS    TotalCoeff, Symbol, LSR #2    
+        STRB    TotalCoeff, [pNumCoeff]    
+        M_PRINTF "TotalCoeff=%d\n", TotalCoeff
+        BEQ.W   EndNoError                  ;// Finished if no coefficients
+
+        CMP     Symbol, #17*4
+        BGE.W   EndBadSymbol                ;// Error if bad symbol
+        
+        ;// Save bitstream pointers
+        M_STR   ppBitStream,  pppBitStream
+        M_STR   pOffset,      ppOffset
+        M_STR   ppPosCoefbuf, pppPosCoefbuf                
+        
+        ;// Decode Trailing Ones
+        ANDS    TrailingOnes, Symbol, #3
+        M_ADR   pLevel, ppLevel            
+        M_PRINTF "TrailingOnes=%d\n", TrailingOnes
+        BEQ     TrailingOnesDone    
+        MOV     Count, TrailingOnes
+TrailingOnesLoop    
+        M_BD_READ8 Symbol, 1, T1
+        SUBS    Count, Count, #1
+        MOV     T1, #1
+        SUB     T1, T1, Symbol, LSL #1
+        M_PRINTF "Level=%d\n", T1
+        STRH    T1, [pLevel], #2
+        BGT     TrailingOnesLoop
+TrailingOnesDone    
+    
+        ;// Decode level values    
+        SUBS    Count, TotalCoeff, TrailingOnes     ;// Number of levels to read
+        BEQ     DecodeRuns                          ;// None left
+        
+        MOV     SuffixLength, #1
+        CMP     TotalCoeff, #10
+        MOVLE   SuffixLength, #0
+        CMP     TrailingOnes, #3    ;// if (TrailingOnes<3)
+        MOVLT   TrailingOnes, #4    ;// then TrailingOnes = +4
+        MOVGE   TrailingOnes, #2    ;// else TrailingOnes = +2
+        MOVGE   SuffixLength, #0    ;//      SuffixLength = 0
+        
+LevelLoop
+        M_BD_CLZ16 Symbol, T1, T2   ;// Symbol=LevelPrefix
+        CMP     Symbol,#16
+        BGE     EndBadSymbol
+        
+        MOVS    lr, SuffixLength    ;// if LevelSuffixSize==0
+        TEQEQ   Symbol, #14         ;//   and  LevelPrefix==14
+        MOVEQ   lr, #4              ;//   then LevelSuffixSize=4
+        TEQ     Symbol, #15         ;// if LevelSuffixSize==15
+        MOVEQ   lr, #12             ;//   then LevelSuffixSize=12
+        
+        TEQEQ   SuffixLength,#0
+        ADDEQ   Symbol,Symbol,#15
+        
+        TEQ     lr, #0              ;// if LevelSuffixSize==0
+        BEQ     LevelCodeRead       ;// LevelCode = LevelPrefix
+        
+        M_BD_VREAD16 LevelSuffix, lr, T1, T2  ;// Read Level Suffix
+        
+        MOV     Symbol, Symbol, LSL SuffixLength
+        ADD     Symbol, LevelSuffix, Symbol
+             
+LevelCodeRead        
+        ;// Symbol = LevelCode
+        ADD     Symbol, Symbol, TrailingOnes ;// +4 if level cannot be +/-1, +2 o/w
+        MOV     TrailingOnes, #2
+        MOVS    T1, Symbol, LSR #1
+        RSBCS   T1, T1, #0                  ;// If Symbol odd then negate
+        M_PRINTF "Level=%d\n", T1
+        STRH    T1, [pLevel], #2            ;// Store level.
+        
+        LDR     T2, =armVCM4P10_SuffixToLevel
+        LDRSB   T1, [T2, SuffixLength]      ;// Find increment level        
+        TEQ     SuffixLength, #0
+        MOVEQ   SuffixLength, #1
+        CMP     Symbol, T1
+        ADDCS   SuffixLength, SuffixLength, #1        
+        SUBS    Count, Count, #1        
+        BGT     LevelLoop
+        
+DecodeRuns        
+        ;// Find number of zeros
+        M_LDR   T1, pSMaxNumCoeff           ;// sMaxNumCoeff
+        SUB     Count, TotalCoeff, #1       ;// Number of runs excluding last
+        SUBS    ZerosLeft, T1, TotalCoeff   ;// Maximum number of zeros there could be
+        M_ADR   pRun, ppRun
+        MOV     CoeffNum,TotalCoeff
+        SUB     CoeffNum,CoeffNum,#1
+        BEQ     NoZerosLeft
+        
+        ;// Unpack number of zeros from bitstream
+        TEQ     T1, #4        
+        LDREQ   pVLDTable, =(armVCM4P10_CAVLCTotalZeros2x2Tables-4)
+        LDRNE   pVLDTable, =(armVCM4P10_CAVLCTotalZeroTables-4)
+        LDR     pVLDTable, [pVLDTable, TotalCoeff, LSL #2]
+        
+        M_BD_VLD  Symbol, T1, T2, pVLDTable, 4, 2 ;// Symbol = ZerosLeft
+        CMP     Symbol,#16
+        BGE     EndBadSymbol
+
+        LDR     ppRunTable, =(armVCM4P10_CAVLCRunBeforeTables-4)
+        M_ADR   pRun, ppRun
+        MOVS    ZerosLeft, Symbol
+
+        ADD     CoeffNum,CoeffNum,ZerosLeft        
+
+        BEQ     NoZerosLeft
+        
+        ;// Decode runs while zeros are left and more than one coefficient
+RunLoop 
+        SUBS    Count, Count, #1
+        LDR     pVLDTable, [ppRunTable, ZerosLeft, LSL#2]
+        BLT     LastRun
+        M_BD_VLD  Symbol, T1, T2, pVLDTable, 3, 2 ;// Symbol = Run
+        CMP     Symbol,#15         
+        BGE     EndBadSymbol        
+
+        SUBS    ZerosLeft, ZerosLeft, Symbol
+        M_PRINTF "Run=%d\n", Symbol
+        STRB    Symbol, [pRun], #1
+        BGT     RunLoop
+        
+        ;// Decode runs while no zeros are left
+NoZerosLeft 
+        SUBS    Count, Count, #1
+        M_PRINTF "Run=%d\n", ZerosLeft
+        STRGEB  ZerosLeft, [pRun], #1
+        BGT     NoZerosLeft
+
+LastRun        
+        ;// Final run length is remaining zeros
+        M_PRINTF "LastRun=%d\n", ZerosLeft
+        STRB    ZerosLeft, [pRun], #1        
+        
+        ;// Write coefficients to output array
+        M_LDR   T1, pSMaxNumCoeff                    ;// sMaxNumCoeff
+        TEQ     T1, #15
+        ADDEQ   CoeffNum,CoeffNum,#1
+        
+
+        SUB     pRun,pRun,TotalCoeff
+        SUB     pLevel,pLevel,TotalCoeff  
+        SUB     pLevel,pLevel,TotalCoeff   
+
+        M_LDR   ppPosCoefbuf, pppPosCoefbuf
+        LDR     pPosCoefbuf, [ppPosCoefbuf]
+        TEQ     T1, #4
+        LDREQ   pZigZag, =armVCM4P10_ZigZag_2x2
+        LDRNE   pZigZag, =armVCM4P10_ZigZag_4x4
+
+        
+        
+OutputLoop
+        
+        LDRB    T2, [pRun],#1
+        LDRB    T1, [pZigZag, CoeffNum]
+        SUB     CoeffNum, CoeffNum, #1      ;// Skip Non zero
+        SUB     CoeffNum, CoeffNum, T2      ;// Skip Zero run
+        
+        LDRSH   T2, [pLevel],#2
+        
+        SUBS    TotalCoeff, TotalCoeff, #1       
+        ORREQ   T1, T1, #LAST_COEFF
+        
+        ADD     temp, T2, #128
+        CMP     temp, #256
+        ORRCS   T1, T1, #TWO_BYTE_COEFF
+
+        
+        TEQ     TotalCoeff, #0              ;// Preserves carry        
+        
+        M_PRINTF "Output=%02x %04x\n", T1, T2
+        STRB    T1, [pPosCoefbuf], #1
+        STRB    T2, [pPosCoefbuf], #1
+        MOV     T2, T2, LSR #8
+        STRCSB  T2, [pPosCoefbuf], #1                
+        BNE     OutputLoop
+        
+        ;// Finished
+        STR     pPosCoefbuf, [ppPosCoefbuf]
+        M_LDR   ppBitStream, pppBitStream
+        M_LDR   pOffset, ppOffset
+        B       EndNoError
+            
+EndBadSymbol
+        MOV     r0, #OMX_Sts_Err
+        B       End    
+        
+EndNoError
+        ;// Finished reading from the bitstream                
+        M_BD_FINI ppBitStream, pOffset
+        
+        ;// Set return value
+        MOV     r0, #OMX_Sts_NoErr    
+End
+        M_END
+    
+    ENDIF
+    
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s
new file mode 100755
index 0000000..2761600
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s
@@ -0,0 +1,123 @@
+;//
+;// 
+;// File Name:  armVCM4P10_DequantTables_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        
+
+         INCLUDE omxtypes_s.h
+         INCLUDE armCOMM_s.h
+     
+         EXPORT armVCM4P10_QPDivTable
+         EXPORT armVCM4P10_VMatrixQPModTable
+         EXPORT armVCM4P10_PosToVCol4x4
+         EXPORT armVCM4P10_PosToVCol2x2
+         EXPORT armVCM4P10_VMatrix 
+         EXPORT armVCM4P10_QPModuloTable
+         EXPORT armVCM4P10_VMatrixU16
+         
+;// Define the processor variants supported by this file
+         
+         M_VARIANTS CortexA8
+           
+         
+;// Guarding implementation by the processor name
+
+    
+    IF CortexA8
+           
+ 
+         M_TABLE armVCM4P10_PosToVCol4x4
+         DCB  0, 2, 0, 2
+         DCB  2, 1, 2, 1
+         DCB  0, 2, 0, 2
+         DCB  2, 1, 2, 1
+
+
+         M_TABLE armVCM4P10_PosToVCol2x2
+         DCB  0, 2
+         DCB  2, 1
+
+
+         M_TABLE armVCM4P10_VMatrix
+         DCB  10, 16, 13
+         DCB  11, 18, 14
+         DCB  13, 20, 16
+         DCB  14, 23, 18
+         DCB  16, 25, 20
+         DCB  18, 29, 23
+
+;//-------------------------------------------------------
+;// This table evaluates the expression [(INT)(QP/6)],
+;// for values of QP from 0 to 51 (inclusive). 
+;//-------------------------------------------------------
+
+         M_TABLE armVCM4P10_QPDivTable
+         DCB  0,  0,  0,  0,  0,  0
+         DCB  1,  1,  1,  1,  1,  1
+         DCB  2,  2,  2,  2,  2,  2
+         DCB  3,  3,  3,  3,  3,  3
+         DCB  4,  4,  4,  4,  4,  4
+         DCB  5,  5,  5,  5,  5,  5
+         DCB  6,  6,  6,  6,  6,  6
+         DCB  7,  7,  7,  7,  7,  7
+         DCB  8,  8,  8,  8,  8,  8
+    
+;//----------------------------------------------------
+;// This table contains armVCM4P10_VMatrix[QP%6][0] entires,
+;// for values of QP from 0 to 51 (inclusive). 
+;//----------------------------------------------------
+
+         M_TABLE armVCM4P10_VMatrixQPModTable
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+         DCB 10, 11, 13, 14, 16, 18
+    
+;//-------------------------------------------------------
+;// This table evaluates the modulus expression [QP%6]*6,
+;// for values of QP from 0 to 51 (inclusive). 
+;//-------------------------------------------------------
+
+         M_TABLE armVCM4P10_QPModuloTable
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+         DCB 0, 6, 12, 18, 24, 30
+        
+;//-------------------------------------------------------
+;// This table contains the invidual byte values stored as
+;// halfwords. This avoids unpacking inside the function
+;//-------------------------------------------------------
+        
+         M_TABLE armVCM4P10_VMatrixU16
+         DCW 10, 16, 13 
+         DCW 11, 18, 14
+         DCW 13, 20, 16
+         DCW 14, 23, 18
+         DCW 16, 25, 20
+         DCW 18, 29, 23 
+         
+    ENDIF                                                           ;//ARM1136JS            
+
+
+                           
+    
+         END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
new file mode 100755
index 0000000..6e912d7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
@@ -0,0 +1,236 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS ARM1136JS
+
+        EXPORT armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+        EXPORT armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+
+DEBUG_ON    SETL {FALSE}
+
+    IF ARM1136JS 
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 8
+iHeight         RN 9
+
+;// Declare inner loop registers
+x               RN 7
+x0              RN 7
+x1              RN 10
+x2              RN 11
+Scratch         RN 12
+
+;// Function: 
+;//     armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to a 4 byte aligned
+;// destination pointed by (pDst) for horizontal interpolation.
+;// This function needs to copy 9 bytes in horizontal direction. 
+;//
+;// Registers used as input for this function
+;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no rows to copy
+;//
+;// Registers preserved for top level function
+;// r2,r3,r4,r5,r6
+;//
+;// Registers modified by the function
+;// r7,r8,r9,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the new aligned location which will be used as pSrc
+;// r1 - step size to this aligned location
+
+        ;// Function header
+        M_START armVCM4P10_InterpolateLuma_HorAlign9x_unsafe     
+        
+        ;// Copy pDst to scratch
+        MOV     Scratch, pDst
+
+StartAlignedStackCopy
+        AND     x, pSrc, #3
+        BIC     pSrc, pSrc, #3
+        
+        M_SWITCH x
+        M_CASE   Copy0toAligned
+        M_CASE   Copy1toAligned
+        M_CASE   Copy2toAligned
+        M_CASE   Copy3toAligned
+        M_ENDSWITCH
+
+Copy0toAligned  
+        LDM     pSrc, {x0, x1, x2}
+        SUBS    iHeight, iHeight, #1
+        ADD     pSrc, pSrc, srcStep
+        
+        ;// One cycle stall
+
+        STM     pDst!, {x0, x1, x2}                     ;// Store aligned output row
+        BGT     Copy0toAligned
+        B       CopyEnd  
+      
+Copy1toAligned        
+        LDM     pSrc, {x0, x1, x2}
+        SUBS    iHeight, iHeight, #1
+        ADD     pSrc, pSrc, srcStep
+        
+        ;// One cycle stall
+
+        MOV     x0, x0, LSR #8
+        ORR     x0, x0, x1, LSL #24
+        MOV     x1, x1, LSR #8
+        ORR     x1, x1, x2, LSL #24
+        MOV     x2, x2, LSR #8
+        STM     pDst!, {x0, x1, x2}                     ;// Store aligned output row
+        BGT     Copy1toAligned
+        B       CopyEnd  
+
+Copy2toAligned        
+        LDM     pSrc, {x0, x1, x2}
+        SUBS    iHeight, iHeight, #1
+        ADD     pSrc, pSrc, srcStep
+        
+        ;// One cycle stall
+
+        MOV     x0, x0, LSR #16
+        ORR     x0, x0, x1, LSL #16
+        MOV     x1, x1, LSR #16
+        ORR     x1, x1, x2, LSL #16
+        MOV     x2, x2, LSR #16
+        STM     pDst!, {x0, x1, x2}                     ;// Store aligned output row
+        BGT     Copy2toAligned
+        B       CopyEnd  
+
+Copy3toAligned        
+        LDM     pSrc, {x0, x1, x2}
+        SUBS    iHeight, iHeight, #1
+        ADD     pSrc, pSrc, srcStep
+        
+        ;// One cycle stall
+
+        MOV     x0, x0, LSR #24
+        ORR     x0, x0, x1, LSL #8
+        MOV     x1, x1, LSR #24
+        ORR     x1, x1, x2, LSL #8
+        MOV     x2, x2, LSR #24
+        STM     pDst!, {x0, x1, x2}                     ;// Store aligned output row
+        BGT     Copy3toAligned
+
+CopyEnd  
+        
+        MOV     pSrc, Scratch
+        MOV     srcStep, #12
+
+        M_END
+    
+
+;// Function:
+;//     armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned
+;// destination pointed by (pDst) for vertical interpolation.
+;// This function needs to copy 4 bytes in horizontal direction 
+;//
+;// Registers used as input for this function
+;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no of rows to copy
+;//
+;// Registers preserved for top level function
+;// r2,r3,r4,r5,r6
+;//
+;// Registers modified by the function
+;// r7,r8,r9,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the new aligned location which will be used as pSrc
+;// r1 - step size to this aligned location
+
+        ;// Function header
+        M_START armVCM4P10_InterpolateLuma_VerAlign4x_unsafe     
+        
+        ;// Copy pSrc to stack
+StartVAlignedStackCopy
+        AND     x, pSrc, #3
+        BIC     pSrc, pSrc, #3                        
+        
+        
+        M_SWITCH x
+        M_CASE   Copy0toVAligned
+        M_CASE   Copy1toVAligned
+        M_CASE   Copy2toVAligned
+        M_CASE   Copy3toVAligned
+        M_ENDSWITCH
+        
+Copy0toVAligned  
+        M_LDR   x0, [pSrc], srcStep
+        SUBS    iHeight, iHeight, #1
+        
+        ;// One cycle stall
+
+        STR     x0, [pDst], #4                              ;// Store aligned output row
+        BGT     Copy0toVAligned
+        B       CopyVEnd  
+      
+Copy1toVAligned        
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        SUBS    iHeight, iHeight, #1        
+        
+        ;// One cycle stall
+
+        MOV     x1, x1, LSL #24
+        ORR     x0, x1, x0, LSR #8
+        STR     x0, [pDst], #4                              ;// Store aligned output row
+        BGT     Copy1toVAligned
+        B       CopyVEnd  
+
+Copy2toVAligned        
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        SUBS    iHeight, iHeight, #1        
+        
+        ;// One cycle stall
+
+        MOV     x1, x1, LSL #16
+        ORR     x0, x1, x0, LSR #16
+        STR     x0, [pDst], #4                              ;// Store aligned output row
+        BGT     Copy2toVAligned
+        B       CopyVEnd  
+
+Copy3toVAligned        
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        SUBS    iHeight, iHeight, #1        
+        
+        ;// One cycle stall
+
+        MOV     x1, x1, LSL #8
+        ORR     x0, x1, x0, LSR #24
+        STR     x0, [pDst], #4                              ;// Store aligned output row
+        BGT     Copy3toVAligned
+
+CopyVEnd  
+
+        SUB     pSrc, pDst, #28
+        MOV     srcStep, #4
+
+        M_END
+
+
+    ENDIF
+
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
new file mode 100755
index 0000000..d275891
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
@@ -0,0 +1,149 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Function:
+;//     armVCM4P10_InterpolateLuma_Copy4x4_unsafe 
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned
+;// destination pointed by (pDst)
+;//
+;// Registers preserved for top level function
+;// r1,r3,r4,r5,r6,r7,r10,r11,r14
+;//
+;// Registers modified by the function
+;// r0,r2,r8,r9,r12
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS ARM1136JS
+
+        EXPORT armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+        
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+
+;// Declare other intermediate registers
+x0              RN 4
+x1              RN 5
+x2              RN 8
+x3              RN 9
+Temp            RN 12
+
+    IF ARM1136JS
+
+        M_START armVCM4P10_InterpolateLuma_Copy4x4_unsafe, r6
+
+Copy4x4Start
+        ;// Do Copy and branch to EndOfInterpolation
+        AND     Temp, pSrc, #3
+        BIC     pSrc, pSrc, #3                        
+
+        M_SWITCH Temp
+        M_CASE  Copy4x4Align0
+        M_CASE  Copy4x4Align1
+        M_CASE  Copy4x4Align2
+        M_CASE  Copy4x4Align3
+        M_ENDSWITCH
+
+Copy4x4Align0
+        M_LDR   x0, [pSrc], srcStep
+        M_LDR   x1, [pSrc], srcStep
+        M_STR   x0, [pDst], dstStep
+        M_LDR   x2, [pSrc], srcStep
+        M_STR   x1, [pDst], dstStep
+        M_LDR   x3, [pSrc], srcStep
+        M_STR   x2, [pDst], dstStep
+        M_STR   x3, [pDst], dstStep
+        B       Copy4x4End  
+
+Copy4x4Align1
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #8
+        ORR     x0, x0, x1, LSL #24
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #8
+        ORR     x2, x2, x3, LSL #24
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        M_STR   x2, [pDst], dstStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #8
+        ORR     x0, x0, x1, LSL #24
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #8
+        ORR     x2, x2, x3, LSL #24
+        M_STR   x2, [pDst], dstStep
+        B       Copy4x4End  
+      
+Copy4x4Align2
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #16
+        ORR     x0, x0, x1, LSL #16
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #16
+        ORR     x2, x2, x3, LSL #16
+        M_STR   x2, [pDst], dstStep        
+
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #16
+        ORR     x0, x0, x1, LSL #16
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #16
+        ORR     x2, x2, x3, LSL #16
+        M_STR   x2, [pDst], dstStep        
+        B       Copy4x4End  
+
+Copy4x4Align3 
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #24
+        ORR     x0, x0, x1, LSL #8
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #24
+        ORR     x2, x2, x3, LSL #8
+        M_STR   x2, [pDst], dstStep
+
+        LDR     x1, [pSrc, #4]
+        M_LDR   x0, [pSrc], srcStep
+        LDR     x3, [pSrc, #4]
+        M_LDR   x2, [pSrc], srcStep
+        MOV     x0, x0, LSR #24
+        ORR     x0, x0, x1, LSL #8
+        M_STR   x0, [pDst], dstStep
+        MOV     x2, x2, LSR #24
+        ORR     x2, x2, x3, LSL #8
+        M_STR   x2, [pDst], dstStep
+        B       Copy4x4End  
+
+Copy4x4End
+        M_END
+
+    ENDIF
+
+    END
+    
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
new file mode 100755
index 0000000..4e5a39d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
@@ -0,0 +1,178 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS ARM1136JS
+
+        EXPORT armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+        EXPORT armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+
+;// Functions: 
+;//     armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe and
+;//     armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe 
+;//
+;// Implements re-arrangement of data from temporary buffer to a buffer pointed by pBuf.
+;// This will do the convertion of data from 16 bit to 8 bit and it also
+;// remove offset and check for saturation.
+;//
+;// Registers used as input for this function
+;// r0,r1,r7 where r0 is input pointer and r2 its step size, r7 is output pointer
+;//
+;// Registers preserved for top level function
+;// r4,r5,r6,r8,r9,r14
+;//
+;// Registers modified by the function
+;// r7,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the destination location
+;// r1 - step size to this destination location
+
+
+DEBUG_ON    SETL {FALSE}
+        
+MASK            EQU 0x80808080  ;// Mask is used to implement (a+b+1)/2
+
+;// Declare input registers
+
+pSrc0           RN 0
+srcStep0        RN 1
+
+;// Declare other intermediate registers
+Temp1           RN 4
+Temp2           RN 5
+Temp3           RN 10
+Temp4           RN 11
+pBuf            RN 7
+r0x0fe00fe0     RN 6
+r0x00ff00ff     RN 12
+Count           RN 14
+ValueA0         RN 10
+ValueA1         RN 11
+
+    IF ARM1136JS
+
+
+        ;// Function header
+        M_START armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe, r6
+
+        ;// Code start     
+        MOV         Count, #4   
+        LDR         r0x0fe00fe0, =0x0fe00fe0
+        LDR         r0x00ff00ff, =0x00ff00ff        
+LoopStart1
+        LDR         Temp4, [pSrc0, #12]
+        LDR         Temp3, [pSrc0, #8]        
+        LDR         Temp2, [pSrc0, #4]
+        M_LDR       Temp1, [pSrc0], srcStep0              
+        UQSUB16     Temp4, Temp4, r0x0fe00fe0        
+        UQSUB16     Temp3, Temp3, r0x0fe00fe0                 
+        UQSUB16     Temp2, Temp2, r0x0fe00fe0        
+        UQSUB16     Temp1, Temp1, r0x0fe00fe0                 
+        USAT16      Temp4, #13, Temp4
+        USAT16      Temp3, #13, Temp3                          
+        USAT16      Temp2, #13, Temp2
+        USAT16      Temp1, #13, Temp1                                  
+        AND         Temp4, r0x00ff00ff, Temp4, LSR #5         
+        AND         Temp3, r0x00ff00ff, Temp3, LSR #5         
+        AND         Temp2, r0x00ff00ff, Temp2, LSR #5         
+        AND         Temp1, r0x00ff00ff, Temp1, LSR #5         
+        ORR         ValueA1, Temp3, Temp4, LSL #8             
+        ORR         ValueA0, Temp1, Temp2, LSL #8             
+        SUBS        Count, Count, #1                   
+        STRD        ValueA0, [pBuf], #8 
+        BGT         LoopStart1
+End1
+        SUB        pSrc0, pBuf, #32
+        MOV        srcStep0, #8
+
+        M_END
+
+
+        ;// Function header
+        M_START armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe, r6
+        
+        ;// Code start        
+        LDR         r0x0fe00fe0, =0x0fe00fe0
+        LDR         r0x00ff00ff, =0x00ff00ff
+        MOV         Count, #2
+
+LoopStart    
+        LDR         Temp4, [pSrc0, #12]
+        LDR         Temp3, [pSrc0, #8]        
+        LDR         Temp2, [pSrc0, #4]
+        M_LDR       Temp1, [pSrc0], srcStep0
+        
+        UQSUB16     Temp4, Temp4, r0x0fe00fe0        
+        UQSUB16     Temp3, Temp3, r0x0fe00fe0                 
+        UQSUB16     Temp2, Temp2, r0x0fe00fe0        
+        UQSUB16     Temp1, Temp1, r0x0fe00fe0                 
+        
+        USAT16      Temp4, #13, Temp4
+        USAT16      Temp3, #13, Temp3                          
+        USAT16      Temp2, #13, Temp2
+        USAT16      Temp1, #13, Temp1
+                                  
+        AND         Temp4, r0x00ff00ff, Temp4, LSR #5         
+        AND         Temp3, r0x00ff00ff, Temp3, LSR #5         
+        AND         Temp2, r0x00ff00ff, Temp2, LSR #5         
+        AND         Temp1, r0x00ff00ff, Temp1, LSR #5         
+        ORR         ValueA1, Temp3, Temp4, LSL #8        ;// [d2 c2 d0 c0]             
+        ORR         ValueA0, Temp1, Temp2, LSL #8        ;// [b2 a2 b0 a0]         
+                    
+        PKHBT       Temp1, ValueA0, ValueA1, LSL #16     ;// [d0 c0 b0 a0]
+
+        STR         Temp1, [pBuf], #8 
+        PKHTB       Temp2, ValueA1, ValueA0, ASR #16     ;// [d2 c2 b2 a2]
+        STR         Temp2, [pBuf], #-4  
+
+        LDR         Temp4, [pSrc0, #12]
+        LDR         Temp3, [pSrc0, #8]        
+        LDR         Temp2, [pSrc0, #4]
+        M_LDR       Temp1, [pSrc0], srcStep0
+        
+        UQSUB16     Temp4, Temp4, r0x0fe00fe0        
+        UQSUB16     Temp3, Temp3, r0x0fe00fe0                 
+        UQSUB16     Temp2, Temp2, r0x0fe00fe0        
+        UQSUB16     Temp1, Temp1, r0x0fe00fe0                 
+        
+        USAT16      Temp4, #13, Temp4
+        USAT16      Temp3, #13, Temp3                          
+        USAT16      Temp2, #13, Temp2
+        USAT16      Temp1, #13, Temp1
+                                  
+        AND         Temp4, r0x00ff00ff, Temp4, LSR #5         
+        AND         Temp3, r0x00ff00ff, Temp3, LSR #5         
+        AND         Temp2, r0x00ff00ff, Temp2, LSR #5         
+        AND         Temp1, r0x00ff00ff, Temp1, LSR #5         
+        ORR         ValueA1, Temp3, Temp4, LSL #8        ;// [d2 c2 d0 c0]             
+        ORR         ValueA0, Temp1, Temp2, LSL #8        ;// [b2 a2 b0 a0]         
+                    
+        PKHBT       Temp1, ValueA0, ValueA1, LSL #16     ;// [d0 c0 b0 a0]
+        SUBS        Count, Count, #1
+        STR         Temp1, [pBuf], #8 
+        PKHTB       Temp2, ValueA1, ValueA0, ASR #16     ;// [d2 c2 b2 a2]
+        STR         Temp2, [pBuf], #4  
+        
+        BGT         LoopStart
+End2
+        SUB         pSrc0, pBuf, #32-8
+        MOV         srcStep0, #4
+
+        M_END
+
+    ENDIF
+    
+    END
+    
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
new file mode 100755
index 0000000..d1684cb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
@@ -0,0 +1,313 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+
+        M_VARIANTS CortexA8
+
+    IF CortexA8
+
+        M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+
+;// Declare Neon registers
+dCoeff5         DN 30.S16
+dCoeff20        DN 31.S16
+qCoeff5         QN 14.S32
+qCoeff20        QN 15.S32
+        
+qSrc01          QN 0.U8
+dSrc0           DN 0.U8
+dSrc1           DN 1.U8                
+                
+dSrcb           DN 4.U8
+dSrcc           DN 2.U8
+dSrcd           DN 3.U8
+dSrce           DN 5.U8
+dSrcf           DN 1.U8
+
+qSrcb           QN 2.S16
+qSrcc           QN 1.S16
+dSrcB           DN 4.S16
+dSrcC           DN 2.S16
+
+qRes0           QN 5.S16
+qRes1           QN 6.S16
+qRes2           QN 7.S16
+qRes3           QN 8.S16
+qRes4           QN 9.S16
+qRes5           QN 10.S16
+qRes6           QN 11.S16
+qRes7           QN 12.S16
+qRes8           QN 13.S16
+    
+dRes0           DN 10.S16
+dRes1           DN 12.S16
+dRes2           DN 14.S16
+dRes3           DN 16.S16
+dRes4           DN 18.S16
+dRes5           DN 20.S16
+dRes6           DN 22.S16
+dRes7           DN 24.S16
+dRes8           DN 26.S16
+    
+qAcc01          QN 5.S32
+qAcc23          QN 6.S32
+qAcc45          QN 2.S32
+qAcc67          QN 3.S32
+qSumBE          QN 0.S32
+qSumCD          QN 1.S32
+
+dTempAcc0       DN 0.U16
+dTempAcc1       DN 2.U16
+dTempAcc2       DN 4.U16
+dTempAcc3       DN 6.U16
+
+qTAcc0          QN 0.U16
+qTAcc1          QN 1.U16
+qTAcc2          QN 2.U16
+qTAcc3          QN 3.U16
+
+dAcc0           DN 0.U8
+dAcc1           DN 2.U8
+dAcc2           DN 4.U8
+dAcc3           DN 6.U8
+
+dTmp0           DN 8.S16
+dTmp1           DN 9.S16
+qTmp0           QN 4.S32
+
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+        VMOV        dCoeff20, #20
+        VMOV        dCoeff5, #5
+
+        ;// Row0
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes0, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+        VMLA        dRes0, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        
+        ;// Row1
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes1, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+        
+        VSUB        dRes0, dRes0, dTmp0 ;// TeRi
+        
+        VMLA        dRes1, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes1, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row2
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes2, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+        
+        VSUB        dRes1, dRes1, dTmp0
+
+        VMLA        dRes2, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes2, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row3
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes3, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+
+        VSUB        dRes2, dRes2, dTmp0
+
+        VMLA        dRes3, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes3, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row4
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes4, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+
+        VSUB        dRes3, dRes3, dTmp0
+
+        VMLA        dRes4, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes4, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row5
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes5, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+
+        VSUB        dRes4, dRes4, dTmp0
+
+        VMLA        dRes5, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes5, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row6
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes6, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+
+        VSUB        dRes5, dRes5, dTmp0
+
+        VMLA        dRes6, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes6, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row7
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes7, dSrc0, dSrcf         ;// Acc=a+f
+        VLD1        qSrc01, [pSrc], srcStep     ;// [a0 a1 a2 a3 ..]
+
+        VSUB        dRes6, dRes6, dTmp0
+
+        VMLA        dRes7, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes7, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        ;// Row8
+        VEXT        dSrcb, dSrc0, dSrc1, #1     ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrcc, dSrc0, dSrc1, #2
+        VEXT        dSrcd, dSrc0, dSrc1, #3
+        VEXT        dSrce, dSrc0, dSrc1, #4
+        VEXT        dSrcf, dSrc0, dSrc1, #5     ;// [f0 f1 f2 f3 ..]
+        VADDL       qSrcc, dSrcc, dSrcd         ;// c+d                
+        VADDL       qSrcb, dSrcb, dSrce         ;// b+e        
+        VADDL       qRes8, dSrc0, dSrcf         ;// Acc=a+f
+
+        VSUB        dRes7, dRes7, dTmp0
+
+        VMLA        dRes8, dSrcC, dCoeff20      ;// Acc += 20*(c+d)
+;        VMLS        dRes8, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+        VMUL        dTmp0, dSrcB, dCoeff5       ;// Acc -= 5*(b+e)
+
+        VMOV        qCoeff20, #20
+        VMOV        qCoeff5, #5
+
+        ;// Col0
+        VADDL       qAcc01, dRes0, dRes5        ;// Acc = a+f
+        VADDL       qSumCD, dRes2, dRes3        ;// c+d
+        VADDL       qSumBE, dRes1, dRes4        ;// b+e
+
+        VSUB        dRes8, dRes8, dTmp0
+
+        VMLA        qAcc01, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
+;        VMLS        qAcc01, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+
+        ;// Col1
+        VADDL       qAcc23, dRes1, dRes6        ;// Acc = a+f
+        VADDL       qSumCD, dRes3, dRes4        ;// c+d
+        VADDL       qSumBE, dRes2, dRes5        ;// b+e
+        VMLA        qAcc23, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
+
+        VSUB        qAcc01, qAcc01, qTmp0
+
+;        VMLS        qAcc23, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+
+        ;// Col2
+        VADDL       qAcc45, dRes2, dRes7        ;// Acc = a+f
+        VADDL       qSumCD, dRes4, dRes5        ;// c+d
+        VADDL       qSumBE, dRes3, dRes6        ;// b+e
+        VMLA        qAcc45, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
+
+        VSUB        qAcc23, qAcc23, qTmp0
+
+;        VMLS        qAcc45, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+        VMUL        qTmp0, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+        
+        ;// Col3
+        VADDL       qAcc67, dRes3, dRes8        ;// Acc = a+f
+        VADDL       qSumCD, dRes5, dRes6        ;// c+d
+        VADDL       qSumBE, dRes4, dRes7        ;// b+e
+        VMLA        qAcc67, qSumCD, qCoeff20    ;// Acc += 20*(c+d)
+
+        VSUB        qAcc45, qAcc45, qTmp0
+
+        VMLS        qAcc67, qSumBE, qCoeff5     ;// Acc -= 20*(b+e)        
+
+        VQRSHRUN    dTempAcc0, qAcc01, #10
+        VQRSHRUN    dTempAcc1, qAcc23, #10
+        VQRSHRUN    dTempAcc2, qAcc45, #10
+        VQRSHRUN    dTempAcc3, qAcc67, #10
+        
+        VQMOVN      dAcc0, qTAcc0
+        VQMOVN      dAcc1, qTAcc1
+        VQMOVN      dAcc2, qTAcc2
+        VQMOVN      dAcc3, qTAcc3
+                
+        M_END
+    
+    ENDIF
+
+
+    
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
new file mode 100755
index 0000000..7bc091f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
@@ -0,0 +1,266 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+
+        M_VARIANTS CortexA8
+
+    IF CortexA8
+        M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r11
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+
+;// Declare Neon registers
+dTCoeff5        DN 30.U8
+dTCoeff20       DN 31.U8
+dCoeff5         DN 30.S16
+dCoeff20        DN 31.S16
+
+qSrcA01         QN 0.U8
+qSrcB23         QN 1.U8
+qSrcC45         QN 2.U8
+qSrcD67         QN 3.U8
+qSrcE89         QN 4.U8
+qSrcF1011       QN 5.U8
+qSrcG1213       QN 6.U8
+qSrcH1415       QN 7.U8
+qSrcI1617       QN 8.U8
+
+dSrcA0          DN 0.U8
+dSrcB2          DN 2.U8
+dSrcC4          DN 4.U8
+dSrcD6          DN 6.U8
+dSrcE8          DN 8.U8
+dSrcF10         DN 10.U8
+dSrcG12         DN 12.U8
+dSrcH14         DN 14.U8
+dSrcI16         DN 16.U8
+
+dSrcA1          DN 1.U8
+dSrcB3          DN 3.U8
+dSrcC5          DN 5.U8
+dSrcD7          DN 7.U8
+dSrcE9          DN 9.U8
+dSrcF11         DN 11.U8
+dSrcG13         DN 13.U8
+dSrcH15         DN 15.U8
+dSrcI17         DN 17.U8
+
+qTempP01        QN 9.S16
+qTempQ01        QN 10.S16
+qTempR01        QN 11.S16
+qTempS01        QN 12.S16
+
+qTempP23        QN 0.S16
+qTempQ23        QN 1.S16
+qTempR23        QN 2.S16
+qTempS23        QN 3.S16
+
+dTempP0         DN 18.S16
+dTempP1         DN 19.S16
+dTempP2         DN 0.S16
+
+dTempQ0         DN 20.S16
+dTempQ1         DN 21.S16
+dTempQ2         DN 2.S16
+
+dTempR0         DN 22.S16
+dTempR1         DN 23.S16
+dTempR2         DN 4.S16
+
+dTempS0         DN 24.S16
+dTempS1         DN 25.S16
+dTempS2         DN 6.S16
+ 
+dTempB0         DN 26.S16
+dTempC0         DN 27.S16
+dTempD0         DN 28.S16
+dTempF0         DN 29.S16
+
+dTempAcc0       DN 0.U16
+dTempAcc1       DN 2.U16
+dTempAcc2       DN 4.U16
+dTempAcc3       DN 6.U16
+
+dAcc0           DN 0.U8
+dAcc1           DN 2.U8
+dAcc2           DN 4.U8
+dAcc3           DN 6.U8
+
+qAcc0           QN 0.S32
+qAcc1           QN 1.S32
+qAcc2           QN 2.S32
+qAcc3           QN 3.S32
+
+qTAcc0          QN 0.U16
+qTAcc1          QN 1.U16
+qTAcc2          QN 2.U16
+qTAcc3          QN 3.U16                
+
+qTmp            QN 4.S16
+dTmp            DN 8.S16
+
+        VLD1        qSrcA01, [pSrc], srcStep                 ;// [a0 a1 a2 a3 .. a15]   
+        ADD         r12, pSrc, srcStep, LSL #2
+        VMOV        dTCoeff5, #5
+        VMOV        dTCoeff20, #20
+        VLD1        qSrcF1011, [r12], srcStep
+        VLD1        qSrcB23, [pSrc], srcStep                 ;// [b0 b1 b2 b3 .. b15]
+        
+        VLD1        qSrcG1213, [r12], srcStep
+        VADDL       qTempP01, dSrcA0, dSrcF10           
+        VLD1        qSrcC45, [pSrc], srcStep                 ;// [c0 c1 c2 c3 .. c15]
+        VADDL       qTempP23, dSrcA1, dSrcF11   
+        VLD1        qSrcD67, [pSrc], srcStep
+        VADDL       qTempQ01, dSrcB2, dSrcG12                   
+        VLD1        qSrcE89, [pSrc], srcStep
+        
+        ;//t0
+        VMLAL       qTempP01, dSrcC4, dTCoeff20
+        
+        VLD1        qSrcH1415, [r12], srcStep
+
+        VMLAL       qTempP23, dSrcC5, dTCoeff20
+        
+        VLD1        qSrcI1617, [r12], srcStep                 ;// [i0 i1 i2 i3 .. ]
+        
+        VMLAL       qTempP01, dSrcD6, dTCoeff20
+        VMLAL       qTempQ01, dSrcD6, dTCoeff20
+        VMLSL       qTempP23, dSrcB3, dTCoeff5
+        
+        VADDL       qTempR01, dSrcC4, dSrcH14   
+        
+        VMLSL       qTempP01, dSrcB2, dTCoeff5
+
+        VADDL       qTempQ23, dSrcB3, dSrcG13   
+
+        VMLAL       qTempP23, dSrcD7, dTCoeff20
+        VMLAL       qTempQ01, dSrcE8, dTCoeff20
+
+        VMLSL       qTempP01, dSrcE8, dTCoeff5
+        VMLAL       qTempQ23, dSrcD7, dTCoeff20
+
+        VMLSL       qTempP23, dSrcE9, dTCoeff5
+
+        ;//t1
+
+        VMLAL       qTempR01, dSrcE8, dTCoeff20
+        VMLSL       qTempQ01, dSrcC4, dTCoeff5
+        VMLSL       qTempQ23, dSrcC5, dTCoeff5
+        VADDL       qTempR23, dSrcC5, dSrcH15   
+
+        VMLAL       qTempR01, dSrcF10, dTCoeff20
+        VMLSL       qTempQ01, dSrcF10, dTCoeff5
+        VMLAL       qTempQ23, dSrcE9, dTCoeff20
+        VMLAL       qTempR23, dSrcE9, dTCoeff20
+        VADDL       qTempS01, dSrcD6, dSrcI16   
+
+
+        VMLSL       qTempR01, dSrcD6, dTCoeff5
+        VMLSL       qTempQ23, dSrcF11, dTCoeff5
+        VMLSL       qTempR23, dSrcD7, dTCoeff5
+
+        ;//t2
+        VADDL       qTempS23, dSrcD7, dSrcI17   
+        VMLAL       qTempS01, dSrcF10, dTCoeff20
+        VMLSL       qTempR01, dSrcG12, dTCoeff5
+        VMLSL       qTempR23, dSrcG13, dTCoeff5
+
+        VMLAL       qTempS23, dSrcF11, dTCoeff20
+        VMLAL       qTempS01, dSrcG12, dTCoeff20
+        VEXT        dTempB0, dTempP0, dTempP1, #1
+        VMLAL       qTempR23, dSrcF11, dTCoeff20
+
+
+        ;//t3
+        VMLAL       qTempS23, dSrcG13, dTCoeff20
+        VMLSL       qTempS01, dSrcE8, dTCoeff5
+        VEXT        dTempC0, dTempP0, dTempP1, #2
+        VMOV        dCoeff20, #20
+        VMLSL       qTempS23, dSrcE9, dTCoeff5
+        VMLSL       qTempS01, dSrcH14, dTCoeff5
+        VEXT        dTempF0, dTempP1, dTempP2, #1
+        VEXT        dTempD0, dTempP0, dTempP1, #3
+        VMLSL       qTempS23, dSrcH15, dTCoeff5
+        
+        VADDL       qAcc0, dTempP0, dTempF0
+        VADD        dTempC0, dTempC0, dTempD0
+        ;//h 
+        VMOV        dCoeff5, #5
+        
+        ;// res0
+        VADD        dTempB0, dTempB0, dTempP1
+        VMLAL       qAcc0, dTempC0, dCoeff20
+        VEXT        dTempC0, dTempQ0, dTempQ1, #2
+        VEXT        dTempD0, dTempQ0, dTempQ1, #3
+        VEXT        dTempF0, dTempQ1, dTempQ2, #1
+        VMLSL       qAcc0, dTempB0, dCoeff5
+
+        ;// res1
+        VEXT        dTempB0, dTempQ0, dTempQ1, #1
+        VADDL       qAcc1, dTempQ0, dTempF0
+        VADD        dTempC0, dTempC0, dTempD0
+        VADD        dTempB0, dTempB0, dTempQ1
+        VEXT        dTempD0, dTempR0, dTempR1, #3
+        VMLAL       qAcc1, dTempC0, dCoeff20
+        VEXT        dTempF0, dTempR1, dTempR2, #1
+        VEXT        dTempC0, dTempR0, dTempR1, #2
+        VEXT        dTmp, dTempR0, dTempR1, #1
+        VADDL       qAcc2, dTempR0, dTempF0
+        VMLSL       qAcc1, dTempB0, dCoeff5
+;        VEXT        dTempB0, dTempR0, dTempR1, #1
+        VADD        dTempC0, dTempC0, dTempD0
+        
+        ;// res2
+        VADD        dTempB0, dTmp, dTempR1
+        VEXT        dTempD0, dTempS0, dTempS1, #3
+        VMLAL       qAcc2, dTempC0, dCoeff20
+;        VADD        dTempB0, dTempB0, dTempR1
+        
+        ;// res3
+        VEXT        dTempC0, dTempS0, dTempS1, #2
+        VEXT        dTempF0, dTempS1, dTempS2, #1
+        VADD        dTempC0, dTempC0, dTempD0
+        VEXT        dTmp, dTempS0, dTempS1, #1
+        VADDL       qAcc3, dTempS0, dTempF0
+        VMLSL       qAcc2, dTempB0, dCoeff5
+        VMLAL       qAcc3, dTempC0, dCoeff20
+        VADD        dTmp, dTmp, dTempS1
+        VMLSL       qAcc3, dTmp, dCoeff5
+                
+        VQRSHRUN    dTempAcc0, qAcc0, #10
+        VQRSHRUN    dTempAcc1, qAcc1, #10
+        VQRSHRUN    dTempAcc2, qAcc2, #10
+        VQRSHRUN    dTempAcc3, qAcc3, #10
+
+        VQMOVN      dAcc0, qTAcc0
+        VQMOVN      dAcc1, qTAcc1
+        VQMOVN      dAcc2, qTAcc2
+        VQMOVN      dAcc3, qTAcc3
+        
+        M_END
+    
+    ENDIF
+    
+    
+    
+
+    
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
new file mode 100755
index 0000000..babe8ad
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
@@ -0,0 +1,228 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+        
+        EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+
+DEBUG_ON    SETL {FALSE}
+
+    IF CortexA8
+        
+        M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+
+;// Declare Neon registers
+dCoeff5         DN 30.S16
+dCoeff20        DN 31.S16
+
+qSrcA01         QN 11.U8
+qSrcB01         QN 12.U8
+qSrcC01         QN 13.U8
+qSrcD01         QN 14.U8
+
+dSrcA0          DN 22.U8
+dSrcA1          DN 23.U8
+dSrcB0          DN 24.U8
+dSrcB1          DN 25.U8
+dSrcC0          DN 26.U8
+dSrcC1          DN 27.U8
+dSrcD0          DN 28.U8
+dSrcD1          DN 29.U8
+
+dSrcb           DN 12.U8
+dSrce           DN 13.U8
+dSrcf           DN 10.U8
+
+dSrc0c          DN 14.U8
+dSrc1c          DN 16.U8
+dSrc2c          DN 18.U8
+dSrc3c          DN 20.U8
+                   
+dSrc0d          DN 15.U8
+dSrc1d          DN 17.U8
+dSrc2d          DN 19.U8
+dSrc3d          DN 21.U8
+
+qTemp01         QN 4.S16
+qTemp23         QN 6.S16
+dTemp0          DN 8.S16
+dTemp2          DN 12.S16
+
+qRes01          QN 11.S16
+qRes23          QN 12.S16
+qRes45          QN 13.S16
+qRes67          QN 14.S16
+
+dRes0           DN 22.S16
+dRes2           DN 24.S16
+dRes4           DN 26.S16
+dRes6           DN 28.S16
+
+dAcc0           DN 22.U8
+dAcc2           DN 24.U8
+dAcc4           DN 26.U8
+dAcc6           DN 28.U8
+
+dResult0        DN 22.U32
+dResult2        DN 24.U32
+dResult4        DN 26.U32
+dResult6        DN 28.U32
+
+        VLD1        qSrcA01, [pSrc], srcStep    ;// Load A register [a0 a1 a2 a3 ..]
+        ;// One cycle stall
+        VEXT        dSrcf, dSrcA0, dSrcA1, #5   ;// [f0 f1 f2 f3 ..]
+        VEXT        dSrcb, dSrcA0, dSrcA1, #1   ;// [b0 b1 b2 b3 ..]
+;        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
+        VEXT        dSrc0c, dSrcA0, dSrcA1, #2
+        VEXT        dSrc0d, dSrcA0, dSrcA1, #3
+        VEXT        dSrce, dSrcA0, dSrcA1, #4
+        VADDL       qRes01, dSrcA0, dSrcf       ;// Acc=a+f
+        VADDL       qTemp01, dSrc0c, dSrc0d     ;// c+d                
+        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
+        
+        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
+;        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]           
+        VMLA        dRes0, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
+;        VMLS        dRes0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
+        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
+        
+        VEXT        dSrcf, dSrcB0, dSrcB1, #5   ;// [f0 f1 f2 f3 ..]
+        VEXT        dSrcb, dSrcB0, dSrcB1, #1   ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrc1c, dSrcB0, dSrcB1, #2
+        VEXT        dSrc1d, dSrcB0, dSrcB1, #3
+        VEXT        dSrce, dSrcB0, dSrcB1, #4
+        VADDL       qRes23, dSrcB0, dSrcf       ;// Acc=a+f
+
+        VSUB        dRes0, dRes0, dTemp0    ;// TeRi
+
+        VADDL       qTemp01, dSrc1c, dSrc1d     ;// c+d                
+        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
+        
+        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]           
+;        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]  
+        
+        VMLA        dRes2, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
+;        VMLS        dRes2, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
+        VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
+
+        VEXT        dSrcf, dSrcC0, dSrcC1, #5   ;// [f0 f1 f2 f3 ..]
+        VEXT        dSrcb, dSrcC0, dSrcC1, #1   ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrc2c, dSrcC0, dSrcC1, #2
+        VEXT        dSrc2d, dSrcC0, dSrcC1, #3
+        VEXT        dSrce, dSrcC0, dSrcC1, #4
+        VADDL       qRes45, dSrcC0, dSrcf       ;// Acc=a+f
+        
+        VSUB        dRes2, dRes2, dTemp0  ;// TeRi
+        
+        VADDL       qTemp01, dSrc2c, dSrc2d     ;// c+d                
+        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
+
+        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]  
+
+        VMLA        dRes4, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
+;        VMLS        dRes4, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
+        VMUL        dTemp0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e) TeRi
+        
+
+        VEXT        dSrcf, dSrcD0, dSrcD1, #5   ;// [f0 f1 f2 f3 ..]
+        VEXT        dSrcb, dSrcD0, dSrcD1, #1   ;// [b0 b1 b2 b3 ..]
+        VEXT        dSrc3c, dSrcD0, dSrcD1, #2
+        VEXT        dSrc3d, dSrcD0, dSrcD1, #3
+        VEXT        dSrce, dSrcD0, dSrcD1, #4
+        VADDL       qRes67, dSrcD0, dSrcf       ;// Acc=a+f
+
+        VSUB        dRes4, dRes4, dTemp0 ;// TeRi
+
+        VADDL       qTemp01, dSrc3c, dSrc3d     ;// c+d                
+        VADDL       qTemp23, dSrcb, dSrce       ;// b+e
+        VMLA        dRes6, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
+        VMLS        dRes6, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
+
+        VQRSHRUN    dAcc0, qRes01, #5           ;// Acc = Sat ((Acc + 16) / 32)
+        VQRSHRUN    dAcc2, qRes23, #5           ;// Acc = Sat ((Acc + 16) / 32)
+        VQRSHRUN    dAcc4, qRes45, #5           ;// Acc = Sat ((Acc + 16) / 32)
+        VQRSHRUN    dAcc6, qRes67, #5           ;// Acc = Sat ((Acc + 16) / 32)
+        
+        M_END
+    
+    ENDIF
+
+
+    END
+    
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
new file mode 100755
index 0000000..89c90aa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
@@ -0,0 +1,134 @@
+;//
+;// 
+;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+       
+        M_VARIANTS CortexA8
+       
+        EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+
+    IF CortexA8
+        
+        M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+
+Temp            RN 12
+
+;// Declare Neon registers
+dCoeff5         DN 30.S16
+dCoeff20        DN 31.S16
+
+dSrc0           DN 7.U8
+dSrc1           DN 8.U8
+dSrc2           DN 9.U8
+dSrc3           DN 10.U8
+dSrc4           DN 11.U8
+dSrc5           DN 12.U8
+dSrc6           DN 13.U8
+dSrc7           DN 14.U8
+dSrc8           DN 15.U8
+
+qSumBE01        QN 8.S16
+qSumCD01        QN 9.S16
+dSumBE0         DN 16.S16
+dSumCD0         DN 18.S16
+
+qAcc01          QN 0.S16
+qAcc23          QN 1.S16
+qAcc45          QN 2.S16
+qAcc67          QN 3.S16
+
+dRes0           DN 0.S16
+dRes1           DN 2.S16
+dRes2           DN 4.S16
+dRes3           DN 6.S16
+
+dAcc0           DN 0.U8
+dAcc1           DN 2.U8
+dAcc2           DN 4.U8
+dAcc3           DN 6.U8        
+        
+
+dTmp0           DN 20.S16
+dTmp1           DN 21.S16
+dTmp2           DN 22.S16
+dTmp3           DN 23.S16
+
+
+        VLD1        dSrc0, [pSrc], srcStep     ;// [a0 a1 a2 a3 .. ] 
+        ADD         Temp, pSrc, srcStep, LSL #2
+        VLD1        dSrc1, [pSrc], srcStep     ;// [b0 b1 b2 b3 .. ]
+        ;// One cycle stall
+        VLD1        dSrc5, [Temp], srcStep        
+        ;// One cycle stall
+        VLD1        dSrc2, [pSrc], srcStep     ;// [c0 c1 c2 c3 .. ]
+        VADDL       qAcc01, dSrc0, dSrc5       ;// Acc = a+f
+        VLD1        dSrc3, [pSrc], srcStep
+        ;// One cycle stall
+        VLD1        dSrc6, [Temp], srcStep ;// TeRi
+        
+        VLD1        dSrc4, [pSrc], srcStep
+        VLD1        dSrc7, [Temp], srcStep ;// TeRi
+        VADDL       qSumBE01, dSrc1, dSrc4     ;// b+e
+        VADDL       qSumCD01, dSrc2, dSrc3     ;// c+d        
+        VLD1        dSrc8, [Temp], srcStep ;// TeRi
+        VMLS        dRes0, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)        
+;        VMLA        dRes0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+        VMUL        dTmp0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+        
+;        VLD1        dSrc6, [Temp], srcStep
+        VADDL       qSumBE01, dSrc2, dSrc5     ;// b+e
+        VADDL       qSumCD01, dSrc3, dSrc4     ;// c+d
+        VADDL       qAcc23, dSrc1, dSrc6       ;// Acc = a+f
+        VMLS        dRes1, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
+;        VMLA        dRes1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+        VMUL        dTmp1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+
+;        VLD1        dSrc7, [Temp], srcStep
+        VADDL       qSumBE01, dSrc3, dSrc6     ;// b+e
+        VADDL       qSumCD01, dSrc4, dSrc5     ;// c+d
+        VADDL       qAcc45, dSrc2, dSrc7       ;// Acc = a+f
+        VMLS        dRes2, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)        
+;        VMLA        dRes2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+        VMUL        dTmp2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+
+;        VLD1        dSrc8, [Temp], srcStep     ;// [i0 i1 i2 i3 .. ]        
+        VADDL       qSumBE01, dSrc4, dSrc7     ;// b+e
+        VADDL       qAcc67, dSrc3, dSrc8       ;// Acc = a+f
+        VADDL       qSumCD01, dSrc5, dSrc6     ;// c+d
+        VMLS        dRes3, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)        
+        VADD        dRes0, dRes0, dTmp0
+        VADD        dRes1, dRes1, dTmp1
+        VADD        dRes2, dRes2, dTmp2
+        VMLA        dRes3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+;        VMUL        dTmp3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
+;        VADD        dRes3, dRes3, dTmp3
+
+        VQRSHRUN    dAcc0, qAcc01, #5        
+        VQRSHRUN    dAcc1, qAcc23, #5        
+        VQRSHRUN    dAcc2, qAcc45, #5        
+        VQRSHRUN    dAcc3, qAcc67, #5        
+
+        M_END
+    
+    ENDIF
+
+    
+    
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
new file mode 100755
index 0000000..0f0ec78
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
@@ -0,0 +1,318 @@
+;//
+;// 
+;// File Name:  armVCM4P10_Interpolate_Chroma_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   9641
+;// Date:       Thursday, February 7, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+
+    IF CortexA8
+
+    M_TABLE armVCM4P10_WidthBranchTableMVIsNotZero      
+    
+    DCD   WidthIs2MVIsNotZero, WidthIs2MVIsNotZero
+    DCD   WidthIs4MVIsNotZero, WidthIs4MVIsNotZero
+    DCD   WidthIs8MVIsNotZero
+    
+    M_TABLE armVCM4P10_WidthBranchTableMVIsZero      
+    
+    DCD   WidthIs2MVIsZero, WidthIs2MVIsZero
+    DCD   WidthIs4MVIsZero, WidthIs4MVIsZero
+    DCD   WidthIs8MVIsZero
+    
+    
+;// input registers
+
+pSrc                 RN 0
+iSrcStep             RN 1
+pDst                 RN 2
+iDstStep             RN 3
+iWidth               RN 4
+iHeight              RN 5
+dx                   RN 6
+dy                   RN 7
+
+;// local variable registers
+pc                   RN 15
+return               RN 0
+EightMinusdx         RN 8 
+EightMinusdy         RN 9
+
+ACoeff               RN 12
+BCoeff               RN 9
+CCoeff               RN 8
+DCoeff               RN 6
+
+pTable               RN 11
+
+Step1                RN 10
+SrcStepMinus1        RN 14
+
+dACoeff              DN D12.U8
+dBCoeff              DN D13.U8
+dCCoeff              DN D14.U8
+dDCoeff              DN D15.U8
+
+dRow0a               DN D0.U8
+dRow0b               DN D1.U8
+dRow1a               DN D2.U8
+dRow1b               DN D3.U8
+
+qRow0a               QN Q2.S16
+qRow0b               QN Q3.S16
+
+;//dIndex               DN    D16.U8                 
+qRow1a               QN Q11.S16
+qRow1b               QN Q12.S16
+
+dRow2a               DN D16.U8
+dRow2b               DN D17.U8
+dRow3a               DN D18.U8
+dRow3b               DN D19.U8
+
+qOutRow2             QN Q11.U16
+qOutRow3             QN Q12.U16
+dOutRow2             DN D20.U8
+dOutRow3             DN D21.U8
+dOutRow2U64          DN D20.U64
+dOutRow3U64          DN D21.U64
+
+qOutRow0             QN Q2.U16
+qOutRow1             QN Q3.U16
+dOutRow0             DN D8.U8
+dOutRow1             DN D9.U8
+
+dOutRow0U64          DN D8.U64
+dOutRow1U64          DN D9.U64
+
+dOutRow0U32          DN D8.U32
+dOutRow1U32          DN D9.U32
+
+dOutRow0U16          DN D8.U16
+dOutRow1U16          DN D9.U16
+
+
+dOut0U64             DN D0.U64
+dOut1U64             DN D1.U64
+
+dOut00U32            DN D0.U32
+dOut01U32            DN D1.U32
+dOut10U32            DN D2.U32
+dOut11U32            DN D3.U32
+
+dOut0U16             DN D0.U16
+dOut1U16             DN D1.U16
+
+;//-----------------------------------------------------------------------------------------------
+;// armVCM4P10_Interpolate_Chroma_asm starts
+;//-----------------------------------------------------------------------------------------------
+        
+        ;// Write function header
+        M_START armVCM4P10_Interpolate_Chroma, r11, d15
+        
+        ;// Define stack arguments
+        M_ARG   Width,      4
+        M_ARG   Height,     4
+        M_ARG   Dx,         4
+        M_ARG   Dy,         4
+        
+        ;// Load argument from the stack
+        ;// M_STALL ARM1136JS=4
+        
+        M_LDRD   dx, dy, Dx
+        M_LDRD   iWidth, iHeight, Width
+        
+        ;// EightMinusdx = 8 - dx
+        ;// EightMinusdy = 8 - dy
+        
+        ;// ACoeff = EightMinusdx * EightMinusdy
+        ;// BCoeff = dx * EightMinusdy
+        ;// CCoeff = EightMinusdx * dy
+        ;// DCoeff = dx * dy
+        
+        RSB     EightMinusdx, dx, #8 
+        RSB     EightMinusdy, dy, #8
+        CMN     dx,dy
+        MOV     Step1, #1
+        LDREQ   pTable, =armVCM4P10_WidthBranchTableMVIsZero
+        SUB     SrcStepMinus1, iSrcStep, Step1
+        LDRNE   pTable, =armVCM4P10_WidthBranchTableMVIsNotZero
+        
+        VLD1    dRow0a, [pSrc], Step1                   ;// 0a
+        
+        SMULBB  ACoeff, EightMinusdx, EightMinusdy
+        SMULBB  BCoeff, dx, EightMinusdy
+        VLD1    dRow0b, [pSrc], SrcStepMinus1           ;// 0b
+        SMULBB  CCoeff, EightMinusdx, dy
+        SMULBB  DCoeff, dx, dy
+        
+        VDUP    dACoeff, ACoeff
+        VDUP    dBCoeff, BCoeff
+        VDUP    dCCoeff, CCoeff
+        VDUP    dDCoeff, DCoeff
+        
+        LDR     pc, [pTable, iWidth, LSL #1]      ;// Branch to the case based on iWidth
+        
+;// Pixel layout:
+;//
+;//   x00 x01 x02
+;//   x10 x11 x12
+;//   x20 x21 x22
+
+;// If fractionl mv is not (0, 0)
+WidthIs8MVIsNotZero
+
+                VLD1   dRow1a, [pSrc], Step1            ;// 1a
+                VMULL  qRow0a, dRow0a, dACoeff
+                VLD1   dRow1b, [pSrc], SrcStepMinus1    ;// 1b
+                VMULL  qRow0b, dRow1a, dACoeff
+                VLD1   dRow2a, [pSrc], Step1            ;// 2a
+                VMLAL  qRow0a, dRow0b, dBCoeff
+                VLD1   dRow2b, [pSrc], SrcStepMinus1    ;// 2b
+                VMULL  qRow1a, dRow2a, dACoeff
+                VMLAL  qRow0b, dRow1b, dBCoeff
+                VLD1   dRow3a, [pSrc], Step1            ;// 3a
+                VMLAL  qRow0a, dRow1a, dCCoeff
+                VMLAL  qRow1a, dRow2b, dBCoeff
+                VMULL  qRow1b, dRow3a, dACoeff
+                VLD1   dRow3b, [pSrc], SrcStepMinus1    ;// 3b
+                VMLAL  qRow0b, dRow2a, dCCoeff
+                VLD1   dRow0a, [pSrc], Step1            ;// 0a
+                VMLAL  qRow1b, dRow3b, dBCoeff
+                VMLAL  qRow1a, dRow3a, dCCoeff
+                VMLAL  qRow0a, dRow1b, dDCoeff
+                VLD1   dRow0b, [pSrc], SrcStepMinus1    ;// 0b
+                VMLAL  qRow1b, dRow0a, dCCoeff
+                VMLAL  qRow0b, dRow2b, dDCoeff
+                VMLAL  qRow1a, dRow3b, dDCoeff
+                
+                
+                SUBS   iHeight, iHeight, #4
+                VMLAL  qRow1b, dRow0b, dDCoeff
+
+                VQRSHRN dOutRow0, qOutRow0, #6
+                VQRSHRN dOutRow1, qOutRow1, #6
+                VQRSHRN dOutRow2, qOutRow2, #6
+                VST1   dOutRow0U64, [pDst], iDstStep
+                VQRSHRN dOutRow3, qOutRow3, #6
+                
+                VST1   dOutRow1U64, [pDst], iDstStep  
+                VST1   dOutRow2U64, [pDst], iDstStep
+                VST1   dOutRow3U64, [pDst], iDstStep  
+                
+
+                BGT     WidthIs8MVIsNotZero
+                MOV     return,  #OMX_Sts_NoErr
+                M_EXIT
+
+WidthIs4MVIsNotZero
+
+                VLD1   dRow1a, [pSrc], Step1
+                VMULL  qRow0a, dRow0a, dACoeff
+                VMULL  qRow0b, dRow1a, dACoeff
+                VLD1   dRow1b, [pSrc], SrcStepMinus1
+                VMLAL  qRow0a, dRow0b, dBCoeff
+                VMLAL  qRow0b, dRow1b, dBCoeff
+                VLD1   dRow0a, [pSrc], Step1
+                VMLAL  qRow0a, dRow1a, dCCoeff
+                VMLAL  qRow0b, dRow0a, dCCoeff
+                VLD1   dRow0b, [pSrc], SrcStepMinus1
+                SUBS   iHeight, iHeight, #2
+                VMLAL  qRow0b, dRow0b, dDCoeff
+                VMLAL  qRow0a, dRow1b, dDCoeff
+                
+                VQRSHRN dOutRow1, qOutRow1, #6
+                VQRSHRN dOutRow0, qOutRow0, #6
+                
+                VST1   dOutRow0U32[0], [pDst], iDstStep
+                VST1   dOutRow1U32[0], [pDst], iDstStep  
+                
+                BGT     WidthIs4MVIsNotZero
+                MOV     return,  #OMX_Sts_NoErr
+                M_EXIT
+
+WidthIs2MVIsNotZero
+
+                VLD1   dRow1a, [pSrc], Step1
+                VMULL  qRow0a, dRow0a, dACoeff
+                VMULL  qRow0b, dRow1a, dACoeff
+                VLD1   dRow1b, [pSrc], SrcStepMinus1
+                VMLAL  qRow0a, dRow0b, dBCoeff
+                VMLAL  qRow0b, dRow1b, dBCoeff
+                VLD1   dRow0a, [pSrc], Step1
+                VMLAL  qRow0a, dRow1a, dCCoeff
+                VMLAL  qRow0b, dRow0a, dCCoeff
+                VLD1   dRow0b, [pSrc], SrcStepMinus1
+                SUBS   iHeight, iHeight, #2
+                VMLAL  qRow0b, dRow0b, dDCoeff
+                VMLAL  qRow0a, dRow1b, dDCoeff
+                
+                VQRSHRN dOutRow1, qOutRow1, #6
+                VQRSHRN dOutRow0, qOutRow0, #6
+                
+                VST1   dOutRow0U16[0], [pDst], iDstStep
+                VST1   dOutRow1U16[0], [pDst], iDstStep  
+
+                BGT     WidthIs2MVIsNotZero 
+                MOV     return,  #OMX_Sts_NoErr
+                M_EXIT                
+                
+;// If fractionl mv is (0, 0)
+WidthIs8MVIsZero
+                SUB     pSrc, pSrc, iSrcStep
+
+WidthIs8LoopMVIsZero
+                VLD1    dRow0a, [pSrc], iSrcStep
+                SUBS    iHeight, iHeight, #2
+                VLD1    dRow0b, [pSrc], iSrcStep
+                VST1    dOut0U64, [pDst], iDstStep
+                VST1    dOut1U64, [pDst], iDstStep
+                BGT     WidthIs8LoopMVIsZero
+
+                MOV     return,  #OMX_Sts_NoErr
+                M_EXIT
+
+WidthIs4MVIsZero                
+                VLD1    dRow0b, [pSrc], iSrcStep
+                
+                SUBS    iHeight, iHeight, #2
+                
+                VST1    dOut00U32[0], [pDst], iDstStep
+                VLD1    dRow0a, [pSrc], iSrcStep
+                VST1    dOut01U32[0], [pDst], iDstStep
+                
+                BGT     WidthIs4MVIsZero 
+                MOV     return,  #OMX_Sts_NoErr
+                M_EXIT
+                
+WidthIs2MVIsZero                
+                VLD1    dRow0b, [pSrc], iSrcStep
+                SUBS    iHeight, iHeight, #2
+                
+                VST1    dOut0U16[0], [pDst], iDstStep
+                VLD1    dRow0a, [pSrc], iSrcStep
+                VST1    dOut1U16[0], [pDst], iDstStep
+                
+                BGT     WidthIs2MVIsZero 
+                MOV     return,  #OMX_Sts_NoErr                                
+                M_END
+                    
+        ENDIF ;// CortexA8
+        
+        END
+
+;//-----------------------------------------------------------------------------------------------
+;// armVCM4P10_Interpolate_Chroma_asm ends
+;//-----------------------------------------------------------------------------------------------
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s
new file mode 100755
index 0000000..7e2642b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s
@@ -0,0 +1,74 @@
+;//
+;// 
+;// File Name:  armVCM4P10_QuantTables_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;// Description:
+;// This file contains quantization tables
+;// 
+;// 
+
+         INCLUDE omxtypes_s.h
+         INCLUDE armCOMM_s.h
+     
+         
+         EXPORT armVCM4P10_MFMatrixQPModTable
+         EXPORT armVCM4P10_QPDivIntraTable
+         EXPORT armVCM4P10_QPDivPlusOneTable  
+         
+;//--------------------------------------------------------------
+;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires,
+;// for values of iQP from 0 to 51 (inclusive). 
+;//--------------------------------------------------------------
+
+         M_TABLE armVCM4P10_MFMatrixQPModTable
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         DCW 13107, 11916, 10082, 9362, 8192, 7282
+         
+;//---------------------------------------------------------------
+;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values,
+;// for values of iQP from 0 to 51 (inclusive). 
+;//---------------------------------------------------------------
+
+         M_TABLE armVCM4P10_QPDivPlusOneTable
+         DCB 16, 16, 16, 16, 16, 16
+         DCB 17, 17, 17, 17, 17, 17
+         DCB 18, 18, 18, 18, 18, 18
+         DCB 19, 19, 19, 19, 19, 19
+         DCB 20, 20, 20, 20, 20, 20
+         DCB 21, 21, 21, 21, 21, 21
+         DCB 22, 22, 22, 22, 22, 22
+         DCB 23, 23, 23, 23, 23, 23
+         DCB 24, 24, 24, 24, 24, 24
+
+;//------------------------------------------------------------------
+;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) ,
+;// for values of iQP from 0 to 51 (inclusive). 
+;//------------------------------------------------------------------
+    
+         M_TABLE armVCM4P10_QPDivIntraTable, 2
+         DCD 21845, 21845, 21845, 21845, 21845, 21845
+         DCD 43690, 43690, 43690, 43690, 43690, 43690
+         DCD 87381, 87381, 87381, 87381, 87381, 87381
+         DCD 174762, 174762, 174762, 174762, 174762, 174762
+         DCD 349525, 349525, 349525, 349525, 349525, 349525
+         DCD 699050, 699050, 699050, 699050, 699050, 699050
+         DCD 1398101, 1398101, 1398101, 1398101, 1398101, 1398101
+         DCD 2796202, 2796202, 2796202, 2796202, 2796202, 2796202
+         DCD 5592405, 5592405, 5592405, 5592405, 5592405, 5592405                
+         
+         
+         END
+         
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
new file mode 100755
index 0000000..ee9c339
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
@@ -0,0 +1,186 @@
+;//
+;// 
+;// File Name:  armVCM4P10_TransformResidual4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// Transform Residual 4x4 Coefficients
+;// 
+;// 
+
+        
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        
+        
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+    
+    
+    
+
+
+
+
+
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+
+;// ARM Registers
+    
+;//Input Registers
+pDst                RN  0
+pSrc                RN  1
+
+
+;// Neon Registers
+      
+;// Packed Input pixels
+dIn0                DN  D0.S16       
+dIn1                DN  D1.S16       
+dIn2                DN  D2.S16       
+dIn3                DN  D3.S16
+
+;// Intermediate calculations       
+dZero               DN  D4.S16
+de0                 DN  D5.S16
+de1                 DN  D6.S16
+de2                 DN  D7.S16
+de3                 DN  D8.S16
+dIn1RS              DN  D7.S16
+dIn3RS              DN  D8.S16
+df0                 DN  D0.S16
+df1                 DN  D1.S16
+df2                 DN  D2.S16
+df3                 DN  D3.S16
+qf01                QN  Q0.32
+qf23                QN  Q1.32
+dg0                 DN  D5.S16
+dg1                 DN  D6.S16
+dg2                 DN  D7.S16
+dg3                 DN  D8.S16
+df1RS               DN  D7.S16
+df3RS               DN  D8.S16
+
+;// Output pixels
+dh0                 DN  D0.S16
+dh1                 DN  D1.S16
+dh2                 DN  D2.S16
+dh3                 DN  D3.S16
+
+       
+    ;// Allocate stack memory required by the function
+        
+
+    ;// Write function header
+        M_START armVCM4P10_TransformResidual4x4, ,d8
+        
+        ;******************************************************************
+        ;// The strategy used in implementing the transform is as follows:*
+        ;// Load the 4x4 block into 8 registers                           *  
+        ;// Transpose the 4x4 matrix                                      *  
+        ;// Perform the row operations (on columns) using SIMD            *  
+        ;// Transpose the 4x4 result matrix                               *  
+        ;// Perform the coloumn operations                                *
+        ;// Store the 4x4 block at one go                                 *  
+        ;******************************************************************
+
+        ;// Load all the 4x4 pixels in transposed form
+        
+        VLD4    {dIn0,dIn1,dIn2,dIn3},[pSrc]
+        
+        VMOV    dZero,#0                                    ;// Used to right shift by 1 
+        
+        
+        ;**************************************** 
+        ;// Row Operations (Performed on columns)
+        ;**************************************** 
+        
+        
+        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2 
+        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2 
+        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
+        VHADD       dIn3RS,dIn3,dZero
+        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3 
+        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
+        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
+        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
+        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
+        
+        
+        
+        ;*****************************************************************
+        ;// Transpose the resultant matrix
+        ;*****************************************************************
+        
+        VTRN    df0,df1
+        VTRN    df2,df3
+        VTRN    qf01,qf23 
+        
+        
+        ;******************************* 
+        ;// Coloumn Operations 
+        ;******************************* 
+        
+        
+        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2 
+        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2 
+        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
+        VHADD       df3RS,df3,dZero
+        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3 
+        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
+        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
+        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
+        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
+        
+             
+        ;************************************************
+        ;// Calculate final value (colOp[i][j] + 32)>>6
+        ;************************************************
+        
+        VRSHR       dh0,#6
+        VRSHR       dh1,#6
+        VRSHR       dh2,#6
+        VRSHR       dh3,#6
+        
+                
+        ;***************************
+        ;// Store all the 4x4 pixels
+        ;***************************
+        
+        VST1   {dh0,dh1,dh2,dh3},[pDst]
+            
+        
+        ;// Set return value
+        
+End                
+
+        
+        ;// Write function tail
+        M_END
+        
+    ENDIF                                                           ;//CortexA8            
+            
+    END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
new file mode 100755
index 0000000..4c52e22
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
@@ -0,0 +1,92 @@
+;//
+;// 
+;// File Name:  armVCM4P10_UnpackBlock4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Define the processor variants supported by this file
+
+        M_VARIANTS ARM1136JS
+        
+                       
+        IF ARM1136JS
+        
+;//--------------------------------------
+;// Input Arguments and their scope/usage
+;//--------------------------------------
+ppSrc           RN 0    ;// Persistent variable
+pDst            RN 1    ;// Persistent variable
+
+;//--------------------------------
+;// Variables and their scope/usage
+;//--------------------------------
+pSrc            RN 2    ;// Persistent variables
+Flag            RN 3    
+Value           RN 4    
+Value2          RN 5    
+strOffset       RN 6    
+cstOffset       RN 7    
+
+        
+        M_START armVCM4P10_UnpackBlock4x4, r7
+        
+        LDR     pSrc, [ppSrc]                       ;// Load pSrc
+        MOV     cstOffset, #31                      ;// To be used in the loop, to compute offset
+        
+        ;//-----------------------------------------------------------------------
+        ; Firstly, fill all the coefficient values on the <pDst> buffer by zero
+        ;//-----------------------------------------------------------------------
+        
+        MOV      Value,  #0                         ;// Initialize the zero value
+        MOV      Value2, #0                         ;// Initialize the zero value
+        LDRB     Flag,  [pSrc], #1                  ;// Preload <Flag> before <unpackLoop>
+        
+        STRD     Value, [pDst, #0]                  ;// pDst[0]  = pDst[1]  = pDst[2]  = pDst[3]  = 0
+        STRD     Value, [pDst, #8]                  ;// pDst[4]  = pDst[5]  = pDst[6]  = pDst[7]  = 0
+        STRD     Value, [pDst, #16]                 ;// pDst[8]  = pDst[9]  = pDst[10] = pDst[11] = 0
+        STRD     Value, [pDst, #24]                 ;// pDst[12] = pDst[13] = pDst[14] = pDst[15] = 0
+        
+        ;//----------------------------------------------------------------------------
+        ;// The loop below parses and unpacks the input stream. The C-model has 
+        ;// a somewhat complicated logic for sign extension.  But in the v6 version,
+        ;// that can be easily taken care by loading the data from <pSrc> stream as 
+        ;// SIGNED byte/halfword. So, based on the first TST instruction, 8-bits or 
+        ;// 16-bits are read.
+        ;//
+        ;// Next, to compute the offset, where the unpacked value needs to be stored,
+        ;// we modify the computation to perform [(Flag & 15) < 1] as [(Flag < 1) & 31]
+        ;// This results in a saving of one cycle.
+        ;//----------------------------------------------------------------------------
+        
+unpackLoop
+        TST      Flag,  #0x10                        ;// Computing (Flag & 0x10)
+        LDRSBNE  Value2,[pSrc,#1]                    ;// Load byte wise to avoid unaligned access   
+        LDRBNE   Value, [pSrc], #2                   
+        AND      strOffset, cstOffset, Flag, LSL #1  ;// strOffset = (Flag & 15) < 1;
+        LDRSBEQ  Value, [pSrc], #1                   ;// Value = (OMX_U8)  *pSrc++
+        ORRNE    Value,Value,Value2, LSL #8          ;// Value = (OMX_U16) *pSrc++
+        
+        TST      Flag,  #0x20                        ;// Computing (Flag & 0x20) to check, if we're done
+        LDRBEQ   Flag,  [pSrc], #1                   ;// Flag  = (OMX_U8) *pSrc++, for next iteration
+        STRH     Value, [pDst, strOffset]            ;// Store <Value> at offset <strOffset>
+        BEQ      unpackLoop                          ;// Branch to the loop beginning
+        
+        STR      pSrc, [ppSrc]                       ;// Update the bitstream pointer
+        M_END
+    
+    ENDIF
+    
+    
+    
+    END
+    
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
new file mode 100755
index 0000000..40d4d5e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
@@ -0,0 +1,88 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  omxVCM4P10_DeblockChroma_I.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * H.264 intra chroma deblock
+ * 
+ */
+ 
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DeblockChroma_I
+ *
+ * Description:
+ * Performs deblocking filtering on all edges of the chroma macroblock (16x16).
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	pSrcDst         pointer to the input macroblock. Must be 8-byte aligned.
+ * [in]	srcdstStep      Step of the arrays
+ * [in]	pAlpha          pointer to a 2x2 array of alpha thresholds, organized as follows: { external
+ *                          vertical edge, internal  vertical edge, external
+ *                         horizontal edge, internal horizontal edge }
+ * [in]	pBeta			pointer to a 2x2 array of beta thresholds, organized as follows: { external
+ *                              vertical edge, internal vertical edge, external  horizontal edge,
+ *                              internal  horizontal edge }
+ * [in]	pThresholds		AArray of size  8x2 of Thresholds (TC0) (values for the left or
+ *                               above edge of each 4x2 or 2x4 block, arranged in  vertical block order
+ *                               and then in  horizontal block order)
+ * [in]	pBS				array of size 16x2 of BS parameters (arranged in scan block order for vertical edges and then horizontal edges);
+ *                         valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4.  Must be 4-byte aligned.
+ * [out]	pSrcDst		pointer to filtered output macroblock
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ *   - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds, or pBS is NULL.
+ *   - pSrcDst is not 8-byte aligned.
+ *   - either pThresholds or pBS is not 4-byte aligned.
+ *   - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3.
+ *   - srcdstStep is not a multiple of 8.
+ *
+ */
+OMXResult omxVCM4P10_DeblockChroma_I(
+	OMX_U8* pSrcDst, 
+	OMX_S32 srcdstStep, 
+	const OMX_U8* pAlpha, 
+	const OMX_U8* pBeta, 
+	const OMX_U8* pThresholds,
+    const OMX_U8 *pBS
+)
+{
+    OMXResult errorCode;
+    
+    armRetArgErrIf(pSrcDst == NULL,                 OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot8ByteAligned(pSrcDst),     OMX_Sts_BadArgErr);
+    armRetArgErrIf(srcdstStep & 7,                  OMX_Sts_BadArgErr);
+    armRetArgErrIf(pAlpha == NULL,                  OMX_Sts_BadArgErr);
+    armRetArgErrIf(pBeta == NULL,                   OMX_Sts_BadArgErr);
+    armRetArgErrIf(pThresholds == NULL,             OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr);
+    armRetArgErrIf(pBS == NULL,                     OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot4ByteAligned(pBS),         OMX_Sts_BadArgErr);
+
+    errorCode = omxVCM4P10_FilterDeblockingChroma_VerEdge_I(
+        pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS);
+
+    armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode)
+    
+    errorCode = omxVCM4P10_FilterDeblockingChroma_HorEdge_I(
+        pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+8, pBS+16);
+
+    return errorCode;
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
new file mode 100755
index 0000000..619365f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
@@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  omxVCM4P10_DeblockLuma_I.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * H.264 luma deblock
+ * 
+ */
+ 
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+ 
+
+/**
+ * Function: omxVCM4P10_DeblockLuma_I
+ *
+ * Description:
+ * This function performs deblock filtering the horizontal and vertical edges of a luma macroblock
+ *(16x16).
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	pSrcDst         pointer to the input macroblock. Must be 8-byte aligned.
+ * [in]	srcdstStep      image width
+ * [in]	pAlpha          pointer to a 2x2 table of alpha thresholds, organized as follows: { external
+ *                             vertical edge, internal vertical edge, external horizontal
+ *                             edge, internal horizontal edge }
+ * [in]	pBeta			pointer to a 2x2 table of beta thresholds, organized as follows: { external
+ *                              vertical edge, internal vertical edge, external  horizontal edge,
+ *                              internal  horizontal edge }
+ * [in]	pThresholds		pointer to a 16x2 table of threshold (TC0), organized as follows: { values for
+ *                              the  left or above edge of each 4x4 block, arranged in  vertical block order
+ *                              and then in horizontal block order)
+ * [in]	pBS				 pointer to a 16x2 table of BS parameters arranged in scan block order for vertical edges and then horizontal edges;
+ *                               valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4.  Must be 4-byte aligned.
+ * [out]	pSrcDst		pointer to filtered output macroblock.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ *    - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds or pBS is NULL.
+ *    - pSrcDst is not 8-byte aligned.
+ *    - srcdstStep is not a multiple of 8
+ *    - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3.
+.
+ *
+ */
+
+OMXResult omxVCM4P10_DeblockLuma_I(
+	OMX_U8* pSrcDst, 
+	OMX_S32 srcdstStep, 
+	const OMX_U8* pAlpha, 
+	const OMX_U8* pBeta, 
+	const OMX_U8* pThresholds, 
+	const OMX_U8 *pBS
+)
+{
+    OMXResult errorCode;
+    
+    armRetArgErrIf(pSrcDst == NULL,             OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot8ByteAligned(pSrcDst), OMX_Sts_BadArgErr);
+    armRetArgErrIf(srcdstStep & 7,              OMX_Sts_BadArgErr);    
+    armRetArgErrIf(pAlpha == NULL,              OMX_Sts_BadArgErr);
+    armRetArgErrIf(pBeta == NULL,               OMX_Sts_BadArgErr);
+    armRetArgErrIf(pThresholds == NULL,         OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr);
+    armRetArgErrIf(pBS == NULL,                     OMX_Sts_BadArgErr);
+    armRetArgErrIf(armNot4ByteAligned(pBS),         OMX_Sts_BadArgErr);
+
+    errorCode = omxVCM4P10_FilterDeblockingLuma_VerEdge_I(
+        pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS);
+
+    armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode)
+    
+    errorCode = omxVCM4P10_FilterDeblockingLuma_HorEdge_I(
+        pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+16, pBS+16);
+
+    return errorCode;
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
new file mode 100755
index 0000000..4e871bf
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * H.264 decode coefficients module
+ * 
+ */
+ 
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse raster scan for 2x2 block of 
+ * ChromaDCLevel. The decoded coefficients in packed position-coefficient 
+ * buffer are stored in increasing raster scan order, namely position order.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		Double pointer to current byte in bit stream
+ *								buffer
+ * [in]	pOffset			Pointer to current bit position in the byte 
+ *								pointed to by *ppBitStream
+ * [out]	ppBitStream		*ppBitStream is updated after each block is decoded
+ * [out]	pOffset			*pOffset is updated after each block is decoded
+ * [out]	pNumCoeff		Pointer to the number of nonzero coefficients
+ *								in this block
+ * [out]	ppPosCoefbuf	Double pointer to destination residual
+ *								coefficient-position pair buffer
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (
+     const OMX_U8** ppBitStream,
+     OMX_S32* pOffset,
+     OMX_U8* pNumCoeff,
+     OMX_U8** ppPosCoefbuf        
+ )
+
+{
+    return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff,
+                                         ppPosCoefbuf, 17, 4);
+
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
new file mode 100755
index 0000000..b29e576
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------
+ *
+ * 
+ * File Name:  omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * H.264 decode coefficients module
+ * 
+ */
+ 
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DecodeCoeffsToPairCAVLC
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of 
+ * Intra16x16DCLevel, Intra16x16ACLevel,LumaLevel, and ChromaACLevel. 
+ * Inverse field scan is not supported. The decoded coefficients in packed 
+ * position-coefficient buffer are stored in increasing zigzag order instead 
+ * of position order.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		Double pointer to current byte in bit stream buffer
+ * [in]	pOffset			Pointer to current bit position in the byte pointed
+ *								to by *ppBitStream
+ * [in]	sMaxNumCoeff	Maximum number of non-zero coefficients in current
+ *								block
+ * [in]	sVLCSelect		VLC table selector, obtained from number of non-zero
+ *								AC coefficients of above and left 4x4 blocks. It is 
+ *								equivalent to the variable nC described in H.264 standard 
+ *								table 9-5, except its value can��t be less than zero.
+ * [out]	ppBitStream		*ppBitStream is updated after each block is decoded
+ * [out]	pOffset			*pOffset is updated after each block is decoded
+ * [out]	pNumCoeff		Pointer to the number of nonzero coefficients in
+ *								this block
+ * [out]	ppPosCoefbuf	Double pointer to destination residual
+ *								coefficient-position pair buffer
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC(
+     const OMX_U8** ppBitStream,
+     OMX_S32* pOffset,
+     OMX_U8* pNumCoeff,
+     OMX_U8**ppPosCoefbuf,
+     OMX_INT sVLCSelect,
+     OMX_INT sMaxNumCoeff        
+ )
+{
+    return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff,
+                                         ppPosCoefbuf, sVLCSelect, sMaxNumCoeff);
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
new file mode 100755
index 0000000..485a488
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
@@ -0,0 +1,396 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;// 
+;// 
+
+        
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        IMPORT armVCM4P10_UnpackBlock4x4
+        IMPORT armVCM4P10_TransformResidual4x4
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixU16
+        IMPORT armVCM4P10_QPModuloTable 
+        
+        M_VARIANTS CortexA8
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_DequantLumaAC4x4
+
+;// Guarding implementation by the processor name
+    
+ 
+
+;// Guarding implementation by the processor name
+    
+
+
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    
+    
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+
+;// ARM Registers
+
+;//Input Registers
+ppSrc       RN  0
+pPred       RN  1
+pDC         RN  2
+pDst        RN  3
+   
+
+;//Output Registers
+result      RN  0
+
+;//Local Scratch Registers
+
+;//Registers used in armVCM4P10_DequantLumaAC4x4
+pQPdiv      RN  10
+pQPmod      RN  11
+pVRow       RN  2
+QPmod       RN  12
+shift       RN  14
+index0      RN  1 
+index1      RN  10 
+
+;//Registers used in DequantTransformResidualFromPairAndAdd
+pDelta      RN  4
+pDeltaTmp   RN  6
+AC          RN  5                   ;//Load from stack
+pPredTemp   RN  7
+pDCTemp     RN  8
+pDstTemp    RN  9
+pDeltaArg1  RN  1
+pDeltaArg0  RN  0
+QP          RN  1                   ;//Load from stack
+DCval       RN  10  
+predstep    RN  1
+dstStep     RN  10
+PredVal1    RN  3
+PredVal2    RN  5
+
+
+
+
+;// Neon Registers
+
+;// Registers used in armVCM4P10_DequantLumaAC4x4
+
+dVmatrix            DN  D6.8  
+dindexRow0          DN  D7.32 
+dindexRow1          DN  D9.32 
+dByteIndexRow0      DN  D7.8
+dByteIndexRow1      DN  D9.8
+dVRow0              DN  D8.8  
+dVRow1              DN  D4.8
+dVRow0U16           DN  D8.U16
+dVRow1U16           DN  D4.U16
+dVRow2U16           DN  D8.U16
+dVRow3U16           DN  D4.U16
+
+dShift              DN  D5.U16
+dSrcRow0            DN  D0.I16
+dSrcRow1            DN  D1.I16
+dSrcRow2            DN  D2.I16    
+dSrcRow3            DN  D3.I16
+dDqntRow0           DN  D0.I16  
+dDqntRow1           DN  D1.I16 
+dDqntRow2           DN  D2.I16 
+dDqntRow3           DN  D3.I16  
+
+;// Registers used in TransformResidual4x4
+
+;// Packed Input pixels
+dIn0                DN  D0.S16       
+dIn1                DN  D1.S16       
+dIn2                DN  D2.S16       
+dIn3                DN  D3.S16
+qIn01               QN  Q0.32
+qIn23               QN  Q1.32
+
+;// Intermediate calculations       
+dZero               DN  D4.S16
+de0                 DN  D5.S16
+de1                 DN  D6.S16
+de2                 DN  D7.S16
+de3                 DN  D8.S16
+dIn1RS              DN  D7.S16
+dIn3RS              DN  D8.S16
+df0                 DN  D0.S16
+df1                 DN  D1.S16
+df2                 DN  D2.S16
+df3                 DN  D3.S16
+qf01                QN  Q0.32
+qf23                QN  Q1.32
+dg0                 DN  D5.S16
+dg1                 DN  D6.S16
+dg2                 DN  D7.S16
+dg3                 DN  D8.S16
+df1RS               DN  D7.S16
+df3RS               DN  D8.S16
+
+;// Output pixels
+dh0                 DN  D0.S16
+dh1                 DN  D1.S16
+dh2                 DN  D2.S16
+dh3                 DN  D3.S16 
+
+;// Registers used in DequantTransformResidualFromPairAndAdd
+
+dDeltaRow0          DN  D0.S16
+dDeltaRow1          DN  D1.S16
+dDeltaRow2          DN  D2.S16
+dDeltaRow3          DN  D3.S16
+qDeltaRow01         QN  Q0.S16
+qDeltaRow23         QN  Q1.S16
+
+dPredValRow01       DN  D4.U8
+dPredValRow23       DN  D5.U8
+
+qSumRow01           QN  Q3.S16
+qSumRow23           QN  Q4.S16
+dDstRow01           DN  D0.U8
+dDstRow23           DN  D1.U8
+dDstRow0            DN  D0.32[0]
+dDstRow1            DN  D0.32[1]
+dDstRow2            DN  D1.32[0]
+dDstRow3            DN  D1.32[1]
+    
+           
+    ;// Allocate stack memory required by the function
+        M_ALLOC8 pBuffer, 32
+               
+
+    ;// Write function header
+        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
+        
+        ;// Define stack arguments
+        M_ARG   predStepOnStack, 4
+        M_ARG   dstStepOnStack,4
+        M_ARG   QPOnStack, 4
+        M_ARG   ACOnStack,4
+  
+        
+        M_ADR   pDelta,pBuffer 
+        M_LDR   AC,ACOnStack 
+        
+         
+        ;// Save registers r1,r2,r3 before function call    
+        MOV     pPredTemp,pPred
+        MOV     pDCTemp,pDC
+        MOV     pDstTemp,pDst
+        
+        CMP     AC,#0
+        BEQ     DCcase
+        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
+    
+        BL      armVCM4P10_UnpackBlock4x4
+    
+        ;//--------------------------------------------------------
+        ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
+        ;//--------------------------------------------------------
+        
+        ;//BL      armVCM4P10_DequantLumaAC4x4
+        M_LDR   QP,QPOnStack                                ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
+                
+        LDR    pQPmod,=armVCM4P10_QPModuloTable
+        LDR    pQPdiv,=armVCM4P10_QPDivTable        
+        LDR    pVRow,=armVCM4P10_VMatrixU16
+        
+        
+        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
+        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
+                
+        LDR    index1,=0x03020504 
+        LDR    index0,=0x05040100                   ;// Indexes into dVmatrix
+        ADD    pVRow,pVRow,QPmod
+        VDUP   dindexRow0,index0 
+        VDUP   dindexRow1,index1
+        VDUP   dShift,shift 
+        
+        ;// Load all 4x4 pVRow[] values
+        VLD1   dVmatrix,[pVRow]                     ;// dVmatrix = [0d|0c|0b|0a]
+        
+        
+        VTBL   dVRow0,dVmatrix,dByteIndexRow0       ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
+        VTBL   dVRow1,dVmatrix,dByteIndexRow1       ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
+        CMP     pDCTemp,#0
+        ;// Load all the 4x4 'src' values  
+        VLD1   { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 
+        
+        VSHL   dVRow0U16,dVRow0U16,dShift 
+        VSHL   dVRow1U16,dVRow1U16,dShift 
+        LDRSHNE DCval,[pDCTemp]
+        
+        
+        ;// Multiply src[] with pVRow[]
+        VMUL    dDqntRow0,dSrcRow0,dVRow0U16
+        VMUL    dDqntRow1,dSrcRow1,dVRow1U16
+        VMUL    dDqntRow2,dSrcRow2,dVRow2U16
+        VMUL    dDqntRow3,dSrcRow3,dVRow3U16
+        
+        
+        
+        ;//-------------------------------------------------------------
+        ;// TransformResidual4x4 : Inlined to avoid Load/Stores
+        ;//-------------------------------------------------------------
+        
+        
+        ;//BL      armVCM4P10_TransformResidual4x4
+        ;//STRHNE  DCval,[pDelta]
+        VMOVNE    dIn0[0],DCval
+        
+        
+        
+        ;//*****************************************************************
+        ;// Transpose the input pixels : perform Row ops as Col ops
+        ;//*****************************************************************
+        
+        VTRN    dIn0,dIn1
+        VTRN    dIn2,dIn3
+        VTRN    qIn01,qIn23 
+         
+        
+        VMOV    dZero,#0                                    ;// Used to right shift by 1 
+        
+        
+        ;//**************************************** 
+        ;// Row Operations (Performed on columns)
+        ;//**************************************** 
+        
+        
+        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2 
+        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2 
+        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
+        VHADD       dIn3RS,dIn3,dZero
+        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3 
+        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
+        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
+        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
+        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
+        
+        
+        
+        ;//*****************************************************************
+        ;// Transpose the resultant matrix
+        ;//*****************************************************************
+        
+        VTRN    df0,df1
+        VTRN    df2,df3
+        VTRN    qf01,qf23 
+        
+        
+        ;//******************************* 
+        ;// Coloumn Operations 
+        ;//******************************* 
+        
+        
+        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2 
+        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2 
+        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
+        VHADD       df3RS,df3,dZero
+        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3 
+        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
+        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
+        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
+        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
+        
+             
+        ;//************************************************
+        ;// Calculate final value (colOp[i][j] + 32)>>6
+        ;//************************************************
+        
+        VRSHR       dh0,#6
+        VRSHR       dh1,#6
+        VRSHR       dh2,#6
+        VRSHR       dh3,#6
+        
+               
+        B       OutDCcase 
+        
+
+DCcase
+        ;// Calculate the Transformed DCvalue : (DCval+32)>>6
+        LDRSH   DCval,[pDCTemp] 
+        ADD     DCval,DCval,#32 
+        ASR     DCval,DCval,#6
+        
+        VDUP    dDeltaRow0, DCval                       ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
+        VDUP    dDeltaRow1, DCval                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
+        VDUP    dDeltaRow2, DCval                        ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
+        VDUP    dDeltaRow3, DCval
+            
+                
+OutDCcase      
+        M_LDR   predstep,predStepOnStack
+        M_LDR   dstStep,dstStepOnStack
+        
+        LDR     PredVal1,[pPredTemp],predstep
+        LDR     PredVal2,[pPredTemp],predstep
+        VMOV    dPredValRow01,PredVal1,PredVal2
+        
+        LDR     PredVal1,[pPredTemp],predstep
+        LDR     PredVal2,[pPredTemp]
+        VMOV    dPredValRow23,PredVal1,PredVal2
+ 
+        
+        VADDW   qSumRow01,qDeltaRow01,dPredValRow01
+        VADDW   qSumRow23,qDeltaRow23,dPredValRow23
+        VQMOVUN dDstRow01,qSumRow01
+        VQMOVUN dDstRow23,qSumRow23
+        
+ 
+        VST1    dDstRow0,[pDstTemp],dstStep
+        VST1    dDstRow1,[pDstTemp],dstStep
+        VST1    dDstRow2,[pDstTemp],dstStep
+        VST1    dDstRow3,[pDstTemp]
+        
+        ;// Set return value
+        MOV     result,#OMX_Sts_NoErr
+        
+End                
+
+        
+        ;// Write function tail
+        
+        M_END
+        
+    ENDIF                                                    ;//CORTEXA8   
+    
+         
+            
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
new file mode 100644
index 0000000..4606197
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
@@ -0,0 +1,202 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+
+        IF CortexA8
+
+        IMPORT  armVCM4P10_DeblockingChromabSGE4_unsafe
+        IMPORT  armVCM4P10_DeblockingChromabSLT4_unsafe
+        
+LOOP_COUNT  EQU 0x40000000
+MASK_3      EQU 0x03030303
+MASK_4      EQU 0x04040404
+
+;// Function arguments
+
+pSrcDst     RN 0
+srcdstStep  RN 1
+pAlpha      RN 2
+pBeta       RN 3
+
+pThresholds RN 5
+pBS         RN 4
+bS3210      RN 6
+
+;// Loop 
+
+XY          RN 7
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+dBeta       DN D2.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+dAp1p0      DN D12.U8
+dAq1q0      DN D18.U8
+dAp2p0      DN D19.U8
+dAq2q0      DN D17.U8
+
+qBS3210     QN Q13.U16
+dBS3210     DN D26
+dMask_bs    DN D27
+dFilt_bs    DN D26.U16
+
+;// bSLT4
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+dMask_4     DN D1.U16
+
+Mask_4      RN 8
+Mask_3      RN 9
+
+dTemp       DN D19.U8
+
+;// Result
+dP_0t       DN D13.U8   
+dQ_0t       DN D31.U8   
+
+dP_0n       DN D29.U8
+dQ_0n       DN D24.U8
+
+        
+        ;// Function header
+        M_START omxVCM4P10_FilterDeblockingChroma_HorEdge_I, r9, d15
+        
+        ;//Arguments on the stack
+        M_ARG   ppThresholds, 4
+        M_ARG   ppBS, 4
+        
+        ;// d0-dAlpha_0
+        ;// d2-dBeta_0
+
+        ;load alpha1,beta1 somewhere to avoid more loads
+        VLD1        {dAlpha[]}, [pAlpha]!
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #1 ;?
+        SUB         pSrcDst, pSrcDst, srcdstStep
+        VLD1        {dBeta[]}, [pBeta]! 
+        
+        M_LDR       pBS, ppBS
+        M_LDR       pThresholds, ppThresholds 
+
+        LDR         Mask_3, =MASK_3
+        LDR         Mask_4, =MASK_4
+
+        VMOV        dMask_0, #0     
+        VMOV        dMask_1, #1     
+        VMOV        dMask_4, #4     
+        
+        LDR         XY, =LOOP_COUNT
+
+        ;// p0-p3 - d4-d7
+        ;// q0-q3 - d8-d11
+LoopY        
+        LDR         bS3210, [pBS], #8
+        
+        VLD1        dP_2, [pSrcDst], srcdstStep
+        ;1
+        VLD1        dP_1, [pSrcDst], srcdstStep
+        CMP         bS3210, #0
+        VLD1        dP_0, [pSrcDst], srcdstStep
+        ;1
+        VLD1        dQ_0, [pSrcDst], srcdstStep
+        VABD        dAp2p0, dP_2, dP_0
+        VLD1        dQ_1, [pSrcDst], srcdstStep
+        VABD        dAp0q0, dP_0, dQ_0
+        VLD1        dQ_2, [pSrcDst], srcdstStep
+        BEQ         NoFilterBS0
+
+        VABD        dAp1p0, dP_1, dP_0
+        VABD        dAq1q0, dQ_1, dQ_0
+
+        VCGT        dFilt, dAlpha, dAp0q0
+        VMOV.U32    dBS3210[0], bS3210
+        VMAX        dAp1p0, dAq1q0, dAp1p0
+        VMOVL       qBS3210, dBS3210.U8
+        VABD        dAq2q0, dQ_2, dQ_0
+        VCGT        dMask_bs.S16, dBS3210.S16, #0
+
+        VCGT        dAp1p0, dBeta, dAp1p0 
+        VCGT        dAp2p0, dBeta, dAp2p0
+        
+        VAND        dFilt, dMask_bs.U8
+
+        TST         bS3210, Mask_3
+
+        VCGT        dAq2q0, dBeta, dAq2q0
+        VAND        dFilt, dFilt, dAp1p0
+
+        VAND        dAqflg, dFilt, dAq2q0
+        VAND        dApflg, dFilt, dAp2p0
+        
+        ;// bS < 4 Filtering
+        BLNE        armVCM4P10_DeblockingChromabSLT4_unsafe
+
+        TST         bS3210, Mask_4        
+
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
+        VTST        dFilt_bs, dFilt_bs, dMask_4
+
+        ;// bS == 4 Filtering
+        BLNE        armVCM4P10_DeblockingChromabSGE4_unsafe
+                    
+        VBIT        dP_0n, dP_0t, dFilt_bs
+        VBIT        dQ_0n, dQ_0t, dFilt_bs
+        
+        VBIF        dP_0n, dP_0, dFilt      
+        VBIF        dQ_0n, dQ_0, dFilt  
+
+        ;// Result Storage
+        VST1        dP_0n, [pSrcDst], srcdstStep
+        ADDS        XY, XY, XY
+        VST1        dQ_0n, [pSrcDst], srcdstStep
+
+        BNE         LoopY        
+        
+        MOV         r0, #OMX_Sts_NoErr
+
+        M_EXIT
+        
+NoFilterBS0
+
+        VLD1        {dAlpha[]}, [pAlpha]
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #1
+        ADDS        XY, XY, XY
+        VLD1        {dBeta[]}, [pBeta]
+        ADD         pThresholds, pThresholds, #4
+        BNE         LoopY        
+
+        MOV         r0, #OMX_Sts_NoErr
+        M_END
+        
+        ENDIF
+        
+
+        END
+        
+        
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
new file mode 100644
index 0000000..18e6c1d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
@@ -0,0 +1,282 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+
+        IF CortexA8
+
+        IMPORT  armVCM4P10_DeblockingChromabSGE4_unsafe
+        IMPORT  armVCM4P10_DeblockingChromabSLT4_unsafe
+        
+LOOP_COUNT  EQU 0x40000000
+MASK_3      EQU 0x03030303
+MASK_4      EQU 0x04040404
+
+;// Function arguments
+
+pSrcDst     RN 0
+srcdstStep  RN 1
+pAlpha      RN 2
+pBeta       RN 3
+
+pThresholds RN 5
+pBS         RN 4
+bS3210      RN 6
+pSrcDst_P   RN 10
+pSrcDst_Q   RN 12
+
+pTmp        RN 10
+pTmp2       RN 12
+step        RN 14
+
+;// Loop 
+
+XY          RN 7
+
+;// Rows input
+dRow0       DN D7.U8
+dRow1       DN D8.U8  
+dRow2       DN D5.U8  
+dRow3       DN D10.U8  
+dRow4       DN D6.U8  
+dRow5       DN D9.U8  
+dRow6       DN D4.U8 
+dRow7       DN D11.U8 
+
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+dBeta       DN D2.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+dAp1p0      DN D12.U8
+dAq1q0      DN D18.U8
+dAp2p0      DN D19.U8
+dAq2q0      DN D17.U8
+
+qBS3210     QN Q13.U16
+dBS3210     DN D26
+dMask_bs    DN D27
+dFilt_bs    DN D26.U16
+
+;// bSLT4
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+dMask_4     DN D1.U16
+
+Mask_4      RN 8
+Mask_3      RN 9
+
+dTemp       DN D19.U8
+
+;// Result
+dP_0t       DN D13.U8   
+dQ_0t       DN D31.U8   
+
+dP_0n       DN D29.U8
+dQ_0n       DN D24.U8
+
+        
+        ;// Function header
+        M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
+        
+        ;//Arguments on the stack
+        M_ARG   ppThresholds, 4
+        M_ARG   ppBS, 4
+        
+        ;// d0-dAlpha_0
+        ;// d2-dBeta_0
+
+        ;load alpha1,beta1 somewhere to avoid more loads
+        VLD1        {dAlpha[]}, [pAlpha]!
+        SUB         pSrcDst, pSrcDst, #4
+        VLD1        {dBeta[]}, [pBeta]! 
+        
+        M_LDR       pBS, ppBS
+        M_LDR       pThresholds, ppThresholds 
+
+        LDR         Mask_4, =MASK_4
+        LDR         Mask_3, =MASK_3
+
+        ;dMask_0-14
+        ;dMask_1-15
+        ;dMask_4-19
+
+        VMOV        dMask_0, #0     
+        VMOV        dMask_1, #1     
+        VMOV        dMask_4, #4     
+        
+        LDR         XY, =LOOP_COUNT
+
+        ;// p0-p3 - d4-d7
+        ;// q0-q3 - d8-d11
+
+
+LoopY        
+        LDR         bS3210, [pBS], #8
+        ADD         pTmp, pSrcDst, srcdstStep
+        ADD         step, srcdstStep, srcdstStep
+        
+        ;1
+        VLD1        dRow0, [pSrcDst], step
+        ;1
+        VLD1        dRow1, [pTmp], step
+        VLD1        dRow2, [pSrcDst], step
+        VLD1        dRow3, [pTmp], step
+        VLD1        dRow4, [pSrcDst], step
+        VLD1        dRow5, [pTmp], step
+        VLD1        dRow6, [pSrcDst], step
+        VLD1        dRow7, [pTmp], step
+        
+        
+        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
+        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
+        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
+        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
+        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
+        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
+        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
+        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
+
+        ;// 8x8 Transpose
+        VZIP.8      dRow0, dRow1
+        VZIP.8      dRow2, dRow3
+        VZIP.8      dRow4, dRow5
+        VZIP.8      dRow6, dRow7
+
+        VZIP.16     dRow0, dRow2
+        VZIP.16     dRow1, dRow3
+        VZIP.16     dRow4, dRow6
+        VZIP.16     dRow5, dRow7
+
+        VZIP.32     dRow0, dRow4
+        VZIP.32     dRow2, dRow6
+        VZIP.32     dRow3, dRow7
+        VZIP.32     dRow1, dRow5
+
+
+        ;Realign the pointers
+
+        CMP         bS3210, #0
+        VABD        dAp2p0, dP_2, dP_0
+        VABD        dAp0q0, dP_0, dQ_0
+        BEQ         NoFilterBS0
+
+        VABD        dAp1p0, dP_1, dP_0
+        VABD        dAq1q0, dQ_1, dQ_0
+
+        VMOV.U32    dBS3210[0], bS3210
+        VCGT        dFilt, dAlpha, dAp0q0
+        VMAX        dAp1p0, dAq1q0, dAp1p0
+        VMOVL       qBS3210, dBS3210.U8
+        VABD        dAq2q0, dQ_2, dQ_0
+        VCGT        dMask_bs.S16, dBS3210.S16, #0
+
+        VCGT        dAp1p0, dBeta, dAp1p0
+        VCGT        dAp2p0, dBeta, dAp2p0
+        VAND        dFilt, dMask_bs.U8
+
+        TST         bS3210, Mask_3
+
+        VCGT        dAq2q0, dBeta, dAq2q0
+        VAND        dFilt, dFilt, dAp1p0
+
+        VAND        dAqflg, dFilt, dAq2q0
+        VAND        dApflg, dFilt, dAp2p0
+
+        ;// bS < 4 Filtering
+        BLNE        armVCM4P10_DeblockingChromabSLT4_unsafe
+
+        TST         bS3210, Mask_4        
+
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
+        VTST        dFilt_bs, dFilt_bs, dMask_4
+
+        ;// bS == 4 Filtering
+        BLNE        armVCM4P10_DeblockingChromabSGE4_unsafe
+
+        VBIT        dP_0n, dP_0t, dFilt_bs
+        VBIT        dQ_0n, dQ_0t, dFilt_bs
+
+        ;// Result Storage
+        ADD         pSrcDst_P, pSrcDst, #3
+        VBIF        dP_0n, dP_0, dFilt      
+        
+        ADD         pTmp2, pSrcDst_P, srcdstStep
+        ADD         step, srcdstStep, srcdstStep
+        VBIF        dQ_0n, dQ_0, dFilt  
+
+        ADDS        XY, XY, XY
+        
+        VST1        {dP_0n[0]}, [pSrcDst_P], step
+        VST1        {dP_0n[1]}, [pTmp2], step
+        VST1        {dP_0n[2]}, [pSrcDst_P], step
+        VST1        {dP_0n[3]}, [pTmp2], step
+        VST1        {dP_0n[4]}, [pSrcDst_P], step
+        VST1        {dP_0n[5]}, [pTmp2], step
+        VST1        {dP_0n[6]}, [pSrcDst_P], step
+        VST1        {dP_0n[7]}, [pTmp2], step
+        
+        ADD         pSrcDst_Q, pSrcDst, #4
+        ADD         pTmp, pSrcDst_Q, srcdstStep
+        
+        VST1        {dQ_0n[0]}, [pSrcDst_Q], step
+        VST1        {dQ_0n[1]}, [pTmp], step
+        VST1        {dQ_0n[2]}, [pSrcDst_Q], step
+        VST1        {dQ_0n[3]}, [pTmp], step
+        VST1        {dQ_0n[4]}, [pSrcDst_Q], step
+        VST1        {dQ_0n[5]}, [pTmp], step
+        VST1        {dQ_0n[6]}, [pSrcDst_Q], step
+        VST1        {dQ_0n[7]}, [pTmp], step
+        
+        ADD         pSrcDst, pSrcDst, #4
+
+        BNE         LoopY        
+        
+        MOV         r0, #OMX_Sts_NoErr
+        
+        M_EXIT
+        
+NoFilterBS0
+        VLD1        {dAlpha[]}, [pAlpha]
+        ADD         pSrcDst, pSrcDst, #4
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
+        ADDS        XY, XY, XY
+        VLD1        {dBeta[]}, [pBeta]
+        ADD         pThresholds, pThresholds, #4
+        BNE         LoopY        
+
+        MOV         r0, #OMX_Sts_NoErr
+
+        M_END
+        
+        ENDIF
+
+
+        END
+        
+        
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
new file mode 100755
index 0000000..0c3f4f2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
@@ -0,0 +1,288 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+
+        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
+        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
+
+        IF CortexA8
+
+LOOP_COUNT  EQU 0x55000000
+
+
+;// Function arguments
+
+pSrcDst     RN 0
+srcdstStep  RN 1
+pAlpha      RN 2
+pBeta       RN 3
+
+pThresholds RN 5
+pBS         RN 4
+bS10        RN 12
+
+pAlpha_0    RN 2
+pBeta_0     RN 3
+
+pAlpha_1    RN 7
+pBeta_1     RN 8
+
+
+
+;// Loop 
+
+XY          RN 9
+
+pTmp        RN 6
+step        RN 10
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dP_3        DN D7.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+dQ_3        DN D11.U8 
+
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+dBeta       DN D2.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+dAp1p0      DN D12.U8
+dAq1q0      DN D18.U8
+dAp2p0      DN D19.U8
+dAq2q0      DN D17.U8
+
+;// bSLT4
+dTC0        DN D18.U8   
+dTC1        DN D19.U8   
+dTC01       DN D18.U8   
+
+dTCs        DN D31.S8
+dTC         DN D31.U8
+
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+
+Mask_0      RN 11
+
+dTemp       DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0      QN Q10.S16
+qDp1q1      QN Q11.S16
+qDelta      QN Q10.S16  ; reuse qDq0p0
+dDelta      DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0      DN D24.U8
+
+dMaxP       DN D23.U8
+dMinP       DN D22.U8
+
+dMaxQ       DN D19.U8
+dMinQ       DN D21.U8
+
+dDeltaP     DN D26.U8
+dDeltaQ     DN D27.U8
+
+qP_0n       QN Q14.S16
+qQ_0n       QN Q12.S16
+
+dQ_0n       DN D24.U8
+dQ_1n       DN D25.U8
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+
+;// bSGE4
+
+qSp0q0      QN Q10.U16
+
+qSp2q1      QN Q11.U16
+qSp0q0p1    QN Q12.U16
+qSp3p2      QN Q13.U16
+dHSp0q1     DN D28.U8
+
+qSq2p1      QN Q11.U16
+qSp0q0q1    QN Q12.U16
+qSq3q2      QN Q13.U16  ;!!
+dHSq0p1     DN D28.U8   ;!!
+
+qTemp1      QN Q11.U16  ;!!;qSp2q1 
+qTemp2      QN Q12.U16  ;!!;qSp0q0p1        
+
+dP_0t       DN D28.U8   ;!!;dHSp0q1        
+dQ_0t       DN D22.U8   ;!!;Temp1        
+
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+dP_2n       DN D31.U8
+
+dQ_0n       DN D24.U8   ;!!;Temp2        
+dQ_1n       DN D25.U8   ;!!;Temp2        
+dQ_2n       DN D28.U8   ;!!;dQ_0t        
+
+        
+        ;// Function header
+        M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15
+        
+        ;//Arguments on the stack
+        M_ARG   ppThresholds, 4
+        M_ARG   ppBS, 4
+        
+        ;// d0-dAlpha_0
+        ;// d2-dBeta_0
+
+        ADD         pAlpha_1, pAlpha_0, #1
+        ADD         pBeta_1, pBeta_0, #1
+        
+        VLD1        {dAlpha[]}, [pAlpha_0]
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
+        VLD1        {dBeta[]}, [pBeta_0] 
+        
+        M_LDR       pBS, ppBS
+        M_LDR       pThresholds, ppThresholds 
+
+        MOV         Mask_0,#0
+
+        ;dMask_0-14
+        ;dMask_1-15
+
+        VMOV        dMask_0, #0     
+        VMOV        dMask_1, #1     
+        
+        ADD         step, srcdstStep, srcdstStep
+
+        LDR         XY,=LOOP_COUNT
+
+        ;// p0-p3 - d4-d7
+        ;// q0-q3 - d8-d11
+LoopY        
+LoopX        
+        LDRH        bS10, [pBS], #2
+        ADD         pTmp, pSrcDst, srcdstStep
+        CMP         bS10, #0
+        BEQ         NoFilterBS0
+
+        VLD1        dP_3, [pSrcDst], step
+        VLD1        dP_2, [pTmp], step
+        VLD1        dP_1, [pSrcDst], step
+        VLD1        dP_0, [pTmp], step
+        VLD1        dQ_0, [pSrcDst], step
+        VABD        dAp1p0, dP_0, dP_1
+        VLD1        dQ_1, [pTmp]
+        VABD        dAp0q0, dQ_0, dP_0
+        VLD1        dQ_2, [pSrcDst], srcdstStep
+        
+        VABD        dAq1q0, dQ_1, dQ_0
+        VABD        dAp2p0, dP_2, dP_0
+        VCGT        dFilt, dAlpha, dAp0q0
+
+        TST         bS10, #0xff
+        VMAX        dAp1p0, dAq1q0, dAp1p0
+        VABD        dAq2q0, dQ_2, dQ_0
+
+        VMOVEQ.U32  dFilt[0], Mask_0
+        TST         bS10, #0xff00
+
+        VCGT        dAp2p0, dBeta, dAp2p0
+        VCGT        dAp1p0, dBeta, dAp1p0
+
+        VMOVEQ.U32  dFilt[1], Mask_0
+
+        VCGT        dAq2q0, dBeta, dAq2q0
+        VLD1        dQ_3, [pSrcDst]
+        VAND        dFilt, dFilt, dAp1p0
+        TST         bS10, #4 
+
+        VAND        dAqflg, dFilt, dAq2q0
+        VAND        dApflg, dFilt, dAp2p0
+    
+        BNE         bSGE4        
+bSLT4
+        ;// bS < 4 Filtering
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
+        SUB         pSrcDst, pSrcDst, srcdstStep
+
+        BL          armVCM4P10_DeblockingLumabSLT4_unsafe
+
+        ;// Result Storage
+        VST1        dP_1n, [pSrcDst], srcdstStep
+        VST1        dP_0n, [pSrcDst], srcdstStep
+        SUB         pTmp, pSrcDst, srcdstStep, LSL #2
+        VST1        dQ_0n, [pSrcDst], srcdstStep
+        ADDS        XY, XY, XY
+        VST1        dQ_1n, [pSrcDst]
+        ADD         pSrcDst, pTmp, #8
+
+        BCC         LoopX
+        B           ExitLoopY        
+
+NoFilterBS0
+        ADD         pSrcDst, pSrcDst, #8
+        ADDS        XY, XY, XY
+        ADD         pThresholds, pThresholds, #2
+        BCC         LoopX
+        B           ExitLoopY        
+bSGE4        
+        ;// bS >= 4 Filtering
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #2
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #1
+        BL          armVCM4P10_DeblockingLumabSGE4_unsafe
+
+        ;// Result Storage
+        VST1        dP_2n, [pSrcDst], srcdstStep
+        VST1        dP_1n, [pSrcDst], srcdstStep
+        VST1        dP_0n, [pSrcDst], srcdstStep
+        SUB         pTmp, pSrcDst, srcdstStep, LSL #2
+        VST1        dQ_0n, [pSrcDst], srcdstStep
+        ADDS        XY,XY,XY
+        VST1        dQ_1n, [pSrcDst], srcdstStep
+        ADD         pThresholds, pThresholds, #2
+        VST1        dQ_2n, [pSrcDst]
+        
+        ADD         pSrcDst, pTmp, #8
+        BCC         LoopX
+
+ExitLoopY        
+
+        SUB         pSrcDst, pSrcDst, #16
+        VLD1        {dAlpha[]}, [pAlpha_1]
+        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #2 
+        VLD1        {dBeta[]}, [pBeta_1]
+        BNE         LoopY
+
+        MOV         r0, #OMX_Sts_NoErr
+
+        M_END
+        
+    ENDIF
+    
+
+        
+
+        END
+        
+        
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
new file mode 100755
index 0000000..e6fbb34
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
@@ -0,0 +1,436 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+
+        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
+        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
+        
+        IF CortexA8
+
+LOOP_COUNT  EQU 0x11000000
+
+
+;// Function arguments
+
+pSrcDst     RN 0
+srcdstStep  RN 1
+pAlpha      RN 2
+pBeta       RN 3
+
+pThresholds RN 5
+pBS         RN 4
+bS10        RN 12
+
+pAlpha_0    RN 2
+pBeta_0     RN 3
+
+pAlpha_1    RN 7
+pBeta_1     RN 8
+
+pTmp        RN 10
+pTmpStep    RN 11
+
+;// Loop 
+
+XY          RN 9
+
+;// Rows input
+dRow0       DN D7.U8
+dRow1       DN D8.U8  
+dRow2       DN D5.U8  
+dRow3       DN D10.U8  
+dRow4       DN D6.U8  
+dRow5       DN D9.U8  
+dRow6       DN D4.U8 
+dRow7       DN D11.U8 
+
+;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
+;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
+
+;// Rows output
+dRown0      DN D7.U8
+dRown1      DN D24.U8
+dRown2      DN D30.U8
+dRown3      DN D10.U8
+dRown4      DN D6.U8
+dRown5      DN D25.U8
+dRown6      DN D29.U8
+dRown7      DN D11.U8
+
+;// dP_0n       DN D29.U8
+;// dP_1n       DN D30.U8
+;// dP_2n       DN D31.U8
+;// 
+;// dQ_0n       DN D24.U8   ;!!;Temp2        
+;// dQ_1n       DN D25.U8   ;!!;Temp2        
+;// dQ_2n       DN D28.U8   ;!!;dQ_0t        
+;// 
+;// dRown0 - dP_3,  dRown1 - dQ_0n
+;// dRown2 - dP_1n, dRown3 - dQ_2
+;// dRown4 - dP_2,  dRown5 - dQ_1n
+;// dRown6 - dP_0n, dRown7 - dQ_3
+
+dRow0n      DN D7.U8
+dRow1n      DN D24.U8
+dRow2n      DN D30.U8
+dRow3n      DN D28.U8
+dRow4n      DN D31.U8
+dRow5n      DN D25.U8
+dRow6n      DN D29.U8
+dRow7n      DN D11.U8
+
+;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
+;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
+
+;// Pixels
+dP_0        DN D4.U8
+dP_1        DN D5.U8  
+dP_2        DN D6.U8  
+dP_3        DN D7.U8  
+dQ_0        DN D8.U8  
+dQ_1        DN D9.U8  
+dQ_2        DN D10.U8 
+dQ_3        DN D11.U8 
+
+
+;// Filtering Decision
+dAlpha      DN D0.U8
+dBeta       DN D2.U8
+
+dFilt       DN D16.U8
+dAqflg      DN D12.U8
+dApflg      DN D17.U8 
+
+dAp0q0      DN D13.U8
+dAp1p0      DN D12.U8
+dAq1q0      DN D18.U8
+dAp2p0      DN D19.U8
+dAq2q0      DN D17.U8
+
+;// bSLT4
+dTC0        DN D18.U8   
+dTC1        DN D19.U8   
+dTC01       DN D18.U8   
+
+dTCs        DN D31.S8
+dTC         DN D31.U8
+
+dMask_0     DN D14.U8
+dMask_1     DN D15.U8    
+
+Mask_0      RN 6
+
+dTemp       DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0      QN Q10.S16
+qDp1q1      QN Q11.S16
+qDelta      QN Q10.S16  ; reuse qDq0p0
+dDelta      DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0      DN D24.U8
+
+dMaxP       DN D23.U8
+dMinP       DN D22.U8
+
+dMaxQ       DN D19.U8
+dMinQ       DN D21.U8
+
+dDeltaP     DN D26.U8
+dDeltaQ     DN D27.U8
+
+qP_0n       QN Q14.S16
+qQ_0n       QN Q12.S16
+
+dQ_0n       DN D24.U8
+dQ_1n       DN D25.U8
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+
+;// bSGE4
+
+qSp0q0      QN Q10.U16
+
+qSp2q1      QN Q11.U16
+qSp0q0p1    QN Q12.U16
+qSp3p2      QN Q13.U16
+dHSp0q1     DN D28.U8
+
+qSq2p1      QN Q11.U16
+qSp0q0q1    QN Q12.U16
+qSq3q2      QN Q13.U16  ;!!
+dHSq0p1     DN D28.U8   ;!!
+
+qTemp1      QN Q11.U16  ;!!;qSp2q1 
+qTemp2      QN Q12.U16  ;!!;qSp0q0p1        
+
+dP_0t       DN D28.U8   ;!!;dHSp0q1        
+dQ_0t       DN D22.U8   ;!!;Temp1        
+
+dP_0n       DN D29.U8
+dP_1n       DN D30.U8
+dP_2n       DN D31.U8
+
+dQ_0n       DN D24.U8   ;!!;Temp2        
+dQ_1n       DN D25.U8   ;!!;Temp2        
+dQ_2n       DN D28.U8   ;!!;dQ_0t        
+
+        
+        ;// Function header
+        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
+        
+        ;//Arguments on the stack
+        M_ARG   ppThresholds, 4
+        M_ARG   ppBS, 4
+        
+        ;// d0-dAlpha_0
+        ;// d2-dBeta_0
+
+        ADD         pAlpha_1, pAlpha_0, #1
+        ADD         pBeta_1, pBeta_0, #1
+        
+        VLD1        {dAlpha[]}, [pAlpha_0]
+        SUB         pSrcDst, pSrcDst, #4
+        VLD1        {dBeta[]}, [pBeta_0] 
+        
+        M_LDR       pBS, ppBS
+        M_LDR       pThresholds, ppThresholds 
+
+        MOV         Mask_0,#0
+
+        ;dMask_0-14
+        ;dMask_1-15
+
+        VMOV        dMask_0, #0     
+        VMOV        dMask_1, #1     
+
+        LDR         XY,=LOOP_COUNT
+    
+        ADD         pTmpStep, srcdstStep, srcdstStep
+
+        ;// p0-p3 - d4-d7
+        ;// q0-q3 - d8-d11
+LoopY        
+LoopX        
+        LDRH        bS10, [pBS], #4
+
+        CMP         bS10, #0
+        BEQ         NoFilterBS0
+
+        ;// Load 8 rows of data
+        ADD         pTmp, pSrcDst, srcdstStep
+        VLD1        dRow0, [pSrcDst], pTmpStep
+        VLD1        dRow1, [pTmp], pTmpStep
+        VLD1        dRow2, [pSrcDst], pTmpStep
+        VZIP.8      dRow0, dRow1
+        VLD1        dRow3, [pTmp], pTmpStep
+        VLD1        dRow4, [pSrcDst], pTmpStep
+        VZIP.8      dRow2, dRow3
+        VLD1        dRow5, [pTmp], pTmpStep
+        VLD1        dRow6, [pSrcDst], pTmpStep
+        VLD1        dRow7, [pTmp], pTmpStep
+        VZIP.8      dRow4, dRow5
+        VZIP.16     dRow1, dRow3
+    
+
+        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
+        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
+        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
+        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
+        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
+        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
+        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
+        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
+
+        ;// 8x8 Transpose
+
+        VZIP.8      dRow6, dRow7
+
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
+        VZIP.16     dRow0, dRow2
+        VZIP.16     dRow5, dRow7
+        
+
+        VZIP.16     dRow4, dRow6
+        VZIP.32     dRow1, dRow5
+        VZIP.32     dRow2, dRow6
+        VZIP.32     dRow3, dRow7
+        VZIP.32     dRow0, dRow4
+        
+
+        ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
+        ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
+
+        ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
+        ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
+        ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
+        ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
+
+        ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
+        ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
+        ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
+        ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
+
+        VABD        dAp0q0, dP_0, dQ_0
+        VABD        dAp1p0, dP_1, dP_0
+
+        VABD        dAq1q0, dQ_1, dQ_0
+        VABD        dAp2p0, dP_2, dP_0
+        
+        TST         bS10, #0xff
+        VCGT        dFilt, dAlpha, dAp0q0
+
+        VMAX        dAp1p0, dAq1q0, dAp1p0
+        VABD        dAq2q0, dQ_2, dQ_0
+
+        VMOVEQ.U32  dFilt[0], Mask_0
+        TST         bS10, #0xff00
+
+        VCGT        dAp2p0, dBeta, dAp2p0
+        VCGT        dAp1p0, dBeta, dAp1p0
+
+        VMOVEQ.U32  dFilt[1], Mask_0
+
+        VCGT        dAq2q0, dBeta, dAq2q0
+        VAND        dFilt, dFilt, dAp1p0
+        TST         bS10, #4 
+
+        VAND        dAqflg, dFilt, dAq2q0
+        VAND        dApflg, dFilt, dAp2p0
+    
+        BNE         bSGE4        
+bSLT4
+        ;// bS < 4 Filtering
+
+        BL          armVCM4P10_DeblockingLumabSLT4_unsafe
+
+        ;// Transpose
+
+        VZIP.8      dP_3,  dP_2  
+        VZIP.8      dP_1n, dP_0n
+        VZIP.8      dQ_0n, dQ_1n
+        VZIP.8      dQ_2,  dQ_3
+
+        
+        VZIP.16     dP_3,  dP_1n
+        ADD         pTmp, pSrcDst, srcdstStep
+        VZIP.16     dQ_0n, dQ_2
+        VZIP.16     dQ_1n, dQ_3
+        VZIP.16     dP_2,  dP_0n
+
+        VZIP.32     dP_3,  dQ_0n
+        VZIP.32     dP_1n, dQ_2
+        VZIP.32     dP_2,  dQ_1n
+        VZIP.32     dP_0n, dQ_3
+
+        ;// dRown0 - dP_3,  dRown1 - dQ_0n
+        ;// dRown2 - dP_1n, dRown3 - dQ_2
+        ;// dRown4 - dP_2,  dRown5 - dQ_1n
+        ;// dRown6 - dP_0n, dRown7 - dQ_3
+
+        VST1        dRown0, [pSrcDst], pTmpStep
+        VST1        dRown1, [pTmp], pTmpStep
+        VST1        dRown2, [pSrcDst], pTmpStep
+        VST1        dRown3, [pTmp], pTmpStep
+        ;1
+        VST1        dRown4, [pSrcDst], pTmpStep
+        VST1        dRown5, [pTmp], pTmpStep
+        ADDS        XY, XY, XY
+        VST1        dRown6, [pSrcDst], pTmpStep
+        ADD         pThresholds, pThresholds, #2
+        VST1        dRown7, [pTmp], srcdstStep
+
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
+        VLD1        {dAlpha[]}, [pAlpha_1]
+        ADD         pSrcDst, pSrcDst, #4
+        VLD1        {dBeta[]}, [pBeta_1]
+
+        BCC         LoopX
+        B           ExitLoopY        
+
+NoFilterBS0
+        ADD         pSrcDst, pSrcDst, #4
+        ADDS        XY, XY, XY
+        VLD1        {dAlpha[]}, [pAlpha_1]
+        ADD         pThresholds, pThresholds, #4
+        VLD1        {dBeta[]}, [pBeta_1]
+        BCC         LoopX
+        B           ExitLoopY        
+bSGE4        
+        ;// bS >= 4 Filtering
+        
+        BL          armVCM4P10_DeblockingLumabSGE4_unsafe
+
+        ;// Transpose
+
+        VZIP.8      dP_3,  dP_2n   
+        VZIP.8      dP_1n, dP_0n
+        VZIP.8      dQ_0n, dQ_1n
+        VZIP.8      dQ_2n, dQ_3
+
+        VZIP.16     dP_3,  dP_1n
+        ADD         pTmp, pSrcDst, srcdstStep
+        VZIP.16     dQ_0n, dQ_2n
+        VZIP.16     dQ_1n, dQ_3
+        VZIP.16     dP_2n, dP_0n
+
+        VZIP.32     dP_3,  dQ_0n
+        VZIP.32     dP_1n, dQ_2n
+        VZIP.32     dP_2n, dQ_1n
+        VZIP.32     dP_0n, dQ_3
+
+        ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
+        ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
+        
+        VST1        dRow0n, [pSrcDst], pTmpStep
+        VST1        dRow1n, [pTmp], pTmpStep
+        VST1        dRow2n, [pSrcDst], pTmpStep
+        VST1        dRow3n, [pTmp], pTmpStep
+        VST1        dRow4n, [pSrcDst], pTmpStep
+        VST1        dRow5n, [pTmp], pTmpStep
+        ADDS        XY,XY,XY
+        VST1        dRow6n, [pSrcDst], pTmpStep
+        ADD         pThresholds, pThresholds, #4
+        VST1        dRow7n, [pTmp], pTmpStep
+
+        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
+        VLD1        {dAlpha[]}, [pAlpha_1]
+        ADD         pSrcDst, pSrcDst, #4
+        VLD1        {dBeta[]}, [pBeta_1]
+
+        BCC         LoopX
+
+ExitLoopY        
+        SUB         pBS, pBS, #14
+        SUB         pThresholds, pThresholds, #14
+        SUB         pSrcDst, pSrcDst, #16
+        VLD1        {dAlpha[]}, [pAlpha_0]
+        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3 
+        VLD1        {dBeta[]}, [pBeta_0]
+        BNE         LoopY
+
+        MOV         r0, #OMX_Sts_NoErr
+
+        M_END
+        
+    ENDIF
+    
+        
+        END
+        
+        
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
new file mode 100755
index 0000000..3ce41be
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
@@ -0,0 +1,79 @@
+/**
+ * 
+ * File Name:  omxVCM4P10_InterpolateChroma.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * This function will calculate 1/8 Pixel interpolation for Chroma Block
+ * 
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armVC.h"
+#include "armCOMM.h"
+
+
+/**
+ * Function: omxVCM4P10_InterpolateChroma,
+ *
+ * Description:
+ * Performs 1/8-pixel interpolation for inter chroma MB.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	pSrc	Pointer to the source reference frame buffer
+ * [in]	srcStep Reference frame step in byte
+ * [in]	dstStep Destination frame step in byte. Must be multiple of roi.width.
+ * [in]	dx		Fractional part of horizontal motion vector component
+ *						in 1/8 pixel unit;valid in the range [0,7]
+ * [in]	dy		Fractional part of vertical motion vector component
+ *						in 1/8 pixel unit;valid in the range [0,7]
+ * [in]	roi		Dimension of the interpolation region;the parameters roi.width and roi.height must
+ *                      be equal to either 2, 4, or 8.
+ * [out]	pDst	Pointer to the destination frame buffer.
+ *                   if roi.width==2,  2-byte alignment required
+ *                   if roi.width==4,  4-byte alignment required
+ *                   if roi.width==8,  8-byte alignment required
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns OMX_Sts_BadArgErr:
+ *	pSrc or pDst is NULL.
+ *	srcStep or dstStep < 8.
+ *	dx or dy is out of range [0-7].
+ *	roi.width or roi.height is out of range {2,4,8}.
+ *	roi.width is equal to 2, but pDst is not 2-byte aligned.
+ *	roi.width is equal to 4, but pDst is not 4-byte aligned.
+ *	roi.width is equal to 8, but pDst is not 8 byte aligned.
+ *	srcStep or dstStep is not a multiple of 8.
+ *
+ */
+
+OMXResult omxVCM4P10_InterpolateChroma (
+     const OMX_U8* pSrc,
+     OMX_S32 srcStep,
+     OMX_U8* pDst,
+     OMX_S32 dstStep,
+     OMX_S32 dx,
+     OMX_S32 dy,
+     OMXSize roi
+ )
+{
+    return armVCM4P10_Interpolate_Chroma 
+        ((OMX_U8*)pSrc, srcStep, pDst, dstStep, roi.width, roi.height, dx, dy);
+}
+
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
new file mode 100755
index 0000000..942ebc6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
@@ -0,0 +1,553 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_InterpolateLuma_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Function:
+;//     omxVCM4P10_InterpolateLuma
+;//
+;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly.
+;// Performs quarter pel interpolation of inter luma MB.
+;// It's assumed that the frame is already padded when calling this function.
+;// Parameters:
+;// [in]    pSrc        Pointer to the source reference frame buffer
+;// [in]    srcStep     Reference frame step in byte
+;// [in]    dstStep     Destination frame step in byte. Must be multiple of roi.width
+;// [in]    dx          Fractional part of horizontal motion vector
+;//                         component in 1/4 pixel unit; valid in the range [0,3]
+;// [in]    dy          Fractional part of vertical motion vector
+;//                         component in 1/4 pixel unit; valid in the range [0,3]
+;// [in]    roi         Dimension of the interpolation region;the parameters roi.width and roi.height must
+;//                         be equal to either 4, 8, or 16.
+;// [out]   pDst        Pointer to the destination frame buffer.
+;//                   if roi.width==4,  4-byte alignment required
+;//                   if roi.width==8,  8-byte alignment required
+;//                   if roi.width==16, 16-byte alignment required
+;//
+;// Return Value:
+;// If the function runs without error, it returns OMX_Sts_NoErr.
+;// It is assued that following cases are satisfied before calling this function:
+;//  pSrc or pDst is not NULL.
+;//  srcStep or dstStep >= roi.width.
+;//     dx or dy is in the range [0-3].
+;//     roi.width or roi.height is not out of range {4, 8, 16}.
+;//     If roi.width is equal to 4, Dst is 4 byte aligned.
+;//     If roi.width is equal to 8, pDst is 8 byte aligned.
+;//     If roi.width is equal to 16, pDst is 16 byte aligned.
+;//     srcStep and dstStep is multiple of 8.
+;//
+;//
+
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        M_VARIANTS CortexA8
+
+        EXPORT omxVCM4P10_InterpolateLuma
+        
+
+    IF CortexA8
+        IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+        IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+        IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+        IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+    ENDIF
+    
+    
+
+;// Declare input registers
+pSrc            RN 0
+srcStep         RN 1
+pDst            RN 2
+dstStep         RN 3
+iHeight         RN 4
+iWidth          RN 5
+
+;// Declare other intermediate registers
+idx             RN 6
+idy             RN 7
+index           RN 6
+Temp            RN 12
+pArgs           RN 11
+
+
+    IF CortexA8
+
+        ;//
+        ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time.
+        ;//
+        M_ALLOC4    ppArgs, 16
+        
+        ;// Function header
+        M_START omxVCM4P10_InterpolateLuma, r11, d15
+
+pSrcBK          RN 8
+
+;// Declare Neon registers
+dCoeff5         DN 30.S16
+dCoeff20        DN 31.S16
+
+;// Registers used for implementing Horizontal interpolation
+dSrc0c          DN 14.U8
+dSrc1c          DN 16.U8
+dSrc2c          DN 18.U8
+dSrc3c          DN 20.U8                   
+dSrc0d          DN 15.U8
+dSrc1d          DN 17.U8
+dSrc2d          DN 19.U8
+dSrc3d          DN 21.U8
+dAccH0          DN 22.U8
+dAccH1          DN 24.U8
+dAccH2          DN 26.U8
+dAccH3          DN 28.U8
+dResultH0       DN 22.U32
+dResultH1       DN 24.U32
+dResultH2       DN 26.U32
+dResultH3       DN 28.U32
+
+;// Registers used for implementing Vertical interpolation
+dSrc0           DN 9.U8
+dSrc1           DN 10.U8
+dSrc2           DN 11.U8
+dSrc3           DN 12.U8
+dSrc4           DN 13.U8
+dAccV0          DN 0.U8
+dAccV1          DN 2.U8
+dAccV2          DN 4.U8
+dAccV3          DN 6.U8
+dResultV0       DN 0.U32
+dResultV1       DN 2.U32
+dResultV2       DN 4.U32
+dResultV3       DN 6.U32
+        
+;// Registers used for implementing Diagonal interpolation
+dTAcc0          DN 0.U8
+dTAcc1          DN 2.U8
+dTAcc2          DN 4.U8
+dTAcc3          DN 6.U8
+dTRes0          DN 0.32
+dTRes1          DN 2.32
+dTRes2          DN 4.32
+dTRes3          DN 6.32
+dTResult0       DN 14.U8
+dTResult1       DN 16.U8
+dTResult2       DN 18.U8
+dTResult3       DN 20.U8
+dTempP0         DN 18.S16
+dTempP1         DN 19.S16
+dTempQ0         DN 20.S16
+dTempQ1         DN 21.S16
+dTempR0         DN 22.S16
+dTempR1         DN 23.S16
+dTempS0         DN 24.S16
+dTempS1         DN 25.S16
+qTempP01        QN 9.S16
+qTempQ01        QN 10.S16
+qTempR01        QN 11.S16
+qTempS01        QN 12.S16
+
+;// Intermediate values for averaging
+qRes2           QN 7.S16
+qRes3           QN 8.S16
+qRes4           QN 9.S16
+qRes5           QN 10.S16
+qRes6           QN 11.S16
+       
+;// For implementing copy
+dDst0            DN 9.32
+dDst1            DN 10.32
+dDst2            DN 11.32
+dDst3            DN 12.32
+
+        ;// Define stack arguments
+        M_ARG       ptridx, 4
+        M_ARG       ptridy, 4        
+        M_ARG       ptrWidth, 4
+        M_ARG       ptrHeight, 4        
+
+        ;// Load structure elements of roi 
+        M_LDR       idx, ptridx
+        M_LDR       idy, ptridy
+        M_LDR       iWidth, ptrWidth
+        M_LDR       iHeight, ptrHeight
+        
+        ADD         index, idx, idy, LSL #2                 ;//  [index] = [idy][idx]
+        M_ADR       pArgs, ppArgs
+                    
+        ;// Move coefficients Neon registers
+        VMOV        dCoeff20, #20
+        VMOV        dCoeff5, #5
+                                        
+Block4x4WidthLoop
+Block4x4HeightLoop
+
+        STM         pArgs, {pSrc,srcStep,pDst,dstStep} 
+                                                            
+        ;// switch table using motion vector as index
+        ADD         pc, pc, index, LSL #2
+        B           Case_f
+        B           Case_0        
+        B           Case_1        
+        B           Case_2        
+        B           Case_3        
+        B           Case_4        
+        B           Case_5        
+        B           Case_6        
+        B           Case_7        
+        B           Case_8        
+        B           Case_9        
+        B           Case_a        
+        B           Case_b        
+        B           Case_c        
+        B           Case_d
+        B           Case_e        
+        B           Case_f
+                    
+Case_0                
+        ;// Case G
+        M_PRINTF "Case 0 \n"
+        
+        ;// Loads a 4x4 block of .8 and stores as .32
+        ADD         Temp, pSrc, srcStep, LSL #1 
+        VLD1        dSrc0, [pSrc], srcStep
+        VLD1        dSrc2, [Temp], srcStep
+        VLD1        dSrc1, [pSrc]
+        VLD1        dSrc3, [Temp]
+        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dDst0[0], [pDst], dstStep
+        VST1        dDst2[0], [Temp], dstStep
+        VST1        dDst1[0], [pDst]
+        VST1        dDst3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd
+Case_1
+        ;// Case a
+        M_PRINTF "Case 1 \n"
+
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dSrc0c
+        VRHADD      dAccH2, dAccH2, dSrc2c
+        VRHADD      dAccH1, dAccH1, dSrc1c
+        VRHADD      dAccH3, dAccH3, dSrc3c
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd        
+Case_2
+        ;// Case b
+        M_PRINTF "Case 2 \n"
+
+        SUB         pSrc, pSrc, #2        
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd        
+Case_3
+        ;// Case c
+        M_PRINTF "Case 3 \n"
+
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dSrc0d
+        VRHADD      dAccH2, dAccH2, dSrc2d
+        VRHADD      dAccH1, dAccH1, dSrc1d
+        VRHADD      dAccH3, dAccH3, dSrc3d
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd        
+Case_4
+        ;// Case d
+        M_PRINTF "Case 4 \n"
+
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        VRHADD      dAccV0, dAccV0, dSrc0
+        VRHADD      dAccV2, dAccV2, dSrc2
+        VRHADD      dAccV1, dAccV1, dSrc1
+        VRHADD      dAccV3, dAccV3, dSrc3
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultV0[0], [pDst], dstStep
+        VST1        dResultV2[0], [Temp], dstStep
+        VST1        dResultV1[0], [pDst]
+        VST1        dResultV3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd        
+Case_5
+        ;// Case e
+        M_PRINTF "Case 5 \n"
+        
+        MOV         pSrcBK, pSrc
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        SUB         pSrc, pSrcBK, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dAccV0
+        VRHADD      dAccH2, dAccH2, dAccV2
+        VRHADD      dAccH1, dAccH1, dAccV1
+        VRHADD      dAccH3, dAccH3, dAccV3        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd        
+Case_6
+        ;// Case f
+        M_PRINTF "Case 6 \n"
+
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+        VQRSHRUN    dTResult0, qRes2, #5        
+        VQRSHRUN    dTResult1, qRes3, #5        
+        VQRSHRUN    dTResult2, qRes4, #5        
+        VQRSHRUN    dTResult3, qRes5, #5        
+        VRHADD      dTAcc0, dTAcc0, dTResult0
+        VRHADD      dTAcc2, dTAcc2, dTResult2
+        VRHADD      dTAcc1, dTAcc1, dTResult1
+        VRHADD      dTAcc3, dTAcc3, dTResult3
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dTRes0[0], [pDst], dstStep
+        VST1        dTRes2[0], [Temp], dstStep
+        VST1        dTRes1[0], [pDst]
+        VST1        dTRes3[0], [Temp]
+        
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd        
+Case_7
+        ;// Case g
+        M_PRINTF "Case 7 \n"
+        MOV         pSrcBK, pSrc
+        ADD         pSrc, pSrc, #1
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        SUB         pSrc, pSrcBK, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dAccV0
+        VRHADD      dAccH2, dAccH2, dAccV2
+        VRHADD      dAccH1, dAccH1, dAccV1
+        VRHADD      dAccH3, dAccH3, dAccV3        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_8
+        ;// Case h
+        M_PRINTF "Case 8 \n"
+
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultV0[0], [pDst], dstStep
+        VST1        dResultV2[0], [Temp], dstStep
+        VST1        dResultV1[0], [pDst]
+        VST1        dResultV3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd
+Case_9
+        ;// Case i
+        M_PRINTF "Case 9 \n"
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+        VEXT        dTempP0, dTempP0, dTempP1, #2
+        VEXT        dTempQ0, dTempQ0, dTempQ1, #2
+        VEXT        dTempR0, dTempR0, dTempR1, #2
+        VEXT        dTempS0, dTempS0, dTempS1, #2
+        
+        VQRSHRUN    dTResult0, qTempP01, #5        
+        VQRSHRUN    dTResult1, qTempQ01, #5        
+        VQRSHRUN    dTResult2, qTempR01, #5        
+        VQRSHRUN    dTResult3, qTempS01, #5        
+
+        VRHADD      dTAcc0, dTAcc0, dTResult0
+        VRHADD      dTAcc2, dTAcc2, dTResult2
+        VRHADD      dTAcc1, dTAcc1, dTResult1
+        VRHADD      dTAcc3, dTAcc3, dTResult3
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dTRes0[0], [pDst], dstStep
+        VST1        dTRes2[0], [Temp], dstStep
+        VST1        dTRes1[0], [pDst]
+        VST1        dTRes3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_a
+        ;// Case j
+        M_PRINTF "Case a \n"
+
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dTRes0[0], [pDst], dstStep
+        VST1        dTRes2[0], [Temp], dstStep
+        VST1        dTRes1[0], [pDst]
+        VST1        dTRes3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_b
+        ;// Case k
+        M_PRINTF "Case b \n"
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+        VEXT        dTempP0, dTempP0, dTempP1, #3
+        VEXT        dTempQ0, dTempQ0, dTempQ1, #3
+        VEXT        dTempR0, dTempR0, dTempR1, #3
+        VEXT        dTempS0, dTempS0, dTempS1, #3
+        
+        VQRSHRUN    dTResult0, qTempP01, #5        
+        VQRSHRUN    dTResult1, qTempQ01, #5        
+        VQRSHRUN    dTResult2, qTempR01, #5        
+        VQRSHRUN    dTResult3, qTempS01, #5        
+
+        VRHADD      dTAcc0, dTAcc0, dTResult0
+        VRHADD      dTAcc2, dTAcc2, dTResult2
+        VRHADD      dTAcc1, dTAcc1, dTResult1
+        VRHADD      dTAcc3, dTAcc3, dTResult3
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dTRes0[0], [pDst], dstStep
+        VST1        dTRes2[0], [Temp], dstStep
+        VST1        dTRes1[0], [pDst]
+        VST1        dTRes3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_c
+        ;// Case n
+        M_PRINTF "Case c \n"
+
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        VRHADD      dAccV0, dAccV0, dSrc1
+        VRHADD      dAccV2, dAccV2, dSrc3
+        VRHADD      dAccV1, dAccV1, dSrc2
+        VRHADD      dAccV3, dAccV3, dSrc4
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultV0[0], [pDst], dstStep
+        VST1        dResultV2[0], [Temp], dstStep
+        VST1        dResultV1[0], [pDst]
+        VST1        dResultV3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B           Block4x4LoopEnd
+Case_d
+        ;// Case p
+        M_PRINTF "Case d \n"
+        
+        MOV         pSrcBK, pSrc
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        ADD         pSrc, pSrcBK, srcStep
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dAccV0
+        VRHADD      dAccH2, dAccH2, dAccV2
+        VRHADD      dAccH1, dAccH1, dAccV1
+        VRHADD      dAccH3, dAccH3, dAccV3        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_e
+        ;// Case q
+        M_PRINTF "Case e \n"
+        
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+        VQRSHRUN    dTResult0, qRes3, #5        
+        VQRSHRUN    dTResult1, qRes4, #5        
+        VQRSHRUN    dTResult2, qRes5, #5        
+        VQRSHRUN    dTResult3, qRes6, #5        
+
+        VRHADD      dTAcc0, dTAcc0, dTResult0
+        VRHADD      dTAcc2, dTAcc2, dTResult2
+        VRHADD      dTAcc1, dTAcc1, dTResult1
+        VRHADD      dTAcc3, dTAcc3, dTResult3
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dTRes0[0], [pDst], dstStep
+        VST1        dTRes2[0], [Temp], dstStep
+        VST1        dTRes1[0], [pDst]
+        VST1        dTRes3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+        B       Block4x4LoopEnd
+Case_f
+        ;// Case r
+        M_PRINTF "Case f \n"
+        MOV         pSrcBK, pSrc
+        ADD         pSrc, pSrc, #1
+        SUB         pSrc, pSrc, srcStep, LSL #1
+        BL          armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe        
+        ADD         pSrc, pSrcBK, srcStep
+        SUB         pSrc, pSrc, #2
+        BL          armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe        
+        VRHADD      dAccH0, dAccH0, dAccV0
+        VRHADD      dAccH2, dAccH2, dAccV2
+        VRHADD      dAccH1, dAccH1, dAccV1
+        VRHADD      dAccH3, dAccH3, dAccV3        
+        ADD         Temp, pDst, dstStep, LSL #1 
+        VST1        dResultH0[0], [pDst], dstStep
+        VST1        dResultH2[0], [Temp], dstStep
+        VST1        dResultH1[0], [pDst]
+        VST1        dResultH3[0], [Temp]
+        M_ADR       pArgs, ppArgs
+
+
+Block4x4LoopEnd
+
+        ;// Width Loop
+        ;//M_ADR       pArgs, ppArgs
+        LDM         pArgs, {pSrc,srcStep,pDst,dstStep}  ;// Load arguments
+        SUBS        iWidth, iWidth, #4
+        ADD         pSrc, pSrc, #4      
+        ADD         pDst, pDst, #4
+        BGT         Block4x4WidthLoop
+                    
+        ;// Height Loop
+        SUBS        iHeight, iHeight, #4
+        M_LDR       iWidth, ptrWidth
+        M_ADR       pArgs, ppArgs
+        ADD         pSrc, pSrc, srcStep, LSL #2      
+        ADD         pDst, pDst, dstStep, LSL #2
+        SUB         pSrc, pSrc, iWidth
+        SUB         pDst, pDst, iWidth
+        BGT         Block4x4HeightLoop
+
+EndOfInterpolation
+        MOV         r0, #0
+        M_END       
+
+    ENDIF  
+        ;// End of CortexA8
+                    
+    END
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
new file mode 100755
index 0000000..3a60705
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
@@ -0,0 +1,436 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+  
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        EXPORT armVCM4P10_pIndexTable8x8
+        
+;// Define the processor variants supported by this file
+         
+         M_VARIANTS CortexA8
+     
+     AREA table, DATA    
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+    M_TABLE armVCM4P10_pIndexTable8x8
+    DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR 
+    DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE  
+    
+    M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
+    DCW   3, 2, 1,4 
+    DCW  -3,-2,-1,0
+    DCW   1, 2, 3,4
+    
+        
+        
+    IF CortexA8
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+ 
+pc              RN 15   
+return          RN 0  
+pTable          RN 8 
+  
+;//--------------------------------------------
+;// Input Arguments
+;//--------------------------------------------
+pSrcLeft        RN 0    ;// input pointer
+pSrcAbove       RN 1    ;// input pointer
+pSrcAboveLeft   RN 2    ;// input pointer
+pDst            RN 3    ;// output pointer
+leftStep        RN 4    ;// input variable
+dstStep         RN 5    ;// input variable
+predMode        RN 6    ;// input variable
+availability    RN 7    ;// input variable
+pMultiplierTable    RN  2     
+
+pTmp            RN 9
+step            RN 10
+
+;//---------------------
+;// Neon Registers
+;//---------------------
+
+;// OMX_VC_CHROMA_HOR
+
+dLeftVal0       DN  D0.8
+dLeftVal1       DN  D1.8
+dLeftVal2       DN  D2.8
+dLeftVal3       DN  D3.8
+dLeftVal4       DN  D4.8
+dLeftVal5       DN  D5.8
+dLeftVal6       DN  D6.8
+dLeftVal7       DN  D7.8
+
+;// OMX_VC_CHROMA_VERT
+
+dAboveVal       DN  D0.U8
+
+;// OMX_VC_CHROMA_DC
+
+dLeftVal        DN  D1.U8
+dSumAboveValU16 DN  D2.U16
+dSumAboveValU32 DN  D3.U32
+dSumAboveValU8  DN  D3.U8
+dSumLeftValU16  DN  D2.U16
+dSumLeftValU32  DN  D1.U32
+dSumLeftValU8   DN  D1.U8
+dSumAboveLeft   DN  D2.U32
+dSumAboveLeftU8 DN  D2.U8
+dIndexRow0U8    DN  D5.U8
+dIndexRow0      DN  D5.U64
+dIndexRow4U8    DN  D6.U8
+dIndexRow4      DN  D6.U64
+dDstRow0        DN  D0.U8
+dDstRow4        DN  D4.U8
+dConst128U8     DN  D0.U8
+
+;// OMX_VC_CHROMA_PLANE
+
+dRevAboveVal    DN  D3.U8  
+dRevAboveValU64 DN  D3.U64  
+dAboveLeftVal   DN  D2.U8
+qAbove7minus0   QN  Q3.S16 
+qAboveDiff      QN  Q2.S16 
+dIndex          DN  D8.U8  
+dDiffAboveU8    DN  D9.U8  
+dDiffAboveS16   DN  D9.S16 
+dAboveDiff0U8   DN  D4.U8  
+dAboveDiff0U64  DN  D4.U64
+dAbove7minus0U8 DN  D6.U8  
+dMultiplier     DN  D10.S16 
+dHorPred        DN  D11.S16 
+dRevLeftVal     DN  D3.U8
+dRevLeftValU64  DN  D3.U64
+qLeft7minus0    QN  Q7.S16
+qLeftDiff       QN  Q6.S16
+dDiffLeftU8     DN  D16.U8
+dDiffLeftS16    DN  D16.S16
+dLeftDiff0U8    DN  D12.U8
+dLeftDiff0U64   DN  D12.U64
+dLeft7minus0U8  DN  D14.U8
+dVerPred        DN  D3.S16 
+dHVValS16       DN  D3.S16
+dHVValS32       DN  D3.S32
+dHVTempS32      DN  D2.S32
+qA              QN  Q0.S16
+qB              QN  Q2.S16
+qC              QN  Q3.S16
+qMultiplier     QN  Q5.S16
+dMultiplier0    DN  D10.S16
+dMultiplier1    DN  D11.S16
+qC0             QN  Q0.S16
+qC1             QN  Q1.S16
+qC2             QN  Q4.S16
+qC3             QN  Q5.S16
+qC4             QN  Q6.S16
+qC5             QN  Q7.S16
+qC6             QN  Q8.S16
+qC7             QN  Q9.S16
+qSum0           QN  Q0.S16
+qSum1           QN  Q1.S16
+qSum2           QN  Q4.S16
+qSum3           QN  Q5.S16
+qSum4           QN  Q6.S16
+qSum5           QN  Q7.S16
+qSum6           QN  Q8.S16
+qSum7           QN  Q9.S16
+dSum0           DN  D0.U8
+dSum1           DN  D1.U8
+dSum2           DN  D2.U8
+dSum3           DN  D3.U8
+dSum4           DN  D4.U8
+dSum5           DN  D5.U8
+dSum6           DN  D6.U8
+dSum7           DN  D7.U8
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntraChroma_8x8 starts
+;//-----------------------------------------------------------------------------------------------
+        
+        ;// Write function header
+        M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
+        
+        ;// Define stack arguments
+        M_ARG    LeftStep,     4
+        M_ARG    DstStep,      4
+        M_ARG    PredMode,     4
+        M_ARG    Availability, 4
+        
+        LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
+        
+        ;// Load argument from the stack
+        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg 
+        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg 
+        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg         
+        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg 
+        
+        
+        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
+
+OMX_VC_CHROMA_DC
+        
+        TST     availability, #OMX_VC_LEFT
+        BEQ     DCChroma8x8LeftNotAvailable
+
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     step, leftStep, leftStep
+
+        ;// Load Left Edge
+        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
+        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
+        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
+        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
+        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]      
+        
+        TST     availability, #OMX_VC_UPPER
+        BEQ     DCChroma8x8LeftOnlyAvailable
+
+        ;// Load Upper Edge also
+        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]  
+        
+        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
+        
+        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]             
+        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ] 
+                
+        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]             
+        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]             
+        
+        VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
+        VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
+        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
+        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
+        
+        VMOV     dIndexRow0U8,#0x0c                         
+        VMOV     dIndexRow4U8,#0x04
+        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000 
+        VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
+        VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
+        VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
+        VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
+ 
+DCChroma8x8LeftStore       
+        ADD     pTmp, pDst, dstStep
+        ADD     step, dstStep, dstStep
+        
+        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
+        VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
+        VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
+        VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
+        VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
+
+        M_EXIT
+        
+
+DCChroma8x8LeftOnlyAvailable
+
+        MOV      return, #OMX_Sts_NoErr
+        
+        VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]             
+        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]   
+        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
+        
+        VDUP     dDstRow0,dSumLeftValU8[0]
+        VDUP     dDstRow4,dSumLeftValU8[4]
+        
+        B        DCChroma8x8LeftStore  
+        
+
+DCChroma8x8LeftNotAvailable
+                 
+        TST     availability, #OMX_VC_UPPER
+        BEQ     DCChroma8x8NoneAvailable
+
+        ;// Load Upper Edge 
+        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]  
+        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
+        
+        VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]             
+        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ] 
+        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
+        VMOV     dIndexRow0U8,#0x04
+        VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
+        VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8 
+        
+        B        DCChroma8x8UpperStore
+        
+
+DCChroma8x8NoneAvailable        
+        
+        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
+        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
+
+DCChroma8x8UpperStore        
+        
+        ADD     pTmp, pDst, dstStep
+        ADD     step, dstStep, dstStep
+        
+        VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
+        VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
+        
+        M_EXIT
+
+
+OMX_VC_CHROMA_VERT
+        
+        VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7   
+        MOV      return, #OMX_Sts_NoErr
+        
+        B        DCChroma8x8UpperStore
+        
+
+OMX_VC_CHROMA_HOR
+        
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     step, leftStep, leftStep
+        
+        VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
+        VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
+        VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
+        VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
+        VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]
+        
+        B        DCChroma8x8PlaneStore
+        
+        
+OMX_VC_CHROMA_PLANE
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     step, leftStep, leftStep
+        
+        VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7   
+        VLD1    dAboveLeftVal[0],[pSrcAboveLeft]
+        
+        VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
+        VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
+        VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
+        VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
+        VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep] 
+        
+        
+        VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
+        VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
+        VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
+        VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
+                                                            ;// pSrcAbove[5] - pSrcAbove[1]
+                                                            ;// pSrcAbove[4] - pSrcAbove[2]
+        
+        VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
+        VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
+        VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
+        VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
+                                                            ;// pSrcLeft[5] - pSrcLeft[1]
+                                                            ;// pSrcLeft[4] - pSrcLeft[2]
+        
+        LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval  
+        VSHL    dAboveDiff0U64,dAboveDiff0U64,#16  
+        VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
+        VLD1    dMultiplier,[pMultiplierTable]! 
+        VSHL    dLeftDiff0U64,dLeftDiff0U64,#16  
+        VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]                                                   
+                                                                    
+        
+        VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
+        VMUL    dVerPred,dDiffLeftS16,dMultiplier
+        VPADD   dHVValS16,dHorPred,dVerPred
+        
+        
+        VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
+        VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
+        VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
+        VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]  
+        VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
+        VADDL   qA,dAboveVal,dLeftVal
+        VDUP    qA,qA[7]
+        VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
+        VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
+        VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]
+        
+        
+        VMUL    qB,qB,qMultiplier
+        VMUL    qC,qC,qMultiplier
+        VADD    qB,qB,qA 
+        
+        VDUP    qC0,qC[0]
+        VDUP    qC1,qC[1]
+        VDUP    qC2,qC[2]
+        VDUP    qC3,qC[3]
+        VDUP    qC4,qC[4]
+        VDUP    qC5,qC[5]
+        VDUP    qC6,qC[6]
+        VDUP    qC7,qC[7]
+        
+        VADD    qSum0,qB,qC0
+        VADD    qSum1,qB,qC1
+        VADD    qSum2,qB,qC2
+        VADD    qSum3,qB,qC3
+        VADD    qSum4,qB,qC4
+        VADD    qSum5,qB,qC5
+        VADD    qSum6,qB,qC6
+        VADD    qSum7,qB,qC7
+        
+        VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
+        VQRSHRUN dSum1,qSum1,#5
+        VQRSHRUN dSum2,qSum2,#5
+        VQRSHRUN dSum3,qSum3,#5
+        VQRSHRUN dSum4,qSum4,#5
+        VQRSHRUN dSum5,qSum5,#5
+        VQRSHRUN dSum6,qSum6,#5
+        VQRSHRUN dSum7,qSum7,#5      
+
+DCChroma8x8PlaneStore        
+        ADD     pTmp, pDst, dstStep
+        ADD     step, dstStep, dstStep
+        
+        VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
+        VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
+        VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
+        VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
+        VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
+        VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
+        VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
+        VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7       
+        
+        MOV     return, #OMX_Sts_NoErr
+        M_END
+        
+        ENDIF ;// CortexA8
+        
+        END
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntraChroma_8x8 ends
+;//-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
new file mode 100755
index 0000000..e9c0eee
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
@@ -0,0 +1,424 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        M_VARIANTS CortexA8
+     
+  
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+    M_TABLE armVCM4P10_pIndexTable16x16
+    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 
+    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
+    
+
+    IF CortexA8
+
+    M_TABLE armVCM4P10_MultiplierTable16x16,1
+    DCW   7,  6,  5,  4,  3,  2,  1,  8 
+    DCW   0,  1,  2,  3,  4,  5,  6,  7
+    DCW   8,  9, 10, 11, 12, 13, 14, 15
+        
+;//--------------------------------------------
+;// Constants 
+;//--------------------------------------------  
+BLK_SIZE        EQU 0x10
+MUL_CONST0      EQU 0x01010101
+MUL_CONST1      EQU 0x00060004
+MUL_CONST2      EQU 0x00070005
+MUL_CONST3      EQU 0x00030001
+MASK_CONST      EQU 0x00FF00FF
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+y               RN 12   
+pc              RN 15   
+
+return          RN 0    
+pTable          RN 9    
+count           RN 11   
+pMultTable      RN 9
+; ----------------------------------------------
+; Neon registers
+; ----------------------------------------------
+qAbove          QN Q0.U8
+qLeft           QN Q1.U8
+qSum8           QN Q0.U16
+dSum80          DN D0.U16
+dSum81          DN D1.U16
+dSum4           DN D0.U16
+dSum2           DN D0.U32
+dSum1           DN D0.U64
+qOut            QN Q3.U8
+dSumLeft        DN D6.U64
+dSumAbove       DN D7.U64
+dSum            DN D8.U64
+dSum0           DN D8.U8[0]
+
+qH              QN Q11.S32
+qV              QN Q12.S32
+qA              QN Q11.S16
+qB              QN Q6.S16
+qC              QN Q7.S16
+
+qB0             QN Q5.S16
+qB1             QN Q6.S16
+dA1             DN D23.S16
+
+dH0             DN D22.S32
+dH1             DN D23.S32
+dV0             DN D24.S32
+dV1             DN D25.S32
+
+qHV             QN Q11.S64
+qHV0            QN Q11.S32
+qHV1            QN Q12.S64
+
+dHV00           DN D22.S32
+dHV01           DN D23.S32
+
+dHV0            DN D22.S16[0]
+dHV1            DN D23.S16[0]
+dHV10           DN D24.S64
+dHV11           DN D25.S64
+
+qSum0           QN Q0.S16
+qSum1           QN Q1.S16
+
+dOut0           DN D6.U8
+dOut1           DN D7.U8
+
+dLeft0          DN D2.U8
+dLeft1          DN D3.U8
+qConst          QN Q13.S16
+
+dAbove0         DN D0.U8
+dAbove1         DN D1.U8
+
+dRevLeft64      DN D12.U64
+dRevLeft        DN D12.U8
+dRevAbove64     DN D5.U64
+dRevAbove       DN D5.U8
+qLeftDiff       QN Q8.S16
+dLeftDiff1      DN D17.S16
+dLeftDiff64     DN D17.S64
+qDiffLeft       QN Q8.S16
+qDiffAbove      QN Q4.S16
+dAboveDiff1     DN D9.S16
+dAboveDiff64    DN D9.S64
+qAboveDiff      QN Q4.S16
+
+dAboveLeft      DN D4.U8
+
+dDiffLeft0      DN D16.S16
+dDiffLeft1      DN D17.S16
+dDiffAbove0     DN D8.S16
+dDiffAbove1     DN D9.S16
+
+qLeft15minus0   QN Q7.S16
+dLeft15minus0   DN D14.S16
+qAbove15minus0  QN Q3.S16
+dAbove15minus0  DN D6.S16
+
+qMultiplier     QN Q10.S16
+qMultiplier0    QN Q10.S16
+qMultiplier1    QN Q12.S16
+dMultiplier0    DN D20.S16
+dMultiplier1    DN D21.S16
+
+dBPlusCMult7    DN D1.S64
+dBPlusCMult7S16 DN D1.S16
+
+qTmp            QN Q0.U8
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+pSrcLeft        RN 0    ;// input pointer
+pSrcAbove       RN 1    ;// input pointer
+pSrcAboveLeft   RN 2    ;// input pointer
+pDst            RN 3    ;// output pointer
+leftStep        RN 4    ;// input variable
+dstStep         RN 5    ;// input variable
+predMode        RN 6    ;// input variable
+availability    RN 7    ;// input variable
+
+pTmp            RN 8
+step            RN 10
+pTmp2           RN 11
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_16x16 starts
+;//-----------------------------------------------------------------------------------------------
+        
+        ;// Write function header
+        M_START omxVCM4P10_PredictIntra_16x16, r11, d15
+        
+        ;// Define stack arguments
+        M_ARG    LeftStep,     4
+        M_ARG    DstStep,      4
+        M_ARG    PredMode,     4
+        M_ARG    Availability, 4
+        
+        ;// M_STALL ARM1136JS=4
+        
+        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
+        
+        ;// Load argument from the stack
+        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg 
+        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg 
+        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg         
+        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
+        
+        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
+        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
+        
+OMX_VC_16X16_VERT
+        VLD1    qAbove,  [pSrcAbove]
+        ADD     pTmp, pDst, dstStep
+        ADD     step, dstStep, dstStep
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst], step
+        VST1    qAbove, [pTmp], step
+        VST1    qAbove, [pDst]
+        VST1    qAbove, [pTmp]
+        MOV     return, #OMX_Sts_NoErr               ;// returnNoError
+        M_EXIT
+        
+OMX_VC_16X16_HOR
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     leftStep, leftStep, leftStep
+        ADD     pTmp2, pDst, dstStep
+        ADD     dstStep, dstStep, dstStep
+LoopHor 
+        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
+        VLD1     {qTmp[]}, [pTmp], leftStep       
+        SUBS     y, y, #8
+        VST1     qLeft, [pDst], dstStep
+        VST1     qTmp, [pTmp2], dstStep
+        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
+        VLD1     {qTmp[]}, [pTmp], leftStep       
+        VST1     qLeft, [pDst], dstStep
+        VST1     qTmp, [pTmp2], dstStep
+        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
+        VLD1     {qTmp[]}, [pTmp], leftStep       
+        VST1     qLeft, [pDst], dstStep
+        VST1     qTmp, [pTmp2], dstStep
+        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
+        VLD1     {qTmp[]}, [pTmp], leftStep       
+        VST1     qLeft, [pDst], dstStep
+        VST1     qTmp, [pTmp2], dstStep
+        
+        BNE      LoopHor                                  ;// Loop for 16 times
+        MOV      return, #OMX_Sts_NoErr
+        M_EXIT
+        
+OMX_VC_16X16_DC
+        MOV      count, #0                                 ;// count = 0
+        TST      availability, #OMX_VC_LEFT
+        BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left
+
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     step, leftStep, leftStep
+
+        VLD1    {qLeft[0]}, [pSrcLeft],step    
+        VLD1    {qLeft[1]}, [pTmp],step   
+        VLD1    {qLeft[2]}, [pSrcLeft],step   
+        VLD1    {qLeft[3]}, [pTmp],step
+        VLD1    {qLeft[4]}, [pSrcLeft],step   
+        VLD1    {qLeft[5]}, [pTmp],step   
+        VLD1    {qLeft[6]}, [pSrcLeft],step    
+        VLD1    {qLeft[7]}, [pTmp],step
+        VLD1    {qLeft[8]}, [pSrcLeft],step    
+        VLD1    {qLeft[9]}, [pTmp],step   
+        VLD1    {qLeft[10]},[pSrcLeft],step   
+        VLD1    {qLeft[11]},[pTmp],step    
+        VLD1    {qLeft[12]},[pSrcLeft],step   
+        VLD1    {qLeft[13]},[pTmp],step   
+        VLD1    {qLeft[14]},[pSrcLeft],step    
+        VLD1    {qLeft[15]},[pTmp] 
+        
+        VPADDL   qSum8, qLeft
+        ADD     count, count, #1    
+        VPADD    dSum4, dSum80, dSum81
+        VPADDL   dSum2, dSum4
+        VPADDL   dSumLeft, dSum2
+        VRSHR    dSum, dSumLeft, #4
+        
+UpperOrNoneAvailable
+        TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
+        BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
+        VLD1     qAbove, [pSrcAbove]
+        ADD      count, count, #1                          ;// if upper inc count by 1
+        VPADDL   qSum8, qAbove
+        VPADD    dSum4, dSum80, dSum81
+        VPADDL   dSum2, dSum4
+        VPADDL   dSumAbove, dSum2
+        VRSHR    dSum, dSumAbove, #4
+        
+BothOrNoneAvailable
+        CMP      count, #2                                  ;// check if both available
+        BNE      NoneAvailable
+        VADD     dSum, dSumAbove, dSumLeft
+        VRSHR    dSum, dSum, #5
+        
+
+NoneAvailable
+        VDUP     qOut, dSum0        
+        CMP      count, #0                                  ;// check if none available
+        ADD      pTmp, pDst, dstStep
+        ADD      step, dstStep, dstStep
+        BNE      LoopDC
+        VMOV     qOut, #128
+LoopDC        
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        VST1    qOut, [pDst], step
+        VST1    qOut, [pTmp], step
+        MOV     return, #OMX_Sts_NoErr
+        M_EXIT
+
+OMX_VC_16X16_PLANE
+        LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
+        VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7    
+        VLD1    dAboveLeft[0],[pSrcAboveLeft]                                               
+        ADD     pTmp, pSrcLeft, leftStep
+        ADD     step, leftStep, leftStep
+        VLD1    {qLeft[0]},  [pSrcLeft],step                                             
+        VLD1    {qLeft[1]},  [pTmp],step      
+        VLD1    {qLeft[2]},  [pSrcLeft],step  
+        VLD1    {qLeft[3]},  [pTmp],step       
+        VLD1    {qLeft[4]},  [pSrcLeft],step  
+        VLD1    {qLeft[5]},  [pTmp],step      
+        VLD1    {qLeft[6]},  [pSrcLeft],step   
+        VLD1    {qLeft[7]},  [pTmp],step
+        VLD1    {qLeft[8]},  [pSrcLeft],step   
+        VLD1    {qLeft[9]},  [pTmp],step      
+        VLD1    {qLeft[10]}, [pSrcLeft],step  
+        VLD1    {qLeft[11]}, [pTmp],step       
+        VLD1    {qLeft[12]}, [pSrcLeft],step  
+        VLD1    {qLeft[13]}, [pTmp],step      
+        VLD1    {qLeft[14]}, [pSrcLeft],step   
+        VLD1    {qLeft[15]}, [pTmp]   
+
+        VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8] 
+        VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 
+        VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X] 
+        VSUBL   qAboveDiff, dRevAbove, dAbove0              
+        
+        VSHL    dAboveDiff64, dAboveDiff64, #16 
+        VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1  
+
+        VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8] 
+        VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 
+        VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X] 
+        VSUBL   qLeftDiff,dRevLeft, dLeft0                  
+        
+        ;// Multiplier = [8|1|2|...|6|7]
+        VLD1    qMultiplier, [pMultTable]!                  
+        
+        VSHL    dLeftDiff64, dLeftDiff64, #16
+        VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1     
+        
+        VMULL   qH,dDiffAbove0, dMultiplier0                
+        VMULL   qV,dDiffLeft0,  dMultiplier0                
+        VMLAL   qH,dDiffAbove1, dMultiplier1 
+        VMLAL   qV,dDiffLeft1,  dMultiplier1
+        
+        VPADD   dHV00,dH1,dH0                                 
+        VPADD   dHV01,dV1,dV0                                 
+        VPADDL  qHV, qHV0
+        VSHL    qHV1,qHV,#2
+        VADD    qHV,qHV,qHV1 
+        
+        ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
+        VRSHR   qHV,qHV,#6
+        
+        ;// HV1 = [c*7|b*7]
+        VSHL    qHV1,qHV,#3
+        VSUB    qHV1,qHV1,qHV                             
+        
+        ;// Multiplier1 = [0|1|2|...|7]
+        VLD1    qMultiplier0, [pMultTable]!    
+        VDUP    qB, dHV0                                  
+        VDUP    qC, dHV1 
+        
+        VADDL   qA,dAbove1,dLeft1
+        VSHL    qA,qA, #4
+        VDUP    qA,dA1[3]  
+        VADD    dBPlusCMult7, dHV10, dHV11
+        
+        ;// Multiplier1 = [8|9|10|...|15]
+        VLD1    qMultiplier1, [pMultTable]
+        ;// Const = a - 7*(b+c)
+        VDUP    qConst, dBPlusCMult7S16[0]
+        VSUB    qConst, qA, qConst
+        
+        ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
+        VMUL    qB0,qB,qMultiplier0
+        
+        ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
+        VMUL    qB1,qB,qMultiplier1
+        
+        VADD    qSum0, qB0, qConst
+        VADD    qSum1, qB1, qConst  
+        
+        ;// Loops for 16 times
+LoopPlane       
+        ;// (b*x + c*y + C)>>5
+        VQRSHRUN dOut0, qSum0,#5
+        VQRSHRUN dOut1, qSum1,#5      
+        SUBS     y, y, #1
+        VST1     qOut,[pDst],dstStep
+        VADD     qSum0,qSum0,qC 
+        VADD     qSum1,qSum1,qC 
+        BNE      LoopPlane
+        
+        MOV      return, #OMX_Sts_NoErr
+
+        M_END
+        
+        ENDIF ;// CortexA8
+            
+        END
+;-----------------------------------------------------------------------------------------------
+; omxVCM4P10_PredictIntra_16x16 ends
+;-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
new file mode 100755
index 0000000..39eb8a4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
@@ -0,0 +1,531 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Define the processor variants supported by this file
+         
+         M_VARIANTS CortexA8
+        
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+    M_TABLE armVCM4P10_pSwitchTable4x4
+    DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR 
+    DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
+    DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
+    DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
+    DCD  OMX_VC_4x4_HU   
+    
+        
+        IF CortexA8
+        
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+return          RN 0
+pTable          RN 8
+pc              RN 15
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+pSrcLeft        RN 0    ;// input pointer
+pSrcAbove       RN 1    ;// input pointer
+pSrcAboveLeft   RN 2    ;// input pointer
+pDst            RN 3    ;// output pointer
+leftStep        RN 4    ;// input variable
+dstStep         RN 5    ;// input variable
+predMode        RN 6    ;// input variable
+availability    RN 7    ;// input variable
+pDst1           RN 1 
+pDst2           RN 4 
+pDst3           RN 6 
+
+pSrcTmp         RN 9
+srcStep         RN 10
+pDstTmp         RN 11
+dstep           RN 12
+
+;//-------------------
+;// Neon registers
+;//-------------------
+
+;// OMX_VC_CHROMA_VERT
+dAboveU32       DN  D0.U32
+
+;// OMX_VC_CHROMA_HOR
+dLeftVal0       DN  D0.8
+dLeftVal1       DN  D1.8
+dLeftVal2       DN  D2.8
+dLeftVal3       DN  D3.8
+dLeftVal0U32    DN  D0.U32
+dLeftVal1U32    DN  D1.U32
+dLeftVal2U32    DN  D2.U32
+dLeftVal3U32    DN  D3.U32
+
+;// OMX_VC_4x4_DC
+dLeftVal        DN  D0.U8
+dLeftValU32     DN  D0.U32
+dSumAboveLeftU16  DN  D1.U16
+dSumAboveLeftU32  DN  D1.U32
+dSumAboveLeftU64  DN  D1.U64
+dSumAboveLeftU8 DN  D1.U8
+dSum            DN  D0.U8
+
+dSumLeftValU16  DN  D1.U16
+dSumLeftValU32  DN  D1.U32
+dSumLeftValU64  DN  D1.U64
+dSumLeftValU8   DN  D1.U8
+
+dAboveVal       DN  D0.U8
+dSumAboveValU16  DN  D1.U16
+dSumAboveValU32  DN  D1.U32
+dSumAboveValU64  DN  D1.U64
+dSumAboveValU8   DN  D1.U8
+dConst128U8     DN  D0.U8
+
+
+;//OMX_VC_4x4_DIAG_DL
+
+dAbove          DN  D0.U8
+dU7             DN  D2.U8
+dU3             DN  D2.U8
+dAbove0         DN  D3.U8
+dAbove1         DN  D4.U8
+dAbove2         DN  D5.U8
+dTmp            DN  D6.U8
+dTmp0           DN  D7.U8
+dTmp1           DN  D8.U8
+dTmp2            DN  D9.U8
+dTmp3            DN  D10.U8
+dTmpU32         DN  D6.U32
+
+
+;//OMX_VC_4x4_DIAG_DR
+dLeft           DN  D1.U8
+dUL             DN  D2.U8
+
+;//OMX_VC_4x4_VR
+dLeft0          DN  D1.U8
+dLeft1          DN  D2.U8
+dEven0          DN  D3.U8
+dEven1          DN  D4.U8
+dEven2          DN  D5.U8
+dOdd0           DN  D6.U8
+dOdd1           DN  D11.U8
+dOdd2           DN  D12.U8
+dTmp3U32        DN  D10.U32    
+dTmp2U32        DN  D9.U32
+
+
+;//OMX_VC_4x4_HD
+dTmp1U64        DN  D8.U64
+dTmp0U64        DN  D7.U64
+dTmpU64         DN  D6.U64
+dTmpU32         DN  D6.U32
+dTmp1U32        DN  D8.U32
+
+;//OMX_VC_4x4_HU
+dL3             DN  D2.U8
+dLeftHU0        DN  D3.U8
+dLeftHU1        DN  D4.U8
+dLeftHU2        DN  D5.U8
+dTmp0U32        DN  D7.U32
+
+
+
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 starts
+;//-----------------------------------------------------------------------------------------------
+        
+        ;// Write function header
+        M_START omxVCM4P10_PredictIntra_4x4, r12,d12
+        
+        ;// Define stack arguments
+        M_ARG    LeftStep,     4
+        M_ARG    DstStep,      4
+        M_ARG    PredMode,     4
+        M_ARG    Availability, 4
+        
+                
+        LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
+        
+        ;// Load argument from the stack
+        M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg 
+        M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg 
+        
+        
+        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
+
+
+OMX_VC_4x4_HOR
+        
+        ADD     pSrcTmp, pSrcLeft, leftStep
+        ADD     srcStep, leftStep, leftStep
+        ;// Load Left Edge
+        VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
+        VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
+        VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
+        
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        
+        VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
+        VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
+        VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
+        VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
+        
+        B        ExitPredict4x4                             ;// Branch to exit code
+        
+OMX_VC_4x4_VERT
+        
+        ;// Load Upper Edge
+        VLD1     dAboveU32[0],[pSrcAbove]
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        
+DCPredict4x4VertStore         
+        
+        VST1     dAboveU32[0],[pDst],dstep
+        VST1     dAboveU32[0],[pDstTmp],dstep
+        VST1     dAboveU32[0],[pDst]
+        VST1     dAboveU32[0],[pDstTmp]
+
+        B        ExitPredict4x4                             ;// Branch to exit code
+
+OMX_VC_4x4_DC
+        
+        
+        TST     availability, #OMX_VC_LEFT
+        BEQ     DCPredict4x4LeftNotAvailable
+
+        ADD     pSrcTmp, pSrcLeft, leftStep
+        ADD     srcStep, leftStep, leftStep
+        ;// Load Left Edge
+        VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
+        VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
+        VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
+        
+        TST     availability, #OMX_VC_UPPER
+        BEQ     DCPredict4x4LeftOnlyAvailable
+
+        ;// Load Upper Edge also
+        VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
+        MOV      return, #OMX_Sts_NoErr
+        
+        VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]             
+        VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]] 
+        VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]                          
+        VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        VDUP     dSum,dSumAboveLeftU8[0]
+        
+        B        DCPredict4x4VertStore  
+        
+DCPredict4x4LeftOnlyAvailable
+
+        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
+        
+        VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]             
+        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]] 
+        
+        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        VDUP     dSum,dSumLeftValU8[0]
+        
+        B        DCPredict4x4VertStore   
+        
+DCPredict4x4LeftNotAvailable
+                 
+        TST     availability, #OMX_VC_UPPER
+        BEQ     DCPredict4x4NoneAvailable
+
+        ;// Load Upper Edge 
+        VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]  
+        MOV      return, #OMX_Sts_NoErr
+        
+        VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]             
+        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]] 
+        
+        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        VDUP     dSum,dSumAboveValU8[0]
+        
+        B        DCPredict4x4VertStore   
+        
+DCPredict4x4NoneAvailable        
+        
+        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
+        MOV      return, #OMX_Sts_NoErr
+        
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        B        DCPredict4x4VertStore   
+        
+        
+        
+OMX_VC_4x4_DIAG_DL
+        
+        TST     availability, #OMX_VC_UPPER_RIGHT
+        BEQ     DiagDLUpperRightNotAvailable
+       
+        VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0] 
+        VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
+        VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
+        VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2] 
+        B       DiagDLPredict4x4Store         
+       
+DiagDLUpperRightNotAvailable
+        VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-] 
+        VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+        VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+        VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+        VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+       
+DiagDLPredict4x4Store  
+        
+        VHADD   dTmp, dAbove0, dAbove2
+        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
+        
+
+        VST1    dTmpU32[0],[pDst],dstStep
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst],dstStep
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst],dstStep
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst]
+        
+        B        ExitPredict4x4                         ;// Branch to exit code
+        
+
+OMX_VC_4x4_DIAG_DR
+        
+        
+        ;// Load U0,U1,U2,U3
+        
+        VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
+                
+        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]    
+        VLD1    {dLeft[7]},[pSrcAboveLeft]              
+        ADD     pSrcTmp, pSrcLeft, leftStep
+        ADD     srcStep, leftStep, leftStep
+        ADD     pDst1,pDst,dstStep
+        
+        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
+        
+        
+        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]   
+        ADD     pDst2,pDst1,dstStep
+        VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]   
+        ADD     pDst3,pDst2,dstStep
+        VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]   
+        
+        VHADD   dTmp, dAbove0, dAbove2
+        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
+        
+        
+        VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
+        VEXT    dTmp,dTmp,dTmp,#1
+        VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
+        
+        B        ExitPredict4x4                         ;// Branch to exit code
+
+OMX_VC_4x4_VR
+
+        
+        ;// Load UL,U0,U1,U2,U3
+        VLD1    dAboveU32[0],[pSrcAbove]
+        VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
+        
+        ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
+                                                        ;// dLeft1 = [L1| X|X|X|X|X|X|X]    
+        VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
+        
+        
+        VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
+        VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
+        VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
+        VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
+        VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
+        VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
+        
+        VHADD   dTmp1, dOdd0, dOdd2
+        VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
+        
+        VHADD   dTmp0, dEven0, dEven2
+        VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
+        
+        
+        VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ] 
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
+        
+        
+        VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
+        VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
+        VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
+        VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
+        
+        B        ExitPredict4x4                         ;// Branch to exit code
+        
+OMX_VC_4x4_HD
+        
+        
+        ;// Load U0,U1,U2,U3
+        VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
+        
+        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] 
+        VLD1    {dLeft[7]},[pSrcAboveLeft]   
+        ADD     pSrcTmp, pSrcLeft, leftStep
+        ADD     srcStep, leftStep, leftStep
+        
+        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
+        
+        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]  
+        VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]   
+        VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]     
+        
+        VHADD   dTmp0, dAbove0, dAbove2
+        VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
+        
+        
+        VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
+        VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
+        
+        
+        VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
+        VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
+        VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
+        VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
+       
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        
+        VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
+        VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
+        VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
+        VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
+        
+        B        ExitPredict4x4                         ;// Branch to exit code
+        
+OMX_VC_4x4_VL
+
+        
+        TST     availability, #OMX_VC_UPPER_RIGHT
+        BEQ     DiagVLUpperRightNotAvailable
+       
+        VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0] 
+        VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
+        VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
+        
+        B       DiagVLPredict4x4Store         
+       
+DiagVLUpperRightNotAvailable
+        VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-] 
+        VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+        VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+        VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+        VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+       
+DiagVLPredict4x4Store  
+        
+        VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
+                                                        ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
+        
+        VHADD   dTmp3, dAbove0, dAbove2
+        VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
+                                                        ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
+                                                         
+        VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
+        ADD     pDstTmp, pDst, dstStep
+        ADD     dstep, dstStep, dstStep
+        VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
+        
+        VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
+        VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
+        VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
+        VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
+        
+        B        ExitPredict4x4                         ;// Branch to exit code
+        
+OMX_VC_4x4_HU
+        ADD     pSrcTmp, pSrcLeft, leftStep
+        ADD     srcStep, leftStep, leftStep
+
+        ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
+        VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
+        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
+        VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
+        VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
+        
+        VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
+        
+        VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
+        VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
+        VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
+        
+        VHADD   dTmp0, dLeftHU0, dLeftHU2
+        VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
+        
+        VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1 
+                                                        ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
+                                                      
+        VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]  
+                                                        ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
+                                                                                                                            
+        
+        VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0] 
+        VEXT    dTmp1,dTmp1,dTmp1,#2
+        VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2] 
+        VEXT    dTmp1,dTmp1,dTmp1,#2
+        VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]  
+        VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6] 
+        
+        
+ExitPredict4x4
+        
+        MOV      return,  #OMX_Sts_NoErr
+        M_END
+
+        ENDIF ;// CortexA8
+        
+        END
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 ends
+;//-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
new file mode 100755
index 0000000..e394339
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
@@ -0,0 +1,140 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+        
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixQPModTable
+            
+        M_VARIANTS CortexA8
+    
+
+    
+    
+    IF CortexA8
+
+;// ARM Registers
+;//--------------------------------------
+;// Declare input registers
+;//--------------------------------------
+ppSrc       RN 0
+pDst        RN 1
+QP          RN 2
+
+;//--------------------------------
+;// Scratch variable for Unpack2x2 
+;//--------------------------------
+pSrc        RN 9
+Value       RN 4
+Value2      RN 5
+Flag        RN 6
+strOffset   RN 7
+cstOffset   RN 8
+
+;//--------------------------------
+;// Scratch variable
+;//--------------------------------
+r0w0        RN  3
+r0w1        RN  4
+
+c0w0        RN  5
+c1w0        RN  6
+
+return      RN  0
+pQPDivTable RN  5
+pQPModTable    RN  6
+Shift        RN  9
+Scale        RN  2
+
+
+
+;// Neon Registers
+
+dZero       DN  D0.U16
+dInvTrCoeff DN  D0.S16
+dScale      DN  D1.S16
+qDqntCoeff  QN  Q1.S32
+dDqntCoeff  DN  D2.S16
+
+
+        ;// Write function header
+        M_START omxVCM4P10_TransformDequantChromaDCFromPair, r9
+        
+        LDR     pSrc, [ppSrc]                        ;// Load pSrc
+        VMOV    dZero, #0
+        MOV     cstOffset, #31                       ;// To be used in the loop, to compute offset
+        
+        ;//-----------------------------------------------------------------------
+        ;// Firstly, fill all the coefficient values on the <pDst> buffer by zero
+        ;//-----------------------------------------------------------------------
+        
+        VST1    dZero,[pDst]                         ;// pDst[0]  = pDst[1]  = pDst[2]  = pDst[3]  = 0   
+        LDRB     Flag,  [pSrc], #1                   ;// Preload <Flag> before <unpackLoop>
+
+
+unpackLoop
+        TST      Flag,  #0x10                        ;// Computing (Flag & 0x10)
+        LDRSBNE  Value2,[pSrc,#1]                  
+        LDRBNE   Value, [pSrc], #2                   ;// Load byte wise to avoid unaligned access
+        AND      strOffset, cstOffset, Flag, LSL #1  ;// strOffset = (Flag & 15) < 1;
+        LDRSBEQ  Value, [pSrc], #1                   ;// Value = (OMX_U8)  *pSrc++
+        ORRNE    Value,Value,Value2, LSL #8          ;// Value = (OMX_U16) *pSrc++
+        
+        TST      Flag,  #0x20                        ;// Computing (Flag & 0x20) to check, if we're done
+        LDRBEQ   Flag,  [pSrc], #1                   ;// Flag  = (OMX_U8) *pSrc++, for next iteration
+        STRH     Value, [pDst, strOffset]            ;// Store <Value> at offset <strOffset>
+        BEQ      unpackLoop                          ;// Branch to the loop beginning
+        
+        ;//--------------------------------------------------
+        ;//InvTransformDC2x2: Inlined (Implemented in ARM V6)
+        ;//--------------------------------------------------
+        
+        LDMIA    pDst, {r0w0, r0w1}                  ;// r0w0 = |c1|c0| & r0w1 = |c3|c2|
+
+        STR      pSrc, [ppSrc]                       ;// Update the bitstream pointer
+        
+        LDR      pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
+        LDR      pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+        
+        SADDSUBX r0w0, r0w0,  r0w0                   ;// [ c00+c01, c00-c01 ]
+        SADDSUBX r0w1, r0w1,  r0w1                   ;// [ c10+c11, c10-c11 ]
+        
+        LDRSB    Shift, [pQPDivTable, QP]            ;// Shift = pQPDivTable[QP]
+        LDRSB    Scale, [pQPModTable, QP]            ;// Scale = pQPModTable[QP]
+        
+        SADD16   c0w0, r0w0, r0w1                    ;// [ d00+d10, d01+d11 ]
+        SSUB16   c1w0, r0w0, r0w1                    ;// [ d00-d10, d01-d11 ]
+        
+        ;//-------------------------------------------------
+        ;//DequantChromaDC2x2: Inlined (Neon Implementation)
+        ;//-------------------------------------------------
+        
+        LSL      Scale, Scale, Shift                 ;// Scale = Scale << Shift
+        VMOV     dInvTrCoeff, c0w0, c1w0  
+        VREV32   dInvTrCoeff,dInvTrCoeff  
+        VDUP     dScale,Scale 
+        
+        VMULL    qDqntCoeff,dInvTrCoeff,dScale
+        VSHRN    dDqntCoeff,qDqntCoeff,#1
+        
+        
+        VST1     dDqntCoeff,[pDst]                   ;// Storing all the coefficients at once
+        
+        MOV      return, #OMX_Sts_NoErr
+        M_END
+        
+    ENDIF ;// CortexA8
+    
+    
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
new file mode 100755
index 0000000..2529959
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
@@ -0,0 +1,264 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;// 
+;// 
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Import/Export symbols required from/to other files
+;// (For example tables)
+        
+        IMPORT armVCM4P10_UnpackBlock4x4 
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixQPModTable
+        
+        M_VARIANTS CortexA8
+
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+    
+
+;// Guarding implementation by the processor name
+    
+    
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;//Input Registers
+pData               RN  0
+QP                  RN  1    
+
+
+;//Local Scratch Registers
+
+;// ARM Registers
+
+pQPDivTable         RN  2
+pQPModTable         RN  3
+Shift               RN  4
+Scale               RN  5
+
+;// NEON Registers
+
+;// Packed Input pixels
+dIn0                DN  D0.S16
+dIn1                DN  D1.S16
+dIn2                DN  D2.S16
+dIn3                DN  D3.S16   
+
+;// Intermediate calculations
+dRowSum1            DN  D4.S16
+dRowSum2            DN  D5.S16
+dRowDiff1           DN  D6.S16
+dRowDiff2           DN  D7.S16
+
+;// Row operated pixels
+dRowOp0             DN  D0.S16
+dRowOp1                DN  D1.S16
+dRowOp2                DN  D2.S16
+dRowOp3                DN  D3.S16
+qRowOp01            QN  Q0.32
+qRowOp23            QN  Q1.32
+
+;// Intermediate calculations
+dColSum1            DN  D4.S16
+dColSum2            DN  D5.S16
+dColDiff1           DN  D6.S16
+dColDiff2           DN  D7.S16
+
+;// Coloumn operated pixels
+dColOp0             DN  D0.S16
+dColOp1                DN  D1.S16
+dColOp2                DN  D2.S16
+dColOp3                DN  D3.S16
+
+;// Temporary scratch varaibles
+
+dScale              DN  D5.S16
+qRound0             QN  Q3.S32
+qRound1             QN  Q4.S32
+qRound2             QN  Q5.S32
+qRound3             QN  Q6.S32
+
+;// InvTransformed and Dequantized pixels
+dOut0               DN  D0.S16
+dOut1                DN  D1.S16
+dOut2                DN  D2.S16
+dOut3                DN  D3.S16
+
+       
+    ;// Allocate stack memory required by the function
+        
+
+    ;// Write function header
+    M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
+    
+    ;******************************************************************
+    ;// The strategy used in implementing the transform is as follows:*
+    ;// Load the 4x4 block into 4 D-registers                         *  
+    ;// Transpose the 4x4 matrix                                      *  
+    ;// Perform the row operations (on columns) using SIMD            *  
+    ;// Transpose the 4x4 result matrix                               *  
+    ;// Perform the coloumn operations                                *
+    ;******************************************************************
+
+        ;// Load all the 4x4 pixels in Transposed form
+        
+        VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
+        LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
+        LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+        
+        ;**************************************** 
+        ;// Row Operations (Performed on columns)
+        ;**************************************** 
+        ;// Scale factor calculation is done using ARM instructions
+        ;// Interleaved with NEON instructions inorder to Dual issue
+        
+        VADD    dRowSum1,dIn0,dIn1
+        VADD    dRowSum2,dIn2,dIn3
+        VSUB    dRowDiff1,dIn0,dIn1
+        LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
+        VSUB    dRowDiff2,dIn2,dIn3
+        LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP] 
+        VADD    dRowOp0,dRowSum1,dRowSum2
+        VSUB    dRowOp1,dRowSum1,dRowSum2
+        VSUB    dRowOp2,dRowDiff1,dRowDiff2
+        LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
+        VADD    dRowOp3,dRowDiff1,dRowDiff2
+        
+        ;****************************************
+        ;// Transpose the resultant matrix
+        ;****************************************
+        
+        VTRN    dRowOp0,dRowOp1
+        VTRN    dRowOp2,dRowOp3
+        VTRN    qRowOp01,qRowOp23 
+        
+        ;**************************************** 
+        ;// Coloumn Operations 
+        ;**************************************** 
+        
+        VADD    dColSum1,dRowOp0,dRowOp1
+        VADD    dColSum2,dRowOp2,dRowOp3
+        VSUB    dColDiff1,dRowOp0,dRowOp1
+        VSUB    dColDiff2,dRowOp2,dRowOp3
+        VADD    dColOp0,dColSum1,dColSum2
+        VSUB    dColOp1,dColSum1,dColSum2
+        VSUB    dColOp2,dColDiff1,dColDiff2
+        VADD    dColOp3,dColDiff1,dColDiff2
+        
+        ;//----------------------------------------------------------------------
+        ;//
+        ;// <Dequantize> improves on the c-reference code
+        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
+        ;// We do not subtract 2 from Shift as in C reference, instead perform a
+        ;// Scale << Shift once in the beginning and do a right shift by a 
+        ;// constant 2 after the Multiplication. The value of Round would be 2 
+        ;// 
+        ;// By doing this we aviod the Branches required and also 
+        ;// reduce the code size substantially
+        ;// 
+        ;//----------------------------------------------------------------------
+        
+        
+        VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
+               
+                
+        VMOV    qRound0,#2                               ;// Set the Round Value 
+        VMOV    qRound1,#2
+        VMOV    qRound2,#2
+        VMOV    qRound3,#2
+        
+        VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round 
+        VMLAL   qRound1,dColOp1,dScale
+        VMLAL   qRound2,dColOp2,dScale
+        VMLAL   qRound3,dColOp3,dScale
+        
+        VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
+        VSHRN   dOut1,qRound1,#2
+        VSHRN   dOut2,qRound2,#2
+        VSHRN   dOut3,qRound3,#2
+        
+        ;***************************
+        ;// Store all the 4x4 pixels
+        ;***************************
+        
+        VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
+
+        
+        ;// Set return value
+        
+        ;// Write function tail
+        M_END        
+        
+    ENDIF                                                           ;//CORTEXA8   
+        
+
+
+;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
+    
+;//Input Registers
+ppSrc               RN  0
+pDst                RN  1
+QPR2                RN  2
+
+;//Output Registers
+result              RN  0
+
+;//Local Scratch Registers
+pDstR4              RN  4
+pDstR0              RN  0
+QPR1                RN  1
+QPR5                RN  5
+
+;// Guarding implementation by the processor name
+    
+    IF CortexA8
+       
+    ;// Allocate stack memory required by the function
+        
+
+    ;// Write function header
+        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
+        
+        MOV     pDstR4,pDst                         ;// Saving register r1
+        MOV     QPR5,QPR2                           ;// Saving register r2
+        BL      armVCM4P10_UnpackBlock4x4
+        
+        MOV     pDstR0,pDstR4                       ;// Setting up register r0
+        MOV     QPR1,QPR5                           ;// Setting up register r1
+        BL      armVCM4P10_InvTransformDequantLumaDC4x4
+                               
+       
+        ;// Set return value
+        MOV     result,#OMX_Sts_NoErr        
+       
+        ;// Write function tail
+        M_END
+        
+            
+    ENDIF                                                           ;//ARM1136JS  
+    
+
+    END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S
new file mode 100644
index 0000000..aca2df4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S
@@ -0,0 +1,134 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_Average_4x4_Align0_unsafe
+    .func   armVCM4P10_Average_4x4_Align0_unsafe
+armVCM4P10_Average_4x4_Align0_unsafe:
+    PUSH     {r4-r6,lr}
+    LDR      r7, =0x80808080
+    LDR      r12,[r2,#0]
+    LDR      r10,[r0],r1
+    LDR      lr,[r2,r3]
+    LDR      r11,[r0],r1
+    MVN      r12,r12
+    MVN      lr,lr
+    UHSUB8   r5,r10,r12
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    LDR      r10,[r0],r1
+    LDR      r12,[r2,#0]
+    LDR      r11,[r0],r1
+    LDR      lr,[r2,r3]
+    MVN      r12,r12
+    UHSUB8   r5,r10,r12
+    MVN      lr,lr
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .global armVCM4P10_Average_4x4_Align2_unsafe
+    .func   armVCM4P10_Average_4x4_Align2_unsafe
+armVCM4P10_Average_4x4_Align2_unsafe:
+    PUSH     {r4-r6,lr}
+    LDR      r7, =0x80808080
+    LDR      r4,[r0,#4]
+    LDR      r10,[r0],r1
+    LDR      r12,[r2,#0]
+    LDR      lr,[r2,r3]
+    LDR      r5,[r0,#4]
+    LDR      r11,[r0],r1
+    MVN      r12,r12
+    MVN      lr,lr
+    LSR      r10,r10,#16
+    ORR      r10,r10,r4,LSL #16
+    LSR      r11,r11,#16
+    ORR      r11,r11,r5,LSL #16
+    UHSUB8   r5,r10,r12
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    LDR      r4,[r0,#4]
+    LDR      r10,[r0],r1
+    LDR      r12,[r2,#0]
+    LDR      lr,[r2,r3]
+    LDR      r5,[r0,#4]
+    LDR      r11,[r0],r1
+    MVN      r12,r12
+    MVN      lr,lr
+    LSR      r10,r10,#16
+    ORR      r10,r10,r4,LSL #16
+    LSR      r11,r11,#16
+    ORR      r11,r11,r5,LSL #16
+    UHSUB8   r5,r10,r12
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .global armVCM4P10_Average_4x4_Align3_unsafe
+    .func   armVCM4P10_Average_4x4_Align3_unsafe
+armVCM4P10_Average_4x4_Align3_unsafe:
+    PUSH     {r4-r6,lr}
+    LDR      r7, =0x80808080
+    LDR      r4,[r0,#4]
+    LDR      r10,[r0],r1
+    LDR      r12,[r2,#0]
+    LDR      lr,[r2,r3]
+    LDR      r5,[r0,#4]
+    LDR      r11,[r0],r1
+    MVN      r12,r12
+    MVN      lr,lr
+    LSR      r10,r10,#24
+    ORR      r10,r10,r4,LSL #8
+    LSR      r11,r11,#24
+    ORR      r11,r11,r5,LSL #8
+    UHSUB8   r5,r10,r12
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    LDR      r4,[r0,#4]
+    LDR      r10,[r0],r1
+    LDR      r12,[r2,#0]
+    LDR      lr,[r2,r3]
+    LDR      r5,[r0,#4]
+    LDR      r11,[r0],r1
+    MVN      r12,r12
+    MVN      lr,lr
+    LSR      r10,r10,#24
+    ORR      r10,r10,r4,LSL #8
+    LSR      r11,r11,#24
+    ORR      r11,r11,r5,LSL #8
+    UHSUB8   r5,r10,r12
+    UHSUB8   r4,r11,lr
+    EOR      r5,r5,r7
+    STR      r5,[r2],r3
+    EOR      r4,r4,r7
+    STR      r4,[r2],r3
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S
new file mode 100644
index 0000000..b9ee221
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S
@@ -0,0 +1,54 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_DeblockingChromabSLT4_unsafe
+    .func   armVCM4P10_DeblockingChromabSLT4_unsafe
+armVCM4P10_DeblockingChromabSLT4_unsafe:
+    VLD1.32  {d18[0]},[r5]!
+    VSUBL.U8 q11,d5,d9
+    VMOV     d28,d18
+    VSUBL.U8 q10,d8,d4
+    VSHR.S16 q11,q11,#2
+    VZIP.8   d18,d28
+    VBIF     d18,d14,d16
+    VRHADD.S16 q10,q11,q10
+    VADD.I8  d31,d18,d15
+    VQMOVN.S16 d20,q10
+    VLD1.8   {d0[]},[r2]
+    VMIN.S8  d20,d20,d31
+    VNEG.S8  d31,d31
+    VLD1.8   {d2[]},[r3]
+    VMAX.S8  d20,d20,d31
+    VMOVL.U8 q14,d4
+    VMOVL.U8 q12,d8
+    VADDW.S8 q14,q14,d20
+    VSUBW.S8 q12,q12,d20
+    VQMOVUN.S16 d29,q14
+    VQMOVUN.S16 d24,q12
+    BX       lr
+    .endfunc
+
+    .global armVCM4P10_DeblockingChromabSGE4_unsafe
+    .func   armVCM4P10_DeblockingChromabSGE4_unsafe
+armVCM4P10_DeblockingChromabSGE4_unsafe:
+    VHADD.U8 d13,d4,d9
+    VHADD.U8 d31,d8,d5
+    VLD1.8   {d0[]},[r2]
+    ADD      r5,r5,#4
+    VLD1.8   {d2[]},[r3]
+    VRHADD.U8 d13,d13,d5
+    VRHADD.U8 d31,d31,d9
+    BX       lr
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S
new file mode 100644
index 0000000..47f3d44
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S
@@ -0,0 +1,102 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_DeblockingLumabSLT4_unsafe
+    .func   armVCM4P10_DeblockingLumabSLT4_unsafe
+armVCM4P10_DeblockingLumabSLT4_unsafe:
+    VSUBL.U8 q11,d5,d9
+    VLD1.8   {d18[]},[r5]!
+    VSUBL.U8 q10,d8,d4
+    VLD1.8   {d19[]},[r5]!
+    VSHR.S16 q11,q11,#2
+    VEXT.8   d18,d18,d19,#4
+    VAND     d19,d17,d15
+    VBIF     d18,d14,d16
+    VRHADD.S16 q10,q11,q10
+    VRHADD.U8 d24,d4,d8
+    VADD.I8  d31,d18,d19
+    VAND     d19,d12,d15
+    VQADD.U8 d23,d5,d18
+    VQMOVN.S16 d20,q10
+    VADD.I8  d31,d31,d19
+    VQSUB.U8 d22,d5,d18
+    VQADD.U8 d19,d9,d18
+    VHADD.U8 d26,d24,d6
+    VMIN.S8  d20,d20,d31
+    VNEG.S8  d31,d31
+    VQSUB.U8 d21,d9,d18
+    VHADD.U8 d27,d24,d10
+    VMAX.U8  d30,d26,d22
+    VMAX.S8  d20,d20,d31
+    VMOVL.U8 q14,d4
+    VMOVL.U8 q12,d8
+    VADDW.S8 q14,q14,d20
+    VSUBW.S8 q12,q12,d20
+    VQMOVUN.S16 d29,q14
+    VQMOVUN.S16 d24,q12
+    VMAX.U8  d25,d27,d21
+    VMIN.U8  d30,d30,d23
+    VMIN.U8  d25,d25,d19
+    VBIF     d29,d4,d16
+    VBIF     d30,d5,d17
+    VBIF     d24,d8,d16
+    VBIF     d25,d9,d12
+    BX       lr
+    .endfunc
+
+    .global armVCM4P10_DeblockingLumabSGE4_unsafe
+    .func   armVCM4P10_DeblockingLumabSGE4_unsafe
+armVCM4P10_DeblockingLumabSGE4_unsafe:
+    VSHR.U8  d19,d0,#2
+    VADD.I8  d19,d19,d15
+    VADDL.U8 q10,d8,d4
+    VADD.I8  d19,d19,d15
+    VADDL.U8 q11,d6,d9
+    VADDW.U8 q12,q10,d5
+    VCGT.U8  d19,d19,d13
+    VSHR.U16 q11,q11,#1
+    VHADD.U16 q11,q12,q11
+    VADDW.U8 q12,q12,d6
+    VADDL.U8 q13,d7,d6
+    VAND     d17,d17,d19
+    VHADD.U8 d28,d4,d9
+    VSRA.U16 q13,q12,#1
+    VAND     d12,d12,d19
+    VQRSHRN.U16 d29,q11,#1
+    VRHADD.U8 d28,d28,d5
+    VQRSHRN.U16 d30,q12,#2
+    VADDL.U8 q11,d10,d5
+    VADDW.U8 q12,q10,d9
+    VBIF     d29,d28,d17
+    VQRSHRN.U16 d31,q13,#2
+    VADDL.U8 q13,d11,d10
+    VSHR.U16 q11,q11,#1
+    VHADD.U16 q11,q12,q11
+    VADDW.U8 q12,q12,d10
+    VHADD.U8 d28,d8,d5
+    VBIF     d29,d4,d16
+    VBIF     d30,d5,d17
+    VSRA.U16 q13,q12,#1
+    VQRSHRN.U16 d25,q12,#2
+    VQRSHRN.U16 d24,q11,#1
+    VRHADD.U8 d22,d28,d9
+    VBIF     d25,d9,d12
+    VBIF     d31,d6,d17
+    VBIF     d24,d22,d12
+    VQRSHRN.U16 d28,q13,#2
+    VBIF     d24,d8,d16
+    VBIF     d28,d10,d12
+    BX       lr
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S
new file mode 100644
index 0000000..e68bd8e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S
@@ -0,0 +1,272 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_DecodeCoeffsToPair
+    .func   armVCM4P10_DecodeCoeffsToPair
+armVCM4P10_DecodeCoeffsToPair:
+    PUSH     {r4-r12,lr}
+    SUB      sp,sp,#0x40
+    LDR      r10,[r0,#0]
+    LDR      r12,[r1,#0]
+    LDR      r6, =armVCM4P10_CAVLCCoeffTokenTables
+    LDR      r4,[sp,#0x68]
+    LDRB     r9,[r10,#2]
+    LDRB     r8,[r10,#1]
+    LDRB     r11,[r10],#3
+    ADD      r12,r12,#8
+    LDR      r6,[r6,r4,LSL #2]
+    ORR      r9,r9,r8,LSL #8
+    ORR      r11,r9,r11,LSL #16
+    LSLS     r8,r11,r12
+    MOVS     r7,#0x1e
+    AND      r7,r7,r8,LSR #27
+    SUBS     r12,r12,#8
+L0x44:
+    BCC      L1
+    LDRB     r8,[r10],#1
+L1:
+    LDRH     r7,[r6,r7]
+    ADDCC    r12,r12,#8
+    ADD      r12,r12,#4
+    ORRCS    r11,r8,r11,LSL #8
+    LSRS     r8,r7,#1
+    BCS      L0x74
+    LSLS     r8,r11,r12
+    SUBS     r12,r12,#0xa
+    ADD      r7,r7,r8,LSR #29
+    BIC      r7,r7,#1
+    B        L0x44
+L0x74:
+    SUB      r12,r12,r7,LSR #13
+    BIC      r7,r8,#0xf000
+    LSRS     r5,r7,#2
+    STRB     r5,[r2,#0]
+    BEQ      L0x344
+    CMP      r7,#0x44
+    BGE      L0x33c
+    STR      r0,[sp,#0]
+    STR      r1,[sp,#4]
+    STR      r3,[sp,#8]
+    ANDS     r1,r7,#3
+    ADD      r2,sp,#0xc
+    BEQ      L0xd8
+    MOV      r0,r1
+L0xac:
+    LSLS     r7,r11,r12
+    SUBS     r12,r12,#7
+    BCC      L2
+    LDRB     r8,[r10],#1
+L2:
+    ADDCC    r12,r12,#8
+    LSR      r7,r7,#31
+    ORRCS    r11,r8,r11,LSL #8
+    SUBS     r0,r0,#1
+    MOV      r8,#1
+    SUB      r8,r8,r7,LSL #1
+    STRH     r8,[r2],#2
+    BGT      L0xac
+L0xd8:
+    SUBS     r0,r5,r1
+    BEQ      L0x1b8
+    MOV      r4,#1
+    CMP      r5,#0xa
+    MOVLE    r4,#0
+    CMP      r1,#3
+    MOVLT    r1,#4
+    MOVGE    r1,#2
+    MOVGE    r4,#0
+L0xfc:
+    LSLS     r7,r11,r12
+    CLZ      r7,r7
+    ADD      r12,r12,r7
+    SUBS     r12,r12,#7
+    BCC      L3
+    LDRB     r8,[r10],#1
+    ORR      r11,r8,r11,LSL #8
+    SUBS     r12,r12,#8
+    BCC      L3
+    LDRB     r8,[r10],#1
+L3:
+    ADDCC    r12,r12,#8
+    ORRCS    r11,r8,r11,LSL #8
+    CMP      r7,#0x10
+    BGE      L0x33c
+    MOVS     lr,r4
+    TEQEQ    r7,#0xe
+    MOVEQ    lr,#4
+    TEQ      r7,#0xf
+    MOVEQ    lr,#0xc
+    TEQEQ    r4,#0
+    ADDEQ    r7,r7,#0xf
+    TEQ      lr,#0
+    BEQ      L0x184
+    LSL      r3,r11,r12
+    ADD      r12,r12,lr
+    SUBS     r12,r12,#8
+    RSB      r9,lr,#0x20
+    BCC      L4
+    LDRB     r8,[r10],#1
+    ORR      r11,r8,r11,LSL #8
+    SUBS     r12,r12,#8
+    BCC      L4
+    LDRB     r8,[r10],#1
+L4:
+    ADDCC    r12,r12,#8
+    LSR      r3,r3,r9
+    ORRCS    r11,r8,r11,LSL #8
+    LSL      r7,r7,r4
+    ADD      r7,r3,r7
+L0x184:
+    ADD      r7,r7,r1
+    MOV      r1,#2
+    LSRS     r8,r7,#1
+    RSBCS    r8,r8,#0
+    STRH     r8,[r2],#2
+    LDR      r9, =armVCM4P10_SuffixToLevel
+    LDRSB    r8,[r9,r4]
+    TEQ      r4,#0
+    MOVEQ    r4,#1
+    CMP      r7,r8
+    ADDCS    r4,r4,#1
+    SUBS     r0,r0,#1
+    BGT      L0xfc
+L0x1b8:
+    LDR      r8,[sp,#0x6c]
+    SUB      r0,r5,#1
+    SUBS     r1,r8,r5
+    ADD      r4,sp,#0x2c
+    MOV      lr,r5
+    SUB      lr,lr,#1
+    BEQ      L0x2b0
+    TEQ      r8,#4
+    LDREQ    r6, =(armVCM4P10_CAVLCTotalZeros2x2Tables - 4)
+    LDRNE    r6, =(armVCM4P10_CAVLCTotalZeroTables - 4)
+    LDR      r6,[r6,r5,LSL #2]
+    LSLS     r8,r11,r12
+    MOVS     r7,#0x1e
+    AND      r7,r7,r8,LSR #27
+    SUBS     r12,r12,#8
+L0x1f4:
+    BCC      L5
+    LDRB     r8,[r10],#1
+L5:
+    LDRH     r7,[r6,r7]
+    ADDCC    r12,r12,#8
+    ADD      r12,r12,#4
+    ORRCS    r11,r8,r11,LSL #8
+    LSRS     r8,r7,#1
+    BCS      L0x224
+    LSLS     r8,r11,r12
+    SUBS     r12,r12,#0xa
+    ADD      r7,r7,r8,LSR #29
+    BIC      r7,r7,#1
+    B        L0x1f4
+L0x224:
+    SUB      r12,r12,r7,LSR #13
+    BIC      r7,r8,#0xf000
+    CMP      r7,#0x10
+    BGE      L0x33c
+    LDR      r3, =(armVCM4P10_CAVLCRunBeforeTables - 4)
+    ADD      r4,sp,#0x2c
+    MOVS     r1,r7
+    ADD      lr,lr,r1
+    BEQ      L0x2b0
+L0x248:
+    SUBS     r0,r0,#1
+    LDR      r6,[r3,r1,LSL #2]
+    BLT      L0x2bc
+    LSLS     r8,r11,r12
+    MOVS     r7,#0xe
+    AND      r7,r7,r8,LSR #28
+    SUBS     r12,r12,#8
+L0x264:
+    BCC      L6
+    LDRB     r8,[r10],#1
+L6:
+    LDRH     r7,[r6,r7]
+    ADDCC    r12,r12,#8
+    ADD      r12,r12,#3
+    ORRCS    r11,r8,r11,LSL #8
+    LSRS     r8,r7,#1
+    BCS      L0x294
+    LSLS     r8,r11,r12
+    SUBS     r12,r12,#9
+    ADD      r7,r7,r8,LSR #29
+    BIC      r7,r7,#1
+    B        L0x264
+L0x294:
+    SUB      r12,r12,r7,LSR #13
+    BIC      r7,r8,#0xf000
+    CMP      r7,#0xf
+    BGE      L0x33c
+    SUBS     r1,r1,r7
+    STRB     r7,[r4],#1
+    BGT      L0x248
+L0x2b0:
+    SUBS     r0,r0,#1
+    BLT      L7
+    STRB     r1,[r4],#1
+L7:
+    BGT      L0x2b0
+L0x2bc:
+    STRB     r1,[r4],#1
+    LDR      r8,[sp,#0x6c]
+    TEQ      r8,#0xf
+    ADDEQ    lr,lr,#1
+    SUB      r4,r4,r5
+    SUB      r2,r2,r5
+    SUB      r2,r2,r5
+    LDR      r3,[sp,#8]
+    LDR      r0,[r3,#0]
+    TEQ      r8,#4
+    LDREQ    r6, =armVCM4P10_ZigZag_2x2
+    LDRNE    r6, =armVCM4P10_ZigZag_4x4
+L0x2ec:
+    LDRB     r9,[r4],#1
+    LDRB     r8,[r6,lr]
+    SUB      lr,lr,#1
+    SUB      lr,lr,r9
+    LDRSH    r9,[r2],#2
+    SUBS     r5,r5,#1
+    ORREQ    r8,r8,#0x20
+    ADD      r1,r9,#0x80
+    CMP      r1,#0x100
+    ORRCS    r8,r8,#0x10
+    TEQ      r5,#0
+    STRB     r8,[r0],#1
+    STRB     r9,[r0],#1
+    LSR      r9,r9,#8
+    BCC      L8
+    STRB     r9,[r0],#1
+L8:
+    BNE      L0x2ec
+    STR      r0,[r3,#0]
+    LDR      r0,[sp,#0]
+    LDR      r1,[sp,#4]
+    B        L0x344
+L0x33c:
+    MVN      r0,#1
+    B        L0x35c
+L0x344:
+    ADD      r10,r10,r12,LSR #3
+    AND      r12,r12,#7
+    SUB      r10,r10,#4
+    STR      r12,[r1,#0]
+    STR      r10,[r0,#0]
+    MOV      r0,#0
+L0x35c:
+    ADD      sp,sp,#0x40
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S
new file mode 100644
index 0000000..44eb428
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S
@@ -0,0 +1,103 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .section .rodata
+    .align 4
+
+
+    .global armVCM4P10_QPDivTable
+    .global armVCM4P10_VMatrixQPModTable
+    .global armVCM4P10_PosToVCol4x4
+    .global armVCM4P10_PosToVCol2x2
+    .global armVCM4P10_VMatrix
+    .global armVCM4P10_QPModuloTable
+    .global armVCM4P10_VMatrixU16
+
+armVCM4P10_PosToVCol4x4:
+    .byte  0, 2, 0, 2
+    .byte  2, 1, 2, 1
+    .byte  0, 2, 0, 2
+    .byte  2, 1, 2, 1
+
+armVCM4P10_PosToVCol2x2:
+    .byte  0, 2
+    .byte  2, 1
+
+armVCM4P10_VMatrix:
+    .byte  10, 16, 13
+    .byte  11, 18, 14
+    .byte  13, 20, 16
+    .byte  14, 23, 18
+    .byte  16, 25, 20
+    .byte  18, 29, 23
+
+;//-------------------------------------------------------
+;// This table evaluates the expression [(INT)(QP/6)],
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+armVCM4P10_QPDivTable:
+    .byte  0,  0,  0,  0,  0,  0
+    .byte  1,  1,  1,  1,  1,  1
+    .byte  2,  2,  2,  2,  2,  2
+    .byte  3,  3,  3,  3,  3,  3
+    .byte  4,  4,  4,  4,  4,  4
+    .byte  5,  5,  5,  5,  5,  5
+    .byte  6,  6,  6,  6,  6,  6
+    .byte  7,  7,  7,  7,  7,  7
+    .byte  8,  8,  8,  8,  8,  8
+
+;//----------------------------------------------------
+;// This table contains armVCM4P10_VMatrix[QP%6][0] entires,
+;// for values of QP from 0 to 51 (inclusive).
+;//----------------------------------------------------
+
+armVCM4P10_VMatrixQPModTable:
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+    .byte 10, 11, 13, 14, 16, 18
+
+;//-------------------------------------------------------
+;// This table evaluates the modulus expression [QP%6]*6,
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+armVCM4P10_QPModuloTable:
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+    .byte 0, 6, 12, 18, 24, 30
+
+;//-------------------------------------------------------
+;// This table contains the invidual byte values stored as
+;// halfwords. This avoids unpacking inside the function
+;//-------------------------------------------------------
+
+armVCM4P10_VMatrixU16:
+    .hword 10, 16, 13
+    .hword 11, 18, 14
+    .hword 13, 20, 16
+    .hword 14, 23, 18
+    .hword 16, 25, 20
+    .hword 18, 29, 23
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S
new file mode 100644
index 0000000..37bc69b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S
@@ -0,0 +1,123 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+    .func   armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+armVCM4P10_InterpolateLuma_HorAlign9x_unsafe:
+    MOV      r12,r8
+    AND      r7,r0,#3
+    BIC      r0,r0,#3
+    ADD      pc,pc,r7,LSL #2
+    NOP
+    B        Copy0toAligned
+    B        Copy1toAligned
+    B        Copy2toAligned
+    B        Copy3toAligned
+Copy0toAligned:
+    LDM      r0,{r7,r10,r11}
+    SUBS     r9,r9,#1
+    ADD      r0,r0,r1
+    STM      r8!,{r7,r10,r11}
+    BGT      Copy0toAligned
+    B        CopyEnd
+Copy1toAligned:
+    LDM      r0,{r7,r10,r11}
+    SUBS     r9,r9,#1
+    ADD      r0,r0,r1
+    LSR      r7,r7,#8
+    ORR      r7,r7,r10,LSL #24
+    LSR      r10,r10,#8
+    ORR      r10,r10,r11,LSL #24
+    LSR      r11,r11,#8
+    STM      r8!,{r7,r10,r11}
+    BGT      Copy1toAligned
+    B        CopyEnd
+Copy2toAligned:
+    LDM      r0,{r7,r10,r11}
+    SUBS     r9,r9,#1
+    ADD      r0,r0,r1
+    LSR      r7,r7,#16
+    ORR      r7,r7,r10,LSL #16
+    LSR      r10,r10,#16
+    ORR      r10,r10,r11,LSL #16
+    LSR      r11,r11,#16
+    STM      r8!,{r7,r10,r11}
+    BGT      Copy2toAligned
+    B        CopyEnd
+Copy3toAligned:
+    LDM      r0,{r7,r10,r11}
+    SUBS     r9,r9,#1
+    ADD      r0,r0,r1
+    LSR      r7,r7,#24
+    ORR      r7,r7,r10,LSL #8
+    LSR      r10,r10,#24
+    ORR      r10,r10,r11,LSL #8
+    LSR      r11,r11,#24
+    STM      r8!,{r7,r10,r11}
+    BGT      Copy3toAligned
+CopyEnd:
+    MOV      r0,r12
+    MOV      r1,#0xc
+    BX       lr
+    .endfunc
+
+    .global armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+    .func   armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+armVCM4P10_InterpolateLuma_VerAlign4x_unsafe:
+    AND      r7,r0,#3
+    BIC      r0,r0,#3
+    ADD      pc,pc,r7,LSL #2
+    NOP
+    B        Copy0toVAligned
+    B        Copy1toVAligned
+    B        Copy2toVAligned
+    B        Copy3toVAligned
+Copy0toVAligned:
+    LDR      r7,[r0],r1
+    SUBS     r9,r9,#1
+    STR      r7,[r8],#4
+    BGT      Copy0toVAligned
+    B        CopyVEnd
+Copy1toVAligned:
+    LDR      r10,[r0,#4]
+    LDR      r7,[r0],r1
+    SUBS     r9,r9,#1
+    LSL      r10,r10,#24
+    ORR      r7,r10,r7,LSR #8
+    STR      r7,[r8],#4
+    BGT      Copy1toVAligned
+    B        CopyVEnd
+Copy2toVAligned:
+    LDR      r10,[r0,#4]
+    LDR      r7,[r0],r1
+    SUBS     r9,r9,#1
+    LSL      r10,r10,#16
+    ORR      r7,r10,r7,LSR #16
+    STR      r7,[r8],#4
+    BGT      Copy2toVAligned
+    B        CopyVEnd
+Copy3toVAligned:
+    LDR      r10,[r0,#4]
+    LDR      r7,[r0],r1
+    SUBS     r9,r9,#1
+    LSL      r10,r10,#8
+    ORR      r7,r10,r7,LSR #24
+    STR      r7,[r8],#4
+    BGT      Copy3toVAligned
+CopyVEnd:
+    SUB      r0,r8,#0x1c
+    MOV      r1,#4
+    BX       lr
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S
new file mode 100644
index 0000000..fe92201
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S
@@ -0,0 +1,105 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+    .func   armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+armVCM4P10_InterpolateLuma_Copy4x4_unsafe:
+    PUSH     {r4-r6,lr}
+    AND      r12,r0,#3
+    BIC      r0,r0,#3
+    ADD      pc,pc,r12,LSL #2
+    NOP
+    B        Copy4x4Align0
+    B        Copy4x4Align1
+    B        Copy4x4Align2
+    B        Copy4x4Align3
+Copy4x4Align0:
+    LDR      r4,[r0],r1
+    LDR      r5,[r0],r1
+    STR      r4,[r2],r3
+    LDR      r8,[r0],r1
+    STR      r5,[r2],r3
+    LDR      r9,[r0],r1
+    STR      r8,[r2],r3
+    STR      r9,[r2],r3
+    B        Copy4x4End
+Copy4x4Align1:
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#8
+    ORR      r4,r4,r5,LSL #24
+    STR      r4,[r2],r3
+    LSR      r8,r8,#8
+    ORR      r8,r8,r9,LSL #24
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    STR      r8,[r2],r3
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#8
+    ORR      r4,r4,r5,LSL #24
+    STR      r4,[r2],r3
+    LSR      r8,r8,#8
+    ORR      r8,r8,r9,LSL #24
+    STR      r8,[r2],r3
+    B        Copy4x4End
+Copy4x4Align2:
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#16
+    ORR      r4,r4,r5,LSL #16
+    STR      r4,[r2],r3
+    LSR      r8,r8,#16
+    ORR      r8,r8,r9,LSL #16
+    STR      r8,[r2],r3
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#16
+    ORR      r4,r4,r5,LSL #16
+    STR      r4,[r2],r3
+    LSR      r8,r8,#16
+    ORR      r8,r8,r9,LSL #16
+    STR      r8,[r2],r3
+    B        Copy4x4End
+Copy4x4Align3:
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#24
+    ORR      r4,r4,r5,LSL #8
+    STR      r4,[r2],r3
+    LSR      r8,r8,#24
+    ORR      r8,r8,r9,LSL #8
+    STR      r8,[r2],r3
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    LDR      r9,[r0,#4]
+    LDR      r8,[r0],r1
+    LSR      r4,r4,#24
+    ORR      r4,r4,r5,LSL #8
+    STR      r4,[r2],r3
+    LSR      r8,r8,#24
+    ORR      r8,r8,r9,LSL #8
+    STR      r8,[r2],r3
+Copy4x4End:
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S
new file mode 100644
index 0000000..544abe8
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S
@@ -0,0 +1,107 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+    .func   armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe:
+    PUSH     {r4-r6,lr}
+    MOV      lr,#4
+    LDR      r6, =0xfe00fe0
+    LDR      r12, =0xff00ff
+LoopStart1:
+    LDR      r11,[r0,#0xc]
+    LDR      r10,[r0,#8]
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    UQSUB16  r11,r11,r6
+    UQSUB16  r10,r10,r6
+    UQSUB16  r5,r5,r6
+    UQSUB16  r4,r4,r6
+    USAT16   r11,#13,r11
+    USAT16   r10,#13,r10
+    USAT16   r5,#13,r5
+    USAT16   r4,#13,r4
+    AND      r11,r12,r11,LSR #5
+    AND      r10,r12,r10,LSR #5
+    AND      r5,r12,r5,LSR #5
+    AND      r4,r12,r4,LSR #5
+    ORR      r11,r10,r11,LSL #8
+    ORR      r10,r4,r5,LSL #8
+    SUBS     lr,lr,#1
+    STRD     r10,r11,[r7],#8
+    BGT      LoopStart1
+    SUB      r0,r7,#0x20
+    MOV      r1,#8
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .global armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+    .func   armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe:
+    PUSH     {r4-r6,lr}
+    LDR      r6, =0xfe00fe0
+    LDR      r12, =0xff00ff
+    MOV      lr,#2
+LoopStart:
+    LDR      r11,[r0,#0xc]
+    LDR      r10,[r0,#8]
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    UQSUB16  r11,r11,r6
+    UQSUB16  r10,r10,r6
+    UQSUB16  r5,r5,r6
+    UQSUB16  r4,r4,r6
+    USAT16   r11,#13,r11
+    USAT16   r10,#13,r10
+    USAT16   r5,#13,r5
+    USAT16   r4,#13,r4
+    AND      r11,r12,r11,LSR #5
+    AND      r10,r12,r10,LSR #5
+    AND      r5,r12,r5,LSR #5
+    AND      r4,r12,r4,LSR #5
+    ORR      r11,r10,r11,LSL #8
+    ORR      r10,r4,r5,LSL #8
+    PKHBT    r4,r10,r11,LSL #16
+    STR      r4,[r7],#8
+    PKHTB    r5,r11,r10,ASR #16
+    STR      r5,[r7],#-4
+    LDR      r11,[r0,#0xc]
+    LDR      r10,[r0,#8]
+    LDR      r5,[r0,#4]
+    LDR      r4,[r0],r1
+    UQSUB16  r11,r11,r6
+    UQSUB16  r10,r10,r6
+    UQSUB16  r5,r5,r6
+    UQSUB16  r4,r4,r6
+    USAT16   r11,#13,r11
+    USAT16   r10,#13,r10
+    USAT16   r5,#13,r5
+    USAT16   r4,#13,r4
+    AND      r11,r12,r11,LSR #5
+    AND      r10,r12,r10,LSR #5
+    AND      r5,r12,r5,LSR #5
+    AND      r4,r12,r4,LSR #5
+    ORR      r11,r10,r11,LSL #8
+    ORR      r10,r4,r5,LSL #8
+    PKHBT    r4,r10,r11,LSL #16
+    SUBS     lr,lr,#1
+    STR      r4,[r7],#8
+    PKHTB    r5,r11,r10,ASR #16
+    STR      r5,[r7],#4
+    BGT      LoopStart
+    SUB      r0,r7,#0x18
+    MOV      r1,#4
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S
new file mode 100644
index 0000000..a330972
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S
@@ -0,0 +1,164 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+    .func   armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe:
+    PUSH     {r4-r12,lr}
+    VLD1.8   {d0,d1},[r0],r1
+    VMOV.I16 d31,#0x14
+    VMOV.I16 d30,#0x5
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q5,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VMLA.I16 d10,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q6,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d10,d10,d8
+    VMLA.I16 d12,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q7,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d12,d12,d8
+    VMLA.I16 d14,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q8,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d14,d14,d8
+    VMLA.I16 d16,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q9,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d16,d16,d8
+    VMLA.I16 d18,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q10,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d18,d18,d8
+    VMLA.I16 d20,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q11,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d20,d20,d8
+    VMLA.I16 d22,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q12,d0,d1
+    VLD1.8   {d0,d1},[r0],r1
+    VSUB.I16 d22,d22,d8
+    VMLA.I16 d24,d2,d31
+    VMUL.I16 d8,d4,d30
+    VEXT.8   d4,d0,d1,#1
+    VEXT.8   d2,d0,d1,#2
+    VEXT.8   d3,d0,d1,#3
+    VEXT.8   d5,d0,d1,#4
+    VEXT.8   d1,d0,d1,#5
+    VADDL.U8 q1,d2,d3
+    VADDL.U8 q2,d4,d5
+    VADDL.U8 q13,d0,d1
+    VSUB.I16 d24,d24,d8
+    VMLA.I16 d26,d2,d31
+    VMUL.I16 d8,d4,d30
+    VMOV.I32 q15,#0x14
+    VMOV.I32 q14,#0x5
+    VADDL.S16 q5,d10,d20
+    VADDL.S16 q1,d14,d16
+    VADDL.S16 q0,d12,d18
+    VSUB.I16 d26,d26,d8
+    VMLA.I32 q5,q1,q15
+    VMUL.I32 q4,q0,q14
+    VADDL.S16 q6,d12,d22
+    VADDL.S16 q1,d16,d18
+    VADDL.S16 q0,d14,d20
+    VMLA.I32 q6,q1,q15
+    VSUB.I32 q5,q5,q4
+    VMUL.I32 q4,q0,q14
+    VADDL.S16 q2,d14,d24
+    VADDL.S16 q1,d18,d20
+    VADDL.S16 q0,d16,d22
+    VMLA.I32 q2,q1,q15
+    VSUB.I32 q6,q6,q4
+    VMUL.I32 q4,q0,q14
+    VADDL.S16 q3,d16,d26
+    VADDL.S16 q1,d20,d22
+    VADDL.S16 q0,d18,d24
+    VMLA.I32 q3,q1,q15
+    VSUB.I32 q2,q2,q4
+    VMLS.I32 q3,q0,q14
+    VQRSHRUN.S32 d0,q5,#10
+    VQRSHRUN.S32 d2,q6,#10
+    VQRSHRUN.S32 d4,q2,#10
+    VQRSHRUN.S32 d6,q3,#10
+    VQMOVN.U16 d0,q0
+    VQMOVN.U16 d2,q1
+    VQMOVN.U16 d4,q2
+    VQMOVN.U16 d6,q3
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S
new file mode 100644
index 0000000..991c33f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S
@@ -0,0 +1,119 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+    .func   armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe:
+    PUSH     {r4-r12,lr}
+    VLD1.8   {d0,d1},[r0],r1
+    ADD      r12,r0,r1,LSL #2
+    VMOV.I8  d30,#0x5
+    VMOV.I8  d31,#0x14
+    VLD1.8   {d10,d11},[r12],r1
+    VLD1.8   {d2,d3},[r0],r1
+    VLD1.8   {d12,d13},[r12],r1
+    VADDL.U8 q9,d0,d10
+    VLD1.8   {d4,d5},[r0],r1
+    VADDL.U8 q0,d1,d11
+    VLD1.8   {d6,d7},[r0],r1
+    VADDL.U8 q10,d2,d12
+    VLD1.8   {d8,d9},[r0],r1
+    VMLAL.U8 q9,d4,d31
+    VLD1.8   {d14,d15},[r12],r1
+    VMLAL.U8 q0,d5,d31
+    VLD1.8   {d16,d17},[r12],r1
+    VMLAL.U8 q9,d6,d31
+    VMLAL.U8 q10,d6,d31
+    VMLSL.U8 q0,d3,d30
+    VADDL.U8 q11,d4,d14
+    VMLSL.U8 q9,d2,d30
+    VADDL.U8 q1,d3,d13
+    VMLAL.U8 q0,d7,d31
+    VMLAL.U8 q10,d8,d31
+    VMLSL.U8 q9,d8,d30
+    VMLAL.U8 q1,d7,d31
+    VMLSL.U8 q0,d9,d30
+    VMLAL.U8 q11,d8,d31
+    VMLSL.U8 q10,d4,d30
+    VMLSL.U8 q1,d5,d30
+    VADDL.U8 q2,d5,d15
+    VMLAL.U8 q11,d10,d31
+    VMLSL.U8 q10,d10,d30
+    VMLAL.U8 q1,d9,d31
+    VMLAL.U8 q2,d9,d31
+    VADDL.U8 q12,d6,d16
+    VMLSL.U8 q11,d6,d30
+    VMLSL.U8 q1,d11,d30
+    VMLSL.U8 q2,d7,d30
+    VADDL.U8 q3,d7,d17
+    VMLAL.U8 q12,d10,d31
+    VMLSL.U8 q11,d12,d30
+    VMLSL.U8 q2,d13,d30
+    VMLAL.U8 q3,d11,d31
+    VMLAL.U8 q12,d12,d31
+    VEXT.8   d26,d18,d19,#2
+    VMLAL.U8 q2,d11,d31
+    VMLAL.U8 q3,d13,d31
+    VMLSL.U8 q12,d8,d30
+    VEXT.8   d27,d18,d19,#4
+    VMOV.I16 d31,#0x14
+    VMLSL.U8 q3,d9,d30
+    VMLSL.U8 q12,d14,d30
+    VEXT.8   d29,d19,d0,#2
+    VEXT.8   d28,d18,d19,#6
+    VMLSL.U8 q3,d15,d30
+    VADDL.S16 q0,d18,d29
+    VADD.I16 d27,d27,d28
+    VMOV.I16 d30,#0x5
+    VADD.I16 d26,d26,d19
+    VMLAL.S16 q0,d27,d31
+    VEXT.8   d27,d20,d21,#4
+    VEXT.8   d28,d20,d21,#6
+    VEXT.8   d29,d21,d2,#2
+    VMLSL.S16 q0,d26,d30
+    VEXT.8   d26,d20,d21,#2
+    VADDL.S16 q1,d20,d29
+    VADD.I16 d27,d27,d28
+    VADD.I16 d26,d26,d21
+    VEXT.8   d28,d22,d23,#6
+    VMLAL.S16 q1,d27,d31
+    VEXT.8   d29,d23,d4,#2
+    VEXT.8   d27,d22,d23,#4
+    VEXT.8   d8,d22,d23,#2
+    VADDL.S16 q2,d22,d29
+    VMLSL.S16 q1,d26,d30
+    VADD.I16 d27,d27,d28
+    VADD.I16 d26,d8,d23
+    VEXT.8   d28,d24,d25,#6
+    VMLAL.S16 q2,d27,d31
+    VEXT.8   d27,d24,d25,#4
+    VEXT.8   d29,d25,d6,#2
+    VADD.I16 d27,d27,d28
+    VEXT.8   d8,d24,d25,#2
+    VADDL.S16 q3,d24,d29
+    VMLSL.S16 q2,d26,d30
+    VMLAL.S16 q3,d27,d31
+    VADD.I16 d8,d8,d25
+    VMLSL.S16 q3,d8,d30
+    VQRSHRUN.S32 d0,q0,#10
+    VQRSHRUN.S32 d2,q1,#10
+    VQRSHRUN.S32 d4,q2,#10
+    VQRSHRUN.S32 d6,q3,#10
+    VQMOVN.U16 d0,q0
+    VQMOVN.U16 d2,q1
+    VQMOVN.U16 d4,q2
+    VQMOVN.U16 d6,q3
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S
new file mode 100644
index 0000000..40e141b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S
@@ -0,0 +1,72 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    .func   armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe:
+    PUSH     {r4-r12,lr}
+    VLD1.8   {d22,d23},[r0],r1
+    VEXT.8   d10,d22,d23,#5
+    VEXT.8   d12,d22,d23,#1
+    VEXT.8   d14,d22,d23,#2
+    VEXT.8   d15,d22,d23,#3
+    VEXT.8   d13,d22,d23,#4
+    VADDL.U8 q11,d22,d10
+    VADDL.U8 q4,d14,d15
+    VADDL.U8 q6,d12,d13
+    VLD1.8   {d24,d25},[r0],r1
+    VMLA.I16 d22,d8,d31
+    VMUL.I16 d8,d12,d30
+    VEXT.8   d10,d24,d25,#5
+    VEXT.8   d12,d24,d25,#1
+    VEXT.8   d16,d24,d25,#2
+    VEXT.8   d17,d24,d25,#3
+    VEXT.8   d13,d24,d25,#4
+    VADDL.U8 q12,d24,d10
+    VSUB.I16 d22,d22,d8
+    VADDL.U8 q4,d16,d17
+    VADDL.U8 q6,d12,d13
+    VLD1.8   {d26,d27},[r0],r1
+    VMLA.I16 d24,d8,d31
+    VMUL.I16 d8,d12,d30
+    VEXT.8   d10,d26,d27,#5
+    VEXT.8   d12,d26,d27,#1
+    VEXT.8   d18,d26,d27,#2
+    VEXT.8   d19,d26,d27,#3
+    VEXT.8   d13,d26,d27,#4
+    VADDL.U8 q13,d26,d10
+    VSUB.I16 d24,d24,d8
+    VADDL.U8 q4,d18,d19
+    VADDL.U8 q6,d12,d13
+    VLD1.8   {d28,d29},[r0],r1
+    VMLA.I16 d26,d8,d31
+    VMUL.I16 d8,d12,d30
+    VEXT.8   d10,d28,d29,#5
+    VEXT.8   d12,d28,d29,#1
+    VEXT.8   d20,d28,d29,#2
+    VEXT.8   d21,d28,d29,#3
+    VEXT.8   d13,d28,d29,#4
+    VADDL.U8 q14,d28,d10
+    VSUB.I16 d26,d26,d8
+    VADDL.U8 q4,d20,d21
+    VADDL.U8 q6,d12,d13
+    VMLA.I16 d28,d8,d31
+    VMLS.I16 d28,d12,d30
+    VQRSHRUN.S16 d22,q11,#5
+    VQRSHRUN.S16 d24,q12,#5
+    VQRSHRUN.S16 d26,q13,#5
+    VQRSHRUN.S16 d28,q14,#5
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S
new file mode 100644
index 0000000..955846f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S
@@ -0,0 +1,58 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    .func   armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe:
+    PUSH     {r4-r12,lr}
+    VLD1.8   {d7},[r0],r1
+    ADD      r12,r0,r1,LSL #2
+    VLD1.8   {d8},[r0],r1
+    VLD1.8   {d12},[r12],r1
+    VLD1.8   {d9},[r0],r1
+    VADDL.U8 q0,d7,d12
+    VLD1.8   {d10},[r0],r1
+    VLD1.8   {d13},[r12],r1
+    VLD1.8   {d11},[r0],r1
+    VLD1.8   {d14},[r12],r1
+    VADDL.U8 q8,d8,d11
+    VADDL.U8 q9,d9,d10
+    VLD1.8   {d15},[r12],r1
+    VMLS.I16 d0,d16,d30
+    VMUL.I16 d20,d18,d31
+    VADDL.U8 q8,d9,d12
+    VADDL.U8 q9,d10,d11
+    VADDL.U8 q1,d8,d13
+    VMLS.I16 d2,d16,d30
+    VMUL.I16 d21,d18,d31
+    VADDL.U8 q8,d10,d13
+    VADDL.U8 q9,d11,d12
+    VADDL.U8 q2,d9,d14
+    VMLS.I16 d4,d16,d30
+    VMUL.I16 d22,d18,d31
+    VADDL.U8 q8,d11,d14
+    VADDL.U8 q3,d10,d15
+    VADDL.U8 q9,d12,d13
+    VMLS.I16 d6,d16,d30
+    VADD.I16 d0,d0,d20
+    VADD.I16 d2,d2,d21
+    VADD.I16 d4,d4,d22
+    VMLA.I16 d6,d18,d31
+    VQRSHRUN.S16 d0,q0,#5
+    VQRSHRUN.S16 d2,q1,#5
+    VQRSHRUN.S16 d4,q2,#5
+    VQRSHRUN.S16 d6,q3,#5
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S
new file mode 100644
index 0000000..66520da
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S
@@ -0,0 +1,175 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+
+    .section .rodata
+    .align 4
+
+armVCM4P10_WidthBranchTableMVIsNotZero:
+    .word   WidthIs2MVIsNotZero, WidthIs2MVIsNotZero
+    .word   WidthIs4MVIsNotZero, WidthIs4MVIsNotZero
+    .word   WidthIs8MVIsNotZero
+
+armVCM4P10_WidthBranchTableMVIsZero:
+    .word   WidthIs2MVIsZero, WidthIs2MVIsZero
+    .word   WidthIs4MVIsZero, WidthIs4MVIsZero
+    .word   WidthIs8MVIsZero
+
+    .text
+
+    .global armVCM4P10_Interpolate_Chroma
+    .func   armVCM4P10_Interpolate_Chroma
+armVCM4P10_Interpolate_Chroma:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    LDRD     r6,r7,[sp,#0x70]
+    LDRD     r4,r5,[sp,#0x68]
+    RSB      r8,r6,#8
+    RSB      r9,r7,#8
+    CMN      r6,r7
+    MOV      r10,#1
+    LDREQ    r11, =armVCM4P10_WidthBranchTableMVIsZero
+    SUB      lr,r1,r10
+    LDRNE    r11, =armVCM4P10_WidthBranchTableMVIsNotZero
+    VLD1.8   {d0},[r0],r10
+    SMULBB   r12,r8,r9
+    SMULBB   r9,r6,r9
+    VLD1.8   {d1},[r0],lr
+    SMULBB   r8,r8,r7
+    SMULBB   r6,r6,r7
+    VDUP.8   d12,r12
+    VDUP.8   d13,r9
+    VDUP.8   d14,r8
+    VDUP.8   d15,r6
+    LDR      pc,[r11,r4,LSL #1]
+
+WidthIs8MVIsNotZero:
+    VLD1.8   {d2},[r0],r10
+    VMULL.U8 q2,d0,d12
+    VLD1.8   {d3},[r0],lr
+    VMULL.U8 q3,d2,d12
+    VLD1.8   {d16},[r0],r10
+    VMLAL.U8 q2,d1,d13
+    VLD1.8   {d17},[r0],lr
+    VMULL.U8 q11,d16,d12
+    VMLAL.U8 q3,d3,d13
+    VLD1.8   {d18},[r0],r10
+    VMLAL.U8 q2,d2,d14
+    VMLAL.U8 q11,d17,d13
+    VMULL.U8 q12,d18,d12
+    VLD1.8   {d19},[r0],lr
+    VMLAL.U8 q3,d16,d14
+    VLD1.8   {d0},[r0],r10
+    VMLAL.U8 q12,d19,d13
+    VMLAL.U8 q11,d18,d14
+    VMLAL.U8 q2,d3,d15
+    VLD1.8   {d1},[r0],lr
+    VMLAL.U8 q12,d0,d14
+    VMLAL.U8 q3,d17,d15
+    VMLAL.U8 q11,d19,d15
+    SUBS     r5,r5,#4
+    VMLAL.U8 q12,d1,d15
+    VQRSHRN.U16 d8,q2,#6
+    VQRSHRN.U16 d9,q3,#6
+    VQRSHRN.U16 d20,q11,#6
+    VST1.64  {d8},[r2],r3
+    VQRSHRN.U16 d21,q12,#6
+    VST1.64  {d9},[r2],r3
+    VST1.64  {d20},[r2],r3
+    VST1.64  {d21},[r2],r3
+    BGT      WidthIs8MVIsNotZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+
+WidthIs4MVIsNotZero:
+    VLD1.8   {d2},[r0],r10
+    VMULL.U8 q2,d0,d12
+    VMULL.U8 q3,d2,d12
+    VLD1.8   {d3},[r0],lr
+    VMLAL.U8 q2,d1,d13
+    VMLAL.U8 q3,d3,d13
+    VLD1.8   {d0},[r0],r10
+    VMLAL.U8 q2,d2,d14
+    VMLAL.U8 q3,d0,d14
+    VLD1.8   {d1},[r0],lr
+    SUBS     r5,r5,#2
+    VMLAL.U8 q3,d1,d15
+    VMLAL.U8 q2,d3,d15
+    VQRSHRN.U16 d9,q3,#6
+    VQRSHRN.U16 d8,q2,#6
+    VST1.32  {d8[0]},[r2],r3
+    VST1.32  {d9[0]},[r2],r3
+    BGT      WidthIs4MVIsNotZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+
+WidthIs2MVIsNotZero:
+    VLD1.8   {d2},[r0],r10
+    VMULL.U8 q2,d0,d12
+    VMULL.U8 q3,d2,d12
+    VLD1.8   {d3},[r0],lr
+    VMLAL.U8 q2,d1,d13
+    VMLAL.U8 q3,d3,d13
+    VLD1.8   {d0},[r0],r10
+    VMLAL.U8 q2,d2,d14
+    VMLAL.U8 q3,d0,d14
+    VLD1.8   {d1},[r0],lr
+    SUBS     r5,r5,#2
+    VMLAL.U8 q3,d1,d15
+    VMLAL.U8 q2,d3,d15
+    VQRSHRN.U16 d9,q3,#6
+    VQRSHRN.U16 d8,q2,#6
+    VST1.16  {d8[0]},[r2],r3
+    VST1.16  {d9[0]},[r2],r3
+    BGT      WidthIs2MVIsNotZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+
+WidthIs8MVIsZero:
+    SUB      r0,r0,r1
+WidthIs8LoopMVIsZero:
+    VLD1.8   {d0},[r0],r1
+    SUBS     r5,r5,#2
+    VLD1.8   {d1},[r0],r1
+    VST1.64  {d0},[r2],r3
+    VST1.64  {d1},[r2],r3
+    BGT      WidthIs8LoopMVIsZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+
+WidthIs4MVIsZero:
+    VLD1.8   {d1},[r0],r1
+    SUBS     r5,r5,#2
+    VST1.32  {d0[0]},[r2],r3
+    VLD1.8   {d0},[r0],r1
+    VST1.32  {d1[0]},[r2],r3
+    BGT      WidthIs4MVIsZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+
+WidthIs2MVIsZero:
+    VLD1.8   {d1},[r0],r1
+    SUBS     r5,r5,#2
+    VST1.16  {d0[0]},[r2],r3
+    VLD1.8   {d0},[r0],r1
+    VST1.16  {d1[0]},[r2],r3
+    BGT      WidthIs2MVIsZero
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S
new file mode 100644
index 0000000..f5d6d1f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S
@@ -0,0 +1,68 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .section .rodata
+    .align 4
+
+    .global armVCM4P10_MFMatrixQPModTable
+    .global armVCM4P10_QPDivIntraTable
+    .global armVCM4P10_QPDivPlusOneTable
+
+;//------------------------------------------------------------------
+;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) ,
+;// for values of iQP from 0 to 51 (inclusive).
+;//------------------------------------------------------------------
+
+
+armVCM4P10_QPDivIntraTable:
+    .word 21845, 21845, 21845, 21845, 21845, 21845
+    .word 43690, 43690, 43690, 43690, 43690, 43690
+    .word 87381, 87381, 87381, 87381, 87381, 87381
+    .word 174762, 174762, 174762, 174762, 174762, 174762
+    .word 349525, 349525, 349525, 349525, 349525, 349525
+    .word 699050, 699050, 699050, 699050, 699050, 699050
+    .word 1398101, 1398101, 1398101, 1398101, 1398101, 1398101
+    .word 2796202, 2796202, 2796202, 2796202, 2796202, 2796202
+
+
+;//--------------------------------------------------------------
+;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires,
+;// for values of iQP from 0 to 51 (inclusive).
+;//--------------------------------------------------------------
+
+armVCM4P10_MFMatrixQPModTable:
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+    .hword 13107, 11916, 10082, 9362, 8192, 7282
+
+;//---------------------------------------------------------------
+;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values,
+;// for values of iQP from 0 to 51 (inclusive).
+;//---------------------------------------------------------------
+
+armVCM4P10_QPDivPlusOneTable:
+    .byte 16, 16, 16, 16, 16, 16
+    .byte 17, 17, 17, 17, 17, 17
+    .byte 18, 18, 18, 18, 18, 18
+    .byte 19, 19, 19, 19, 19, 19
+    .byte 20, 20, 20, 20, 20, 20
+    .byte 21, 21, 21, 21, 21, 21
+    .byte 22, 22, 22, 22, 22, 22
+    .byte 23, 23, 23, 23, 23, 23
+    .byte 24, 24, 24, 24, 24, 24
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S
new file mode 100644
index 0000000..c24d717
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S
@@ -0,0 +1,52 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_TransformResidual4x4
+    .func   armVCM4P10_TransformResidual4x4
+armVCM4P10_TransformResidual4x4:
+    VPUSH    {d8}
+    VLD4.16  {d0,d1,d2,d3},[r1]
+    VMOV.I16 d4,#0
+    VADD.I16 d5,d0,d2
+    VSUB.I16 d6,d0,d2
+    VHADD.S16 d7,d1,d4
+    VHADD.S16 d8,d3,d4
+    VSUB.I16 d7,d7,d3
+    VADD.I16 d8,d1,d8
+    VADD.I16 d0,d5,d8
+    VADD.I16 d1,d6,d7
+    VSUB.I16 d2,d6,d7
+    VSUB.I16 d3,d5,d8
+    VTRN.16  d0,d1
+    VTRN.16  d2,d3
+    VTRN.32  q0,q1
+    VADD.I16 d5,d0,d2
+    VSUB.I16 d6,d0,d2
+    VHADD.S16 d7,d1,d4
+    VHADD.S16 d8,d3,d4
+    VSUB.I16 d7,d7,d3
+    VADD.I16 d8,d1,d8
+    VADD.I16 d0,d5,d8
+    VADD.I16 d1,d6,d7
+    VSUB.I16 d2,d6,d7
+    VSUB.I16 d3,d5,d8
+    VRSHR.S16 d0,d0,#6
+    VRSHR.S16 d1,d1,#6
+    VRSHR.S16 d2,d2,#6
+    VRSHR.S16 d3,d3,#6
+    VST1.16  {d0,d1,d2,d3},[r0]
+    VPOP     {d8}
+    BX       lr
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S
new file mode 100644
index 0000000..c552f8d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S
@@ -0,0 +1,40 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_UnpackBlock4x4
+    .func   armVCM4P10_UnpackBlock4x4
+armVCM4P10_UnpackBlock4x4:
+    PUSH     {r4-r8,lr}
+    LDR      r2,[r0,#0]
+    MOV      r7,#0x1f
+    MOV      r4,#0
+    MOV      r5,#0
+    LDRB     r3,[r2],#1
+    STRD     r4,r5,[r1,#0]
+    STRD     r4,r5,[r1,#8]
+    STRD     r4,r5,[r1,#0x10]
+    STRD     r4,r5,[r1,#0x18]
+unpackLoop:
+    TST      r3,#0x10
+    LDRNESB  r5,[r2,#1]
+    LDRNEB   r4,[r2],#2
+    AND      r6,r7,r3,LSL #1
+    LDREQSB  r4,[r2],#1
+    ORRNE    r4,r4,r5,LSL #8
+    TST      r3,#0x20
+    LDREQB   r3,[r2],#1
+    STRH     r4,[r1,r6]
+    BEQ      unpackLoop
+    STR      r2,[r0,#0]
+    POP      {r4-r8,pc}
+    .endfunc
+    .end
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S
new file mode 100644
index 0000000..ba61059
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S
@@ -0,0 +1,67 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_DeblockLuma_I
+    .func   omxVCM4P10_DeblockLuma_I
+omxVCM4P10_DeblockLuma_I:
+    PUSH     {r4-r9,lr}
+    MOVS     r6,r0
+    SUB      sp,sp,#0xc
+    MOV      r9,r1
+    MOV      r7,r2
+    MOV      r8,r3
+    LDR      r4,[sp,#0x28]
+    LDR      r5,[sp,#0x2c]
+    BEQ      L0x58
+    TST      r6,#7
+    TSTEQ    r9,#7
+    BNE      L0x58
+    CMP      r7,#0
+    CMPNE    r8,#0
+    CMPNE    r4,#0
+    BEQ      L0x58
+    TST      r4,#3
+    BNE      L0x58
+    CMP      r5,#0
+    BEQ      L0x58
+    TST      r5,#3
+    BEQ      L0x64
+L0x58:
+    MVN      r0,#4
+L0x5c:
+    ADD      sp,sp,#0xc
+    POP      {r4-r9,pc}
+L0x64:
+    STR      r4,[sp,#0]
+    MOV      r3,r8
+    STR      r5,[sp,#4]
+    MOV      r2,r7
+    MOV      r1,r9
+    MOV      r0,r6
+    BL       omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+    CMP      r0,#0
+    BNE      L0x5c
+    ADD      r3,r5,#0x10
+    ADD      r2,r4,#0x10
+    STR      r3,[sp,#4]
+    STR      r2,[sp,#0]
+    ADD      r3,r8,#2
+    ADD      r2,r7,#2
+    MOV      r1,r9
+    MOV      r0,r6
+    BL       omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+    ADD      sp,sp,#0xc
+    POP      {r4-r9,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S
new file mode 100644
index 0000000..be21ee7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S
@@ -0,0 +1,119 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_DequantTransformResidualFromPairAndAdd
+    .func   omxVCM4P10_DequantTransformResidualFromPairAndAdd
+omxVCM4P10_DequantTransformResidualFromPairAndAdd:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d9}
+    SUB      sp,sp,#0x20
+    ADD      r4,sp,#0
+    LDR      r5,[sp,#0x64]
+    MOV      r7,r1
+    MOV      r8,r2
+    MOV      r9,r3
+    CMP      r5,#0
+    BEQ      L0x114
+    MOV      r1,r4
+    BL       armVCM4P10_UnpackBlock4x4  ;//
+    LDR      r1,[sp,#0x60]
+    LDR      r11, =armVCM4P10_QPModuloTable
+    LDR      r10, =armVCM4P10_QPDivTable
+    LDR      r2,  =armVCM4P10_VMatrixU16
+    LDRSB    r12,[r11,r1]
+    LDRSB    lr,[r10,r1]
+    LDR      r10, =0x3020504
+    LDR      r1, =0x5040100
+    ADD      r2,r2,r12
+    VDUP.32  d7,r1
+    VDUP.32  d9,r10
+    VDUP.16  d5,lr
+    VLD1.8   {d6},[r2]
+    VTBL.8   d8,{d6},d7
+    VTBL.8   d4,{d6},d9
+    CMP      r8,#0
+    VLD1.16  {d0,d1,d2,d3},[r4]
+    VSHL.U16 d8,d8,d5
+    VSHL.U16 d4,d4,d5
+    BEQ      L1
+    LDRSH    r10,[r8,#0]
+L1:
+    VMUL.I16 d0,d0,d8
+    VMUL.I16 d1,d1,d4
+    VMUL.I16 d2,d2,d8
+    VMUL.I16 d3,d3,d4
+    VMOVNE.16 d0[0],r10
+    VTRN.16  d0,d1
+    VTRN.16  d2,d3
+    VTRN.32  q0,q1
+    VMOV.I16 d4,#0
+    VADD.I16 d5,d0,d2
+    VSUB.I16 d6,d0,d2
+    VHADD.S16 d7,d1,d4
+    VHADD.S16 d8,d3,d4
+    VSUB.I16 d7,d7,d3
+    VADD.I16 d8,d1,d8
+    VADD.I16 d0,d5,d8
+    VADD.I16 d1,d6,d7
+    VSUB.I16 d2,d6,d7
+    VSUB.I16 d3,d5,d8
+    VTRN.16  d0,d1
+    VTRN.16  d2,d3
+    VTRN.32  q0,q1
+    VADD.I16 d5,d0,d2
+    VSUB.I16 d6,d0,d2
+    VHADD.S16 d7,d1,d4
+    VHADD.S16 d8,d3,d4
+    VSUB.I16 d7,d7,d3
+    VADD.I16 d8,d1,d8
+    VADD.I16 d0,d5,d8
+    VADD.I16 d1,d6,d7
+    VSUB.I16 d2,d6,d7
+    VSUB.I16 d3,d5,d8
+    VRSHR.S16 d0,d0,#6
+    VRSHR.S16 d1,d1,#6
+    VRSHR.S16 d2,d2,#6
+    VRSHR.S16 d3,d3,#6
+    B        L0x130
+L0x114:
+    LDRSH    r10,[r8,#0]
+    ADD      r10,r10,#0x20
+    ASR      r10,r10,#6
+    VDUP.16  d0,r10
+    VDUP.16  d1,r10
+    VDUP.16  d2,r10
+    VDUP.16  d3,r10
+L0x130:
+    LDR      r1,[sp,#0x58]
+    LDR      r10,[sp,#0x5c]
+    LDR      r3,[r7],r1
+    LDR      r5,[r7],r1
+    VMOV     d4,r3,r5
+    LDR      r3,[r7],r1
+    LDR      r5,[r7,#0]
+    VMOV     d5,r3,r5
+    VADDW.U8 q3,q0,d4
+    VADDW.U8 q4,q1,d5
+    VQMOVUN.S16 d0,q3
+    VQMOVUN.S16 d1,q4
+    VST1.32  {d0[0]},[r9],r10
+    VST1.32  {d0[1]},[r9],r10
+    VST1.32  {d1[0]},[r9],r10
+    VST1.32  {d1[1]},[r9]
+    MOV      r0,#0
+    ADD      sp,sp,#0x20
+    VPOP     {d8-d9}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S
new file mode 100644
index 0000000..79ba538
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S
@@ -0,0 +1,87 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_FilterDeblockingChroma_HorEdge_I
+    .func   omxVCM4P10_FilterDeblockingChroma_HorEdge_I
+omxVCM4P10_FilterDeblockingChroma_HorEdge_I:
+    PUSH     {r4-r10,lr}
+    VPUSH    {d8-d15}
+    VLD1.8   {d0[]},[r2]!
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,r1
+    VLD1.8   {d2[]},[r3]!
+    LDR      r4,[sp,#0x64]
+    LDR      r5,[sp,#0x60]
+    LDR      r9, =0x3030303
+    LDR      r8, =0x4040404
+    VMOV.I8  d14,#0
+    VMOV.I8  d15,#0x1
+    VMOV.I16 d1,#0x4
+    MOV      r7,#0x40000000
+L0x38:
+    LDR      r6,[r4],#8
+    VLD1.8   {d6},[r0],r1
+    VLD1.8   {d5},[r0],r1
+    CMP      r6,#0
+    VLD1.8   {d4},[r0],r1
+    VLD1.8   {d8},[r0],r1
+    VABD.U8  d19,d6,d4
+    VLD1.8   {d9},[r0],r1
+    VABD.U8  d13,d4,d8
+    VLD1.8   {d10},[r0],r1
+    BEQ      L0xe4
+    VABD.U8  d12,d5,d4
+    VABD.U8  d18,d9,d8
+    VCGT.U8  d16,d0,d13
+    VMOV.32  d26[0],r6
+    VMAX.U8  d12,d18,d12
+    VMOVL.U8 q13,d26
+    VABD.U8  d17,d10,d8
+    VCGT.S16 d27,d26,#0
+    VCGT.U8  d12,d2,d12
+    VCGT.U8  d19,d2,d19
+    VAND     d16,d16,d27
+    TST      r6,r9
+    VCGT.U8  d17,d2,d17
+    VAND     d16,d16,d12
+    VAND     d12,d16,d17
+    VAND     d17,d16,d19
+    BLNE     armVCM4P10_DeblockingChromabSLT4_unsafe
+    TST      r6,r8
+    SUB      r0,r0,r1,LSL #2
+    VTST.16  d26,d26,d1
+    BLNE     armVCM4P10_DeblockingChromabSGE4_unsafe
+    VBIT     d29,d13,d26
+    VBIT     d24,d31,d26
+    VBIF     d29,d4,d16
+    VBIF     d24,d8,d16
+    VST1.8   {d29},[r0],r1
+    ADDS     r7,r7,r7
+    VST1.8   {d24},[r0],r1
+    BNE      L0x38
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r10,pc}
+L0xe4:
+    VLD1.8   {d0[]},[r2]
+    SUB      r0,r0,r1,LSL #1
+    ADDS     r7,r7,r7
+    VLD1.8   {d2[]},[r3]
+    ADD      r5,r5,#4
+    BNE      L0x38
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r10,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S
new file mode 100644
index 0000000..dcdddbe
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S
@@ -0,0 +1,123 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_FilterDeblockingChroma_VerEdge_I
+    .func   omxVCM4P10_FilterDeblockingChroma_VerEdge_I
+omxVCM4P10_FilterDeblockingChroma_VerEdge_I:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    VLD1.8   {d0[]},[r2]!
+    SUB      r0,r0,#4
+    VLD1.8   {d2[]},[r3]!
+    LDR      r4,[sp,#0x6c]
+    LDR      r5,[sp,#0x68]
+    LDR      r8, =0x4040404
+    LDR      r9, =0x3030303
+    VMOV.I8  d14,#0
+    VMOV.I8  d15,#0x1
+    VMOV.I16 d1,#0x4
+    MOV      r7,#0x40000000
+L0x34:
+    LDR      r6,[r4],#8
+    ADD      r10,r0,r1
+    ADD      lr,r1,r1
+    VLD1.8   {d7},[r0],lr
+    VLD1.8   {d8},[r10],lr
+    VLD1.8   {d5},[r0],lr
+    VLD1.8   {d10},[r10],lr
+    VLD1.8   {d6},[r0],lr
+    VLD1.8   {d9},[r10],lr
+    VLD1.8   {d4},[r0],lr
+    VLD1.8   {d11},[r10],lr
+    VZIP.8   d7,d8
+    VZIP.8   d5,d10
+    VZIP.8   d6,d9
+    VZIP.8   d4,d11
+    VZIP.16  d7,d5
+    VZIP.16  d8,d10
+    VZIP.16  d6,d4
+    VZIP.16  d9,d11
+    VTRN.32  d7,d6
+    VTRN.32  d5,d4
+    VTRN.32  d10,d11
+    VTRN.32  d8,d9
+    CMP      r6,#0
+    VABD.U8  d19,d6,d4
+    VABD.U8  d13,d4,d8
+    BEQ      L0x170
+    VABD.U8  d12,d5,d4
+    VABD.U8  d18,d9,d8
+    VMOV.32  d26[0],r6
+    VCGT.U8  d16,d0,d13
+    VMAX.U8  d12,d18,d12
+    VMOVL.U8 q13,d26
+    VABD.U8  d17,d10,d8
+    VCGT.S16 d27,d26,#0
+    VCGT.U8  d12,d2,d12
+    VCGT.U8  d19,d2,d19
+    VAND     d16,d16,d27
+    TST      r6,r9
+    VCGT.U8  d17,d2,d17
+    VAND     d16,d16,d12
+    VAND     d12,d16,d17
+    VAND     d17,d16,d19
+    BLNE     armVCM4P10_DeblockingChromabSLT4_unsafe
+    TST      r6,r8
+    SUB      r0,r0,r1,LSL #3
+    VTST.16  d26,d26,d1
+    BLNE     armVCM4P10_DeblockingChromabSGE4_unsafe
+    VBIT     d29,d13,d26
+    VBIT     d24,d31,d26
+    ADD      r10,r0,#3
+    VBIF     d29,d4,d16
+    ADD      r12,r10,r1
+    ADD      lr,r1,r1
+    VBIF     d24,d8,d16
+    ADDS     r7,r7,r7
+    VST1.8   {d29[0]},[r10],lr
+    VST1.8   {d29[1]},[r12],lr
+    VST1.8   {d29[2]},[r10],lr
+    VST1.8   {d29[3]},[r12],lr
+    VST1.8   {d29[4]},[r10],lr
+    VST1.8   {d29[5]},[r12],lr
+    VST1.8   {d29[6]},[r10],lr
+    VST1.8   {d29[7]},[r12],lr
+    ADD      r12,r0,#4
+    ADD      r10,r12,r1
+    VST1.8   {d24[0]},[r12],lr
+    VST1.8   {d24[1]},[r10],lr
+    VST1.8   {d24[2]},[r12],lr
+    VST1.8   {d24[3]},[r10],lr
+    VST1.8   {d24[4]},[r12],lr
+    VST1.8   {d24[5]},[r10],lr
+    VST1.8   {d24[6]},[r12],lr
+    VST1.8   {d24[7]},[r10],lr
+    ADD      r0,r0,#4
+    BNE      L0x34
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+L0x170:
+    VLD1.8   {d0[]},[r2]
+    ADD      r0,r0,#4
+    SUB      r0,r0,r1,LSL #3
+    ADDS     r7,r7,r7
+    VLD1.8   {d2[]},[r3]
+    ADD      r5,r5,#4
+    BNE      L0x34
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S
new file mode 100644
index 0000000..9755899
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S
@@ -0,0 +1,107 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+    .func   omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+omxVCM4P10_FilterDeblockingLuma_HorEdge_I:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    ADD      r7,r2,#1
+    ADD      r8,r3,#1
+    VLD1.8   {d0[]},[r2]
+    SUB      r0,r0,r1,LSL #2
+    VLD1.8   {d2[]},[r3]
+    LDR      r4,[sp,#0x6c]
+    LDR      r5,[sp,#0x68]
+    MOV      r11,#0
+    VMOV.I8  d14,#0
+    VMOV.I8  d15,#0x1
+    ADD      r10,r1,r1
+    MOV      r9,#0x55000000
+L0x38:
+    LDRH     r12,[r4],#2
+    ADD      r6,r0,r1
+    CMP      r12,#0
+    BEQ      L0xe4
+    VLD1.8   {d7},[r0],r10
+    VLD1.8   {d6},[r6],r10
+    VLD1.8   {d5},[r0],r10
+    VLD1.8   {d4},[r6],r10
+    VLD1.8   {d8},[r0],r10
+    VABD.U8  d12,d4,d5
+    VLD1.8   {d9},[r6]
+    VABD.U8  d13,d8,d4
+    VLD1.8   {d10},[r0],r1
+    VABD.U8  d18,d9,d8
+    VABD.U8  d19,d6,d4
+    VCGT.U8  d16,d0,d13
+    TST      r12,#0xff
+    VMAX.U8  d12,d18,d12
+    VABD.U8  d17,d10,d8
+    VMOVEQ.32 d16[0],r11
+    TST      r12,#0xff00
+    VCGT.U8  d19,d2,d19
+    VCGT.U8  d12,d2,d12
+    VMOVEQ.32 d16[1],r11
+    VCGT.U8  d17,d2,d17
+    VLD1.8   {d11},[r0]
+    VAND     d16,d16,d12
+    TST      r12,#4
+    VAND     d12,d16,d17
+    VAND     d17,d16,d19
+    BNE      L0xf8
+    SUB      r0,r0,r1,LSL #2
+    SUB      r0,r0,r1
+    BL       armVCM4P10_DeblockingLumabSLT4_unsafe
+    VST1.8   {d30},[r0],r1
+    VST1.8   {d29},[r0],r1
+    SUB      r6,r0,r1,LSL #2
+    VST1.8   {d24},[r0],r1
+    ADDS     r9,r9,r9
+    VST1.8   {d25},[r0]
+    ADD      r0,r6,#8
+    BCC      L0x38
+    B        L0x130
+L0xe4:
+    ADD      r0,r0,#8
+    ADDS     r9,r9,r9
+    ADD      r5,r5,#2
+    BCC      L0x38
+    B        L0x130
+L0xf8:
+    SUB      r0,r0,r1,LSL #2
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_DeblockingLumabSGE4_unsafe
+    VST1.8   {d31},[r0],r1
+    VST1.8   {d30},[r0],r1
+    VST1.8   {d29},[r0],r1
+    SUB      r6,r0,r1,LSL #2
+    VST1.8   {d24},[r0],r1
+    ADDS     r9,r9,r9
+    VST1.8   {d25},[r0],r1
+    ADD      r5,r5,#2
+    VST1.8   {d28},[r0]
+    ADD      r0,r6,#8
+    BCC      L0x38
+L0x130:
+    SUB      r0,r0,#0x10
+    VLD1.8   {d0[]},[r7]
+    ADD      r0,r0,r1,LSL #2
+    VLD1.8   {d2[]},[r8]
+    BNE      L0x38
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S
new file mode 100644
index 0000000..66cc32e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S
@@ -0,0 +1,157 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+    .func   omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+omxVCM4P10_FilterDeblockingLuma_VerEdge_I:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    ADD      r7,r2,#1
+    ADD      r8,r3,#1
+    VLD1.8   {d0[]},[r2]
+    SUB      r0,r0,#4
+    VLD1.8   {d2[]},[r3]
+    LDR      r4,[sp,#0x6c]
+    LDR      r5,[sp,#0x68]
+    MOV      r6,#0
+    VMOV.I8  d14,#0
+    VMOV.I8  d15,#0x1
+    MOV      r9,#0x11000000
+    ADD      r11,r1,r1
+L0x38:
+    LDRH     r12,[r4],#4
+    CMP      r12,#0
+    BEQ      L0x160
+    ADD      r10,r0,r1
+    VLD1.8   {d7},[r0],r11
+    VLD1.8   {d8},[r10],r11
+    VLD1.8   {d5},[r0],r11
+    VZIP.8   d7,d8
+    VLD1.8   {d10},[r10],r11
+    VLD1.8   {d6},[r0],r11
+    VZIP.8   d5,d10
+    VLD1.8   {d9},[r10],r11
+    VLD1.8   {d4},[r0],r11
+    VLD1.8   {d11},[r10],r11
+    VZIP.8   d6,d9
+    VZIP.16  d8,d10
+    VZIP.8   d4,d11
+    SUB      r0,r0,r1,LSL #3
+    VZIP.16  d7,d5
+    VZIP.16  d9,d11
+    VZIP.16  d6,d4
+    VTRN.32  d8,d9
+    VTRN.32  d5,d4
+    VTRN.32  d10,d11
+    VTRN.32  d7,d6
+    VABD.U8  d13,d4,d8
+    VABD.U8  d12,d5,d4
+    VABD.U8  d18,d9,d8
+    VABD.U8  d19,d6,d4
+    TST      r12,#0xff
+    VCGT.U8  d16,d0,d13
+    VMAX.U8  d12,d18,d12
+    VABD.U8  d17,d10,d8
+    VMOVEQ.32 d16[0],r6
+    TST      r12,#0xff00
+    VCGT.U8  d19,d2,d19
+    VCGT.U8  d12,d2,d12
+    VMOVEQ.32 d16[1],r6
+    VCGT.U8  d17,d2,d17
+    VAND     d16,d16,d12
+    TST      r12,#4
+    VAND     d12,d16,d17
+    VAND     d17,d16,d19
+    BNE      L0x17c
+    BL       armVCM4P10_DeblockingLumabSLT4_unsafe
+    VZIP.8   d7,d6
+    VZIP.8   d30,d29
+    VZIP.8   d24,d25
+    VZIP.8   d10,d11
+    VZIP.16  d7,d30
+    ADD      r10,r0,r1
+    VZIP.16  d24,d10
+    VZIP.16  d25,d11
+    VZIP.16  d6,d29
+    VTRN.32  d7,d24
+    VTRN.32  d30,d10
+    VTRN.32  d6,d25
+    VTRN.32  d29,d11
+    VST1.8   {d7},[r0],r11
+    VST1.8   {d24},[r10],r11
+    VST1.8   {d30},[r0],r11
+    VST1.8   {d10},[r10],r11
+    VST1.8   {d6},[r0],r11
+    VST1.8   {d25},[r10],r11
+    ADDS     r9,r9,r9
+    VST1.8   {d29},[r0],r11
+    ADD      r5,r5,#2
+    VST1.8   {d11},[r10],r1
+    SUB      r0,r0,r1,LSL #3
+    VLD1.8   {d0[]},[r7]
+    ADD      r0,r0,#4
+    VLD1.8   {d2[]},[r8]
+    BCC      L0x38
+    B        L0x1f0
+L0x160:
+    ADD      r0,r0,#4
+    ADDS     r9,r9,r9
+    VLD1.8   {d0[]},[r7]
+    ADD      r5,r5,#4
+    VLD1.8   {d2[]},[r8]
+    BCC      L0x38
+    B        L0x1f0
+L0x17c:
+    BL       armVCM4P10_DeblockingLumabSGE4_unsafe
+    VZIP.8   d7,d31
+    VZIP.8   d30,d29
+    VZIP.8   d24,d25
+    VZIP.8   d28,d11
+    VZIP.16  d7,d30
+    ADD      r10,r0,r1
+    VZIP.16  d24,d28
+    VZIP.16  d25,d11
+    VZIP.16  d31,d29
+    VTRN.32  d7,d24
+    VTRN.32  d30,d28
+    VTRN.32  d31,d25
+    VTRN.32  d29,d11
+    VST1.8   {d7},[r0],r11
+    VST1.8   {d24},[r10],r11
+    VST1.8   {d30},[r0],r11
+    VST1.8   {d28},[r10],r11
+    VST1.8   {d31},[r0],r11
+    VST1.8   {d25},[r10],r11
+    ADDS     r9,r9,r9
+    VST1.8   {d29},[r0],r11
+    ADD      r5,r5,#4
+    VST1.8   {d11},[r10],r11
+    SUB      r0,r0,r1,LSL #3
+    VLD1.8   {d0[]},[r7]
+    ADD      r0,r0,#4
+    VLD1.8   {d2[]},[r8]
+    BCC      L0x38
+L0x1f0:
+    SUB      r4,r4,#0xe
+    SUB      r5,r5,#0xe
+    SUB      r0,r0,#0x10
+    VLD1.8   {d0[]},[r2]
+    ADD      r0,r0,r1,LSL #3
+    VLD1.8   {d2[]},[r3]
+    BNE      L0x38
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S
new file mode 100644
index 0000000..76c3d7d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S
@@ -0,0 +1,323 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_InterpolateLuma
+    .func   omxVCM4P10_InterpolateLuma
+omxVCM4P10_InterpolateLuma:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    SUB      sp,sp,#0x10
+    LDR      r6,[sp,#0x78]
+    LDR      r7,[sp,#0x7c]
+    LDR      r5,[sp,#0x80]
+    LDR      r4,[sp,#0x84]
+    ADD      r6,r6,r7,LSL #2
+    ADD      r11,sp,#0
+    VMOV.I16 d31,#0x14
+    VMOV.I16 d30,#0x5
+L0x2c:
+    STM      r11,{r0-r3}
+    ADD      pc,pc,r6,LSL #2
+    B        L0x3f0
+    B        L0x78
+    B        L0xa8
+    B        L0xdc
+    B        L0x100
+    B        L0x134
+    B        L0x168
+    B        L0x1a8
+    B        L0x1f0
+    B        L0x234
+    B        L0x258
+    B        L0x2b0
+    B        L0x2d8
+    B        L0x330
+    B        L0x364
+    B        L0x3a8
+    B        L0x3f0
+L0x78:
+    ADD      r12,r0,r1,LSL #1
+    VLD1.8   {d9},[r0],r1
+    VLD1.8   {d11},[r12],r1
+    VLD1.8   {d10},[r0]
+    VLD1.8   {d12},[r12]
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d9[0]},[r2],r3
+    VST1.32  {d11[0]},[r12],r3
+    VST1.32  {d10[0]},[r2]
+    VST1.32  {d12[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0xa8:
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d14
+    VRHADD.U8 d26,d26,d18
+    VRHADD.U8 d24,d24,d16
+    VRHADD.U8 d28,d28,d20
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0xdc:
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x100:
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d15
+    VRHADD.U8 d26,d26,d19
+    VRHADD.U8 d24,d24,d17
+    VRHADD.U8 d28,d28,d21
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x134:
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    VRHADD.U8 d0,d0,d9
+    VRHADD.U8 d4,d4,d11
+    VRHADD.U8 d2,d2,d10
+    VRHADD.U8 d6,d6,d12
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x168:
+    MOV      r8,r0
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    SUB      r0,r8,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d0
+    VRHADD.U8 d26,d26,d4
+    VRHADD.U8 d24,d24,d2
+    VRHADD.U8 d28,d28,d6
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x1a8:
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+    VQRSHRUN.S16 d14,q7,#5
+    VQRSHRUN.S16 d16,q8,#5
+    VQRSHRUN.S16 d18,q9,#5
+    VQRSHRUN.S16 d20,q10,#5
+    VRHADD.U8 d0,d0,d14
+    VRHADD.U8 d4,d4,d18
+    VRHADD.U8 d2,d2,d16
+    VRHADD.U8 d6,d6,d20
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x1f0:
+    MOV      r8,r0
+    ADD      r0,r0,#1
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    SUB      r0,r8,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d0
+    VRHADD.U8 d26,d26,d4
+    VRHADD.U8 d24,d24,d2
+    VRHADD.U8 d28,d28,d6
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x234:
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x258:
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+    VEXT.8   d18,d18,d19,#4
+    VEXT.8   d20,d20,d21,#4
+    VEXT.8   d22,d22,d23,#4
+    VEXT.8   d24,d24,d25,#4
+    VQRSHRUN.S16 d14,q9,#5
+    VQRSHRUN.S16 d16,q10,#5
+    VQRSHRUN.S16 d18,q11,#5
+    VQRSHRUN.S16 d20,q12,#5
+    VRHADD.U8 d0,d0,d14
+    VRHADD.U8 d4,d4,d18
+    VRHADD.U8 d2,d2,d16
+    VRHADD.U8 d6,d6,d20
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x2b0:
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x2d8:
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+    VEXT.8   d18,d18,d19,#6
+    VEXT.8   d20,d20,d21,#6
+    VEXT.8   d22,d22,d23,#6
+    VEXT.8   d24,d24,d25,#6
+    VQRSHRUN.S16 d14,q9,#5
+    VQRSHRUN.S16 d16,q10,#5
+    VQRSHRUN.S16 d18,q11,#5
+    VQRSHRUN.S16 d20,q12,#5
+    VRHADD.U8 d0,d0,d14
+    VRHADD.U8 d4,d4,d18
+    VRHADD.U8 d2,d2,d16
+    VRHADD.U8 d6,d6,d20
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x330:
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    VRHADD.U8 d0,d0,d10
+    VRHADD.U8 d4,d4,d12
+    VRHADD.U8 d2,d2,d11
+    VRHADD.U8 d6,d6,d13
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x364:
+    MOV      r8,r0
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    ADD      r0,r8,r1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d0
+    VRHADD.U8 d26,d26,d4
+    VRHADD.U8 d24,d24,d2
+    VRHADD.U8 d28,d28,d6
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x3a8:
+    SUB      r0,r0,r1,LSL #1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+    VQRSHRUN.S16 d14,q8,#5
+    VQRSHRUN.S16 d16,q9,#5
+    VQRSHRUN.S16 d18,q10,#5
+    VQRSHRUN.S16 d20,q11,#5
+    VRHADD.U8 d0,d0,d14
+    VRHADD.U8 d4,d4,d18
+    VRHADD.U8 d2,d2,d16
+    VRHADD.U8 d6,d6,d20
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d0[0]},[r2],r3
+    VST1.32  {d4[0]},[r12],r3
+    VST1.32  {d2[0]},[r2]
+    VST1.32  {d6[0]},[r12]
+    ADD      r11,sp,#0
+    B        L0x434
+L0x3f0:
+    MOV      r8,r0
+    ADD      r0,r0,#1
+    SUB      r0,r0,r1,LSL #1
+    BL       armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+    ADD      r0,r8,r1
+    SUB      r0,r0,#2
+    BL       armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+    VRHADD.U8 d22,d22,d0
+    VRHADD.U8 d26,d26,d4
+    VRHADD.U8 d24,d24,d2
+    VRHADD.U8 d28,d28,d6
+    ADD      r12,r2,r3,LSL #1
+    VST1.32  {d22[0]},[r2],r3
+    VST1.32  {d26[0]},[r12],r3
+    VST1.32  {d24[0]},[r2]
+    VST1.32  {d28[0]},[r12]
+    ADD      r11,sp,#0
+L0x434:
+    LDM      r11,{r0-r3}
+    SUBS     r5,r5,#4
+    ADD      r0,r0,#4
+    ADD      r2,r2,#4
+    BGT      L0x2c
+    SUBS     r4,r4,#4
+    LDR      r5,[sp,#0x80]
+    ADD      r11,sp,#0
+    ADD      r0,r0,r1,LSL #2
+    ADD      r2,r2,r3,LSL #2
+    SUB      r0,r0,r5
+    SUB      r2,r2,r5
+    BGT      L0x2c
+    MOV      r0,#0
+    ADD      sp,sp,#0x10
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S
new file mode 100644
index 0000000..0d49e4b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S
@@ -0,0 +1,217 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .section .rodata
+    .align 4
+
+armVCM4P10_pIndexTable8x8:
+    .word  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
+    .word  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
+
+armVCM4P10_MultiplierTableChroma8x8:
+    .hword   3, 2, 1,4
+    .hword  -3,-2,-1,0
+    .hword   1, 2, 3,4
+
+
+    .text
+    .global omxVCM4P10_PredictIntraChroma_8x8
+    .func   omxVCM4P10_PredictIntraChroma_8x8
+omxVCM4P10_PredictIntraChroma_8x8:
+    PUSH     {r4-r10,lr}
+    VPUSH    {d8-d15}
+    LDR      r8, =armVCM4P10_pIndexTable8x8
+    LDR      r6,[sp,#0x68]
+    LDR      r4,[sp,#0x60]
+    LDR      r5,[sp,#0x64]
+    LDR      r7,[sp,#0x6c]
+    LDR      pc,[r8,r6,LSL #2]
+OMX_VC_CHROMA_DC:
+    TST      r7,#2
+    BEQ      L0xe8
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d1[0]},[r0],r10
+    VLD1.8   {d1[1]},[r9],r10
+    VLD1.8   {d1[2]},[r0],r10
+    VLD1.8   {d1[3]},[r9],r10
+    VLD1.8   {d1[4]},[r0],r10
+    VLD1.8   {d1[5]},[r9],r10
+    VLD1.8   {d1[6]},[r0],r10
+    VLD1.8   {d1[7]},[r9]
+    TST      r7,#1
+    BEQ      L0xcc
+    VLD1.8   {d0},[r1]
+    MOV      r0,#0
+    VPADDL.U8 d2,d0
+    VPADDL.U16 d3,d2
+    VPADDL.U8 d2,d1
+    VPADDL.U16 d1,d2
+    VADD.I32 d2,d3,d1
+    VRSHR.U32 d2,d2,#3
+    VRSHR.U32 d3,d3,#2
+    VRSHR.U32 d1,d1,#2
+    VMOV.I8  d5,#0xc
+    VMOV.I8  d6,#0x4
+    VSHL.I64 d5,d5,#32
+    VSHR.U64 d6,d6,#32
+    VADD.I8  d6,d6,d5
+    VTBL.8   d0,{d2-d3},d5
+    VTBL.8   d4,{d1-d2},d6
+L0x9c:
+    ADD      r9,r3,r5
+    ADD      r10,r5,r5
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9],r10
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9],r10
+    VST1.8   {d4},[r3],r10
+    VST1.8   {d4},[r9],r10
+    VST1.8   {d4},[r3],r10
+    VST1.8   {d4},[r9]
+    VPOP     {d8-d15}
+    POP      {r4-r10,pc}
+L0xcc:
+    MOV      r0,#0
+    VPADDL.U8 d2,d1
+    VPADDL.U16 d1,d2
+    VRSHR.U32 d1,d1,#2
+    VDUP.8   d0,d1[0]
+    VDUP.8   d4,d1[4]
+    B        L0x9c
+L0xe8:
+    TST      r7,#1
+    BEQ      L0x114
+    VLD1.8   {d0},[r1]
+    MOV      r0,#0
+    VPADDL.U8 d2,d0
+    VPADDL.U16 d3,d2
+    VRSHR.U32 d3,d3,#2
+    VMOV.I8  d5,#0x4
+    VSHL.I64 d5,d5,#32
+    VTBL.8   d0,{d3},d5
+    B        L0x11c
+L0x114:
+    VMOV.I8  d0,#0x80
+    MOV      r0,#0
+L0x11c:
+    ADD      r9,r3,r5
+    ADD      r10,r5,r5
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9],r10
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9],r10
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9],r10
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d0},[r9]
+    VPOP     {d8-d15}
+    POP      {r4-r10,pc}
+OMX_VC_CHROMA_VERT:
+    VLD1.8   {d0},[r1]
+    MOV      r0,#0
+    B        L0x11c
+OMX_VC_CHROMA_HOR:
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d0[]},[r0],r10
+    VLD1.8   {d1[]},[r9],r10
+    VLD1.8   {d2[]},[r0],r10
+    VLD1.8   {d3[]},[r9],r10
+    VLD1.8   {d4[]},[r0],r10
+    VLD1.8   {d5[]},[r9],r10
+    VLD1.8   {d6[]},[r0],r10
+    VLD1.8   {d7[]},[r9]
+    B        L0x28c
+OMX_VC_CHROMA_PLANE:
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d0},[r1]
+    VLD1.8   {d2[0]},[r2]
+    VLD1.8   {d1[0]},[r0],r10
+    VLD1.8   {d1[1]},[r9],r10
+    VLD1.8   {d1[2]},[r0],r10
+    VLD1.8   {d1[3]},[r9],r10
+    VLD1.8   {d1[4]},[r0],r10
+    VLD1.8   {d1[5]},[r9],r10
+    VLD1.8   {d1[6]},[r0],r10
+    VLD1.8   {d1[7]},[r9]
+    VREV64.8 d3,d0
+    VSUBL.U8 q3,d3,d2
+    VSHR.U64 d3,d3,#8
+    VSUBL.U8 q2,d3,d0
+    VREV64.8 d3,d1
+    VSUBL.U8 q7,d3,d2
+    VSHR.U64 d3,d3,#8
+    VSUBL.U8 q6,d3,d1
+    LDR      r2, =armVCM4P10_MultiplierTableChroma8x8
+    VSHL.I64 d4,d4,#16
+    VEXT.8   d9,d4,d6,#2
+    VLD1.16  {d10},[r2]!
+    VSHL.I64 d12,d12,#16
+    VEXT.8   d16,d12,d14,#2
+    VMUL.I16 d11,d9,d10
+    VMUL.I16 d3,d16,d10
+    VPADD.I16 d3,d11,d3
+    VPADDL.S16 d3,d3
+    VSHL.I32 d2,d3,#4
+    VADD.I32 d3,d3,d2
+    VLD1.16  {d10,d11},[r2]
+    VRSHR.S32 d3,d3,#5
+    VADDL.U8 q0,d0,d1
+    VDUP.16  q0,d1[3]
+    VSHL.I16 q0,q0,#4
+    VDUP.16  q2,d3[0]
+    VDUP.16  q3,d3[2]
+    VMUL.I16 q2,q2,q5
+    VMUL.I16 q3,q3,q5
+    VADD.I16 q2,q2,q0
+    VDUP.16  q0,d6[0]
+    VDUP.16  q1,d6[1]
+    VDUP.16  q4,d6[2]
+    VDUP.16  q5,d6[3]
+    VDUP.16  q6,d7[0]
+    VDUP.16  q7,d7[1]
+    VDUP.16  q8,d7[2]
+    VDUP.16  q9,d7[3]
+    VADD.I16 q0,q2,q0
+    VADD.I16 q1,q2,q1
+    VADD.I16 q4,q2,q4
+    VADD.I16 q5,q2,q5
+    VADD.I16 q6,q2,q6
+    VADD.I16 q7,q2,q7
+    VADD.I16 q8,q2,q8
+    VADD.I16 q9,q2,q9
+    VQRSHRUN.S16 d0,q0,#5
+    VQRSHRUN.S16 d1,q1,#5
+    VQRSHRUN.S16 d2,q4,#5
+    VQRSHRUN.S16 d3,q5,#5
+    VQRSHRUN.S16 d4,q6,#5
+    VQRSHRUN.S16 d5,q7,#5
+    VQRSHRUN.S16 d6,q8,#5
+    VQRSHRUN.S16 d7,q9,#5
+L0x28c:
+    ADD      r9,r3,r5
+    ADD      r10,r5,r5
+    VST1.8   {d0},[r3],r10
+    VST1.8   {d1},[r9],r10
+    VST1.8   {d2},[r3],r10
+    VST1.8   {d3},[r9],r10
+    VST1.8   {d4},[r3],r10
+    VST1.8   {d5},[r9],r10
+    VST1.8   {d6},[r3],r10
+    VST1.8   {d7},[r9]
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r10,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
new file mode 100644
index 0000000..53268f6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
@@ -0,0 +1,239 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+
+    .section .rodata
+    .align 4
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+armVCM4P10_pIndexTable16x16:
+    .word  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
+    .word  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
+
+
+
+armVCM4P10_MultiplierTable16x16:
+    .hword   7,  6,  5,  4,  3,  2,  1,  8
+    .hword   0,  1,  2,  3,  4,  5,  6,  7
+    .hword   8,  9, 10, 11, 12, 13, 14, 15
+
+    .text
+
+    .global omxVCM4P10_PredictIntra_16x16
+    .func   omxVCM4P10_PredictIntra_16x16
+omxVCM4P10_PredictIntra_16x16:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d15}
+    LDR      r9, =armVCM4P10_pIndexTable16x16
+    LDR      r6,[sp,#0x70]
+    LDR      r4,[sp,#0x68]
+    LDR      r5,[sp,#0x6c]
+    LDR      r7,[sp,#0x74]
+    MOV      r12,#0x10
+    LDR      pc,[r9,r6,LSL #2]
+OMX_VC_16X16_VERT:
+    VLD1.8   {d0,d1},[r1]
+    ADD      r8,r3,r5
+    ADD      r10,r5,r5
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3],r10
+    VST1.8   {d0,d1},[r8],r10
+    VST1.8   {d0,d1},[r3]
+    VST1.8   {d0,d1},[r8]
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+OMX_VC_16X16_HOR:
+    ADD      r8,r0,r4
+    ADD      r4,r4,r4
+    ADD      r11,r3,r5
+    ADD      r5,r5,r5
+L0x8c:
+    VLD1.8   {d2[],d3[]},[r0],r4
+    VLD1.8   {d0[],d1[]},[r8],r4
+    SUBS     r12,r12,#8
+    VST1.8   {d2,d3},[r3],r5
+    VST1.8   {d0,d1},[r11],r5
+    VLD1.8   {d2[],d3[]},[r0],r4
+    VLD1.8   {d0[],d1[]},[r8],r4
+    VST1.8   {d2,d3},[r3],r5
+    VST1.8   {d0,d1},[r11],r5
+    VLD1.8   {d2[],d3[]},[r0],r4
+    VLD1.8   {d0[],d1[]},[r8],r4
+    VST1.8   {d2,d3},[r3],r5
+    VST1.8   {d0,d1},[r11],r5
+    VLD1.8   {d2[],d3[]},[r0],r4
+    VLD1.8   {d0[],d1[]},[r8],r4
+    VST1.8   {d2,d3},[r3],r5
+    VST1.8   {d0,d1},[r11],r5
+    BNE      L0x8c
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+OMX_VC_16X16_DC:
+    MOV      r11,#0
+    TST      r7,#2
+    BEQ      L0x14c
+    ADD      r8,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d2[0]},[r0],r10
+    VLD1.8   {d2[1]},[r8],r10
+    VLD1.8   {d2[2]},[r0],r10
+    VLD1.8   {d2[3]},[r8],r10
+    VLD1.8   {d2[4]},[r0],r10
+    VLD1.8   {d2[5]},[r8],r10
+    VLD1.8   {d2[6]},[r0],r10
+    VLD1.8   {d2[7]},[r8],r10
+    VLD1.8   {d3[0]},[r0],r10
+    VLD1.8   {d3[1]},[r8],r10
+    VLD1.8   {d3[2]},[r0],r10
+    VLD1.8   {d3[3]},[r8],r10
+    VLD1.8   {d3[4]},[r0],r10
+    VLD1.8   {d3[5]},[r8],r10
+    VLD1.8   {d3[6]},[r0],r10
+    VLD1.8   {d3[7]},[r8]
+    VPADDL.U8 q0,q1
+    ADD      r11,r11,#1
+    VPADD.I16 d0,d0,d1
+    VPADDL.U16 d0,d0
+    VPADDL.U32 d6,d0
+    VRSHR.U64 d8,d6,#4
+L0x14c:
+    TST      r7,#1
+    BEQ      L0x170
+    VLD1.8   {d0,d1},[r1]
+    ADD      r11,r11,#1
+    VPADDL.U8 q0,q0
+    VPADD.I16 d0,d0,d1
+    VPADDL.U16 d0,d0
+    VPADDL.U32 d7,d0
+    VRSHR.U64 d8,d7,#4
+L0x170:
+    CMP      r11,#2
+    BNE      L0x180
+    VADD.I64 d8,d7,d6
+    VRSHR.U64 d8,d8,#5
+L0x180:
+    VDUP.8   q3,d8[0]
+    CMP      r11,#0
+    ADD      r8,r3,r5
+    ADD      r10,r5,r5
+    BNE      L0x198
+    VMOV.I8  q3,#0x80
+L0x198:
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    VST1.8   {d6,d7},[r3],r10
+    VST1.8   {d6,d7},[r8],r10
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+OMX_VC_16X16_PLANE:
+    LDR      r9, =armVCM4P10_MultiplierTable16x16
+    VLD1.8   {d0,d1},[r1]
+    VLD1.8   {d4[0]},[r2]
+    ADD      r8,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d2[0]},[r0],r10
+    VLD1.8   {d2[1]},[r8],r10
+    VLD1.8   {d2[2]},[r0],r10
+    VLD1.8   {d2[3]},[r8],r10
+    VLD1.8   {d2[4]},[r0],r10
+    VLD1.8   {d2[5]},[r8],r10
+    VLD1.8   {d2[6]},[r0],r10
+    VLD1.8   {d2[7]},[r8],r10
+    VLD1.8   {d3[0]},[r0],r10
+    VLD1.8   {d3[1]},[r8],r10
+    VLD1.8   {d3[2]},[r0],r10
+    VLD1.8   {d3[3]},[r8],r10
+    VLD1.8   {d3[4]},[r0],r10
+    VLD1.8   {d3[5]},[r8],r10
+    VLD1.8   {d3[6]},[r0],r10
+    VLD1.8   {d3[7]},[r8]
+    VREV64.8 d5,d1
+    VSUBL.U8 q3,d5,d4
+    VSHR.U64 d5,d5,#8
+    VSUBL.U8 q4,d5,d0
+    VSHL.I64 d9,d9,#16
+    VEXT.8   d9,d9,d6,#2
+    VREV64.8 d12,d3
+    VSUBL.U8 q7,d12,d4
+    VSHR.U64 d12,d12,#8
+    VSUBL.U8 q8,d12,d2
+    VLD1.16  {d20,d21},[r9]!
+    VSHL.I64 d17,d17,#16
+    VEXT.8   d17,d17,d14,#2
+    VMULL.S16 q11,d8,d20
+    VMULL.S16 q12,d16,d20
+    VMLAL.S16 q11,d9,d21
+    VMLAL.S16 q12,d17,d21
+    VPADD.I32 d22,d23,d22
+    VPADD.I32 d23,d25,d24
+    VPADDL.S32 q11,q11
+    VSHL.I64 q12,q11,#2
+    VADD.I64 q11,q11,q12
+    VRSHR.S64 q11,q11,#6
+    VSHL.I64 q12,q11,#3
+    VSUB.I64 q12,q12,q11
+    VLD1.16  {d20,d21},[r9]!
+    VDUP.16  q6,d22[0]
+    VDUP.16  q7,d23[0]
+    VADDL.U8 q11,d1,d3
+    VSHL.I16 q11,q11,#4
+    VDUP.16  q11,d23[3]
+    VADD.I64 d1,d24,d25
+    VLD1.16  {d24,d25},[r9]
+    VDUP.16  q13,d1[0]
+    VSUB.I16 q13,q11,q13
+    VMUL.I16 q5,q6,q10
+    VMUL.I16 q6,q6,q12
+    VADD.I16 q0,q5,q13
+    VADD.I16 q1,q6,q13
+L0x2d4:
+    VQRSHRUN.S16 d6,q0,#5
+    VQRSHRUN.S16 d7,q1,#5
+    SUBS     r12,r12,#1
+    VST1.8   {d6,d7},[r3],r5
+    VADD.I16 q0,q0,q7
+    VADD.I16 q1,q1,q7
+    BNE      L0x2d4
+    MOV      r0,#0
+    VPOP     {d8-d15}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S
new file mode 100644
index 0000000..aa6d7ef
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S
@@ -0,0 +1,261 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+
+    .section .rodata
+    .align 4
+
+armVCM4P10_pSwitchTable4x4:
+    .word OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR
+    .word OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
+    .word OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
+    .word OMX_VC_4x4_HD,       OMX_VC_4x4_VL
+    .word OMX_VC_4x4_HU
+
+    .text
+
+    .global omxVCM4P10_PredictIntra_4x4
+    .func   omxVCM4P10_PredictIntra_4x4
+omxVCM4P10_PredictIntra_4x4:
+    PUSH     {r4-r12,lr}
+    VPUSH    {d8-d12}
+    LDR      r8, =armVCM4P10_pSwitchTable4x4
+    LDRD     r6,r7,[sp,#0x58]
+    LDRD     r4,r5,[sp,#0x50]
+    LDR      pc,[r8,r6,LSL #2]
+OMX_VC_4x4_HOR:
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d0[]},[r0],r10
+    VLD1.8   {d1[]},[r9],r10
+    VLD1.8   {d2[]},[r0]
+    VLD1.8   {d3[]},[r9]
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VST1.32  {d0[0]},[r3],r12
+    VST1.32  {d1[0]},[r11],r12
+    VST1.32  {d2[0]},[r3]
+    VST1.32  {d3[0]},[r11]
+    B        L0x348
+OMX_VC_4x4_VERT:
+    VLD1.32  {d0[0]},[r1]
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+L0x58:
+    VST1.32  {d0[0]},[r3],r12
+    VST1.32  {d0[0]},[r11],r12
+    VST1.32  {d0[0]},[r3]
+    VST1.32  {d0[0]},[r11]
+    B        L0x348
+OMX_VC_4x4_DC:
+    TST      r7,#2
+    BEQ      L0xdc
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d0[0]},[r0],r10
+    VLD1.8   {d0[1]},[r9],r10
+    VLD1.8   {d0[2]},[r0]
+    VLD1.8   {d0[3]},[r9]
+    TST      r7,#1
+    BEQ      L0xbc
+    VLD1.32  {d0[1]},[r1]
+    MOV      r0,#0
+    VPADDL.U8 d1,d0
+    VPADDL.U16 d1,d1
+    VPADDL.U32 d1,d1
+    VRSHR.U64 d1,d1,#3
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VDUP.8   d0,d1[0]
+    B        L0x58
+L0xbc:
+    MOV      r0,#0
+    VPADDL.U8 d1,d0
+    VPADDL.U16 d1,d1
+    VRSHR.U32 d1,d1,#2
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VDUP.8   d0,d1[0]
+    B        L0x58
+L0xdc:
+    TST      r7,#1
+    BEQ      L0x108
+    VLD1.32  {d0[0]},[r1]
+    MOV      r0,#0
+    VPADDL.U8 d1,d0
+    VPADDL.U16 d1,d1
+    VRSHR.U32 d1,d1,#2
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VDUP.8   d0,d1[0]
+    B        L0x58
+L0x108:
+    VMOV.I8  d0,#0x80
+    MOV      r0,#0
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    B        L0x58
+OMX_VC_4x4_DIAG_DL:
+    TST      r7,#0x40
+    BEQ      L0x138
+    VLD1.8   {d3},[r1]
+    VDUP.8   d2,d3[7]
+    VEXT.8   d4,d3,d2,#1
+    VEXT.8   d5,d3,d2,#2
+    B        L0x14c
+L0x138:
+    VLD1.32  {d0[1]},[r1]
+    VDUP.8   d2,d0[7]
+    VEXT.8   d3,d0,d2,#4
+    VEXT.8   d4,d0,d2,#5
+    VEXT.8   d5,d0,d2,#6
+L0x14c:
+    VHADD.U8 d6,d3,d5
+    VRHADD.U8 d6,d6,d4
+    VST1.32  {d6[0]},[r3],r5
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r3],r5
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r3],r5
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r3]
+    B        L0x348
+OMX_VC_4x4_DIAG_DR:
+    VLD1.32  {d0[0]},[r1]
+    VLD1.8   {d1[7]},[r2]
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    ADD      r1,r3,r5
+    VLD1.8   {d1[6]},[r0],r10
+    VLD1.8   {d1[5]},[r9],r10
+    VLD1.8   {d1[4]},[r0]
+    VLD1.8   {d1[3]},[r9]
+    VEXT.8   d3,d1,d0,#3
+    ADD      r4,r1,r5
+    VEXT.8   d4,d1,d0,#4
+    ADD      r6,r4,r5
+    VEXT.8   d5,d1,d0,#5
+    VHADD.U8 d6,d3,d5
+    VRHADD.U8 d6,d6,d4
+    VST1.32  {d6[0]},[r6]
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r4]
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r1]
+    VEXT.8   d6,d6,d6,#1
+    VST1.32  {d6[0]},[r3]
+    B        L0x348
+OMX_VC_4x4_VR:
+    VLD1.32  {d0[0]},[r1]
+    VLD1.8   {d0[7]},[r2]
+    VLD1.8   {d1[7]},[r0],r4
+    VLD1.8   {d2[7]},[r0],r4
+    VLD1.8   {d1[6]},[r0]
+    VEXT.8   d12,d0,d0,#7
+    VEXT.8   d3,d1,d12,#6
+    VEXT.8   d4,d2,d12,#7
+    VEXT.8   d5,d1,d0,#7
+    VEXT.8   d6,d2,d0,#7
+    VEXT.8   d11,d1,d12,#7
+    VHADD.U8 d8,d6,d12
+    VRHADD.U8 d8,d8,d11
+    VHADD.U8 d7,d3,d5
+    VRHADD.U8 d7,d7,d4
+    VEXT.8   d10,d8,d8,#1
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VEXT.8   d9,d7,d7,#1
+    VST1.32  {d10[0]},[r3],r12
+    VST1.32  {d9[0]},[r11],r12
+    VST1.32  {d8[0]},[r3],r12
+    VST1.32  {d7[0]},[r11]
+    B        L0x348
+OMX_VC_4x4_HD:
+    VLD1.8   {d0},[r1]
+    VLD1.8   {d1[7]},[r2]
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d1[6]},[r0],r10
+    VLD1.8   {d1[5]},[r9],r10
+    VLD1.8   {d1[4]},[r0]
+    VLD1.8   {d1[3]},[r9]
+    VEXT.8   d3,d1,d0,#3
+    VEXT.8   d4,d1,d0,#2
+    VEXT.8   d5,d1,d0,#1
+    VHADD.U8 d7,d3,d5
+    VRHADD.U8 d7,d7,d4
+    VRHADD.U8 d8,d4,d3
+    VSHL.I64 d8,d8,#24
+    VSHL.I64 d6,d7,#16
+    VZIP.8   d8,d6
+    VEXT.8   d7,d7,d7,#6
+    VEXT.8   d8,d6,d7,#2
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VST1.32  {d8[1]},[r3],r12
+    VST1.32  {d6[1]},[r11],r12
+    VST1.32  {d8[0]},[r3]
+    VST1.32  {d6[0]},[r11]
+    B        L0x348
+OMX_VC_4x4_VL:
+    TST      r7,#0x40
+    BEQ      L0x2b4
+    VLD1.8   {d3},[r1]
+    VEXT.8   d4,d3,d3,#1
+    VEXT.8   d5,d4,d4,#1
+    B        L0x2c8
+L0x2b4:
+    VLD1.32  {d0[1]},[r1]
+    VDUP.8   d2,d0[7]
+    VEXT.8   d3,d0,d2,#4
+    VEXT.8   d4,d0,d2,#5
+    VEXT.8   d5,d0,d2,#6
+L0x2c8:
+    VRHADD.U8 d7,d4,d3
+    VHADD.U8 d10,d3,d5
+    VRHADD.U8 d10,d10,d4
+    VEXT.8   d8,d7,d7,#1
+    ADD      r11,r3,r5
+    ADD      r12,r5,r5
+    VEXT.8   d9,d10,d8,#1
+    VST1.32  {d7[0]},[r3],r12
+    VST1.32  {d10[0]},[r11],r12
+    VST1.32  {d8[0]},[r3]
+    VST1.32  {d9[0]},[r11]
+    B        L0x348
+OMX_VC_4x4_HU:
+    ADD      r9,r0,r4
+    ADD      r10,r4,r4
+    VLD1.8   {d1[4]},[r0],r10
+    VLD1.8   {d1[5]},[r9],r10
+    VLD1.8   {d1[6]},[r0]
+    VLD1.8   {d1[7]},[r9]
+    VDUP.8   d2,d1[7]
+    VEXT.8   d3,d1,d2,#4
+    VEXT.8   d4,d1,d2,#5
+    VEXT.8   d5,d1,d2,#6
+    VHADD.U8 d7,d3,d5
+    VRHADD.U8 d7,d7,d4
+    VRHADD.U8 d8,d4,d3
+    VZIP.8   d8,d7
+    VST1.32  {d8[0]},[r3],r5
+    VEXT.8   d8,d8,d8,#2
+    VST1.32  {d8[0]},[r3],r5
+    VEXT.8   d8,d8,d8,#2
+    VST1.32  {d8[0]},[r3],r5
+    VST1.32  {d7[0]},[r3]
+L0x348:
+    MOV      r0,#0
+    VPOP     {d8-d12}
+    POP      {r4-r12,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S
new file mode 100644
index 0000000..28a89cb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S
@@ -0,0 +1,54 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global omxVCM4P10_TransformDequantChromaDCFromPair
+    .func   omxVCM4P10_TransformDequantChromaDCFromPair
+omxVCM4P10_TransformDequantChromaDCFromPair:
+    push    {r4-r10, lr}
+    ldr     r9, [r0,#0]
+    vmov.i16    d0, #0
+    mov     r8, #0x1f
+    vst1.16    {d0}, [r1]
+    ldrb    r6, [r9], #1
+unpackLoop:
+    tst     r6, #0x10
+    ldrnesb r5, [r9, #1]
+    ldrneb  r4, [r9], #2
+    and     r7, r8, r6, lsl #1
+    ldreqsb r4, [r9], #1
+    orrne   r4, r4, r5, lsl #8
+    tst     r6, #0x20
+    ldreqb  r6, [r9], #1
+    strh    r4, [r1, r7]
+    beq     unpackLoop
+    ldmia   r1, {r3, r4}
+    str     r9, [r0, #0]
+    ldr     r5, =armVCM4P10_QPDivTable
+    ldr     r6, =armVCM4P10_VMatrixQPModTable
+    saddsubx        r3, r3, r3
+    saddsubx        r4, r4, r4
+    ldrsb   r9, [r5, r2]
+    ldrsb   r2, [r6, r2]
+    sadd16  r5, r3, r4
+    ssub16  r6, r3, r4
+    lsl     r2, r2, r9
+    vmov    d0, r5, r6
+    vrev32.16  d0, d0
+    vdup.16    d1, r2
+    vmull.s16   q1, d0, d1
+    vshrn.i32   d2, q1, #1
+    vst1.16    {d2}, [r1]
+    mov     r0, #0
+    pop     {r4-r10, pc}
+    .endfunc
+
+    .end
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S
new file mode 100644
index 0000000..a3a0715
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S
@@ -0,0 +1,76 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+    .eabi_attribute 24, 1
+    .eabi_attribute 25, 1
+
+    .arm
+    .fpu neon
+    .text
+
+    .global armVCM4P10_InvTransformDequantLumaDC4x4
+    .func   armVCM4P10_InvTransformDequantLumaDC4x4
+armVCM4P10_InvTransformDequantLumaDC4x4:
+    PUSH     {r4-r6,lr}
+    VPUSH    {d8-d13}
+    VLD4.16  {d0,d1,d2,d3},[r0]
+    LDR      r2, =armVCM4P10_QPDivTable
+    LDR      r3, =armVCM4P10_VMatrixQPModTable
+    VADD.I16 d4,d0,d1
+    VADD.I16 d5,d2,d3
+    VSUB.I16 d6,d0,d1
+    LDRSB    r4,[r2,r1]
+    VSUB.I16 d7,d2,d3
+    LDRSB    r5,[r3,r1]
+    VADD.I16 d0,d4,d5
+    VSUB.I16 d1,d4,d5
+    VSUB.I16 d2,d6,d7
+    LSL      r5,r5,r4
+    VADD.I16 d3,d6,d7
+    VTRN.16  d0,d1
+    VTRN.16  d2,d3
+    VTRN.32  q0,q1
+    VADD.I16 d4,d0,d1
+    VADD.I16 d5,d2,d3
+    VSUB.I16 d6,d0,d1
+    VSUB.I16 d7,d2,d3
+    VADD.I16 d0,d4,d5
+    VSUB.I16 d1,d4,d5
+    VSUB.I16 d2,d6,d7
+    VADD.I16 d3,d6,d7
+    VDUP.16  d5,r5
+    VMOV.I32 q3,#0x2
+    VMOV.I32 q4,#0x2
+    VMOV.I32 q5,#0x2
+    VMOV.I32 q6,#0x2
+    VMLAL.S16 q3,d0,d5
+    VMLAL.S16 q4,d1,d5
+    VMLAL.S16 q5,d2,d5
+    VMLAL.S16 q6,d3,d5
+    VSHRN.I32 d0,q3,#2
+    VSHRN.I32 d1,q4,#2
+    VSHRN.I32 d2,q5,#2
+    VSHRN.I32 d3,q6,#2
+    VST1.16  {d0,d1,d2,d3},[r0]
+    VPOP     {d8-d13}
+    POP      {r4-r6,pc}
+    .endfunc
+
+.global omxVCM4P10_TransformDequantLumaDCFromPair
+.func   omxVCM4P10_TransformDequantLumaDCFromPair
+omxVCM4P10_TransformDequantLumaDCFromPair:
+    PUSH     {r4-r6,lr}
+    MOV      r4,r1
+    MOV      r5,r2
+    BL       armVCM4P10_UnpackBlock4x4
+    MOV      r0,r4
+    MOV      r1,r5
+    BL       armVCM4P10_InvTransformDequantLumaDC4x4
+    MOV      r0,#0
+    POP      {r4-r6,pc}
+    .endfunc
+
+    .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
new file mode 100755
index 0000000..74b5505
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
@@ -0,0 +1,37 @@
+/**
+ * 
+ * File Name:  armVCM4P2_Huff_Tables_VLC.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ *
+ * File:        armVCM4P2_Huff_Tables.h
+ * Description: Declares Tables used for Hufffman coding and decoding 
+ *              in MP4P2 codec.
+ *
+ */
+ 
+#ifndef _OMXHUFFTAB_H_
+#define _OMXHUFFTAB_H_
+
+
+extern const OMX_U16 armVCM4P2_IntraVlcL0L1[200];
+
+
+extern const OMX_U16 armVCM4P2_InterVlcL0L1[200];
+
+extern const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64];
+//extern const OMX_U16 armVCM4P2_aIntraDCChromaIndex[32];
+extern const OMX_U16 armVCM4P2_aVlcMVD[124];
+
+extern const OMX_U8 armVCM4P2_InterL0L1LMAX[73];
+extern const OMX_U8 armVCM4P2_InterL0L1RMAX[35];
+extern const OMX_U8 armVCM4P2_IntraL0L1LMAX[53];
+extern const OMX_U8 armVCM4P2_IntraL0L1RMAX[40]
+
+#endif /* _OMXHUFFTAB_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
new file mode 100755
index 0000000..e95203a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
@@ -0,0 +1,25 @@
+/**
+ * 
+ * File Name:  armVCM4P2_ZigZag_Tables.h
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ *
+ * File:        armVCM4P2_Zigzag_Tables.h
+ * Description: Declares Tables used for Zigzag scan in MP4P2 codec.
+ *
+ */
+ 
+#ifndef _OMXZIGZAGTAB_H
+#define _OMXZIGZAGTAB_H
+
+extern const OMX_U8 armVCM4P2_aClassicalZigzagScan [192];
+//extern const OMX_U8 armVCM4P2_aHorizontalZigzagScan [64];
+//extern const OMX_U8 armVCM4P2_aVerticalZigzagScan [64];
+
+#endif /* _OMXZIGZAGTAB_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s
new file mode 100755
index 0000000..95fe6d2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s
@@ -0,0 +1,82 @@
+; /**
+; * 
+; * File Name:  armVCM4P2_Clip8_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains module for Clipping 16 bit value to [0,255] Range
+; */ 
+
+      INCLUDE omxtypes_s.h
+      INCLUDE armCOMM_s.h
+      
+
+      M_VARIANTS CortexA8
+
+      IF CortexA8
+;//Input Arguments
+
+pSrc                 RN 0
+pDst                 RN 1
+step                 RN 2
+
+;// Neon Registers
+
+qx0                  QN  Q0.S16                  
+dx00                 DN  D0.S16
+dx01                 DN  D1.S16
+qx1                  QN  Q1.S16
+dx10                 DN  D2.S16
+dx11                 DN  D3.S16
+
+qx2                  QN  Q2.S16                  
+dx20                 DN  D4.S16
+dx21                 DN  D5.S16
+qx3                  QN  Q3.S16
+dx30                 DN  D6.S16
+dx31                 DN  D7.S16
+
+
+dclip0               DN  D0.U8
+dclip1               DN  D2.U8 
+dclip2               DN  D4.U8
+dclip3               DN  D6.U8
+ 
+       M_START armVCM4P2_Clip8
+
+       VLD1          {dx00,dx01,dx10,dx11},[pSrc]!          ;// Load 16 entries from pSrc
+       VLD1          {dx20,dx21,dx30,dx31},[pSrc]!          ;// Load next 16 entries from pSrc  
+       VQSHRUN       dclip0,qx0,#0                          ;// dclip0[i]=clip qx0[i] to [0,255]
+       VQSHRUN       dclip1,qx1,#0                          ;// dclip1[i]=clip qx1[i] to [0,255]
+       VST1          {dclip0},[pDst],step                   ;// store 8 bytes and pDst=pDst+step
+       VST1          {dclip1},[pDst],step                   ;// store 8 bytes and pDst=pDst+step
+       VQSHRUN       dclip2,qx2,#0
+       VQSHRUN       dclip3,qx3,#0
+       VST1          {dclip2},[pDst],step
+       VST1          {dclip3},[pDst],step
+       
+       VLD1          {dx00,dx01,dx10,dx11},[pSrc]!          ;// Load 16 entries from pSrc
+       VLD1          {dx20,dx21,dx30,dx31},[pSrc]!          ;// Load next 16 entries from pSrc  
+       VQSHRUN       dclip0,qx0,#0                          ;// dclip0[i]=clip qx0[i] to [0,255]
+       VQSHRUN       dclip1,qx1,#0                          ;// dclip1[i]=clip qx1[i] to [0,255]
+       VST1          {dclip0},[pDst],step                   ;// store 8 bytes and pDst=pDst+step
+       VST1          {dclip1},[pDst],step                   ;// store 8 bytes and pDst=pDst+step
+       VQSHRUN       dclip2,qx2,#0
+       VQSHRUN       dclip3,qx3,#0
+       VST1          {dclip2},[pDst],step
+       VST1          {dclip3},[pDst],step
+
+
+       
+        M_END
+        ENDIF
+        
+     
+        
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
new file mode 100755
index 0000000..e4a7f33
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
@@ -0,0 +1,398 @@
+;/**
+; * 
+; * File Name:  armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter, intra block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_AC_unsafe
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan 
+; *
+; * 
+; *
+; * 
+; */
+
+
+      INCLUDE omxtypes_s.h
+      INCLUDE armCOMM_s.h
+      INCLUDE armCOMM_BitDec_s.h
+
+
+      M_VARIANTS ARM1136JS
+
+     
+
+
+
+     IF ARM1136JS
+     
+        
+
+
+
+;//Input Arguments
+
+ppBitStream          RN 0
+pBitOffset           RN 1
+pDst                 RN 2
+shortVideoHeader     RN 3
+
+
+;//Local Variables
+
+Return               RN 0
+
+pVlcTableL0L1        RN 4
+pLMAXTableL0L1       RN 4
+pRMAXTableL0L1       RN 4
+pZigzagTable         RN 4
+
+ftype                RN 0
+temp3                RN 4
+temp                 RN 5
+Count                RN 6
+Escape               RN 5
+
+;// armVCM4P2_FillVLDBuffer
+zigzag               RN 0
+storeLevel           RN 1
+temp2                RN 4
+temp1                RN 5
+sign                 RN 5
+Last                 RN 7
+storeRun             RN 14
+
+
+packRetIndex         RN 5
+
+
+markerbit            RN 5
+
+;// Scratch Registers
+
+RBitStream           RN 8
+RBitBuffer           RN 9
+RBitCount            RN 10
+
+T1                   RN 11
+T2                   RN 12
+LR                   RN 14        
+        
+
+
+        M_ALLOC4        pppBitStream,4
+        M_ALLOC4        ppOffset,4
+        M_ALLOC4        pLinkRegister,4       
+        
+        M_START armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+        ;// get the table addresses from stack       
+        M_ARG           ppVlcTableL0L1,4
+        M_ARG           ppLMAXTableL0L1,4
+        M_ARG           ppRMAXTableL0L1,4
+        M_ARG           ppZigzagTable,4
+        
+        ;// Store ALL zeros at pDst
+        
+        MOV             temp1,#0                                        ;// Initialize Count to zero                                
+        MOV             Last,#0
+        M_STR           LR,pLinkRegister                                ;// Store Link Register on Stack
+        MOV             temp2,#0
+        MOV             LR,#0          
+        
+        ;// Initialize the Macro and Store all zeros to pDst 
+  
+        STM             pDst!,{temp2,temp1,Last,LR}                   
+        M_BD_INIT0      ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount  
+        STM             pDst!,{temp2,temp1,Last,LR}
+        M_BD_INIT1      T1, T2, T2
+        STM             pDst!,{temp2,temp1,Last,LR}
+        M_BD_INIT2      T1, T2, T2
+        STM             pDst!,{temp2,temp1,Last,LR}
+        M_STR           ppBitStream,pppBitStream                        ;// Store ppBitstream on stack                         
+        STM             pDst!,{temp2,temp1,Last,LR}
+        M_STR           pBitOffset,ppOffset                             ;// Store pBitOffset on stack
+        STM             pDst!,{temp2,temp1,Last,LR}
+        
+        STM             pDst!,{temp2,temp1,Last,LR}
+        STM             pDst!,{temp2,temp1,Last,LR}
+ 
+        
+        SUB             pDst,pDst,#128                                  ;// Restore pDst
+
+        ;// The armVCM4P2_GetVLCBits begins
+
+getVLCbits
+        
+        M_BD_LOOK8      Escape,7                                        ;// Load Escape Value
+        LSR             Escape,Escape,#25                                                  
+        CMP             Escape,#3                                       ;// check for escape mode
+        MOVNE           ftype,#0
+        BNE             notEscapemode                                   ;// Branch if not in Escape mode 3
+
+        M_BD_VSKIP8     #7,T1
+        CMP             shortVideoHeader,#0                             ;// Check shortVideoHeader flag to know the type of Escape mode
+        BEQ             endFillVLD                                       
+        
+        ;// Escape Mode 4
+
+        M_BD_READ8      Last,1,T1
+        M_BD_READ8      storeRun,6,T1
+        M_BD_READ8      storeLevel,8,T1
+
+           
+        ;// Check whether the Reserved values for Level are used and Exit with an Error Message if it is so
+
+        TEQ             storeLevel,#0
+        TEQNE           storeLevel,#128                    
+        BEQ             ExitError
+
+        ADD             temp2,storeRun,Count
+        CMP             temp2,#64
+        BGE             ExitError                                       ;// error if Count+storeRun >= 64
+        
+        
+        ;// Load address of zigzagTable
+        
+        M_LDR           pZigzagTable,ppZigzagTable                      ;// Loading the Address of Zigzag table
+               
+                
+        ;// armVCM4P2_FillVLDBuffer
+                
+        SXTB            storeLevel,storeLevel                           ;// Sign Extend storeLevel to 32 bits
+                              
+        
+        ;// To Reflect Runlength
+
+        ADD             Count,Count,storeRun
+        LDRB            zigzag,[pZigzagTable,Count]
+        ADD             Count,Count,#1
+        STRH            storeLevel,[pDst,zigzag]                        ;// store Level
+              
+        B               ExitOk
+       
+        
+
+endFillVLD
+        
+               
+        ;// Load Ftype( Escape Mode) value based on the two successive bits in the bitstream
+     
+        M_BD_READ8      temp1,1,T1           
+        CMP             temp1,#0    
+        MOVEQ           ftype,#1
+        BEQ             notEscapemode
+        M_BD_READ8      temp1,1,T1
+        CMP             temp1,#1
+        MOVEQ           ftype,#3
+        MOVNE           ftype,#2
+        
+
+notEscapemode
+
+        ;// Load optimized packed VLC table with last=0 and Last=1
+        
+        M_LDR           pVlcTableL0L1,ppVlcTableL0L1                    ;// Load Combined VLC Table
+                
+       
+        CMP             ftype,#3                                        ;// If ftype >=3 get perform Fixed Length Decoding (Escape Mode 3)
+        BGE             EscapeMode3                                     ;// Else continue normal VLC Decoding
+        
+        ;// Variable lengh decoding, "armUnPackVLC32" 
+        
+        
+        M_BD_VLD        packRetIndex,T1,T2,pVlcTableL0L1,4,2
+        
+        
+        LDR             temp3,=0xFFF
+        
+        CMP             packRetIndex,temp3                              ;// Check for invalid symbol
+        BEQ             ExitError                                       ;// if invalid symbol occurs exit with an error message
+        
+        AND             Last,packRetIndex,#2                            ;// Get Last from packed Index
+              
+         
+        
+
+        LSR             storeRun,packRetIndex,#7                        ;// Get Run Value from Packed index
+        AND             storeLevel,packRetIndex,#0x7c                   ;// storeLevel=packRetIndex[2-6],storeLevel[0-1]=0 
+                                                                        
+     
+        M_LDR           pLMAXTableL0L1,ppLMAXTableL0L1                  ;// Load LMAX table
+              
+       
+        LSR             storeLevel,storeLevel,#2                        ;// Level value
+
+        CMP             ftype,#1                                    
+        BNE             ftype2
+        
+        ;// ftype==1; Escape mode =1
+          
+        
+        ADD            temp1, pLMAXTableL0L1, Last, LSL#4              ;// If the Last=1 add 32 to table address
+        LDRB            temp1,[temp1,storeRun]
+
+       
+        ADD             storeLevel,temp1,storeLevel                     
+
+ftype2
+
+        ;// ftype =2; Escape mode =2
+        
+        M_LDR           pRMAXTableL0L1,ppRMAXTableL0L1                  ;// Load RMAX Table 
+                
+        CMP             ftype,#2
+        BNE             FillVLDL1
+                  
+        ADD            temp1, pRMAXTableL0L1, Last, LSL#4               ;// If Last=1 add 32 to table address
+        SUB             temp2,storeLevel,#1
+        LDRB            temp1,[temp1,temp2]
+
+       
+        ADD             storeRun,storeRun,#1
+        ADD             storeRun,temp1
+        
+FillVLDL1        
+            
+                
+        ;// armVCM4P2_FillVLDBuffer
+
+        M_LDR           pZigzagTable,ppZigzagTable                     ;// Load address of zigzagTable 
+                
+        M_BD_READ8      sign,1,T1
+
+        CMP             sign,#1
+        RSBEQ           storeLevel,storeLevel,#0
+ 
+        ADD             temp1,storeRun,Count                           ;// Exit with an error message if Run + Count exceeds 63
+        CMP             temp1,#64
+        BGE             ExitError
+
+      
+        
+        
+              
+        
+        ;// To Reflect Runlenght
+
+        ADD             Count,Count,storeRun
+ 
+storeLevelL1
+        
+        LDRB            zigzag,[pZigzagTable,Count]
+        CMP             Last,#2                                         ;// Check if the Level val is Last non zero val
+        ADD             Count,Count,#1
+        LSR             Last,Last,#1
+        STRH            storeLevel,[pDst,zigzag]                  
+           
+        BNE             end
+        
+        B               ExitOk
+ 
+
+
+        ;// Fixed Lengh Decoding Escape Mode 3
+
+EscapeMode3
+
+        M_BD_READ8      Last,1,T1
+        M_BD_READ8      storeRun,6,T1
+        
+        ADD             temp2,storeRun,Count                            ;// Exit with an error message if Run + Count exceeds 63
+        CMP             temp2,#64
+        BGE             ExitError
+
+        M_BD_READ8      markerbit,1,T1
+        TEQ             markerbit,#0                                    ;// Exit with an error message if marker bit is zero
+        BEQ             ExitError
+        
+        M_BD_READ16     storeLevel,12,T1
+
+        TST             storeLevel,#0x800                               ;// test if the level is negative
+        SUBNE           storeLevel,storeLevel,#4096
+        CMP             storeLevel,#0
+        CMPNE           storeLevel,#-2048
+        BEQ             ExitError                                       ;// Exit with an error message if Level==0 or  -2048 
+
+        M_LDR           pZigzagTable,ppZigzagTable                      ;// Load address of zigzagTable
+              
+        M_BD_READ8      markerbit,1,T1
+           
+
+        ;// armVCM4P2_FillVLDBuffer ( Sign not used as storeLevel is preprocessed)
+            
+               
+
+        ;// To Reflect Run Length
+
+        ADD             Count,Count,storeRun
+
+
+ 
+storeLevelLast
+        
+        LDRB            zigzag,[pZigzagTable,Count]
+        CMP             Last,#1
+        ADD             Count,Count,#1
+        STRH            storeLevel,[pDst,zigzag]                          
+                
+        BNE             end 
+      
+        B               ExitOk
+        
+end
+
+        CMP             Count,#64                                       ;//Run the Loop untill Count reaches 64
+
+        BLT             getVLCbits
+
+        
+ExitOk
+        ;// Exit When VLC Decoding is done Successfully 
+   
+        ;// Loading ppBitStream and pBitOffset from stack
+        
+        CMP             Last,#1
+        M_LDR           ppBitStream,pppBitStream
+        M_LDR           pBitOffset,ppOffset
+
+        ;//Ending the macro
+
+        M_BD_FINI       ppBitStream,pBitOffset
+             
+        MOVEQ           Return,#OMX_Sts_NoErr
+        MOVNE           Return,#OMX_Sts_Err
+        M_LDR           LR,pLinkRegister                               ;// Load the Link Register Back
+        B               exit2
+
+ExitError
+        ;// Exit When an Error occurs 
+
+        M_LDR           ppBitStream,pppBitStream
+        M_LDR           pBitOffset,ppOffset
+        ;//Ending the macro
+
+        M_BD_FINI       ppBitStream,pBitOffset
+        M_LDR           LR,pLinkRegister
+        MOV             Return,#OMX_Sts_Err
+
+exit2
+       
+
+        M_END
+        ENDIF
+        
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
new file mode 100755
index 0000000..38af975
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
@@ -0,0 +1,211 @@
+ /**
+ * 
+ * File Name:  armVCM4P2_Huff_Tables_VLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File:        armVCM4P2_Huff_Tables_VLC.c
+ * Description: Contains all the Huffman tables used in MPEG4 codec
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+
+#include "armCOMM_Bitstream.h"
+
+
+
+
+// Contains optimized and Packed VLC tables with Last=0 and Last=1
+
+//              optimized Packed VLC table Entry Format 
+//              ---------------------------------------
+// 
+//        15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+//       +------------------------------------------------+
+//       |  Len   |       Run       |     Level    |L | 1 |
+//       +------------------------------------------------+
+//       |                Offset                      | 0 |
+//       +------------------------------------------------+
+// If the table entry is a leaf entry then bit 0 set:
+//    Len    = Number of bits overread  (0 to 7)  3 bits
+//    Run    = RunLength of the Symbol  (0 to 63) 6 bits
+//    Level  = Level of the Symbol      (0 to 31) 5 bits
+//    L      = Last Value of the Symbol (0 or 1)  1 bit
+//
+// If the table entry is an internal node then bit 0 is clear:
+//    Offset = Number of (16-bit) half words from the table
+//             start to the next table node
+//
+// The table is accessed by successive lookup up on the
+// next Step bits of the input bitstream until a leaf node
+// is obtained. The Step sizes are supplied to the VLD macro.
+
+// The VLC tables used for Intra and non inta coefficients in non Escape mode
+// contains symbols with both Last=0 and Last=1.
+// If a symbol is not found in the table it will be coded as 0xFFF
+ 
+
+const OMX_U16 armVCM4P2_InterVlcL0L1[200] = {
+    0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x1b09,
+    0x4009, 0x4009, 0x4009, 0x4009, 0x2109, 0x2109, 0x0209, 0x0011,
+    0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058,
+    0x3fff, 0x3fff, 0x0038, 0x0040, 0x2115, 0x2115, 0x201d, 0x201d,
+    0x2059, 0x2059, 0x2051, 0x2051, 0x1c0d, 0x1b0d, 0x1a0d, 0x190d,
+    0x0911, 0x0811, 0x0711, 0x0611, 0x0511, 0x0319, 0x0219, 0x0121,
+    0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088,
+    0x2061, 0x2061, 0x2129, 0x2129, 0x3709, 0x3709, 0x3809, 0x3809,
+    0x3d0d, 0x3d0d, 0x3e0d, 0x3e0d, 0x3f0d, 0x3f0d, 0x200d, 0x200d,
+    0x0098, 0x00a0, 0x00a8, 0x00b0, 0x0131, 0x0221, 0x0419, 0x0519,
+    0x0619, 0x0a11, 0x1909, 0x1a09, 0x210d, 0x220d, 0x230d, 0x240d,
+    0x250d, 0x260d, 0x270d, 0x280d, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+    0x0049, 0x0041, 0x380d, 0x380d, 0x370d, 0x370d, 0x360d, 0x360d,
+    0x350d, 0x350d, 0x340d, 0x340d, 0x330d, 0x330d, 0x320d, 0x320d,
+    0x00e8, 0x00f0, 0x00f8, 0x0100, 0x310d, 0x310d, 0x2015, 0x2015,
+    0x3609, 0x3609, 0x3509, 0x3509, 0x3409, 0x3409, 0x3309, 0x3309,
+    0x3209, 0x3209, 0x3109, 0x3109, 0x0110, 0x0130, 0x0138, 0x0140,
+    0x0118, 0x0120, 0x0128, 0x100d, 0x3009, 0x3009, 0x2f09, 0x2f09,
+    0x2411, 0x2411, 0x2311, 0x2311, 0x2039, 0x2039, 0x2031, 0x2031,
+    0x0f0d, 0x0e0d, 0x0d0d, 0x0c0d, 0x0b0d, 0x0a0d, 0x090d, 0x0e09,
+    0x0d09, 0x0211, 0x0119, 0x0029, 0x0150, 0x0158, 0x0160, 0x0168,
+    0x280d, 0x280d, 0x270d, 0x270d, 0x260d, 0x260d, 0x250d, 0x250d,
+    0x2c09, 0x2c09, 0xb759, 0xb759, 0x2a09, 0x2a09, 0x2021, 0x2021,
+    0x040d, 0x030d, 0x0b35, 0x010d, 0x0909, 0x0809, 0x0709, 0x0609,
+    0x0111, 0x0019, 0x2509, 0x2509, 0x2409, 0x2409, 0x2309, 0x2309
+};
+
+
+const OMX_U16 armVCM4P2_IntraVlcL0L1[200] = {
+    0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x0f09,
+    0x4009, 0x4009, 0x4009, 0x4009, 0x2011, 0x2011, 0x0109, 0x0019,
+    0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058,
+    0x3fff, 0x3fff, 0x0038, 0x0040, 0x203d, 0x203d, 0x2035, 0x2035,
+    0x20b1, 0x20b1, 0x20a9, 0x20a9, 0x0215, 0x011d, 0x002d, 0x0d09,
+    0x0519, 0x0811, 0x0419, 0x0321, 0x0221, 0x0139, 0x00a1, 0x0099,
+    0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088,
+    0x20b9, 0x20b9, 0x20c1, 0x20c1, 0x2141, 0x2141, 0x2911, 0x2911,
+    0x2315, 0x2315, 0x2415, 0x2415, 0x2f0d, 0x2f0d, 0x300d, 0x300d,
+    0x0098, 0x00a0, 0x00a8, 0x00b0, 0x00c9, 0x00d1, 0x00d9, 0x0149,
+    0x0619, 0x0151, 0x0229, 0x0719, 0x0e09, 0x0045, 0x0515, 0x0615,
+    0x110d, 0x120d, 0x130d, 0x140d, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+    0x0091, 0x0089, 0x2e0d, 0x2e0d, 0x2d0d, 0x2d0d, 0x2c0d, 0x2c0d,
+    0x2b0d, 0x2b0d, 0x2a0d, 0x2a0d, 0x2115, 0x2115, 0x2025, 0x2025,
+    0x00e8, 0x00f0, 0x00f8, 0x0100, 0x2c09, 0x2c09, 0x2b09, 0x2b09,
+    0x2711, 0x2711, 0x2611, 0x2611, 0x2511, 0x2511, 0x2319, 0x2319,
+    0x2219, 0x2219, 0x2131, 0x2131, 0x0110, 0x0130, 0x0138, 0x0140,
+    0x0118, 0x0120, 0x0128, 0x080d, 0x2129, 0x2129, 0x2081, 0x2081,
+    0x2411, 0x2411, 0x2079, 0x2079, 0x2071, 0x2071, 0x2069, 0x2069,
+    0x1bb5, 0x060d, 0x001d, 0xd3f9, 0x0909, 0x0809, 0x090d, 0x0311,
+    0x0121, 0x0061, 0x0059, 0x0051, 0x0150, 0x0158, 0x0160, 0x0168,
+    0x240d, 0x240d, 0x230d, 0x230d, 0x2609, 0x2609, 0x250d, 0x250d,
+    0x2709, 0x2709, 0x2211, 0x2211, 0x2119, 0x2119, 0x2049, 0x2049,
+    0x0015, 0x0509, 0x020d, 0x010d, 0x0409, 0x0309, 0x0041, 0x0039,
+    0x0111, 0x0031, 0x2209, 0x2209, 0x2029, 0x2029, 0x2021, 0x2021
+};
+
+const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64] = {
+    0x0020, 0x000b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2001, 0x2001,
+    0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003,
+    0x0028, 0x000f, 0x200d, 0x200d, 0x0030, 0x0013, 0x2011, 0x2011,
+    0x0038, 0x0017, 0x2015, 0x2015, 0x3fff, 0x3fff, 0x2019, 0x2019,
+
+	0x0020, 0x0009, 0x2007, 0x2007, 0x4005, 0x4005, 0x4005, 0x4005,
+    0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001,
+    0x0028, 0x000d, 0x200b, 0x200b, 0x0030, 0x0011, 0x200f, 0x200f,
+    0x0038, 0x0015, 0x2013, 0x2013, 0x1fff, 0x0019, 0x2017, 0x2017
+};
+
+
+const OMX_U16 armVCM4P2_aVlcMVD[124] = {
+    0x0010, 0x00f0, 0x0043, 0x003f, 0x4041, 0x4041, 0x4041, 0x4041,
+    0x0018, 0x00d8, 0x0047, 0x003b, 0x0020, 0x0080, 0x00a8, 0x00d0,
+    0x0028, 0x0048, 0x0070, 0x0078, 0x1fff, 0x0030, 0x0038, 0x0040,
+    0x0081, 0x0001, 0x007f, 0x0003, 0x207d, 0x207d, 0x2005, 0x2005,
+    0x207b, 0x207b, 0x2007, 0x2007, 0x0050, 0x0058, 0x0060, 0x0068,
+    0x2079, 0x2079, 0x2009, 0x2009, 0x2077, 0x2077, 0x200b, 0x200b,
+    0x2075, 0x2075, 0x200d, 0x200d, 0x2073, 0x2073, 0x200f, 0x200f,
+    0x0071, 0x0011, 0x006f, 0x0013, 0x006d, 0x0015, 0x006b, 0x0017,
+    0x0088, 0x0090, 0x0098, 0x00a0, 0x0069, 0x0019, 0x0067, 0x001b,
+    0x0065, 0x001d, 0x0063, 0x001f, 0x0061, 0x0021, 0x005f, 0x0023,
+    0x005d, 0x0025, 0x005b, 0x0027, 0x00b0, 0x00b8, 0x00c0, 0x00c8,
+    0x0059, 0x0029, 0x0057, 0x002b, 0x2055, 0x2055, 0x202d, 0x202d,
+    0x2053, 0x2053, 0x202f, 0x202f, 0x2051, 0x2051, 0x2031, 0x2031,
+    0x204f, 0x204f, 0x2033, 0x2033, 0x00e0, 0x00e8, 0x0049, 0x0039,
+    0x204d, 0x204d, 0x2035, 0x2035, 0x204b, 0x204b, 0x2037, 0x2037,
+    0x2045, 0x2045, 0x203d, 0x203d
+};
+
+/* LMAX table for non Inter (Last == 0 and Last=1)
+   Level - 1 Indexed
+   padded armVCM4P2_InterL0L1LMAX[27-31] with zeros to acess entries for Last=1 effectively
+
+*/
+const OMX_U8 armVCM4P2_InterL0L1LMAX[73] = 
+{
+   12,  6,  4,  3,  3,  3,  3,  2, 
+    2,  2,  2,  1,  1,  1,  1,  1,
+    1,  1,  1,  1,  1,  1,  1,  1,
+    1,  1,  1,  0,  0,  0,  0,  0,
+    3,  2,  1,  1,  1,  1,  1,  1, 
+	1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,
+	1
+};
+
+/* RMAX table for non Inter (Last == 0 and Last=1)
+   Level - 1 Indexed 
+ padded armVCM4P2_InterL0L1RMAX[12-31] with zeros to access entries for Last=1 table effectively */
+
+
+const OMX_U8 armVCM4P2_InterL0L1RMAX[35] = 
+{
+   26, 10,  6,  2,  1,  1,   
+    0,  0,  0,  0,  0,  0,
+	0,	0,	0,	0,	0,	0,
+	0,	0,	0,	0,	0,	0,
+	0,	0,	0,	0,
+    0,  0,  0,  0,  40,  1,  0
+};
+
+/* LMAX table for non Intra (Last == 0 and Last=1)
+   Level - 1 Indexed
+   padded armVCM4P2_IntraL0L1LMAX[15-31] with zeros to acess entries for Last=1 effectively
+
+*/
+const OMX_U8 armVCM4P2_IntraL0L1LMAX[53] = 
+{
+   27, 10,  5,  4,  3,  3,  3,  
+    3,  2,  2,  1,  1,  1,  1,  1,	0,
+	0,	0,	0,	0,	0,	0,	0,	0,
+	0,	0,	0,	0,	0,	0,	0,	0,
+
+	8,  3,  2,  2,  2,  2,  2,  1, 
+	1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1
+};
+
+
+/* RMAX table for non Inter (Last == 0 and Last=1)
+   Level - 1 Indexed 
+ padded armVCM4P2_IntraL0L1RMAX[27-31] with zeros to access entries for Last=1 table effectively */
+
+
+const OMX_U8 armVCM4P2_IntraL0L1RMAX[40] =
+{
+   14,  9,  7,  3,  2,  1,	1,  
+    1,  1,  1,  0,  0,  0, 	0,  
+    0,  0,  0,  0,  0,  0,  0,  
+    0,  0,  0,  0,  0,  0,  0,
+	0,	0,	0,	0,
+	
+	20,  6,  1,  0,  0,  0,  0,  0
+
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
new file mode 100755
index 0000000..6948f80
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
@@ -0,0 +1,75 @@
+ /**
+ * 
+ * File Name:  armVCM4P2_Lookup_Tables.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File:        armVCM4P2_Lookup_Tables.c
+ * Description: Contains all the Lookup tables used in MPEG4 codec
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+
+    /* * Table Entries contain Dc Scaler values
+       * armVCM4P2_DCScaler[i]= 8           for i=1  to  4 and i=33 to 36
+       *                      = 2*i         for i=5  to  8
+       *                      = i+8         for i=9  to  25
+       *                      = 2*i-16      for i=26 to  31
+       *                      = (i-32+13)/2 for i=37 to  59
+       *                      = i-6-32      for i=60 to  63
+       *                      = 255         for i=0 and i=32
+       */
+       
+const OMX_U8 armVCM4P2_DCScaler[64]={
+	0xff, 0x8,  0x8,  0x8,  0x8,  0xa,  0xc,  0xe,  
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 
+    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+    0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e,
+    0xff, 0x8,  0x8,  0x8,  0x8,  0x9,  0x9,  0xa,  
+    0xa,  0xb,  0xb,  0xc,  0xc,  0xd,  0xd,  0xe,  
+    0xe,  0xf,  0xf,  0x10, 0x10, 0x11, 0x11, 0x12, 
+    0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+
+};
+
+              
+     /*  Table Entries Contain reciprocal of 1 to 63
+      *  armVCM4P2_Reciprocal_QP_S16[i]=round(32767/i)
+      *  armVCM4P2_Reciprocal_QP_S16[0]= 0
+      */
+
+const OMX_S16 armVCM4P2_Reciprocal_QP_S16[64]={
+	0x0000,0x7fff,0x4000,0x2aaa,0x2000,0x1999,0x1555,0x1249,
+    0x1000,0x0e39,0x0ccd,0x0ba3,0x0aab,0x09d9,0x0925,0x0888,
+    0x0800,0x0787,0x071c,0x06bd,0x0666,0x0618,0x05d1,0x0591,
+    0x0555,0x051f,0x04ec,0x04be,0x0492,0x046a,0x0444,0x0421,
+    0x0400,0x03e1,0x03c4,0x03a8,0x038e,0x0376,0x035e,0x0348,
+    0x0333,0x031f,0x030c,0x02fa,0x02e9,0x02d8,0x02c8,0x02b9,
+    0x02ab,0x029d,0x028f,0x0282,0x0276,0x026a,0x025f,0x0254,
+    0x0249,0x023f,0x0235,0x022b,0x0222,0x0219,0x0211,0x0208
+	   
+};
+     
+      /* Table Entries Contain reciprocal of 1 to 63
+       * armVCM4P2_Reciprocal_QP_S32[i]=round(131071/i)
+       * armVCM4P2_Reciprocal_QP_S32[0]= 0
+       */
+
+const OMX_S32 armVCM4P2_Reciprocal_QP_S32[64]={
+	0x00000000,0x0001ffff,0x00010000,0x0000aaaa, 0x00008000, 0x00006666, 0x00005555, 0x00004924,
+    0x00004000,0x000038e3,0x00003333,0x00002e8c, 0x00002aab, 0x00002762, 0x00002492, 0x00002222,
+    0x00002000,0x00001e1e,0x00001c72,0x00001af2, 0x0000199a, 0x00001861, 0x00001746, 0x00001643,
+    0x00001555,0x0000147b,0x000013b1,0x000012f6, 0x00001249, 0x000011a8, 0x00001111, 0x00001084,
+    0x00001000,0x00000f84,0x00000f0f,0x00000ea1, 0x00000e39, 0x00000dd6, 0x00000d79, 0x00000d21,
+    0x00000ccd,0x00000c7d,0x00000c31,0x00000be8, 0x00000ba3, 0x00000b61, 0x00000b21, 0x00000ae5,
+    0x00000aab,0x00000a73,0x00000a3d,0x00000a0a, 0x000009d9, 0x000009a9, 0x0000097b, 0x0000094f,
+    0x00000925,0x000008fb,0x000008d4,0x000008ae, 0x00000889, 0x00000865, 0x00000842, 0x00000820
+	
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
new file mode 100755
index 0000000..44f2460
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
@@ -0,0 +1,104 @@
+;//
+;// 
+;// File Name:  armVCM4P2_SetPredDir_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+; **
+; * Function: armVCM4P2_SetPredDir
+; *
+; * Description:
+; * Performs detecting the prediction direction
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] blockIndex  block index indicating the component type and
+; *                          position as defined in subclause 6.1.3.8, of ISO/IEC
+; *                          14496-2. Furthermore, indexes 6 to 9 indicate the
+; *                          alpha blocks spatially corresponding to luminance
+; *                          blocks 0 to 3 in the same macroblock.
+; * [in] pCoefBufRow pointer to the coefficient row buffer
+; * [in] pQpBuf      pointer to the quantization parameter buffer
+; * [out]predQP      quantization parameter of the predictor block
+; * [out]predDir     indicates the prediction direction which takes one
+; *                  of the following values:
+; *                  OMX_VC_HORIZONTAL    predict horizontally
+; *                  OMX_VC_VERTICAL      predict vertically
+; *
+; * Return Value:
+; * Standard OMXResult result. See enumeration for possible result codes.
+; *
+; */
+
+       INCLUDE omxtypes_s.h
+       INCLUDE armCOMM_s.h
+       INCLUDE omxVC_s.h
+
+
+       M_VARIANTS ARM1136JS
+
+
+       IF ARM1136JS
+ 
+;// Input Arguments
+BlockIndex         RN 0
+pCoefBufRow        RN 1
+pCoefBufCol        RN 2
+predDir            RN 3
+predQP             RN 4
+pQpBuf             RN 5
+
+;// Local Variables
+
+Return             RN 0
+blockDCLeft        RN 6  
+blockDCTop         RN 7
+blockDCTopLeft     RN 8
+temp1              RN 9
+temp2              RN 14
+
+       M_START    armVCM4P2_SetPredDir,r9
+
+       M_ARG       ppredQP,4
+       M_ARG       ppQpBuf,4
+    
+       LDRH        blockDCTopLeft,[pCoefBufRow,#-16]
+       LDRH        blockDCLeft,[pCoefBufCol]
+       
+       TEQ         BlockIndex,#3
+       LDREQH      blockDCTop,[pCoefBufCol,#-16]
+       LDRNEH      blockDCTop,[pCoefBufRow]
+             
+       SUBS        temp1,blockDCLeft,blockDCTopLeft
+       RSBLT       temp1,temp1,#0
+       SUBS        temp2,blockDCTopLeft,blockDCTop
+       RSBLT       temp2,temp2,#0
+      
+       M_LDR       pQpBuf,ppQpBuf
+       M_LDR       predQP,ppredQP
+       CMP         temp1,temp2
+       MOV         temp2,#OMX_VC_VERTICAL
+       LDRLTB      temp1,[pQpBuf,#1]
+       STRLT       temp2,[predDir]
+       STRLT       temp1,[predQP]
+       MOV         temp2,#OMX_VC_HORIZONTAL           
+       LDRGEB      temp1,[pQpBuf]
+       STRGE       temp2,[predDir]
+       MOV         Return,#OMX_Sts_NoErr
+       STRGE       temp1,[predQP] 
+
+         
+    
+       M_END
+ 
+       ENDIF
+
+       END    
+    
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
new file mode 100755
index 0000000..21fa715
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
@@ -0,0 +1,61 @@
+/**
+ * 
+ * File Name:  armVCM4P2_Zigzag_Tables.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * File:        armVCM4P2_ZigZag_Tables.c
+ * Description: Contains the zigzag tables
+ *
+ */
+
+#include "omxtypes.h"
+
+/* Contains Double the values in the reference Zigzag Table
+ * Contains Classical,Vetical and Horizontal Zigzagscan tables in one array  
+ */
+
+const OMX_U8 armVCM4P2_aClassicalZigzagScan [192] = 
+{
+     0,  2,  16, 32,  18,  4,  6, 20,
+    34, 48, 64, 50, 36, 22,  8,  10,
+    24, 38, 52, 66, 80, 96, 82, 68,
+    54, 40, 26,  12,  14, 28, 42, 56, 
+    70, 84, 98, 112, 114, 100, 86, 72,
+    58, 44, 30, 46, 60, 74, 88, 102,
+    116, 118, 104, 90, 76, 62, 78, 92,
+    106, 120, 122, 104, 94, 110, 124, 126,
+
+	0,  16, 32, 48,  2,  18,  4, 20,
+    34, 50, 64, 80, 96, 112, 114, 98,
+    82, 66, 52, 36,  6, 22,  8, 24,
+    38, 54, 68, 84, 100, 116, 70, 86,
+    102, 118, 40, 56,  10, 26,  12, 28,
+    42, 58, 72, 88, 104, 120, 74, 90, 
+    106, 122, 44, 60,  14, 30, 46, 62,
+    76, 92, 108, 124, 78, 94, 110, 126,
+
+    0,  2,  4,  6,  16,  18, 32, 34,
+    20, 22,  8,  10,  12,  14, 30, 28,
+    26, 24, 38, 36, 48, 50, 64, 66,
+    52, 54, 40, 42, 44, 46, 56, 58,
+    60, 62, 68, 70, 80, 82, 96, 98,
+    84, 86, 72, 74, 76, 78, 88, 90, 
+    92, 94, 100, 102, 112, 114, 116, 118,
+    104, 106, 108, 110, 120, 122, 124, 126
+
+
+};
+
+
+
+
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
new file mode 100755
index 0000000..796ad6e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
@@ -0,0 +1,102 @@
+/**
+ * 
+ * File Name:  omxVCM4P2_DecodeBlockCoef_Inter.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Description: 
+ * Contains modules for inter reconstruction
+ * 
+ */
+ 
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Inter
+ *
+ * Description:
+ * Decodes the INTER block coefficients. Inverse quantization, inversely zigzag
+ * positioning and IDCT, with appropriate clipping on each step, are performed
+ * on the coefficients. The results (residuals) are placed in a contiguous array
+ * of 64 elements. For INTER block, the output buffer holds the residuals for
+ * further reconstruction.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte in
+ *								the bit stream buffer. There is no boundary
+ *								check for the bit stream buffer.
+ * [in]	pBitOffset		pointer to the bit position in the byte pointed
+ *								to by *ppBitStream. *pBitOffset is valid within
+ *								[0-7]
+ * [in]	QP				quantization parameter
+ * [in] shortVideoHeader    a flag indicating presence of short_video_header;
+ *                           shortVideoHeader==1 indicates using quantization method defined in short
+ *                           video header mode, and shortVideoHeader==0 indicates normail quantization method.
+ * [out] ppBitStream 	*ppBitStream is updated after the block is decoded, so that it points to the
+ *                      current byte in the bit stream buffer.
+ * [out] pBitOffset		*pBitOffset is updated so that it points to the current bit position in the
+ *                      byte pointed by *ppBitStream
+ * [out] pDst			pointer to the decoded residual buffer (a contiguous array of 64 elements of
+ *                      OMX_S16 data type). Must be 16-byte aligned.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ *   - At least one of the following pointers is Null: ppBitStream, *ppBitStream, pBitOffset , pDst
+ *   - At least one of the below case:
+ *   - *pBitOffset exceeds [0,7], QP <= 0;
+ *	 - pDst not 16-byte aligned
+ * OMX_Sts_Err - status error
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Inter(
+     const OMX_U8 ** ppBitStream,
+     OMX_INT * pBitOffset,
+     OMX_S16 * pDst,
+     OMX_INT QP,
+     OMX_INT shortVideoHeader
+)
+{
+    /* 64 elements are needed but to align it to 16 bytes need
+    15 more elements of padding */
+    OMX_S16 tempBuf[79];
+    OMX_S16 *pTempBuf1;
+    OMXResult errorCode;
+    /* Aligning the local buffers */
+    pTempBuf1 = armAlignTo16Bytes(tempBuf);
+    
+    
+    /* VLD and zigzag */
+    errorCode = omxVCM4P2_DecodeVLCZigzag_Inter(ppBitStream, pBitOffset, 
+                                        pTempBuf1,shortVideoHeader);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    
+    /* Dequantization */
+    errorCode = omxVCM4P2_QuantInvInter_I(
+     pTempBuf1,
+     QP);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    
+    /* Inverse transform */
+    errorCode = omxVCM4P2_IDCT8x8blk(pTempBuf1, pDst);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+	    
+    return OMX_Sts_NoErr;
+}
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
new file mode 100755
index 0000000..b28657c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
@@ -0,0 +1,214 @@
+/**
+ * 
+ * File Name:  omxVCM4P2_DecodeBlockCoef_Intra.c
+ * OpenMAX DL: v1.0.2
+ * Revision:   12290
+ * Date:       Wednesday, April 9, 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Description: 
+ * Contains modules for intra reconstruction
+ * 
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/* Function for saturating 16 bit values to the [0,255] range and  */
+/* writing out as 8 bit values.  Does 64 entries                   */
+void armVCM4P2_Clip8(OMX_S16 *pSrc, OMX_U8 *pDst, OMX_INT dstStep );
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Intra
+ *
+ * Description:
+ * Decodes the INTRA block coefficients. Inverse quantization, inversely zigzag
+ * positioning, and IDCT, with appropriate clipping on each step, are performed
+ * on the coefficients. The results are then placed in the output frame/plane on
+ * a pixel basis. For INTRA block, the output values are clipped to [0, 255] and
+ * written to corresponding block buffer within the destination plane.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in]	ppBitStream		pointer to the pointer to the current byte in
+ *								the bit stream buffer. There is no boundary
+ *								check for the bit stream buffer.
+ * [in]	pBitOffset		pointer to the bit position in the byte pointed
+ *								to by *ppBitStream. *pBitOffset is valid within
+ *								[0-7].
+ * [in]	step			width of the destination plane
+ * [in/out]	pCoefBufRow		[in]  pointer to the coefficient row buffer
+ *                        [out] updated coefficient rwo buffer
+ * [in/out]	pCoefBufCol		[in]  pointer to the coefficient column buffer
+ *                        [out] updated coefficient column buffer
+ * [in]	curQP			quantization parameter of the macroblock which
+ *								the current block belongs to
+ * [in]	pQpBuf		 Pointer to a 2-element QP array. pQpBuf[0] holds the QP of the 8x8 block left to
+ *                   the current block(QPa). pQpBuf[1] holds the QP of the 8x8 block just above the
+ *                   current block(QPc).
+ *                   Note, in case the corresponding block is out of VOP bound, the QP value will have
+ *                   no effect to the intra-prediction process. Refer to subclause  "7.4.3.3 Adaptive
+ *                   ac coefficient prediction" of ISO/IEC 14496-2(MPEG4 Part2) for accurate description.
+ * [in]	blockIndex		block index indicating the component type and
+ *								position as defined in subclause 6.1.3.8,
+ *								Figure 6-5 of ISO/IEC 14496-2. 
+ * [in]	intraDCVLC		a code determined by intra_dc_vlc_thr and QP.
+ *								This allows a mechanism to switch between two VLC
+ *								for coding of Intra DC coefficients as per Table
+ *								6-21 of ISO/IEC 14496-2. 
+ * [in]	ACPredFlag		a flag equal to ac_pred_flag (of luminance) indicating
+ *								if the ac coefficients of the first row or first
+ *								column are differentially coded for intra coded
+ *								macroblock.
+ * [in] shortVideoHeader    a flag indicating presence of short_video_header;
+ *                           shortVideoHeader==1 selects linear intra DC mode,
+ *							and shortVideoHeader==0 selects nonlinear intra DC mode.
+ * [out]	ppBitStream		*ppBitStream is updated after the block is
+ *								decoded, so that it points to the current byte
+ *								in the bit stream buffer
+ * [out]	pBitOffset		*pBitOffset is updated so that it points to the
+ *								current bit position in the byte pointed by
+ *								*ppBitStream
+ * [out]	pDst			pointer to the block in the destination plane.
+ *								pDst should be 16-byte aligned.
+ * [out]	pCoefBufRow		pointer to the updated coefficient row buffer.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ *   -	At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset,
+ *                                                      pCoefBufRow, pCoefBufCol, pQPBuf, pDst.
+ *      or
+ *   -  At least one of the below case: *pBitOffset exceeds [0,7], curQP exceeds (1, 31),
+ *      blockIndex exceeds [0,9], step is not the multiple of 8, intraDCVLC is zero while
+ *      blockIndex greater than 5.
+ *      or
+ *   -	pDst is not 16-byte aligned
+ * OMX_Sts_Err - status error
+ *
+ */
+
+OMXResult omxVCM4P2_DecodeBlockCoef_Intra(
+     const OMX_U8 ** ppBitStream,
+     OMX_INT *pBitOffset,
+     OMX_U8 *pDst,
+     OMX_INT step,
+     OMX_S16 *pCoefBufRow,
+     OMX_S16 *pCoefBufCol,
+     OMX_U8 curQP,
+     const OMX_U8 *pQPBuf,
+     OMX_INT blockIndex,
+     OMX_INT intraDCVLC,
+     OMX_INT ACPredFlag,
+	 OMX_INT shortVideoHeader
+ )
+{
+    OMX_S16 tempBuf1[79], tempBuf2[79];
+    OMX_S16 *pTempBuf1, *pTempBuf2;
+    OMX_INT predDir, predACDir;
+    OMX_INT  predQP;
+    OMXVCM4P2VideoComponent videoComp;
+    OMXResult errorCode;
+    
+    
+    /* Aligning the local buffers */
+    pTempBuf1 = armAlignTo16Bytes(tempBuf1);
+    pTempBuf2 = armAlignTo16Bytes(tempBuf2);
+    
+    /* Setting the AC prediction direction and prediction direction */
+    armVCM4P2_SetPredDir(
+        blockIndex,
+        pCoefBufRow,
+        pCoefBufCol,
+        &predDir,
+        &predQP,
+        pQPBuf);
+
+    predACDir = predDir;
+
+    
+    if (ACPredFlag == 0)
+    {
+        predACDir = OMX_VC_NONE;
+    }
+
+    /* Setting the videoComp */
+    if (blockIndex <= 3)
+    {
+        videoComp = OMX_VC_LUMINANCE;
+    }
+    else
+    {
+        videoComp = OMX_VC_CHROMINANCE;
+    }
+    
+
+    /* VLD and zigzag */
+    if (intraDCVLC == 1)
+    {
+        errorCode = omxVCM4P2_DecodeVLCZigzag_IntraDCVLC(
+            ppBitStream,
+            pBitOffset,
+            pTempBuf1,
+            predACDir,
+            shortVideoHeader,
+            videoComp);
+        armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    }
+    else
+    {
+        errorCode = omxVCM4P2_DecodeVLCZigzag_IntraACVLC(
+            ppBitStream,
+            pBitOffset,
+            pTempBuf1,
+            predACDir,
+            shortVideoHeader);
+        armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    }
+
+    /* AC DC prediction */
+    errorCode = omxVCM4P2_PredictReconCoefIntra(
+        pTempBuf1,
+        pCoefBufRow,
+        pCoefBufCol,
+        curQP,
+        predQP,
+        predDir,
+        ACPredFlag,
+        videoComp);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    
+    /* Dequantization */
+    errorCode = omxVCM4P2_QuantInvIntra_I(
+     pTempBuf1,
+     curQP,
+     videoComp,
+     shortVideoHeader);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    
+    /* Inverse transform */
+    errorCode = omxVCM4P2_IDCT8x8blk (pTempBuf1, pTempBuf2);
+    armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+    
+    /* Placing the linear array into the destination plane and clipping
+       it to 0 to 255 */
+    
+	armVCM4P2_Clip8(pTempBuf2,pDst,step);
+	
+	
+    return OMX_Sts_NoErr;
+}
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
new file mode 100755
index 0000000..cc16f5a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
@@ -0,0 +1,364 @@
+; **********
+; * 
+; * File Name:  omxVCM4P2_DecodePadMV_PVOP_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; * 
+; **
+; * Function: omxVCM4P2_DecodePadMV_PVOP
+; *
+; * Description:
+; * Decodes and pads four motion vectors of the non-intra macroblock in P-VOP.
+; * The motion vector padding process is specified in subclause 7.6.1.6 of
+; * ISO/IEC 14496-2.
+; *
+; * Remarks:
+; *
+; *
+; * Parameters:
+; * [in]    ppBitStream        pointer to the pointer to the current byte in
+; *                            the bit stream buffer
+; * [in]    pBitOffset         pointer to the bit position in the byte pointed
+; *                            to by *ppBitStream. *pBitOffset is valid within
+; *                            [0-7].
+; * [in]    pSrcMVLeftMB       pointers to the motion vector buffers of the
+; *                           macroblocks specially at the left side of the current macroblock
+; *                     respectively.
+; * [in]    pSrcMVUpperMB      pointers to the motion vector buffers of the
+; *                     macroblocks specially at the upper side of the current macroblock
+; *                     respectively.
+; * [in]    pSrcMVUpperRightMB pointers to the motion vector buffers of the
+; *                     macroblocks specially at the upper-right side of the current macroblock
+; *                     respectively.
+; * [in]    fcodeForward       a code equal to vop_fcode_forward in MPEG-4
+; *                     bit stream syntax
+; * [in]    MBType         the type of the current macroblock. If MBType
+; *                     is not equal to OMX_VC_INTER4V, the destination
+; *                     motion vector buffer is still filled with the
+; *                     same decoded vector.
+; * [out]   ppBitStream         *ppBitStream is updated after the block is decoded,
+; *                     so that it points to the current byte in the bit
+; *                     stream buffer
+; * [out]   pBitOffset         *pBitOffset is updated so that it points to the
+; *                     current bit position in the byte pointed by
+; *                     *ppBitStream
+; * [out]   pDstMVCurMB         pointer to the motion vector buffer of the current
+; *                     macroblock which contains four decoded motion vectors
+; *
+; * Return Value:
+; * OMX_Sts_NoErr -no error
+; * 
+; *                     
+; * OMX_Sts_Err - status error
+; *
+; *
+     
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        INCLUDE armCOMM_BitDec_s.h
+        INCLUDE omxVC_s.h
+        
+       M_VARIANTS ARM1136JS
+       
+                
+
+
+        IF ARM1136JS
+
+;//Input Arguments
+
+ppBitStream           RN 0
+pBitOffset            RN 1
+pSrcMVLeftMB          RN 2
+pSrcMVUpperMB         RN 3
+pSrcMVUpperRightMB    RN 4
+pDstMVCurMB           RN 5
+fcodeForward          RN 6
+MBType                RN 7
+
+;//Local Variables
+
+zero                  RN 4
+one                   RN 4
+scaleFactor           RN 1
+
+
+Return                RN 0
+
+VlcMVD                RN 0
+index                 RN 4
+Count                 RN 7
+
+mvHorData             RN 4
+mvHorResidual         RN 0
+
+mvVerData             RN 4             
+mvVerResidual         RN 0
+
+temp                  RN 1
+
+temp1                 RN 3
+High                  RN 4
+Low                   RN 2
+Range                 RN 1
+
+BlkCount              RN 14
+
+diffMVdx              RN 0
+diffMVdy              RN 1
+
+;// Scratch Registers
+
+RBitStream            RN 8
+RBitCount             RN 9
+RBitBuffer            RN 10
+
+T1                    RN 11
+T2                    RN 12
+LR                    RN 14
+
+       IMPORT          armVCM4P2_aVlcMVD
+       IMPORT          omxVCM4P2_FindMVpred
+
+       ;// Allocate stack memory        
+       
+       M_ALLOC4        ppDstMVCurMB,4
+       M_ALLOC4        pDstMVPredME,4
+       M_ALLOC4        pBlkCount,4
+       
+       M_ALLOC4        pppBitStream,4
+       M_ALLOC4        ppBitOffset,4
+       M_ALLOC4        ppSrcMVLeftMB,4
+       M_ALLOC4        ppSrcMVUpperMB,4
+       
+       M_ALLOC4        pdiffMVdx,4
+       M_ALLOC4        pdiffMVdy,4
+       M_ALLOC4        pHigh,4
+       
+              
+
+
+       M_START   omxVCM4P2_DecodePadMV_PVOP,r11
+       
+       M_ARG           pSrcMVUpperRightMBonStack,4           ;// pointer to  pSrcMVUpperRightMB on stack
+       M_ARG           pDstMVCurMBonStack,4                  ;// pointer to pDstMVCurMB on stack
+       M_ARG           fcodeForwardonStack,4                 ;// pointer to fcodeForward on stack 
+       M_ARG           MBTypeonStack,4                       ;// pointer to MBType on stack
+
+      
+       
+       
+       
+       ;// Initializing the BitStream Macro
+
+       M_BD_INIT0      ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount
+       M_LDR           MBType,MBTypeonStack                  ;// Load MBType from stack
+       M_LDR           pDstMVCurMB,pDstMVCurMBonStack        ;// Load pDstMVCurMB from stack
+       MOV             zero,#0
+
+       TEQ             MBType,#OMX_VC_INTRA                  ;// Check if MBType=OMX_VC_INTRA
+       TEQNE           MBType,#OMX_VC_INTRA_Q                ;// check if MBType=OMX_VC_INTRA_Q
+       STREQ           zero,[pDstMVCurMB]
+       M_BD_INIT1      T1, T2, T2
+       STREQ           zero,[pDstMVCurMB,#4]
+       M_BD_INIT2      T1, T2, T2
+       STREQ           zero,[pDstMVCurMB,#4]
+       MOVEQ           Return,#OMX_Sts_NoErr
+       MOV             BlkCount,#0
+       STREQ           zero,[pDstMVCurMB,#4]
+       
+       BEQ             ExitOK
+
+       TEQ             MBType,#OMX_VC_INTER4V                ;// Check if MBType=OMX_VC_INTER4V
+       TEQNE           MBType,#OMX_VC_INTER4V_Q              ;// Check if MBType=OMX_VC_INTER4V_Q
+       MOVEQ           Count,#4
+
+       TEQ             MBType,#OMX_VC_INTER                  ;// Check if MBType=OMX_VC_INTER
+       TEQNE           MBType,#OMX_VC_INTER_Q                ;// Check if MBType=OMX_VC_INTER_Q
+       MOVEQ           Count,#1
+       
+       M_LDR           fcodeForward,fcodeForwardonStack      ;// Load fcodeForward  from stack
+
+       ;// Storing the values temporarily on stack
+
+       M_STR           ppBitStream,pppBitStream              
+       M_STR           pBitOffset,ppBitOffset
+            
+
+       SUB             temp,fcodeForward,#1                  ;// temp=fcodeForward-1
+       MOV             one,#1
+       M_STR           pSrcMVLeftMB,ppSrcMVLeftMB
+       LSL             scaleFactor,one,temp                  ;// scaleFactor=1<<(fcodeForward-1)
+       M_STR           pSrcMVUpperMB,ppSrcMVUpperMB
+       LSL             scaleFactor,scaleFactor,#5            
+       M_STR           scaleFactor,pHigh                     ;// [pHigh]=32*scaleFactor
+              
+       ;// VLD Decoding
+
+
+Loop
+
+       LDR             VlcMVD, =armVCM4P2_aVlcMVD        ;// Load the optimized MVD VLC table
+
+       ;// Horizontal Data and Residual calculation
+
+       LDR             temp,=0xFFF                           
+       M_BD_VLD        index,T1,T2,VlcMVD,3,2                ;// variable lenght decoding using the macro
+      
+       TEQ             index,temp
+       BEQ             ExitError                             ;// Exit with an Error Message if the decoded symbol is an invalied symbol 
+       
+       SUB             mvHorData,index,#32                   ;// mvHorData=index-32             
+       MOV             mvHorResidual,#1                      ;// mvHorResidual=1
+       CMP             fcodeForward,#1
+       TEQNE           mvHorData,#0
+       MOVEQ           diffMVdx,mvHorData                    ;// if scaleFactor=1(fcodeForward=1) or mvHorData=0 diffMVdx=mvHorData         
+       BEQ             VerticalData
+       
+       SUB             temp,fcodeForward,#1
+       M_BD_VREAD8     mvHorResidual,temp,T1,T2              ;// get mvHorResidual from bitstream if fcodeForward>1 and mvHorData!=0              
+       
+       CMP             mvHorData,#0
+       RSBLT           mvHorData,mvHorData,#0                ;// mvHorData=abs(mvHorData)
+       SUB             mvHorResidual,mvHorResidual,fcodeForward
+       SMLABB          diffMVdx,mvHorData,fcodeForward,mvHorResidual ;// diffMVdx=abs(mvHorData)*fcodeForward+mvHorResidual-fcodeForward
+       ADD             diffMVdx,diffMVdx,#1
+       RSBLT           diffMVdx,diffMVdx,#0
+       
+       ;// Vertical Data and Residual calculation
+
+VerticalData
+
+       M_STR           diffMVdx,pdiffMVdx                    ;// Store the diffMVdx on stack
+       LDR             VlcMVD, =armVCM4P2_aVlcMVD        ;// Loading the address of optimized VLC tables
+
+       LDR             temp,=0xFFF
+       M_BD_VLD        index,T1,T2,VlcMVD,3,2                ;// VLC decoding using the macro
+       
+       TEQ             index,temp
+       BEQ             ExitError                             ;// Exit with an Error Message if an Invalied Symbol occurs
+       
+       SUB             mvVerData,index,#32                   ;// mvVerData=index-32             
+       MOV             mvVerResidual,#1     
+       CMP             fcodeForward,#1
+       TEQNE           mvVerData,#0
+       MOVEQ           diffMVdy,mvVerData                    ;// diffMVdy = mvVerData if scaleFactor=1(fcodeForward=1) or mvVerData=0
+       BEQ             FindMVPred
+
+       SUB             temp,fcodeForward,#1
+       M_BD_VREAD8     mvVerResidual,temp,T1,T2              ;// Get mvVerResidual from bit stream if fcodeForward>1 and mnVerData!=0
+             
+
+       CMP             mvVerData,#0
+       RSBLT           mvVerData,mvVerData,#0
+       SUB             mvVerResidual,mvVerResidual,fcodeForward
+       SMLABB          diffMVdy,mvVerData,fcodeForward,mvVerResidual ;// diffMVdy=abs(mvVerData)*fcodeForward+mvVerResidual-fcodeForward
+       ADD             diffMVdy,diffMVdy,#1
+       RSBLT           diffMVdy,diffMVdy,#0
+
+       ;//Calling the Function omxVCM4P2_FindMVpred
+        
+FindMVPred
+
+       M_STR           diffMVdy,pdiffMVdy
+       ADD             temp,pDstMVCurMB,BlkCount,LSL #2      ;// temp=pDstMVCurMB[BlkCount]
+       M_STR           temp,ppDstMVCurMB                     ;// store temp on stack for passing as an argument to FindMVPred
+       
+       MOV             temp,#0
+       M_STR           temp,pDstMVPredME                     ;// Pass pDstMVPredME=NULL as an argument         
+       M_STR           BlkCount,pBlkCount                    ;// Passs BlkCount as Argument through stack
+
+       MOV             temp,pSrcMVLeftMB                     ;// temp (RN 1)=pSrcMVLeftMB
+       M_LDR           pSrcMVUpperRightMB,pSrcMVUpperRightMBonStack
+       MOV             pSrcMVLeftMB,pSrcMVUpperMB            ;// pSrcMVLeftMB ( RN 2) = pSrcMVUpperMB
+       MOV             ppBitStream,pDstMVCurMB               ;// ppBitStream  ( RN 0) = pDstMVCurMB
+       MOV             pSrcMVUpperMB,pSrcMVUpperRightMB      ;// pSrcMVUpperMB( RN 3) = pSrcMVUpperRightMB      
+       BL              omxVCM4P2_FindMVpred              ;// Branch to subroutine omxVCM4P2_FindMVpred
+
+       ;// Store Horizontal Motion Vector
+     
+       M_LDR           BlkCount,pBlkCount                    ;// Load BlkCount from stack
+       M_LDR           High,pHigh                            ;// High=32*scaleFactor
+       LSL             temp1,BlkCount,#2                     ;// temp=BlkCount*4
+       M_LDR           diffMVdx,pdiffMVdx                    ;// Laad diffMVdx
+       
+       LDRSH           temp,[pDstMVCurMB,temp1]              ;// temp=pDstMVCurMB[BlkCount]
+       
+       
+       RSB             Low,High,#0                           ;// Low = -32*scaleFactor
+       ADD             diffMVdx,temp,diffMVdx                ;// diffMVdx=pDstMVCurMB[BlkCount]+diffMVdx
+       ADD             Range,High,High                       ;// Range=64*ScaleFactor
+       SUB             High,High,#1                          ;// High= 32*scaleFactor-1
+
+       CMP             diffMVdx,Low                          ;// If diffMVdx<Low          
+       ADDLT           diffMVdx,diffMVdx,Range               ;// diffMVdx+=Range
+        
+       CMP             diffMVdx,High                         
+       SUBGT           diffMVdx,diffMVdx,Range               ;// If diffMVdx > High diffMVdx-=Range
+       STRH            diffMVdx,[pDstMVCurMB,temp1]
+
+       ;// Store Vertical
+
+       ADD             temp1,temp1,#2                        ;// temp1=4*BlkCount+2
+       M_LDR           diffMVdx,pdiffMVdy                    ;// Laad diffMVdy
+       LDRSH           temp,[pDstMVCurMB,temp1]              ;// temp=pDstMVCurMB[BlkCount].diffMVdy
+       ADD             BlkCount,BlkCount,#1                  ;// BlkCount=BlkCount+1
+       ADD             diffMVdx,temp,diffMVdx                
+       CMP             diffMVdx,Low
+       ADDLT           diffMVdx,diffMVdx,Range               ;// If diffMVdy<Low  diffMVdy+=Range                
+       CMP             diffMVdx,High
+       SUBGT           diffMVdx,diffMVdx,Range               ;// If diffMVdy > High diffMVdy-=Range
+       STRH            diffMVdx,[pDstMVCurMB,temp1]    
+       
+       CMP             BlkCount,Count
+       M_LDR           pSrcMVLeftMB,ppSrcMVLeftMB
+       M_LDR           pSrcMVUpperMB,ppSrcMVUpperMB
+
+       BLT             Loop                                  ;// If BlkCount<Count Continue the Loop
+
+
+       ;// If MBType=OMX_VC_INTER or MBtype=OMX_VC_INTER_Q copy pDstMVCurMB[0] to
+       ;// pDstMVCurMB[1], pDstMVCurMB[2], pDstMVCurMB[3] 
+
+       M_LDR           MBType,MBTypeonStack
+
+       TEQ             MBType,#OMX_VC_INTER                                       
+       TEQNE           MBType,#OMX_VC_INTER_Q                            
+       LDREQ           temp,[pDstMVCurMB]
+       M_LDR           ppBitStream,pppBitStream
+       STREQ           temp,[pDstMVCurMB,#4]
+       
+       STREQ           temp,[pDstMVCurMB,#8]
+       STREQ           temp,[pDstMVCurMB,#12]
+       
+       
+       M_LDR           pBitOffset,ppBitOffset
+       ;//Ending the macro
+       M_BD_FINI       ppBitStream,pBitOffset                 ;// Finishing the Macro       
+
+       
+       MOV             Return,#OMX_Sts_NoErr
+       B               ExitOK
+ 
+ExitError
+
+       M_LDR           ppBitStream,pppBitStream
+       M_LDR           pBitOffset,ppBitOffset
+       ;//Ending the macro
+       M_BD_FINI       ppBitStream,pBitOffset
+       
+       MOV             Return,#OMX_Sts_Err
+
+ExitOK             
+
+       M_END
+       ENDIF
+       END
+
+
+   
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
new file mode 100755
index 0000000..7208c21
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
@@ -0,0 +1,132 @@
+;/**
+; * 
+; * File Name:  omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one inter coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in]    ppBitStream        pointer to the pointer to the current byte in
+; *                    the bitstream buffer
+; * [in]    pBitOffset        pointer to the bit position in the byte pointed
+; *                    to by *ppBitStream. *pBitOffset is valid within    [0-7].
+; * [in] shortVideoHeader     binary flag indicating presence of short_video_header;
+; *                           escape modes 0-3 are used if shortVideoHeader==0,
+; *                           and escape mode 4 is used when shortVideoHeader==1.
+; * [out]    ppBitStream        *ppBitStream is updated after the block is
+; *                    decoded, so that it points to the current byte
+; *                    in the bit stream buffer
+; * [out]    pBitOffset        *pBitOffset is updated so that it points to the
+; *                    current bit position in the byte pointed by
+; *                    *ppBitStream
+; * [out]    pDst            pointer to the coefficient buffer of current
+; *                    block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; *   -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; *   -pDst is not 16-byte aligned, or
+; *   -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; *   -At least one mark bit is equal to zero
+; *   -Encountered an illegal stream code that cannot be found in the VLC table
+; *   -Encountered and illegal code in the VLC FLC table
+; *   -The number of coefficients is greater than 64
+; *
+; */
+
+
+      INCLUDE omxtypes_s.h
+      INCLUDE armCOMM_s.h
+      INCLUDE armCOMM_BitDec_s.h
+
+
+      M_VARIANTS ARM1136JS
+
+     
+
+
+
+     IF ARM1136JS
+     
+        ;// Import various tables needed for the function
+
+        
+        IMPORT          armVCM4P2_InterVlcL0L1             ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+                                                               ;// Packed in Run:Level:Last format
+        IMPORT          armVCM4P2_InterL0L1LMAX            ;// Contains LMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_InterL0L1RMAX            ;// Contains RMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_aClassicalZigzagScan     ;// contains classical Zigzag table entries with double the original values
+        IMPORT          armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+
+
+;//Input Arguments
+
+ppBitStream          RN 0
+pBitOffset           RN 1
+pDst                 RN 2
+shortVideoHeader     RN 3
+
+;//Local Variables
+
+Return               RN 0
+
+pVlcTableL0L1        RN 4
+pLMAXTableL0L1       RN 4
+pRMAXTableL0L1       RN 4
+pZigzagTable         RN 4
+Count                RN 6
+
+
+        
+        ;// Allocate stack memory to store the VLC,Zigzag,LMAX and RMAX tables
+     
+        
+        M_ALLOC4        ppVlcTableL0L1,4
+        M_ALLOC4        ppLMAXTableL0L1,4
+        M_ALLOC4        ppRMAXTableL0L1,4
+        M_ALLOC4        ppZigzagTable,4
+        
+        
+        M_START omxVCM4P2_DecodeVLCZigzag_Inter,r12
+
+        
+
+        
+        LDR             pZigzagTable, =armVCM4P2_aClassicalZigzagScan       ;// Load zigzag table
+        M_STR           pZigzagTable,ppZigzagTable                              ;// Store zigzag table on stack to pass as argument to unsafe function
+        LDR             pVlcTableL0L1, =armVCM4P2_InterVlcL0L1              ;// Load optimized VLC table with both L=0 and L=1 entries
+        M_STR           pVlcTableL0L1,ppVlcTableL0L1                            ;// Store optimized VLC table address on stack
+        LDR             pLMAXTableL0L1, =armVCM4P2_InterL0L1LMAX            ;// Load Interleaved L=0 and L=1 LMAX Tables
+        M_STR           pLMAXTableL0L1,ppLMAXTableL0L1                          ;// Store LMAX table address on stack
+        LDR             pRMAXTableL0L1, =armVCM4P2_InterL0L1RMAX            ;// Load Interleaved L=0 and L=1 RMAX Tables
+        MOV             Count,#0                                                ;// set start=0
+        M_STR           pRMAXTableL0L1,ppRMAXTableL0L1                          ;// store RMAX table address on stack
+                
+
+        BL              armVCM4P2_DecodeVLCZigzag_AC_unsafe                 ;// call Unsafe Function for VLC Zigzag Decoding
+         
+       
+
+        M_END
+        ENDIF
+        
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
new file mode 100755
index 0000000..9a37ec9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
@@ -0,0 +1,136 @@
+;/**
+; * 
+; * File Name:  omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in]    ppBitStream        pointer to the pointer to the current byte in
+; *                    the bitstream buffer
+; * [in]    pBitOffset        pointer to the bit position in the byte pointed
+; *                    to by *ppBitStream. *pBitOffset is valid within    [0-7].
+; * [in] shortVideoHeader     binary flag indicating presence of short_video_header;
+; *                           escape modes 0-3 are used if shortVideoHeader==0,
+; *                           and escape mode 4 is used when shortVideoHeader==1.
+; * [out]    ppBitStream        *ppBitStream is updated after the block is
+; *                    decoded, so that it points to the current byte
+; *                    in the bit stream buffer
+; * [out]    pBitOffset        *pBitOffset is updated so that it points to the
+; *                    current bit position in the byte pointed by
+; *                    *ppBitStream
+; * [out]    pDst            pointer to the coefficient buffer of current
+; *                    block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; *   -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; *   -pDst is not 16-byte aligned, or
+; *   -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; *   -At least one mark bit is equal to zero
+; *   -Encountered an illegal stream code that cannot be found in the VLC table
+; *   -Encountered and illegal code in the VLC FLC table
+; *   -The number of coefficients is greater than 64
+; *
+; */
+
+
+      INCLUDE omxtypes_s.h
+      INCLUDE armCOMM_s.h
+      INCLUDE armCOMM_BitDec_s.h
+
+
+      M_VARIANTS ARM1136JS
+
+     
+
+
+
+     IF ARM1136JS
+     
+        ;// Import various tables needed for the function
+
+        
+        IMPORT          armVCM4P2_IntraVlcL0L1             ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+                                                               ;// Packed in Run:Level:Last format
+        IMPORT          armVCM4P2_IntraL0L1LMAX            ;// Contains LMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_IntraL0L1RMAX            ;// Contains RMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_aClassicalZigzagScan     ;// contains classical Zigzag table entries with double the original values
+        IMPORT          armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+;//Input Arguments
+
+ppBitStream          RN 0
+pBitOffset           RN 1
+pDst                 RN 2
+PredDir              RN 3
+shortVideoHeader     RN 3
+
+;//Local Variables
+
+Return               RN 0
+
+pVlcTableL0L1        RN 4
+pLMAXTableL0L1       RN 4
+pRMAXTableL0L1       RN 4
+pZigzagTable         RN 4
+Count                RN 6
+
+
+        
+        ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses 
+     
+        M_ALLOC4        ppVlcTableL0L1,4
+        M_ALLOC4        ppLMAXTableL0L1,4
+        M_ALLOC4        ppRMAXTableL0L1,4
+        M_ALLOC4        ppZigzagTable,4
+
+        
+        M_START omxVCM4P2_DecodeVLCZigzag_IntraACVLC,r12
+
+        M_ARG           shortVideoHeaderonStack,4                             ;// pointer to Input Argument on stack           
+
+        LDR             pZigzagTable, =armVCM4P2_aClassicalZigzagScan     ;// Load Address of the Zigzag table    
+        ADD             pZigzagTable, pZigzagTable, PredDir, LSL #6           ;// Loading Different type of zigzag tables based on PredDir
+       
+        M_STR           pZigzagTable,ppZigzagTable                            ;// Store Zigzag table address on stack
+        LDR             pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1            ;// Load optimized packed VLC Table with both L=0 and L=1 entries
+        M_STR           pVlcTableL0L1,ppVlcTableL0L1                          ;// Store VLC Table address on stack
+        LDR             pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX          ;// Load LMAX Table
+        M_STR           pLMAXTableL0L1,ppLMAXTableL0L1                        ;// Store LMAX Table address on Stack
+        LDR             pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX          ;// Load RMAX Table
+        MOV             Count,#0                                              ;// Set Start=0        
+        
+        M_STR           pRMAXTableL0L1,ppRMAXTableL0L1                        ;// Store RMAX Table address on stack
+              
+
+       
+        M_LDR           shortVideoHeader,shortVideoHeaderonStack              ;// get the Input Argument from stack
+
+        BL              armVCM4P2_DecodeVLCZigzag_AC_unsafe               ;// Call Unsafe Function
+
+
+
+        
+        M_END
+        ENDIF
+        
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
new file mode 100755
index 0000000..778aaf2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
@@ -0,0 +1,224 @@
+;/**
+; * 
+; * File Name:  omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in]    ppBitStream        pointer to the pointer to the current byte in
+; *                    the bitstream buffer
+; * [in]    pBitOffset        pointer to the bit position in the byte pointed
+; *                    to by *ppBitStream. *pBitOffset is valid within    [0-7].
+; * [in] shortVideoHeader     binary flag indicating presence of short_video_header;
+; *                           escape modes 0-3 are used if shortVideoHeader==0,
+; *                           and escape mode 4 is used when shortVideoHeader==1.
+; * [out]    ppBitStream        *ppBitStream is updated after the block is
+; *                    decoded, so that it points to the current byte
+; *                    in the bit stream buffer
+; * [out]    pBitOffset        *pBitOffset is updated so that it points to the
+; *                    current bit position in the byte pointed by
+; *                    *ppBitStream
+; * [out]    pDst            pointer to the coefficient buffer of current
+; *                    block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; *   -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; *   -pDst is not 16-byte aligned, or
+; *   -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; *   -At least one mark bit is equal to zero
+; *   -Encountered an illegal stream code that cannot be found in the VLC table
+; *   -Encountered and illegal code in the VLC FLC table
+; *   -The number of coefficients is greater than 64
+; *
+; */
+
+
+      INCLUDE omxtypes_s.h
+      INCLUDE armCOMM_s.h
+      INCLUDE armCOMM_BitDec_s.h
+
+
+      M_VARIANTS CortexA8
+
+     
+      
+
+
+      IF CortexA8
+
+     
+        ;// Import various tables needed for the function
+
+        
+        IMPORT          armVCM4P2_IntraVlcL0L1             ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+                                                               ;// Packed in Run:Level:Last format
+        IMPORT          armVCM4P2_IntraL0L1LMAX            ;// Contains LMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_IntraL0L1RMAX            ;// Contains RMAX table entries with both Last=0 and Last=1
+        IMPORT          armVCM4P2_aClassicalZigzagScan     ;// contains CLassical, Horizontal, Vertical Zigzag table entries with double the original values
+        IMPORT          armVCM4P2_aIntraDCLumaChromaIndex  ;// Contains Optimized DCLuma and DCChroma Index table Entries
+        
+
+        IMPORT          armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+;//Input Arguments
+
+ppBitStream          RN 0
+pBitOffset           RN 1
+pDst                 RN 2
+PredDir              RN 3
+shortVideoHeader     RN 3
+videoComp            RN 5
+;//Local Variables
+
+Return               RN 0
+
+pDCLumaChromaIndex   RN 4
+pDCChromaIndex       RN 7
+pVlcTableL0L1        RN 4
+pLMAXTableL0L1       RN 4
+pRMAXTableL0L1       RN 4
+pZigzagTable         RN 4
+Count                RN 6
+DCValueSize          RN 6
+powOfSize            RN 7
+temp1                RN 5
+
+
+;// Scratch Registers
+
+RBitStream           RN 8
+RBitBuffer           RN 9
+RBitCount            RN 10
+
+T1                   RN 11
+T2                   RN 12
+DCVal                RN 14
+
+        
+        ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses 
+     
+        M_ALLOC4        ppVlcTableL0L1,4
+        M_ALLOC4        ppLMAXTableL0L1,4
+        M_ALLOC4        ppRMAXTableL0L1,4
+        M_ALLOC4        ppZigzagTable,4
+        M_ALLOC4        pDCCoeff,4
+        
+
+        
+        M_START omxVCM4P2_DecodeVLCZigzag_IntraDCVLC,r12
+
+        M_ARG           shortVideoHeaderonStack,4                                  ;// Pointer to argument on stack  
+        M_ARG           videoComponstack,4                                         ;// Pointer to argument on stack
+
+        
+        ;// Decode DC Coefficient
+
+        
+        LDR             pDCLumaChromaIndex, =armVCM4P2_aIntraDCLumaChromaIndex ;// Load Optimized VLC Table for Luminance and Chrominance
+
+        ;// Initializing the Bitstream Macro
+
+        M_BD_INIT0      ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount
+        M_LDR           videoComp,videoComponstack                                 
+        M_BD_INIT1      T1, T2, T2
+        ADD             pDCLumaChromaIndex,pDCLumaChromaIndex,videoComp, LSL #6             
+        M_BD_INIT2      T1, T2, T2
+    
+        
+        M_BD_VLD        DCValueSize,T1,T2,pDCLumaChromaIndex,4,2                    ;// VLC Decode using optimized Luminance and Chrominance VLC Table
+
+    
+       
+
+DecodeDC
+                         
+        CMP             DCValueSize,#12     
+        BGT             ExitError
+        
+        CMP             DCValueSize,#0
+        MOVEQ           DCVal,#0                                                    ;// If DCValueSize is zero then DC coeff =0
+        BEQ             ACDecode                                                    ;// Branch to perform AC Coeff Decoding
+        
+        M_BD_VREAD16    DCVal,DCValueSize,T1,T2                                     ;// Get DC Value From Bit stream
+         
+
+        MOV             powOfSize,#1                                                
+        LSL             powOfSize,DCValueSize                                       ;// powOfSize=pow(2,DCValueSize)
+        CMP             DCVal,powOfSize,LSR #1                                      ;// Compare DCVal with powOfSize/2 
+        ADDLT           DCVal,DCVal,#1
+        SUBLT           DCVal,DCVal,powOfSize                                       ;// If Lessthan powOfSize/2 DCVal=DCVal-powOfSize+1
+                                                                                    ;// Else DCVal= fetchbits from bit stream
+
+CheckDCValueSize
+        
+        CMP             DCValueSize,#8                                              ;// If DCValueSize greater than 8 check marker bit
+
+        BLE             ACDecode
+
+        M_BD_READ8      temp1,1,T1
+        TEQ             temp1,#0                                                    ;// If Marker bit is zero Exit with an Error Message
+        BEQ             ExitError
+
+        
+
+        ;// Decode AC Coefficient
+
+ACDecode
+
+        M_STR           DCVal,pDCCoeff                                             ;// Store Decoded DC Coeff on Stack
+        M_BD_FINI       ppBitStream,pBitOffset                                     ;// Terminating the Bit stream Macro
+         
+        LDR             pZigzagTable, =armVCM4P2_aClassicalZigzagScan          ;// Load Zigzag talbe address   
+        ADD             pZigzagTable, pZigzagTable, PredDir, LSL #6                ;// Modify the Zigzag table adress based on PredDir                
+       
+        M_STR           pZigzagTable,ppZigzagTable                                 ;// Store zigzag table on stack
+        LDR             pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1                 ;// Load Optimized VLC Table With both Last=0 and Last=1 Entries
+        M_STR           pVlcTableL0L1,ppVlcTableL0L1                               ;// Store Optimized VLC Table on stack
+        LDR             pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX               ;// Load LMAX Table
+        M_STR           pLMAXTableL0L1,ppLMAXTableL0L1                             ;// Store LMAX table on stack
+        LDR             pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX               ;// Load RMAX Table
+        MOV             Count,#1                                                   ;// Set Start =1        
+        
+        M_STR           pRMAXTableL0L1,ppRMAXTableL0L1                             ;// Store RMAX Table on Stack
+        
+       
+        M_LDR           shortVideoHeader,shortVideoHeaderonStack                   ;// Load the Input Argument From Stack
+        
+        BL              armVCM4P2_DecodeVLCZigzag_AC_unsafe                    ;// Call the Unsafe Function
+
+        M_LDR           DCVal,pDCCoeff                                             ;// Get the Decoded DC Value From Stack
+        STRH            DCVal,[pDst]                                               ;// Store the DC Value 
+        B               ExitOK
+        
+              
+
+ExitError
+ 
+        M_BD_FINI       ppBitStream,pBitOffset                                     ;// Terminating the Bit Stream Macro in case of an Error
+        MOV             Return,#OMX_Sts_Err                                        ;// Exit with an Error Message 
+ExitOK
+      
+        M_END
+        ENDIF
+        
+        END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
new file mode 100755
index 0000000..caf7121
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
@@ -0,0 +1,194 @@
+;//
+;// 
+;// File Name:  omxVCM4P2_FindMVpred_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Function:
+;//     omxVCM4P2_FindMVpred
+;//
+        ;// Include headers
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        INCLUDE armVCCOMM_s.h
+
+        ;// Define cpu variants
+        M_VARIANTS CortexA8
+        
+        
+        IF CortexA8
+        
+        M_TABLE armVCM4P2_pBlkIndexTable
+        DCD  OMXVCBlk0, OMXVCBlk1
+        DCD  OMXVCBlk2, OMXVCBlk3
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+        
+pSrcMVCurMB            RN 0
+pSrcCandMV1            RN 1
+pSrcCandMV2            RN 2
+pSrcCandMV3            RN 3
+pDstMVPred             RN 4
+pDstMVPredME           RN 5
+iBlk                   RN 6
+
+pTable                 RN 4
+CandMV                 RN 12
+
+pCandMV1               RN 7
+pCandMV2               RN 8
+pCandMV3               RN 9
+
+CandMV1dx              RN 0 
+CandMV1dy              RN 1 
+CandMV2dx              RN 2
+CandMV2dy              RN 3
+CandMV3dx              RN 10
+CandMV3dy              RN 11
+
+temp                   RN 14
+
+zero                   RN 14
+return                 RN 0
+        
+; ----------------------------------------------
+; Main routine
+; ----------------------------------------------        
+
+        M_ALLOC4 MV, 4
+        
+        ;// Function header 
+        M_START omxVCM4P2_FindMVpred, r11
+        
+        ;// Define stack arguments
+        M_ARG   ppDstMVPred,  4
+        M_ARG   ppDstMVPredME, 4
+        M_ARG   Blk, 4
+        
+        M_ADR CandMV, MV
+        MOV   zero, #0
+        M_LDR iBlk, Blk
+        
+        ;// Set the default value for these
+        ;// to be used if pSrcCandMV[1|2|3] == NULL
+        MOV   pCandMV1, CandMV
+        MOV   pCandMV2, CandMV
+        MOV   pCandMV3, CandMV
+    
+        STR   zero, [CandMV]
+
+        ;// Branch to the case based on blk number
+        M_SWITCH iBlk
+        M_CASE   OMXVCBlk0      ;// iBlk=0
+        M_CASE   OMXVCBlk1      ;// iBlk=0
+        M_CASE   OMXVCBlk2      ;// iBlk=0
+        M_CASE   OMXVCBlk3      ;// iBlk=0
+        M_ENDSWITCH
+        
+OMXVCBlk0
+        CMP   pSrcCandMV1, #0
+        ADDNE pCandMV1, pSrcCandMV1, #4
+        
+        CMP   pSrcCandMV2, #0
+        ADDNE pCandMV2, pSrcCandMV2, #8
+
+        CMP   pSrcCandMV3, #0
+        ADDNE pCandMV3, pSrcCandMV3, #8
+        CMPEQ pSrcCandMV1, #0
+    
+        MOVEQ pCandMV3, pCandMV2
+        MOVEQ pCandMV1, pCandMV2
+                
+        CMP   pSrcCandMV1, #0
+        CMPEQ pSrcCandMV2, #0
+    
+        MOVEQ pCandMV1, pCandMV3
+        MOVEQ pCandMV2, pCandMV3
+        
+        CMP   pSrcCandMV2, #0
+        CMPEQ pSrcCandMV3, #0
+    
+        MOVEQ pCandMV2, pCandMV1
+        MOVEQ pCandMV3, pCandMV1
+        
+        B     BlkEnd
+    
+OMXVCBlk1
+        MOV   pCandMV1, pSrcMVCurMB
+        CMP   pSrcCandMV3, #0
+        ADDNE pCandMV3, pSrcCandMV3, #8
+        
+        CMP   pSrcCandMV2, #0
+        ADDNE pCandMV2, pSrcCandMV2, #12
+    
+        CMPEQ pSrcCandMV3, #0
+    
+        MOVEQ pCandMV2, pCandMV1
+        MOVEQ pCandMV3, pCandMV1
+            
+        B     BlkEnd
+
+OMXVCBlk2
+        CMP   pSrcCandMV1, #0
+        MOV   pCandMV2, pSrcMVCurMB
+        ADD   pCandMV3, pSrcMVCurMB, #4
+        ADDNE pCandMV1, pSrcCandMV1, #12
+        B     BlkEnd
+
+OMXVCBlk3
+        ADD   pCandMV1, pSrcMVCurMB, #8
+        MOV   pCandMV2, pSrcMVCurMB
+        ADD   pCandMV3, pSrcMVCurMB, #4
+    
+BlkEnd
+
+        ;// Using the transperancy info, zero
+        ;// out the candidate MV if neccesary
+        LDRSH CandMV1dx, [pCandMV1], #2
+        LDRSH CandMV2dx, [pCandMV2], #2
+        LDRSH CandMV3dx, [pCandMV3], #2
+    
+        ;// Load argument from the stack
+        M_LDR pDstMVPredME, ppDstMVPredME
+
+        LDRSH CandMV1dy, [pCandMV1]
+        LDRSH CandMV2dy, [pCandMV2]
+        LDRSH CandMV3dy, [pCandMV3]
+
+        CMP pDstMVPredME, #0        
+
+        ;// Store the candidate MV's into the pDstMVPredME, 
+        ;// these can be used in the fast algorithm if implemented 
+
+        STRHNE CandMV1dx, [pDstMVPredME], #2
+        STRHNE CandMV1dy, [pDstMVPredME], #2        
+        STRHNE CandMV2dx, [pDstMVPredME], #2
+        STRHNE CandMV2dy, [pDstMVPredME], #2
+        STRHNE CandMV3dx, [pDstMVPredME], #2
+        STRHNE CandMV3dy, [pDstMVPredME]
+           
+        ; Find the median of the 3 candidate MV's
+        M_MEDIAN3 CandMV1dx, CandMV2dx, CandMV3dx, temp
+
+        ;// Load argument from the stack
+        M_LDR pDstMVPred, ppDstMVPred
+
+        M_MEDIAN3 CandMV1dy, CandMV2dy, CandMV3dy, temp
+    
+        STRH CandMV3dx, [pDstMVPred], #2
+        STRH CandMV3dy, [pDstMVPred]
+
+        MOV return, #OMX_Sts_NoErr
+    
+        M_END
+    ENDIF ;// ARM1136JS :LOR: CortexA8
+    
+    END
+\ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
new file mode 100755
index 0000000..b5e3d0d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
@@ -0,0 +1,73 @@
+;//
+;// 
+;// File Name:  omxVCM4P2_IDCT8x8blk_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+
+;// Function:
+;//     omxVCM4P2_IDCT8x8blk
+;//
+        ;// Include headers
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+
+        ;// Define cpu variants
+        M_VARIANTS CortexA8
+
+        INCLUDE armCOMM_IDCT_s.h        
+        
+        IMPORT armCOMM_IDCTPreScale
+        ;//
+        ;// Function prototype
+        ;//
+        ;//     OMXResult
+        ;//     omxVCM4P2_IDCT8x8blk(const OMX_S16* pSrc,
+        ;//                                       OMX_S16* pDst)
+        ;//    
+        
+    IF CortexA8
+        M_ALLOC4  ppDest, 4
+        M_ALLOC4  pStride, 4
+        M_ALLOC8  pBlk, 2*8*8
+    ENDIF
+    
+    
+    IF CortexA8
+        M_START omxVCM4P2_IDCT8x8blk, r11, d15
+    ENDIF
+        
+    IF CortexA8
+        
+;// Declare input registers
+pSrc            RN 0
+pDst            RN 1
+
+;// Declare other intermediate registers
+Result          RN 0
+
+;// Prototype for macro M_IDCT
+;// pSrc            RN 0  ;// source data buffer
+;// Stride          RN 1  ;// destination stride in bytes
+;// pDest           RN 2  ;// destination data buffer
+;// pScale          RN 3  ;// pointer to scaling table
+
+pSrc    RN 0    
+Stride  RN 1    
+pDest   RN 2    
+pScale  RN 3    
+                
+        MOV         pDest, pDst
+        LDR         pScale, =armCOMM_IDCTPreScale        
+        M_IDCT      s9, s16, 16      
+        MOV         Result, #OMX_Sts_NoErr
+        M_END       
+    ENDIF  
+        ;// ARM1136JS :LOR: CortexA8
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
new file mode 100755
index 0000000..dd00df5
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
@@ -0,0 +1,444 @@
+;//
+;// 
+;// File Name:  omxVCM4P2_MCReconBlock_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;//
+;//
+
+;// Include standard headers
+    INCLUDE omxtypes_s.h
+    INCLUDE armCOMM_s.h
+
+;// Import symbols required from other files
+
+    M_VARIANTS CortexA8
+
+;// ***************************************************************************
+;// ARM1136JS implementation
+;// ***************************************************************************
+
+;// ***************************************************************************
+;// CortexA8 implementation
+;// ***************************************************************************
+    IF  CortexA8
+;// ***************************************************************************
+;// MACRO DEFINITIONS
+;// ***************************************************************************
+    ;// Description:
+    ;// Does interpolation for the case of "IntegerPixel" predictType. Both 
+    ;// rounding cases are handled. Just copies a block from pSrc to pDst
+    ;//
+    ;// Syntax:
+    ;// M_MCRECONBLOCK_IntegerPixel
+    ;// 
+    ;// Inputs: None
+    ;// Outputs: None
+
+    MACRO 
+    M_MCRECONBLOCK_IntegerPixel
+CaseIntegerPixel_Rnd0
+CaseIntegerPixel_Rnd1
+
+    VLD1        dRow0, [pSrc], srcStep
+    VLD1        dRow1, [pSrc], srcStep
+    VLD1        dRow2, [pSrc], srcStep
+    VLD1        dRow3, [pSrc], srcStep
+    VLD1        dRow4, [pSrc], srcStep
+    VLD1        dRow5, [pSrc], srcStep
+    VLD1        dRow6, [pSrc], srcStep
+    VLD1        dRow7, [pSrc], srcStep
+
+    VST1        dRow0, [pDst@64], dstStep
+    VST1        dRow1, [pDst@64], dstStep
+    VST1        dRow2, [pDst@64], dstStep
+    VST1        dRow3, [pDst@64], dstStep
+    VST1        dRow4, [pDst@64], dstStep
+    VST1        dRow5, [pDst@64], dstStep
+    VST1        dRow6, [pDst@64], dstStep
+    VST1        dRow7, [pDst@64], dstStep
+
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Does interpolation for the case of "HalfPixelX" predictType. The two 
+    ;// rounding cases are handled by the parameter "$rndVal". Averages between
+    ;// a pixel and pixel right to it, rounding it based on $rndVal. The 
+    ;// rounding is implemented by using opCode switching between "VRHADD" and 
+    ;// "VHADD" instructions.
+    ;//
+    ;// Syntax:
+    ;// M_MCRECONBLOCK_HalfPixelX $rndVal
+    ;// 
+    ;// Inputs: 
+    ;//     $rndVal: 0 for rounding and 1 for no rounding
+    ;// Outputs: None
+
+    MACRO 
+    M_MCRECONBLOCK_HalfPixelX $rndVal
+
+    LCLS M_VHADDR
+    IF $rndVal = 0
+M_VHADDR SETS "VRHADD"
+    ELSE
+M_VHADDR SETS "VHADD"
+    ENDIF
+
+CaseHalfPixelX_Rnd$rndVal
+
+    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
+    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
+    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
+    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
+    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
+    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
+    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
+    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
+    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
+    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
+    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
+    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
+    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
+    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
+    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
+    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
+    $M_VHADDR   dRow0, dRow0, dRow0Shft
+    $M_VHADDR   dRow1, dRow1, dRow1Shft
+    VST1        dRow0, [pDst@64], dstStep
+    $M_VHADDR   dRow2, dRow2, dRow2Shft
+    VST1        dRow1, [pDst@64], dstStep
+    $M_VHADDR   dRow3, dRow3, dRow3Shft
+    VST1        dRow2, [pDst@64], dstStep
+    $M_VHADDR   dRow4, dRow4, dRow4Shft
+    VST1        dRow3, [pDst@64], dstStep
+    $M_VHADDR   dRow5, dRow5, dRow5Shft
+    VST1        dRow4, [pDst@64], dstStep
+    $M_VHADDR   dRow6, dRow6, dRow6Shft
+    VST1        dRow5, [pDst@64], dstStep
+    $M_VHADDR   dRow7, dRow7, dRow7Shft
+    VST1        dRow6, [pDst@64], dstStep
+    VST1        dRow7, [pDst@64], dstStep
+    
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Does interpolation for the case of "HalfPixelY" predictType. The two 
+    ;// rounding cases are handled by the parameter "$rndVal". Averages between
+    ;// a pixel and pixel below it, rounding it based on $rndVal. The 
+    ;// rounding is implemented by using opCode switching between "VRHADD" and 
+    ;// "VHADD" instructions.
+    ;//
+    ;// Syntax:
+    ;// M_MCRECONBLOCK_HalfPixelY $rndVal
+    ;// 
+    ;// Inputs: 
+    ;//     $rndVal: 0 for rounding and 1 for no rounding
+    ;// Outputs: None
+
+    MACRO 
+    M_MCRECONBLOCK_HalfPixelY $rndVal
+
+    LCLS M_VHADDR
+    IF $rndVal = 0
+M_VHADDR SETS "VRHADD"
+    ELSE
+M_VHADDR SETS "VHADD"
+    ENDIF
+
+CaseHalfPixelY_Rnd$rndVal
+    VLD1        dRow0, [pSrc], srcStep
+    VLD1        dRow1, [pSrc], srcStep
+    VLD1        dRow2, [pSrc], srcStep
+    VLD1        dRow3, [pSrc], srcStep
+    VLD1        dRow4, [pSrc], srcStep
+    VLD1        dRow5, [pSrc], srcStep
+    VLD1        dRow6, [pSrc], srcStep
+    VLD1        dRow7, [pSrc], srcStep
+    $M_VHADDR   dRow0, dRow0, dRow1
+    VLD1        dRow8, [pSrc], srcStep
+    $M_VHADDR   dRow1, dRow1, dRow2
+    VST1        dRow0, [pDst@64], dstStep
+    $M_VHADDR   dRow2, dRow2, dRow3
+    VST1        dRow1, [pDst@64], dstStep
+    $M_VHADDR   dRow3, dRow3, dRow4
+    VST1        dRow2, [pDst@64], dstStep
+    $M_VHADDR   dRow4, dRow4, dRow5
+    VST1        dRow3, [pDst@64], dstStep
+    $M_VHADDR   dRow5, dRow5, dRow6
+    VST1        dRow4, [pDst@64], dstStep
+    $M_VHADDR   dRow6, dRow6, dRow7
+    VST1        dRow5, [pDst@64], dstStep
+    $M_VHADDR   dRow7, dRow7, dRow8
+    VST1        dRow6, [pDst@64], dstStep
+    VST1        dRow7, [pDst@64], dstStep
+
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Does interpolation for the case of "IntegerPixel" predictType. Both 
+    ;// rounding cases are handled. 
+    ;// Typical computation for a row goes like this
+    ;//     1. VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
+    ;//     2. VEXT        dRow0Shft, dRow0, dRow0Shft, #1     ;// Generate the shifted row
+    ;//     3. VADDL       qSum0, dRow0, dRow0Shft             ;// Generate the sum of row and shifted row
+    ;//     5. VADD        qSum0, qSum0, qSum1                 ;// Add to the sum of next row (odd row sum has rounding value added to it)
+    ;//     6. VSHRN       dRow0, qSum0, #2                    ;// Divide by 4
+    ;//     7. VST1        dRow0, [pDst@64], dstStep           ;// Store
+    ;// Odd rows undergo following computation after step 3
+    ;//     4. VADD        qSum1, qSum1, qRound
+    ;// This saves for adding rounding value to each final sum (overall saves 4 
+    ;// instructions).
+    ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes 
+    ;// care of this and also minimizes stalls. Rounding value was modified in 
+    ;// ARM register rndVal (originally used for rounding flag) before the switch.
+    ;// It is then populated into all lanes in this macro. No branching out to 
+    ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these 
+    ;// are the last of switch cases.
+    ;// 
+    ;// Syntax:
+    ;// M_MCRECONBLOCK_HalfPixelXY
+    ;// 
+    ;// Inputs: None
+    ;// Outputs: None
+
+    MACRO 
+    M_MCRECONBLOCK_HalfPixelXY
+
+CaseHalfPixelXY_Rnd0
+CaseHalfPixelXY_Rnd1
+    VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
+    VDUP        qRound, rndVal
+    VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
+    VEXT        dRow0Shft, dRow0, dRow0Shft, #1
+    VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
+    VEXT        dRow1Shft, dRow1, dRow1Shft, #1
+    VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
+    VEXT        dRow2Shft, dRow2, dRow2Shft, #1
+    VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
+    VADDL       qSum0, dRow0, dRow0Shft
+    VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
+    VADDL       qSum1, dRow1, dRow1Shft
+    VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
+    VEXT        dRow3Shft, dRow3, dRow3Shft, #1
+    VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
+    VEXT        dRow4Shft, dRow4, dRow4Shft, #1
+    VLD1        {dRow8, dRow8Shft}, [pSrc], srcStep
+    VADD        qSum1, qSum1, qRound
+    VADDL       qSum2, dRow2, dRow2Shft
+    VEXT        dRow5Shft, dRow5, dRow5Shft, #1
+    VADD        qSum0, qSum0, qSum1
+    VADDL       qSum3, dRow3, dRow3Shft
+    VEXT        dRow6Shft, dRow6, dRow6Shft, #1
+    VADD        qSum1, qSum1, qSum2
+    VSHRN       dRow0, qSum0, #2
+    VADDL       qSum4, dRow4, dRow4Shft
+    VSHRN       dRow1, qSum1, #2
+    VADD        qSum3, qSum3, qRound
+    VADDL       qSum5, dRow5, dRow5Shft
+    VST1        dRow0, [pDst@64], dstStep
+    VEXT        dRow7Shft, dRow7, dRow7Shft, #1
+    VST1        dRow1, [pDst@64], dstStep
+    VEXT        dRow8Shft, dRow8, dRow8Shft, #1
+    VADD        qSum5, qSum5, qRound
+    VADD        qSum2, qSum2, qSum3
+    VADD        qSum3, qSum3, qSum4
+    VADD        qSum4, qSum4, qSum5
+    VSHRN       dRow2, qSum2, #2
+    VSHRN       dRow3, qSum3, #2
+    VSHRN       dRow4, qSum4, #2
+    VADDL       qSum6, dRow6, dRow6Shft
+    VADDL       qSum7, dRow7, dRow7Shft
+    VST1        dRow2, [pDst@64], dstStep
+    VADDL       qSum8, dRow8, dRow8Shft
+    VADD        qSum7, qSum7, qRound
+    VST1        dRow3, [pDst@64], dstStep
+    VST1        dRow4, [pDst@64], dstStep
+    VADD        qSum5, qSum5, qSum6
+    VADD        qSum6, qSum6, qSum7
+    VADD        qSum7, qSum7, qSum8
+    VSHRN       dRow5, qSum5, #2
+    VSHRN       dRow6, qSum6, #2
+    VSHRN       dRow7, qSum7, #2
+    VST1        dRow5, [pDst@64], dstStep
+    VST1        dRow6, [pDst@64], dstStep
+    VST1        dRow7, [pDst@64], dstStep
+
+    MEND
+;// ***************************************************************************
+
+;// Input/Output Registers
+pSrc                  RN 0
+srcStep               RN 1
+pSrcResidue           RN 2
+pDst                  RN 3
+dstStep               RN 4
+predictType           RN 5
+rndVal                RN 6
+
+;// Local Scratch Registers
+pDstCopy              RN 0
+return                RN 0
+
+;// Neon Registers
+dRow0                 DN D0.U8
+dRow0Shft             DN D1.U8
+dRow1                 DN D2.U8
+dRow1Shft             DN D3.U8
+dRow2                 DN D4.U8
+dRow2Shft             DN D5.U8
+dRow3                 DN D6.U8
+dRow3Shft             DN D7.U8
+dRow4                 DN D8.U8
+dRow4Shft             DN D9.U8
+dRow5                 DN D10.U8
+dRow5Shft             DN D11.U8
+dRow6                 DN D12.U8
+dRow6Shft             DN D13.U8
+dRow7                 DN D14.U8
+dRow7Shft             DN D15.U8
+dRow8                 DN D16.U8
+dRow8Shft             DN D17.U8
+
+
+qSum0                 QN Q9.U16
+qSum1                 QN Q10.U16
+qSum2                 QN Q11.U16
+qSum3                 QN Q12.U16
+qSum4                 QN Q13.U16
+qSum5                 QN Q14.U16
+qSum6                 QN Q0.U16
+qSum7                 QN Q1.U16
+qSum8                 QN Q2.U16
+
+qRound                QN Q15.U16
+
+dDst0                 DN D0.U8
+dDst1                 DN D1.U8
+dDst2                 DN D2.U8
+dDst3                 DN D3.U8
+dDst4                 DN D4.U8
+dDst5                 DN D5.U8
+dDst6                 DN D6.U8
+dDst7                 DN D7.U8
+
+qRes0                 QN Q4.S16
+qRes1                 QN Q5.S16
+qRes2                 QN Q6.S16
+qRes3                 QN Q7.S16
+qRes4                 QN Q8.S16
+qRes5                 QN Q9.S16
+qRes6                 QN Q10.S16
+qRes7                 QN Q11.S16
+
+    ;// Function header
+    M_START     omxVCM4P2_MCReconBlock, r6, d15
+    ;// Define stack arguments
+    M_ARG       Arg_dstStep,        4
+    M_ARG       Arg_predictType,    4
+    M_ARG       Arg_rndVal,         4
+    ;// Load argument from the stack
+    M_LDR       dstStep, Arg_dstStep
+    M_LDR       predictType, Arg_predictType
+    M_LDR       rndVal, Arg_rndVal
+    ADD         predictType, rndVal, predictType, LSL #1
+    RSB         rndVal, rndVal, #2              ;// preparing rndVal for HalfPixelXY
+    
+    ;// The following is implementation of switching to different code segments
+    ;// based on different predictType and rndVal flags. The corresponding 
+    ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
+    ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel" 
+    ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases; 
+    ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle 
+    ;// the two rounding cases in separate code bases.
+    ;// All these together implement the interpolation functionality
+    
+    M_SWITCH    predictType
+        M_CASE      CaseIntegerPixel_Rnd0
+        M_CASE      CaseIntegerPixel_Rnd1
+        M_CASE      CaseHalfPixelX_Rnd0
+        M_CASE      CaseHalfPixelX_Rnd1
+        M_CASE      CaseHalfPixelY_Rnd0
+        M_CASE      CaseHalfPixelY_Rnd1
+        M_CASE      CaseHalfPixelXY_Rnd0
+        M_CASE      CaseHalfPixelXY_Rnd1
+    M_ENDSWITCH
+
+    M_MCRECONBLOCK_IntegerPixel
+    M_MCRECONBLOCK_HalfPixelX 0
+    M_MCRECONBLOCK_HalfPixelX 1
+    M_MCRECONBLOCK_HalfPixelY 0
+    M_MCRECONBLOCK_HalfPixelY 1
+    M_MCRECONBLOCK_HalfPixelXY
+SwitchPredictTypeEnd
+
+    ;// After interpolation is done, residue needs to be added. This is done 
+    ;// only in case "pSrcResidue" parameter to the function is not NULL.
+    ;// Following is a completely unrolled code to do so. Each row and 
+    ;// corresponding residue is loaded and residue is added and value 
+    ;// stored
+    
+    CMP         pSrcResidue, #0
+    SUBNE       pDst, pDst, dstStep, LSL #3     ;// Restoring pDst
+    MOVNE       pDstCopy, pDst
+    BEQ         pSrcResidueConditionEnd
+pSrcResidueNotNull    
+    VLD1        dDst0, [pDst@64], dstStep
+    VLD1        qRes0, [pSrcResidue@128]!
+    VLD1        dDst1, [pDst@64], dstStep
+    VLD1        qRes1, [pSrcResidue@128]!
+    VLD1        dDst2, [pDst@64], dstStep
+    VLD1        qRes2, [pSrcResidue@128]!
+    VADDW       qRes0, qRes0, dDst0
+    VLD1        dDst3, [pDst@64], dstStep
+    VADDW       qRes1, qRes1, dDst1
+    VLD1        qRes3, [pSrcResidue@128]!
+    VADDW       qRes2, qRes2, dDst2
+    VLD1        dDst4, [pDst@64], dstStep
+    VQMOVUN     dDst0, qRes0
+    VLD1        qRes4, [pSrcResidue@128]!
+    VADDW       qRes3, qRes3, dDst3
+    VLD1        dDst5, [pDst@64], dstStep
+    VQMOVUN     dDst1, qRes1
+    VLD1        qRes5, [pSrcResidue@128]!
+    VADDW       qRes4, qRes4, dDst4
+    VLD1        dDst6, [pDst@64], dstStep
+    VQMOVUN     dDst2, qRes2
+    VLD1        qRes6, [pSrcResidue@128]!
+    VADDW       qRes5, qRes5, dDst5
+    VLD1        dDst7, [pDst@64], dstStep
+    VQMOVUN     dDst3, qRes3
+    VLD1        qRes7, [pSrcResidue@128]!
+    VADDW       qRes6, qRes6, dDst6
+    VST1        dDst0, [pDstCopy@64], dstStep
+    VQMOVUN     dDst4, qRes4
+    VST1        dDst1, [pDstCopy@64], dstStep
+    VADDW       qRes7, qRes7, dDst7
+    VST1        dDst2, [pDstCopy@64], dstStep
+    VQMOVUN     dDst5, qRes5
+    VST1        dDst3, [pDstCopy@64], dstStep
+    VQMOVUN     dDst6, qRes6
+    VST1        dDst4, [pDstCopy@64], dstStep
+    VQMOVUN     dDst7, qRes7
+    VST1        dDst5, [pDstCopy@64], dstStep
+    VST1        dDst6, [pDstCopy@64], dstStep
+    VST1        dDst7, [pDstCopy@64], dstStep
+    
+pSrcResidueConditionEnd
+    MOV         return, #OMX_Sts_NoErr
+
+    M_END
+    ENDIF ;// CortexA8
+    END
+;// ***************************************************************************
+;// omxVCM4P2_MCReconBlock ends
+;// ***************************************************************************
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
new file mode 100755
index 0000000..a73f64a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
@@ -0,0 +1,320 @@
+; **********
+; * 
+; * File Name:  omxVCM4P2_PredictReconCoefIntra_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; * 
+; * Description:
+; * Contains module for DC/AC coefficient prediction
+; *
+; * 
+; * Function: omxVCM4P2_PredictReconCoefIntra
+; *
+; * Description:
+; * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+; * to the function call, prediction direction (predDir) should be selected
+; * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in]  pSrcDst      pointer to the coefficient buffer which contains the 
+; *                    quantized coefficient residuals (PQF) of the current 
+; *                    block; must be aligned on a 4-byte boundary. The 
+; *                    output coefficients are saturated to the range 
+; *                    [-2048, 2047].
+; * [in]  pPredBufRow  pointer to the coefficient row buffer; must be aligned
+; *                    on a 4-byte boundary.
+; * [in]  pPredBufCol  pointer to the coefficient column buffer; must be 
+; *                    aligned on a 4-byte boundary.
+; * [in]  curQP        quantization parameter of the current block. curQP may 
+; *                    equal to predQP especially when the current block and 
+; *                    the predictor block are in the same macroblock.
+; * [in]  predQP       quantization parameter of the predictor block
+; * [in]  predDir      indicates the prediction direction which takes one
+; *                    of the following values:
+; *                    OMX_VIDEO_HORIZONTAL    predict horizontally
+; *                    OMX_VIDEO_VERTICAL        predict vertically
+; * [in]  ACPredFlag   a flag indicating if AC prediction should be
+; *                    performed. It is equal to ac_pred_flag in the bit
+; *                    stream syntax of MPEG-4
+; * [in]  videoComp    video component type (luminance, chrominance or
+; *                    alpha) of the current block
+; * [out] pSrcDst      pointer to the coefficient buffer which contains
+; *                    the quantized coefficients (QF) of the current
+; *                    block
+; * [out] pPredBufRow  pointer to the updated coefficient row buffer
+; * [out] pPredBufCol  pointer to the updated coefficient column buffer
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - Bad arguments 
+; * - At least one of the pointers is NULL: pSrcDst, pPredBufRow, or pPredBufCol.
+; * - At least one the following cases: curQP <= 0, predQP <= 0, curQP >31, 
+; *   predQP > 31, preDir exceeds [1,2].
+; * - At least one of the pointers pSrcDst, pPredBufRow, or pPredBufCol is not 
+; *   4-byte aligned.
+; *
+; *********
+     
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+       M_VARIANTS CortexA8
+       
+             
+
+       IMPORT        armVCM4P2_Reciprocal_QP_S32
+       IMPORT        armVCM4P2_Reciprocal_QP_S16
+       IMPORT        armVCM4P2_DCScaler
+       
+        IF CortexA8
+;// Input Arguments
+
+pSrcDst          RN 0
+pPredBufRow      RN 1
+pPredBufCol      RN 2
+curQP            RN 3
+QP               RN 3
+predQP           RN 4
+predDir          RN 5
+ACPredFlag       RN 6
+videoComp        RN 7
+
+;// Local Variables
+
+shortVideoHeader RN 4
+dcScaler         RN 4
+index            RN 6
+predCoeffTable   RN 7
+temp1            RN 6
+temp2            RN 9
+temp             RN 14
+Const            RN 8
+temppPredColBuf  RN 8
+tempPred         RN 9
+
+absCoeffDC       RN 8
+negdcScaler      RN 10
+Rem              RN 11
+temp3            RN 12
+
+dcRowbufCoeff    RN 10
+dcColBuffCoeff   RN 11
+Return           RN 0
+
+;//NEON Registers
+
+qPredRowBuf       QN Q0.S16
+dPredRowBuf0      DN D0.S16
+dPredRowBuf1      DN D1.S16
+
+
+
+
+qCoeffTab         QN Q1.S32
+
+qPredQP           QN Q2.S16
+dPredQP0          DN D4.S16
+dPredQP1          DN D5.S16
+
+
+qtemp1            QN Q3.S32
+qtemp             QN Q3.S16
+
+dtemp0            DN D6.S16
+dtemp1            DN D7.S16
+
+dtemp2            DN D8.S16
+dtemp3            DN D9.S16
+
+dtemp4            DN D2.S16
+dtemp5            DN D3.S16
+dtemp6            DN D4.S16
+dtemp7            DN D5.S16
+ 
+qtempPred1        QN Q5.S32
+qtempPred         QN Q5.S16
+
+dtempPred0        DN D10.S16
+dtempPred1        DN D11.S16
+ 
+  
+
+      M_START   omxVCM4P2_PredictReconCoefIntra,r11,d11
+
+      ;// Assigning pointers to Input arguments on Stack
+    
+      M_ARG           predQPonStack,4  
+      M_ARG           predDironStack,4
+      M_ARG           ACPredFlagonStack,4
+      M_ARG           videoComponStack,4
+      
+      ;// DC Prediction
+
+      M_LDR           videoComp,videoComponStack                     ;// Load videoComp From Stack               
+            
+      M_LDR           predDir,predDironStack                         ;// Load Prediction direction
+      ;// DC Scaler calculation   
+      LDR             index, =armVCM4P2_DCScaler
+      ADD             index,index,videoComp,LSL #5
+      LDRB            dcScaler,[index,QP]
+
+       
+      LDR             predCoeffTable, =armVCM4P2_Reciprocal_QP_S16   ;// Loading the table with entries 32767/(1 to 63) 
+      CMP             predDir,#2                                     ;// Check if the Prediction direction is vertical
+
+      ;// Caulucate tempPred
+            
+      LDREQSH         absCoeffDC,[pPredBufRow]                       ;// If vetical load the coeff from Row Prediction Buffer
+      LDRNESH         absCoeffDC,[pPredBufCol]                       ;// If horizontal load the coeff from column Prediction Buffer
+      
+      RSB             negdcScaler,dcScaler,#0                        ;// negdcScaler=-dcScaler   
+      MOV             temp1,absCoeffDC                               ;// Load the Prediction coeff to temp for comparision                               
+      CMP             temp1,#0                                       
+      RSBLT           absCoeffDC,temp1,#0                            ;// calculate absolute val of prediction coeff
+      
+      ADD             temp,dcScaler,dcScaler
+      LDRH            temp,[predCoeffTable,temp]                     ;// Load value from coeff table for performing division using multiplication
+      SMULBB          tempPred,temp,absCoeffDC                       ;// tempped=pPredBufRow(Col)[0]*32767/dcScaler
+      ADD             temp3,dcScaler,#1
+      LSR             tempPred,tempPred,#15                          ;// tempped=pPredBufRow(Col)[0]/dcScaler                  
+      LSR             temp3,temp3,#1                                 ;// temp3=round(dcScaler/2)           
+      MLA             Rem,negdcScaler,tempPred,absCoeffDC            ;// Remainder Rem=abs(pPredBufRow(Col)[0])-tempPred*dcScaler
+      
+      LDRH            dcRowbufCoeff,[pPredBufCol]            
+      
+      CMP             Rem,temp3                                      ;// compare Rem with (dcScaler/2)
+      ADDGE           tempPred,#1                                    ;// tempPred=tempPred+1 if Rem>=(dcScaler/2)
+      CMP             temp1,#0
+      RSBLT           tempPred,tempPred,#0                           ;// tempPred=-tempPred if 
+       
+      STRH            dcRowbufCoeff,[pPredBufRow,#-16]      
+       
+
+      LDRH            temp,[pSrcDst]                                 ;// temp=pSrcDst[0]
+      ADD             temp,temp,tempPred                             ;// temp=pSrcDst[0]+tempPred
+      SSAT16          temp,#12,temp                                  ;// clip temp to [-2048,2047]
+      SMULBB          dcColBuffCoeff,temp,dcScaler                   ;// temp1=clipped(pSrcDst[0])*dcScaler           
+      M_LDR           ACPredFlag,ACPredFlagonStack
+      STRH            dcColBuffCoeff,[pPredBufCol]      
+      
+
+       ;// AC Prediction
+      
+      M_LDR           predQP,predQPonStack
+      
+      CMP             ACPredFlag,#1                                  ;// Check if the AC prediction flag is set or not
+      BNE             Exit                                           ;// If not set Exit
+      CMP             predDir,#2                                     ;// Check the Prediction direction                       
+      LDR             predCoeffTable, =armVCM4P2_Reciprocal_QP_S32   ;// Loading the table with entries 0x1ffff/(1 to 63) 
+      MOV             Const,#4
+      MUL             curQP,curQP,Const                              ;// curQP=4*curQP
+      VDUP            dPredQP0,predQP
+      LDR             temp2,[predCoeffTable,curQP]                   ;// temp=0x1ffff/curQP
+      VDUP            qCoeffTab,temp2
+      BNE             Horizontal                                     ;// If the Prediction direction is horizontal branch to Horizontal
+      
+     
+      
+      ;// Vertical
+      ;//Calculating tempPred
+
+      VLD1            {dPredRowBuf0,dPredRowBuf1},[pPredBufRow]      ;// Loading pPredBufRow[i]:i=0 t0 7
+      
+      VMULL           qtemp1,dPredRowBuf0,dPredQP0                   ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i]: i=0 t0 3
+      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+      
+      VMULL           qtemp1,dPredRowBuf1,dPredQP0                   ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i] : i=4 t0 7      
+
+      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=0 t0 3
+      VSHRN           dPredQP1,qtempPred1,#0                         ;// narrow qtempPred1[i] to 16 bits
+      
+      
+      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP)  : i=4 t0 7
+      VLD1            {dtemp0,dtemp1},[pSrcDst]                      ;//Loading pSrcDst[i] : i=0 to 7
+      VSHRN           dtempPred1,qtempPred1,#0                       ;// narrow qtempPred1[i] to 16 bits
+      VMOV            dtempPred0,dPredQP1
+      
+      ;//updating source and row prediction buffer contents      
+      VADD            qtemp,qtemp,qtempPred                          ;//pSrcDst[i]=pSrcDst[i]+qtempPred[i]: i=0 to 7 
+      VQSHL           qtemp,qtemp,#4                                 ;//Clip to [-2048,2047]
+      LDRH            dcRowbufCoeff,[pPredBufRow]                    ;//Loading Dc Value of Row Prediction buffer
+      VSHR            qtemp,qtemp,#4
+      
+      VST1            {dtemp0,dtemp1},[pSrcDst]                      ;//storing back the updated values 
+      VST1            {dtemp0,dtemp1},[pPredBufRow]                  ;//storing back the updated row prediction values                      
+      STRH            dcRowbufCoeff,[pPredBufRow]                    ;// storing the updated DC Row Prediction coeff
+      
+      B               Exit
+
+Horizontal
+
+      ;// Calculating Temppred
+
+            
+
+      VLD1            {dPredRowBuf0,dPredRowBuf1},[pPredBufCol]      ;// Loading pPredBufCol[i]:i=0 t0 7
+      VMULL           qtemp1,dPredRowBuf0,dPredQP0                   ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i]: i=0 t0 3
+      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+      
+      VMULL           qtemp1,dPredRowBuf1,dPredQP0                   ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i] : i=4 t0 7      
+
+      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=0 t0 3
+      VSHRN           dPredQP1,qtempPred1,#0                         ;// narrow qtempPred1[i] to 16 bits
+      
+      
+      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+      
+      MOV             temppPredColBuf,pPredBufCol
+      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP)  : i=4 t0 7
+      VLD4            {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst]        ;// Loading coefficients Interleaving by 4
+      VSHRN           dtempPred1,qtempPred1,#0                       ;// narrow qtempPred1[i] to 16 bits
+      VMOV            dtempPred0,dPredQP1
+      
+      ;// Updating source and column prediction buffer contents     
+      ADD             temp2,pSrcDst,#32                                  
+      VLD4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]          ;// Loading next 16 coefficients Interleaving by 4
+      VUZP            dtemp0,dtemp4                                  ;// Interleaving by 8
+      VADD            dtemp0,dtemp0,dtempPred0                       ;// Adding tempPred to coeffs
+      VQSHL           dtemp0,dtemp0,#4                               ;// Clip to [-2048,2047]
+      VSHR            dtemp0,dtemp0,#4
+      VST1            {dtemp0},[pPredBufCol]!                        ;// Updating Pridiction column buffer
+      VZIP            dtemp0,dtemp4                                  ;// deinterleaving
+      VST4            {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst]        ;// Updating source coeffs         
+      VST4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]!
+      
+      MOV             temp1,temp2                                     
+      VLD4            {dtemp0,dtemp1,dtemp2,dtemp3},[temp2]!         ;// Loading  coefficients Interleaving by 4
+      
+      VLD4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+      VUZP            dtemp0,dtemp4                                  ;// Interleaving by 8
+      VADD            dtemp0,dtemp0,dtempPred1
+      VQSHL           dtemp0,dtemp0,#4                               ;// Clip to [-2048,2047]
+      VSHR            dtemp0,dtemp0,#4
+      VST1            {dtemp0},[pPredBufCol]!
+      VZIP            dtemp0,dtemp4
+      VST4            {dtemp0,dtemp1,dtemp2,dtemp3},[temp1]
+      STRH            dcColBuffCoeff,[temppPredColBuf] 
+      VST4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+      
+Exit
+
+      STRH            temp,[pSrcDst]
+          
+ 
+      MOV             Return,#OMX_Sts_NoErr 
+ 
+      M_END
+      ENDIF
+
+
+       END
+
+
+   
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
new file mode 100755
index 0000000..bd0ad1f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
@@ -0,0 +1,162 @@
+;/**
+; * 
+; * File Name:  omxVCM4P2_QuantInvInter_I_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for inter reconstruction
+; * 
+; *
+; *
+; *
+; *
+; * Function: omxVCM4P2_QuantInvInter_I
+; *
+; * Description:
+; * Performs inverse quantization on intra/inter coded block.
+; * This function supports bits_per_pixel = 8. Mismatch control
+; * is performed for the first MPEG-4 mode inverse quantization method.
+; * The output coefficients are clipped to the range: [-2048, 2047].
+; * Mismatch control is performed for the first inverse quantization method.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] pSrcDst          pointer to the input (quantized) intra/inter block. Must be 16-byte aligned.
+; * [in] QP              quantization parameter (quantiser_scale)
+; * [in] videoComp      (Intra version only.) Video component type of the
+; *                  current block. Takes one of the following flags:
+; *                  OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE,
+; *                  OMX_VC_ALPHA.
+; * [in] shortVideoHeader a flag indicating presence of short_video_header;
+; *                       shortVideoHeader==1 selects linear intra DC mode,
+; *                  and shortVideoHeader==0 selects nonlinear intra DC mode.
+; * [out]    pSrcDst      pointer to the output (dequantized) intra/inter block.  Must be 16-byte aligned.
+; *
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - bad arguments
+; *    - If pSrcDst is NULL or is not 16-byte aligned.
+; *      or
+; *    - If QP <= 0.
+; *      or
+; *    - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA.
+; *
+; */
+
+   INCLUDE omxtypes_s.h
+   INCLUDE armCOMM_s.h
+
+   M_VARIANTS CortexA8
+
+     IF CortexA8
+     
+     
+;//Input Arguments
+pSrcDst            RN 0
+QP                 RN 1
+     
+
+;//Local Variables
+Count              RN 3
+doubleQP           RN 4
+Return             RN 0
+;// Neon registers
+
+
+dQP10              DN D0.S32[0]
+qQP1               QN Q0.S32
+
+dQP1               DN D0.S16
+dMinusQP1          DN D1.S16
+
+dCoeff0            DN D2.S16
+dCoeff1            DN D3.S16   
+
+qResult0           QN Q3.S32
+dResult0           DN D7.S16
+qSign0             QN Q3.S32
+dSign0             DN D6.S16
+
+qResult1           QN Q4.S32
+dResult1           DN D8.S16
+qSign1             QN Q4.S32
+dSign1             DN D8.S16
+
+d2QP0              DN D10.S32[0]
+q2QP0              QN Q5.S32
+d2QP               DN D10.S16
+
+dZero0             DN D11.S16
+dZero1             DN D12.S16
+dConst0            DN D13.S16
+
+    
+     M_START omxVCM4P2_QuantInvInter_I,r4,d13
+     
+         
+         
+         ADD      doubleQP,QP,QP                   ;// doubleQP= 2*QP
+         VMOV     d2QP0,doubleQP
+         VDUP     q2QP0,d2QP0                      ;// Move doubleQP in to a scalar
+         TST      QP,#1                   
+         VLD1     {dCoeff0,dCoeff1},[pSrcDst]      ;// Load first 8 values to Coeff0,Coeff1
+         SUBEQ    QP,QP,#1                            
+         VMOV     dQP10,QP                         ;// If QP is even then QP1=QP-1 else QP1=QP     
+         MOV      Count,#64
+         VDUP     qQP1,dQP10                       ;// Duplicate tempResult with QP1
+         VSHRN    d2QP,q2QP0,#0
+         VEOR     dConst0,dConst0,dConst0
+         VSHRN    dQP1,qQP1,#0                     ;// QP1 truncated to 16 bits
+         VSUB     dMinusQP1,dConst0,dQP1           ;// dMinusQP1=-QP1
+
+Loop                       
+         
+        ;//Performing Inverse Quantization
+         
+         VCLT     dSign0,dCoeff0, #0               ;// Compare Coefficient 0 against 0
+         VCLT     dSign1,dCoeff1, #0               ;// Compare Coefficient 1 against 0
+         VCEQ     dZero0,dCoeff0,#0                ;// Compare Coefficient 0 against zero
+         VBSL     dSign0,dMinusQP1,dQP1            ;// dSign0 = -QP1 if Coeff0< 0 else QP1
+         VCEQ     dZero1,dCoeff1,#0                ;// Compare Coefficient 1 against zero
+         VBSL     dSign1,dMinusQP1,dQP1            ;// dSign1 = -QP1 if Coeff1< 0 else QP1
+         VMOVL    qSign0,dSign0                    ;// Sign extend qSign0 to 32 bits
+         VMOVL    qSign1,dSign1
+         VMLAL    qResult0,dCoeff0,d2QP            ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0 
+                                                   ;// qResult0[i]= qCoeff0[i]                 if Coeff >=0 
+         VMLAL    qResult1,dCoeff1,d2QP            ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0 
+                                                   ;// qResult1[i]= qCoeff1[i]                 if Coeff >=0 
+         ;// Clip Result to [-2048,2047]                     
+         
+         VQSHL    qResult0,qResult0,#20            ;// clip to [-2048,2047]
+         VQSHL    qResult1,qResult1,#20
+                 
+         VSHR     qResult0,qResult0,#4  
+         VSHR     qResult1,qResult1,#4
+         VSHRN    dResult0,qResult0,#16            ;// Narrow the clipped Value to Halfword
+         VSHRN    dResult1,qResult1,#16 
+         VBIT     dResult0,dConst0,dZero0  
+         VBIT     dResult1,dConst0,dZero1     
+         
+         VST1     {dResult0,dResult1},[pSrcDst]!   ;// Store the result
+         SUBS     Count,Count,#8
+         VLD1     {dCoeff0,dCoeff1},[pSrcDst]
+         
+         
+         BGT      Loop
+
+         MOV      Return,#OMX_Sts_NoErr
+
+
+         M_END
+         ENDIF
+         
+
+        END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
new file mode 100755
index 0000000..e00591f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
@@ -0,0 +1,210 @@
+;/**
+; * 
+; * File Name:  omxVCM4P2_QuantInvIntra_I_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision:   12290
+; * Date:       Wednesday, April 9, 2008
+; * 
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; * 
+; * 
+; *
+; * Description: 
+; * Contains modules for inter reconstruction
+; * 
+; *
+; *
+; *
+; *
+; * 
+; * Function: omxVCM4P2_QuantInvIntra_I
+; *
+; * Description:
+; * Performs inverse quantization on intra/inter coded block.
+; * This function supports bits_per_pixel = 8. Mismatch control
+; * is performed for the first MPEG-4 mode inverse quantization method.
+; * The output coefficients are clipped to the range: [-2048, 2047].
+; * Mismatch control is performed for the first inverse quantization method.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in]    pSrcDst        pointer to the input (quantized) intra/inter block. Must be 16-byte aligned.
+; * [in]    QP            quantization parameter (quantiser_scale)
+; * [in]    videoComp          (Intra version only.) Video component type of the
+; *                    current block. Takes one of the following flags:
+; *                    OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE,
+; *                    OMX_VC_ALPHA.
+; * [in]    shortVideoHeader  a flag indicating presence of short_video_header;
+; *                           shortVideoHeader==1 selects linear intra DC mode,
+; *                    and shortVideoHeader==0 selects nonlinear intra DC mode.
+; * [out]    pSrcDst        pointer to the output (dequantized) intra/inter block.  Must be 16-byte aligned.
+; *
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - bad arguments
+; *    -    If pSrcDst is NULL or is not 16-byte aligned.
+; *      or
+; *    - If QP <= 0.
+; *      or
+; *    - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA.
+; *
+ 
+
+   INCLUDE omxtypes_s.h
+   INCLUDE armCOMM_s.h
+   
+   M_VARIANTS CortexA8
+   
+   
+   IMPORT        armVCM4P2_DCScaler
+ 
+     IF CortexA8
+     
+     
+;//Input Arguments
+pSrcDst            RN 0
+QP                 RN 1
+videoComp          RN 2
+shortVideoHeader   RN 3
+     
+
+;//Local Variables
+
+dcScaler           RN 4
+temp               RN 14
+index              RN 5
+
+
+Count              RN 5
+doubleQP           RN 4
+Return             RN 0
+
+
+;// Neon registers
+
+
+dQP10              DN D0.S32[0]
+qQP1               QN Q0.S32
+
+dQP1               DN D0.S16
+dMinusQP1          DN D1.S16
+
+dCoeff0            DN D2.S16
+dCoeff1            DN D3.S16   
+
+qResult0           QN Q3.S32
+dResult0           DN D7.S16
+qSign0             QN Q3.S32
+dSign0             DN D6.S16
+
+qResult1           QN Q4.S32
+dResult1           DN D8.S16
+qSign1             QN Q4.S32
+dSign1             DN D8.S16
+
+d2QP0              DN D10.S32[0]
+q2QP0              QN Q5.S32
+d2QP               DN D10.S16
+
+dZero0             DN D11.S16
+dZero1             DN D4.S16
+dConst0            DN D5.S16
+
+
+
+
+     
+     
+     M_START omxVCM4P2_QuantInvIntra_I,r5,d11
+
+
+        ;// Perform Inverse Quantization for DC coefficient
+
+        TEQ       shortVideoHeader,#0      ;// Test if short Video Header flag =0             
+        MOVNE     dcScaler,#8              ;// if shortVideoHeader is non zero dcScaler=8
+        BNE       calDCVal
+        
+        LDR       index, =armVCM4P2_DCScaler
+      ADD       index,index,videoComp,LSL #5
+      LDRB      dcScaler,[index,QP]
+
+        ;//M_CalDCScalar  shortVideoHeader,videoComp, QP
+
+calDCVal
+
+        LDRH     temp,[pSrcDst]
+        SMULBB   temp,temp,dcScaler       ;// dcCoeff = dcScaler * Quantized DC coefficient(from memory)
+        SSAT     temp,#12,temp            ;// Saturating to 12 bits
+      
+
+
+        ;// Perform Inverse Quantization for Ac Coefficients
+     
+         
+         
+         ADD      doubleQP,QP,QP                   ;// doubleQP= 2*QP
+         VMOV     d2QP0,doubleQP
+         VDUP     q2QP0,d2QP0                      ;// Move doubleQP in to a scalar
+         TST      QP,#1                   
+         VLD1     {dCoeff0,dCoeff1},[pSrcDst]      ;// Load first 8 values to Coeff0,Coeff1
+         SUBEQ    QP,QP,#1                            
+         VMOV     dQP10,QP                         ;// If QP is even then QP1=QP-1 else QP1=QP     
+         MOV      Count,#64
+         VDUP     qQP1,dQP10                       ;// Duplicate tempResult with QP1
+         VSHRN    d2QP,q2QP0,#0
+         VEOR     dConst0,dConst0,dConst0
+         VSHRN    dQP1,qQP1,#0                     ;// QP1 truncated to 16 bits
+         VSUB     dMinusQP1,dConst0,dQP1           ;// dMinusQP1=-QP1
+
+Loop                       
+         
+        ;//Performing Inverse Quantization
+         
+         VCLT     dSign0,dCoeff0, #0               ;// Compare Coefficient 0 against 0
+         VCLT     dSign1,dCoeff1, #0               ;// Compare Coefficient 1 against 0
+         VCEQ     dZero0,dCoeff0,#0                ;// Compare Coefficient 0 against zero
+         VBSL     dSign0,dMinusQP1,dQP1            ;// dSign0 = -QP1 if Coeff0< 0 else QP1
+         VCEQ     dZero1,dCoeff1,#0                ;// Compare Coefficient 1 against zero
+         VBSL     dSign1,dMinusQP1,dQP1            ;// dSign1 = -QP1 if Coeff1< 0 else QP1
+         VMOVL    qSign0,dSign0                    ;// Sign extend qSign0 to 32 bits
+         VMOVL    qSign1,dSign1
+         VMLAL    qResult0,dCoeff0,d2QP            ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0 
+                                                   ;// qResult0[i]= qCoeff0[i]                 if Coeff >=0 
+         VMLAL    qResult1,dCoeff1,d2QP            ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0 
+                                                   ;// qResult1[i]= qCoeff1[i]                 if Coeff >=0 
+         ;// Clip Result to [-2048,2047]                     
+         
+         VQSHL    qResult0,qResult0,#20            ;// clip to [-2048,2047]
+         VQSHL    qResult1,qResult1,#20
+                 
+         VSHR     qResult0,qResult0,#4  
+         VSHR     qResult1,qResult1,#4
+         VSHRN    dResult0,qResult0,#16            ;// Narrow the clipped Value to Halfword
+         VSHRN    dResult1,qResult1,#16 
+         VBIT     dResult0,dConst0,dZero0  
+         VBIT     dResult1,dConst0,dZero1     
+         
+         VST1     {dResult0,dResult1},[pSrcDst]!   ;// Store the result
+         SUBS     Count,Count,#8
+         VLD1     {dCoeff0,dCoeff1},[pSrcDst]
+         
+         
+         BGT      Loop
+         
+         SUB      pSrcDst,pSrcDst,#128
+         
+         ;// Store the Inverse quantized Dc coefficient
+         
+         STRH     temp,[pSrcDst],#2
+        
+         MOV      Return,#OMX_Sts_NoErr
+         
+
+
+         M_END
+         ENDIF
+         
+
+        END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c
new file mode 100755
index 0000000..5d93681
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c
@@ -0,0 +1,6 @@
+#include "omxtypes.h"
+#include "armCOMM_Version.h"
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+const char * const omxVC_VersionDescription = "ARM OpenMAX DL v" ARM_VERSION_STRING "   Rel=" OMX_ARM_RELEASE_TAG "   Arch=" OMX_ARM_BUILD_ARCHITECTURE "   Tools="  OMX_ARM_BUILD_TOOLCHAIN ;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */