summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT63
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT91
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h785
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h670
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h212
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h40
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h1451
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h27
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h43
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h1157
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h274
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h252
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h77
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl113
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt75
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c936
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c329
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c60
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c45
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h1153
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h72
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h4381
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h129
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s95
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s70
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s236
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h30
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s222
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c327
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s198
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s396
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s325
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s123
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s236
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s149
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s178
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s313
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s266
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s228
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s134
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s318
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s74
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s186
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s92
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c88
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c91
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c62
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c68
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s396
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s202
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s282
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s288
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s436
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c79
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s553
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s436
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s424
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s531
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s140
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s264
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S134
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S54
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S102
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S272
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S103
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S123
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S105
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S107
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S164
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S119
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S72
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S58
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S175
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S68
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S52
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S40
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S67
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S119
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S87
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S123
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S107
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S157
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S323
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S217
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S239
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S261
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S54
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S76
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h37
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h25
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s82
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s398
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c211
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c75
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s104
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c61
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c102
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c214
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s364
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s132
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s136
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s224
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s194
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s73
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s444
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s320
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s162
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s210
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c6
109 files changed, 28123 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT
new file mode 100755
index 0000000..cc2d70a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_DELIVERY.TXT
@@ -0,0 +1,63 @@
+The contents of this transaction was created by Hedley Francis
+of ARM on 19-Feb-2008.
+
+It contains the ARM data versions listed below.
+
+This data, unless otherwise stated, is ARM Proprietary and access to it
+is subject to the agreements indicated below.
+
+If you experience problems with this data, please contact ARM support
+quoting transaction reference <97414>.
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+- OX002-SW-98010-r0p0-00bet1
+ Video codecs - optimised code
+ V7 code release for Hantro (Ver 1.0.2)
+ internal access
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+This transaction contains deliverables which are designated as being of
+beta release status (BET).
+
+Beta release status has a particular meaning to ARM of which the recipient
+must be aware. Beta is a pre-release status indicating that the deliverable
+so described is believed to robustly demonstrate specified behaviour, to be
+consistent across its included aspects and be ready for general deployment.
+But Beta also indicates that pre-release reliability trials are ongoing and
+that it is possible residual defects or errors in operation, consistency
+and documentation may still be encountered. The recipient should consider
+this position when using this Beta material supplied. ARM will normally
+attempt to provide fixes or a work-around for defects identified by the
+recipient, but the provision or timeliness of this support cannot be
+guaranteed. ARM shall not be responsible for direct or consequential
+damages as a result of encountering one or more of these residual defects.
+By accepting a Beta release, the recipient agrees to these constraints and
+to providing reasonable information to ARM to enable the replication of the
+defects identified by the recipient. The specific Beta version supplied
+will not be supported after release of a later or higher status version.
+It should be noted that Support for the Beta release of the deliverable
+will only be provided by ARM to a recipient who has a current support and
+maintenance contract for the deliverable.
+
+
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+In addition to the data versions listed above, this transaction contains
+two additional files at the top level.
+
+The first is this file, ARM_DELIVERY_97414.TXT, which is the delivery
+note.
+
+The second is ARM_MANIFEST_97414.TXT which contains a manifest of all the
+files included in this transaction, together with their checksums.
+
+The checksums provided are calculated using the RSA Data Security, Inc.
+MD5 Message-Digest Algorithm.
+
+The checksums can be used to verify the integrity of this data using the
+"md5sum" tool (which is part of the GNU "textutils" package) by running:
+
+ % md5sum --check ARM_MANIFEST_97414.TXT
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT
new file mode 100755
index 0000000..8310f67
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/ARM_MANIFEST.TXT
@@ -0,0 +1,91 @@
+ OX002-SW-98010-r0p0-00bet1/
+ OX002-SW-98010-r0p0-00bet1/api/
+e049791cfab6060a08cbac7b3ad767d6 OX002-SW-98010-r0p0-00bet1/api/armCOMM_s.h
+ed798face25497b2703ede736d6d52b6 OX002-SW-98010-r0p0-00bet1/api/omxtypes_s.h
+4eebd63af087376811d6749f0646b864 OX002-SW-98010-r0p0-00bet1/api/armCOMM_BitDec_s.h
+43cf46c2cf2fe1f93c615b57bcbe4809 OX002-SW-98010-r0p0-00bet1/api/armCOMM.h
+8f248ceaac8f602e277a521b679dcbbe OX002-SW-98010-r0p0-00bet1/api/armCOMM_IDCTTable.h
+8ac5fa80ea98e391f5730a375280b5bd OX002-SW-98010-r0p0-00bet1/api/armCOMM_Version.h
+3a2f420ddf6a1b950470bd0f5ebd5c62 OX002-SW-98010-r0p0-00bet1/api/armCOMM_IDCT_s.h
+511c0bb534fe223599e2c84eff24c9ed OX002-SW-98010-r0p0-00bet1/api/armCOMM_MaskTable.h
+8971932d56eed6b1ad1ba507f0bff5f0 OX002-SW-98010-r0p0-00bet1/api/armCOMM_Bitstream.h
+f87fedd9ca432fefa757008176864ef8 OX002-SW-98010-r0p0-00bet1/api/armOMX.h
+8e49899a428822c36ef9dd94e0e05f18 OX002-SW-98010-r0p0-00bet1/api/omxtypes.h
+323008b72e9f04099a8cb42e99a1face OX002-SW-98010-r0p0-00bet1/build_vc.pl
+e72d96c0a415459748df9807f3dae72f OX002-SW-98010-r0p0-00bet1/filelist_vc.txt
+ OX002-SW-98010-r0p0-00bet1/src/
+5eeae659a29477f5c52296d24afffd3c OX002-SW-98010-r0p0-00bet1/src/armCOMM_IDCTTable.c
+d64cdcf38f7749dc7f77465e5b7d356d OX002-SW-98010-r0p0-00bet1/src/armCOMM_MaskTable.c
+ OX002-SW-98010-r0p0-00bet1/vc/
+ OX002-SW-98010-r0p0-00bet1/vc/m4p10/
+ OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/
+e7e0c320978564a7c9b2c723749a98d6 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_CAVLCTables.c
+4adcd0df081990bdfc4729041a2a9152 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
+852e0404142965dc1f3aa7f00ee5127b OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
+7054151c5bfea6b5e74feee86b2d7b01 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+5f7213a4f37627b3c58f6294ba477e30 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DequantTables_s.s
+32ff4b8be62e2f0f3e764b83c1e5e2fd OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
+d066e3c81d82616f37ec1810ea49e7b7 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
+fe629a3e9d55395a6098bdf2431b5f02 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
+5b13fb954b7679de20076bb6a7f4ee1d OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
+01ba60eff66ea49a4f833ce6279f8e2f OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
+fa1072cf1d17e9666c9f1e215fa302b1 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+db387b9e66d32787f47ef9cf0347da2a OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+ea537e4e2ad03a1940981055fa3ace01 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+29a4283885b9473a3550a81eff2559d2 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+2ddcaf60a8ea1e6e6b77737f768bfb9d OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_QuantTables_s.s
+c3002aad5600f872b70a5d7fe3915846 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+a2900f2c47f1c61d20bd6c1eda33d6d4 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+c921df73397a32c947dc996ba6858553 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
+3769e14f2fc3f514d025fe6ab73ff67a OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+c029d1cebea0a09e1d235a37e2155002 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+076a033f8161750a685756f9f51f04c9 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+c5b5d22842822e6e5e31094882cbeb46 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
+f6bdf6d914a4a1479f524951a3409846 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+ebeb0713a9b2ea25986360ef262138c4 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+78ed9ea200faa7be665445a713859af1 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+c2d995f787b6f44ef10c751c12d1935f OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+40bed679a9f6e0d3efe216b7d4a9cf45 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
+4a52b3e9e268b8a8f07829bf500d03af OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
+11249f8a98c5d4b84cb5575b0e37ca9c OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
+2513b60559ba71ae495c6053fb779fa9 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
+2fb1ee17c36e3c1469c170f6dac11bf1 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
+cc4a6f32db0b72a91d3f278f6855df69 OX002-SW-98010-r0p0-00bet1/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+ OX002-SW-98010-r0p0-00bet1/vc/m4p10/api/
+6e530ddaa7c2b57ffe88162c020cb662 OX002-SW-98010-r0p0-00bet1/vc/m4p10/api/armVCM4P10_CAVLCTables.h
+ OX002-SW-98010-r0p0-00bet1/vc/m4p2/
+ OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/
+bec6de348b113438498867b869001622 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Clip8_s.s
+dba9824e959b21d401cac925e68a11a6 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+dfa7e5b58027be3542dda0593b77b2d3 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
+4fba4c431a783a78a2eb6497a94ac967 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
+39991961179ca03b6381b6e653b1f14b OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
+1b0b2990c2669dfb87cf6b810611c01b OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
+1c9b87abf3283e957816b3937c680701 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
+4fe1afca659a9055fc1172e58f78a506 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
+2ea067f0436f91ba1351edaf411cb4ea OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
+6ce363aadc9d65c308b40cca8902e4f6 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
+bf212f786772aed2bc705d22ff4e74f5 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
+293a48a648a3085456e6665bb7366fad OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
+2bb47ed9c9e25c5709c6d9b4ad39a38a OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
+437dfa204508850d61d4b87091446e9f OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+bc9778898dd41101dc0fb0139eaf83cc OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+fc191eeae43f8ce735dbd311cc7bcb8d OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
+a0d85f4f517c945a4c9317ac021f2d08 OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+386020dee8b725c7fe2526f1fc211d7d OX002-SW-98010-r0p0-00bet1/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
+ OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/
+4624e7c838e10a249abcc3d3f4f40748 OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
+65e1057d04e2cb844559dc9f6e09795a OX002-SW-98010-r0p0-00bet1/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
+ OX002-SW-98010-r0p0-00bet1/vc/src/
+e627b3346b0dc9aff14446005ce0fa43 OX002-SW-98010-r0p0-00bet1/vc/src/armVC_Version.c
+ OX002-SW-98010-r0p0-00bet1/vc/api/
+7ca94b1c33ac0211e17d38baadd7d1dd OX002-SW-98010-r0p0-00bet1/vc/api/armVC.h
+12cf7596edbbf6048b626d15e8d0ed48 OX002-SW-98010-r0p0-00bet1/vc/api/omxVC.h
+11726e286a81257cb45f5547fb4d374c OX002-SW-98010-r0p0-00bet1/vc/api/omxVC_s.h
+a5b2af605c319cd2491319e430741377 OX002-SW-98010-r0p0-00bet1/vc/api/armVCCOMM_s.h
+ OX002-SW-98010-r0p0-00bet1/vc/comm/
+ OX002-SW-98010-r0p0-00bet1/vc/comm/src/
+1f81187b48487a8ea6dbc327648e3e4f OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_Copy16x16_s.s
+936d3f2038a6f8613ec25e50cc601fe8 OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_Copy8x8_s.s
+8f6708a249130962e0bc5c044ac6dd93 OX002-SW-98010-r0p0-00bet1/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
+aab7713414428e95de0ba799a2679b36 ARM_DELIVERY_97414.TXT
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
new file mode 100755
index 0000000..64c1958
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM.h
@@ -0,0 +1,785 @@
+/**
+ *
+ * File Name: armCOMM.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armCOMM.h
+ * Brief: Declares Common APIs/Data Types used across OpenMAX API's
+ *
+ */
+
+
+#ifndef _armCommon_H_
+#define _armCommon_H_
+
+#include "omxtypes.h"
+
+typedef struct
+{
+ OMX_F32 Re; /** Real part */
+ OMX_F32 Im; /** Imaginary part */
+
+} OMX_FC32; /** single precision floating point complex number */
+
+typedef struct
+{
+ OMX_F64 Re; /** Real part */
+ OMX_F64 Im; /** Imaginary part */
+
+} OMX_FC64; /** double precision floating point complex number */
+
+
+/* Used by both IP and IC domains for 8x8 JPEG blocks. */
+typedef OMX_S16 ARM_BLOCK8x8[64];
+
+
+#include "armOMX.h"
+
+#define armPI (OMX_F64)(3.1415926535897932384626433832795)
+
+/***********************************************************************/
+
+/* Compiler extensions */
+#ifdef ARM_DEBUG
+/* debug version */
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#define armError(str) {printf((str)); printf("\n"); exit(-1);}
+#define armWarn(str) {printf((str)); printf("\n");}
+#define armIgnore(a) ((void)a)
+#define armAssert(a) assert(a)
+#else
+/* release version */
+#define armError(str) ((void) (str))
+#define armWarn(str) ((void) (str))
+#define armIgnore(a) ((void) (a))
+#define armAssert(a) ((void) (a))
+#endif /* ARM_DEBUG */
+
+/* Arithmetic operations */
+
+#define armMin(a,b) ( (a) > (b) ? (b):(a) )
+#define armMax(a,b) ( (a) > (b) ? (a):(b) )
+#define armAbs(a) ( (a) < 0 ? -(a):(a) )
+
+/* Alignment operation */
+
+#define armAlignToBytes(Ptr,N) (Ptr + ( ((N-(int)Ptr)&(N-1)) / sizeof(*Ptr) ))
+#define armAlignTo2Bytes(Ptr) armAlignToBytes(Ptr,2)
+#define armAlignTo4Bytes(Ptr) armAlignToBytes(Ptr,4)
+#define armAlignTo8Bytes(Ptr) armAlignToBytes(Ptr,8)
+#define armAlignTo16Bytes(Ptr) armAlignToBytes(Ptr,16)
+
+/* Error and Alignment check */
+
+#define armRetArgErrIf(condition, code) if(condition) { return (code); }
+#define armRetDataErrIf(condition, code) if(condition) { return (code); }
+
+#ifndef ALIGNMENT_DOESNT_MATTER
+#define armIsByteAligned(Ptr,N) ((((int)(Ptr)) % N)==0)
+#define armNotByteAligned(Ptr,N) ((((int)(Ptr)) % N)!=0)
+#else
+#define armIsByteAligned(Ptr,N) (1)
+#define armNotByteAligned(Ptr,N) (0)
+#endif
+
+#define armIs2ByteAligned(Ptr) armIsByteAligned(Ptr,2)
+#define armIs4ByteAligned(Ptr) armIsByteAligned(Ptr,4)
+#define armIs8ByteAligned(Ptr) armIsByteAligned(Ptr,8)
+#define armIs16ByteAligned(Ptr) armIsByteAligned(Ptr,16)
+
+#define armNot2ByteAligned(Ptr) armNotByteAligned(Ptr,2)
+#define armNot4ByteAligned(Ptr) armNotByteAligned(Ptr,4)
+#define armNot8ByteAligned(Ptr) armNotByteAligned(Ptr,8)
+#define armNot16ByteAligned(Ptr) armNotByteAligned(Ptr,16)
+#define armNot32ByteAligned(Ptr) armNotByteAligned(Ptr,32)
+
+/**
+ * Function: armRoundFloatToS16_ref/armRoundFloatToS32_ref/armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value);
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToS16_ref/armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16/OMX_S32 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value);
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value);
+
+/**
+ * Function: armSatRoundFloatToU16_ref/armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int/int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16/OMX_U32 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value);
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value);
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] var Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT -- returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ */
+
+OMX_INT armSignCheck (OMX_S16 var);
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 -- returns clipped value
+ */
+
+OMX_S32 armClip (
+ OMX_INT min,
+ OMX_INT max,
+ OMX_S32 src
+ );
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 -- returns clipped value
+ */
+
+OMX_F32 armClip_F32 (
+ OMX_F32 min,
+ OMX_F32 max,
+ OMX_F32 src
+ );
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding
+ * integer value. Takes care of rounding while clipping the final
+ * value.
+ *
+ * Parameters:
+ * [in] v Number to be operated upon
+ * [in] shift Divides the input "v" by "2^shift"
+ * [in] satBits Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 -- returns "shifted" saturated value
+ */
+
+OMX_U32 armShiftSat_F32(
+ OMX_F32 v,
+ OMX_INT shift,
+ OMX_INT satBits
+ );
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * This function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(OMX_U8 *pBuf1, OMX_U8 *pBuf2, OMX_INT elemSize);
+
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry First entry
+ * [in] sEntry second entry
+ * [in] tEntry Third entry
+ *
+ * Return Value:
+ * OMX_S32 -- returns the median value
+ */
+
+OMX_S32 armMedianOf3 (
+ OMX_S32 fEntry,
+ OMX_S32 sEntry,
+ OMX_S32 tEntry
+ );
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value Positive value
+ *
+ * Return Value:
+ * OMX_U8 -- returns the size of the positive value
+ */
+
+OMX_U8 armLogSize (
+ OMX_U16 value
+ );
+
+/***********************************************************************/
+ /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S32 armSatAdd_S32(
+ OMX_S32 Value1,
+ OMX_S32 Value2
+ );
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S64 armSatAdd_S64(
+ OMX_S64 Value1,
+ OMX_S64 Value2
+ );
+
+/** Function :armSatSub_S32()
+ *
+ * Description :
+ * Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatSub_S32(
+ OMX_S32 Value1,
+ OMX_S32 Value2
+ );
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ * Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ * accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ * [in] Mac Accumulator
+ *
+ * Return:
+ * [out] Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(
+ OMX_S32 Mac,
+ OMX_S16 Value1,
+ OMX_S16 Value2
+ );
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ * Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ * mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem First 32 bit Operand
+ * [in] filTap Second 16 bit Operand
+ * [in] mac Result of MAC operation
+ *
+ * Return:
+ * [out] mac Result of operation
+ *
+ **/
+
+OMX_S32 armSatMac_S16S32_S32(
+ OMX_S32 mac,
+ OMX_S32 delayElem,
+ OMX_S16 filTap );
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ * Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ * output = Saturate_in_16Bits( ( RightShift( (Round(input) , scaleFactor ) )
+ *
+ * Parametrs:
+ * [in] input The input to be operated on
+ * [in] scaleFactor The shift number
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(
+ OMX_S32 input,
+ OMX_INT scaleFactor);
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatRoundLeftShift_S32(
+ OMX_S32 Value,
+ OMX_INT shift
+ );
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S64 armSatRoundLeftShift_S64(
+ OMX_S64 Value,
+ OMX_INT shift
+ );
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ * Returns the result of a S16 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(
+ OMX_S16 input1,
+ OMX_S32 input2);
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ * Returns the result of a S32 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatMulS32S32_S32(
+ OMX_S32 input1,
+ OMX_S32 input2);
+
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer.
+ * Half-integer values are rounded away from zero
+ * unless otherwise specified. For example 3//2 is rounded
+ * to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num Operand 1
+ * [in] Deno Operand 2
+ *
+ * Return:
+ * [out] Result of operation input1//input2
+ *
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno);
+
+
+/***********************************************************************/
+/*
+ * Debugging macros
+ *
+ */
+
+
+/*
+ * Definition of output stream - change to stderr if necessary
+ */
+#define DEBUG_STREAM stdout
+
+/*
+ * Debug printf macros, one for each argument count.
+ * Add more if needed.
+ */
+#ifdef DEBUG_ON
+#include <stdio.h>
+
+#define DEBUG_PRINTF_0(a) fprintf(DEBUG_STREAM, a)
+#define DEBUG_PRINTF_1(a, b) fprintf(DEBUG_STREAM, a, b)
+#define DEBUG_PRINTF_2(a, b, c) fprintf(DEBUG_STREAM, a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d) fprintf(DEBUG_STREAM, a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e) fprintf(DEBUG_STREAM, a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f) fprintf(DEBUG_STREAM, a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) fprintf(DEBUG_STREAM, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#else /* DEBUG_ON */
+#define DEBUG_PRINTF_0(a)
+#define DEBUG_PRINTF_1(a, b)
+#define DEBUG_PRINTF_2(a, b, c)
+#define DEBUG_PRINTF_3(a, b, c, d)
+#define DEBUG_PRINTF_4(a, b, c, d, e)
+#define DEBUG_PRINTF_5(a, b, c, d, e, f)
+#define DEBUG_PRINTF_6(a, b, c, d, e, f, g)
+#define DEBUG_PRINTF_7(a, b, c, d, e, f, g, h)
+#define DEBUG_PRINTF_8(a, b, c, d, e, f, g, h, i)
+#define DEBUG_PRINTF_9(a, b, c, d, e, f, g, h, i, j)
+#define DEBUG_PRINTF_10(a, b, c, d, e, f, g, h, i, j, k)
+#define DEBUG_PRINTF_11(a, b, c, d, e, f, g, h, i, j, k, l)
+#define DEBUG_PRINTF_12(a, b, c, d, e, f, g, h, i, j, k, l, m)
+#define DEBUG_PRINTF_13(a, b, c, d, e, f, g, h, i, j, k, l, m, n)
+#define DEBUG_PRINTF_14(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)
+#endif /* DEBUG_ON */
+
+
+/*
+ * Domain and sub domain definitions
+ *
+ * In order to turn on debug for an entire domain or sub-domain
+ * at compile time, one of the DEBUG_DOMAIN_* below may be defined,
+ * which will activate debug in all of the defines it contains.
+ */
+
+#ifdef DEBUG_DOMAIN_AC
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT_MPEG4
+#define DEBUG_OMXACAAC_DECODECHANPAIRELT
+#define DEBUG_OMXACAAC_DECODEDATSTRELT
+#define DEBUG_OMXACAAC_DECODEFILLELT
+#define DEBUG_OMXACAAC_DECODEISSTEREO_S32
+#define DEBUG_OMXACAAC_DECODEMSPNS_S32
+#define DEBUG_OMXACAAC_DECODEMSSTEREO_S32_I
+#define DEBUG_OMXACAAC_DECODEPRGCFGELT
+#define DEBUG_OMXACAAC_DECODETNS_S32_I
+#define DEBUG_OMXACAAC_DEINTERLEAVESPECTRUM_S32
+#define DEBUG_OMXACAAC_ENCODETNS_S32_I
+#define DEBUG_OMXACAAC_LONGTERMPREDICT_S32
+#define DEBUG_OMXACAAC_LONGTERMRECONSTRUCT_S32
+#define DEBUG_OMXACAAC_MDCTFWD_S32
+#define DEBUG_OMXACAAC_MDCTINV_S32_S16
+#define DEBUG_OMXACAAC_NOISELESSDECODE
+#define DEBUG_OMXACAAC_QUANTINV_S32_I
+#define DEBUG_OMXACAAC_UNPACKADIFHEADER
+#define DEBUG_OMXACAAC_UNPACKADTSFRAMEHEADER
+#define DEBUG_OMXACMP3_HUFFMANDECODESFBMBP_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODESFB_S32
+#define DEBUG_OMXACMP3_HUFFMANDECODE_S32
+#define DEBUG_OMXACMP3_MDCTINV_S32
+#define DEBUG_OMXACMP3_REQUANTIZESFB_S32_I
+#define DEBUG_OMXACMP3_REQUANTIZE_S32_I
+#define DEBUG_OMXACMP3_SYNTHPQMF_S32_S16
+#define DEBUG_OMXACMP3_UNPACKFRAMEHEADER
+#define DEBUG_OMXACMP3_UNPACKSCALEFACTORS_S8
+#define DEBUG_OMXACMP3_UNPACKSIDEINFO
+#endif /* DEBUG_DOMAIN_AC */
+
+
+#ifdef DEBUG_DOMAIN_VC
+#define DEBUG_OMXVCM4P10_AVERAGE_16X
+#define DEBUG_OMXVCM4P10_AVERAGE_4X
+#define DEBUG_OMXVCM4P10_AVERAGE_8X
+#define DEBUG_OMXVCM4P10_DEBLOCKCHROMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DEBLOCKLUMA_U8_C1IR
+#define DEBUG_OMXVCM4P10_DECODECHROMADCCOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DECODECOEFFSTOPAIRCAVLC_U8
+#define DEBUG_OMXVCM4P10_DEQUANTTRANSFORMACFROMPAIR_U8_S16_C1_DLX
+#define DEBUG_OMXVCM4P10_EXPANDFRAME
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGCHROMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_HOREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_FILTERDEBLOCKINGLUMA_VEREDGE_U8_C1IR
+#define DEBUG_OMXVCM4P10_PREDICTINTRACHROMA8X8_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_16X16_U8_C1R
+#define DEBUG_OMXVCM4P10_PREDICTINTRA_4X4_U8_C1R
+#define DEBUG_OMXVCM4P10_SADQUAR_16X
+#define DEBUG_OMXVCM4P10_SADQUAR_4X
+#define DEBUG_OMXVCM4P10_SADQUAR_8X
+#define DEBUG_OMXVCM4P10_SAD_16X
+#define DEBUG_OMXVCM4P10_SAD_4X
+#define DEBUG_OMXVCM4P10_SAD_8X
+#define DEBUG_OMXVCM4P10_SATD_4X4
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTCHROMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMDEQUANTLUMADCFROMPAIR_U8_S16_C1
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_CHROMADC
+#define DEBUG_OMXVCM4P10_TRANSFORMQUANT_LUMADC
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_HALF_8X8
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_16X16
+#define DEBUG_OMXVCM4P2_BLOCKMATCH_INTEGER_8X8
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_SAD_U8_S16
+#define DEBUG_OMXVCM4P2_COMPUTETEXTUREERRORBLOCK_U8_S16
+#define DEBUG_OMXVCM4P2_DCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_DECODEBLOCKCOEF_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEPADMV_PVOP
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_DECODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEMV_U8_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTER_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRAACVLC_S16
+#define DEBUG_OMXVCM4P2_ENCODEVLCZIGZAG_INTRADCVLC_S16
+#define DEBUG_OMXVCM4P2_FINDMVPRED
+#define DEBUG_OMXVCM4P2_IDCT8X8BLKDLX
+#define DEBUG_OMXVCM4P2_LIMITMVTORECT
+#define DEBUG_OMXVCM4P2_MOTIONESTIMATIONMB
+#define DEBUG_OMXVCM4P2_PADMBGRAY_U8
+#define DEBUG_OMXVCM4P2_PADMBHORIZONTAL_U8
+#define DEBUG_OMXVCM4P2_PADMBVERTICAL_U8
+#define DEBUG_OMXVCM4P2_PADMV
+#define DEBUG_OMXVCM4P2_QUANTINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINTRA_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTER_S16_I
+#define DEBUG_OMXVCM4P2_QUANTINVINTRA_S16_I
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTER
+#define DEBUG_OMXVCM4P2_TRANSRECBLOCKCEOF_INTRA
+#endif /* DEBUG_DOMAIN_VC */
+
+
+#ifdef DEBUG_DOMAIN_IC
+/* To be filled in */
+#endif /* DEBUG_DOMAIN_IC */
+
+
+#ifdef DEBUG_DOMAIN_SP
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S16
+#define DEBUG_OMXACSP_BLOCKEXP_S32
+#define DEBUG_OMXACSP_COPY_S16
+#define DEBUG_OMXACSP_DOTPROD_S16
+#define DEBUG_OMXACSP_DOTPROD_S16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTFWD_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S16S32_SFS
+#define DEBUG_OMXACSP_FFTFWD_RTOCCS_S32_SFS
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC16
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_C_SC32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S16_S32
+#define DEBUG_OMXACSP_FFTGETBUFSIZE_R_S32
+#define DEBUG_OMXACSP_FFTINIT_C_SC16
+#define DEBUG_OMXACSP_FFTINIT_C_SC32
+#define DEBUG_OMXACSP_FFTINIT_R_S16_S32
+#define DEBUG_OMXACSP_FFTINIT_R_S32
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32S16_SFS
+#define DEBUG_OMXACSP_FFTINV_CCSTOR_S32_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC16_SFS
+#define DEBUG_OMXACSP_FFTINV_CTOC_SC32_SFS
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32_I
+#define DEBUG_OMXACSP_FILTERMEDIAN_S32
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_FIRONE_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_ISFS
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_FIR_DIRECT_S16
+#define DEBUG_OMXACSP_FIR_DIRECT_S16_SFS
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIRONE_DIRECT_S16
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_BIQUADDIRECT_S16
+#define DEBUG_OMXACSP_IIR_DIRECT_S16_I
+#define DEBUG_OMXACSP_IIR_DIRECT_S16
+#endif /* DEBUG_DOMAIN_SP */
+
+
+#ifdef DEBUG_DOMAIN_IP
+#define DEBUG_OMXIPBM_ADDC_U8_C1R_SFS
+#define DEBUG_OMXIPBM_COPY_U8_C1R
+#define DEBUG_OMXIPBM_COPY_U8_C3R
+#define DEBUG_OMXIPBM_MIRROR_U8_C1R
+#define DEBUG_OMXIPBM_MULC_U8_C1R_SFS
+#define DEBUG_OMXIPCS_COLORTWISTQ14_U8_C3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR420LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR422LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGB565TOYCBCR444LS_MCU_U16_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR420LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR422LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_RGBTOYCBCR444LS_MCU_U8_S16_C3P3R
+#define DEBUG_OMXIPCS_YCBCR420RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR420TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422RSZCSCROTRGB_U8_C2R
+#define DEBUG_OMXIPCS_YCBCR422RSZROT_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB565_U8_U16_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCR422TORGB_U8_C2C3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_C2P3R
+#define DEBUG_OMXIPCS_YCBCR422TOYCBCR420ROTATE_U8_P3R
+#define DEBUG_OMXIPCS_YCBCR444TORGB565LS_MCU_S16_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCR444TORGBLS_MCU_S16_U8_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB565_U8_U16_P3C3R
+#define DEBUG_OMXIPCS_YCBCRTORGB_U8_C3R
+#define DEBUG_OMXIPPP_GETCENTRALMOMENT_S64
+#define DEBUG_OMXIPPP_GETSPATIALMOMENT_S64
+#define DEBUG_OMXIPPP_MOMENTGETSTATESIZE_S64
+#define DEBUG_OMXIPPP_MOMENTINIT_S64
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C1R
+#define DEBUG_OMXIPPP_MOMENTS64S_U8_C3R
+#endif /* DEBUG_DOMAIN_IP */
+
+
+#endif /* _armCommon_H_ */
+
+/*End of File*/
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
new file mode 100755
index 0000000..c738f72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_BitDec_s.h
@@ -0,0 +1,670 @@
+;//
+;//
+;// File Name: armCOMM_BitDec_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// OpenMAX optimized bitstream decode module
+;//
+;// You must include armCOMM_s.h before including this file
+;//
+;// This module provides macros to perform assembly optimized fixed and
+;// variable length decoding from a read-only bitstream. The variable
+;// length decode modules take as input a pointer to a table of 16-bit
+;// entries of the following format.
+;//
+;// VLD Table Entry format
+;//
+;// 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+;// +------------------------------------------------+
+;// | Len | Symbol | 1 |
+;// +------------------------------------------------+
+;// | Offset | 0 |
+;// +------------------------------------------------+
+;//
+;// If the table entry is a leaf entry then bit 0 set:
+;// Len = Number of bits overread (0 to 7)
+;// Symbol = Symbol payload (unsigned 12 bits)
+;//
+;// If the table entry is an internal node then bit 0 is clear:
+;// Offset = Number of (16-bit) half words from the table
+;// start to the next table node
+;//
+;// The table is accessed by successive lookup up on the
+;// next Step bits of the input bitstream until a leaf node
+;// is obtained. The Step sizes are supplied to the VLD macro.
+;//
+;// USAGE:
+;//
+;// To use any of the macros in this package, first call:
+;//
+;// M_BD_INIT ppBitStream, pBitOffset, pBitStream, RBitBuffer, RBitCount, Tmp
+;//
+;// This caches the current bitstream position and next available
+;// bits in registers pBitStream, RBitBuffer, RBitCount. These registers
+;// are reserved for use by the bitstream decode package until you
+;// call M_BD_FINI.
+;//
+;// Next call the following macro(s) as many times as you need:
+;//
+;// M_BD_LOOK8 - Look ahead constant 1<=N<=8 bits into the bitstream
+;// M_BD_LOOK16 - Look ahead constant 1<=N<=16 bits into the bitstream
+;// M_BD_READ8 - Read constant 1<=N<=8 bits from the bitstream
+;// M_BD_READ16 - Read constant 1<=N<=16 bits from the bitstream
+;// M_BD_VREAD8 - Read variable 1<=N<=8 bits from the bitstream
+;// M_BD_VREAD16 - Read variable 1<=N<=16 bits from the bitstream
+;// M_BD_VLD - Perform variable length decode using lookup table
+;//
+;// Finally call the macro:
+;//
+;// M_BD_FINI ppBitStream, pBitOffset
+;//
+;// This writes the bitstream state back to memory.
+;//
+;// The three bitstream cache register names are assigned to the following global
+;// variables:
+;//
+
+ GBLS pBitStream ;// Register name for pBitStream
+ GBLS BitBuffer ;// Register name for BitBuffer
+ GBLS BitCount ;// Register name for BitCount
+
+;//
+;// These register variables must have a certain defined state on entry to every bitstream
+;// macro (except M_BD_INIT) and on exit from every bitstream macro (except M_BD_FINI).
+;// The state may depend on implementation.
+;//
+;// For the default (ARM11) implementation the following hold:
+;// pBitStream - points to the first byte not held in the BitBuffer
+;// BitBuffer - is a cache of (4 bytes) 32 bits, bit 31 the first bit
+;// BitCount - is offset (from the top bit) to the next unused bitstream bit
+;// 0<=BitCount<=15 (so BitBuffer holds at least 17 unused bits)
+;//
+;//
+
+ ;// Bitstream Decode initialise
+ ;//
+ ;// Initialises the bitstream decode global registers from
+ ;// bitstream pointers. This macro is split into 3 parts to enable
+ ;// scheduling.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $ppBitStream - pointer to pointer to the next bitstream byte
+ ;// $pBitOffset - pointer to the number of bits used in the current byte (0..7)
+ ;// $RBitStream - register to use for pBitStream (can be $ppBitStream)
+ ;// $RBitBuffer - register to use for BitBuffer
+ ;// $RBitCount - register to use for BitCount (can be $pBitOffset)
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1,$T2,$T3 - registers that must be preserved between calls to
+ ;// M_BD_INIT1 and M_BD_INIT2
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_INIT0 $ppBitStream, $pBitOffset, $RBitStream, $RBitBuffer, $RBitCount
+
+pBitStream SETS "$RBitStream"
+BitBuffer SETS "$RBitBuffer"
+BitCount SETS "$RBitCount"
+
+ ;// load inputs
+ LDR $pBitStream, [$ppBitStream]
+ LDR $BitCount, [$pBitOffset]
+ MEND
+
+ MACRO
+ M_BD_INIT1 $T1, $T2, $T3
+ LDRB $T2, [$pBitStream, #2]
+ LDRB $T1, [$pBitStream, #1]
+ LDRB $BitBuffer, [$pBitStream], #3
+ ADD $BitCount, $BitCount, #8
+ MEND
+
+ MACRO
+ M_BD_INIT2 $T1, $T2, $T3
+ ORR $T2, $T2, $T1, LSL #8
+ ORR $BitBuffer, $T2, $BitBuffer, LSL #16
+ MEND
+
+ ;//
+ ;// Look ahead fixed 1<=N<=8 bits without consuming any bits
+ ;// The next bits will be placed at bit 31..24 of destination register
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to look
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_LOOK8 $Symbol, $N
+ ASSERT ($N>=1):LAND:($N<=8)
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MEND
+
+ ;//
+ ;// Look ahead fixed 1<=N<=16 bits without consuming any bits
+ ;// The next bits will be placed at bit 31..16 of destination register
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to look
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_LOOK16 $Symbol, $N, $T1
+ ASSERT ($N >= 1):LAND:($N <= 16)
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MEND
+
+ ;//
+ ;// Skips fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_SKIP8 $N, $T1
+ ASSERT ($N>=1):LAND:($N<=8)
+ SUBS $BitCount, $BitCount, #(8-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Read fixed 1<=N<=8 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_READ8 $Symbol, $N, $T1
+ ASSERT ($N>=1):LAND:($N<=8)
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ SUBS $BitCount, $BitCount, #(8-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR #(32-$N)
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Read fixed 1<=N<=16 bits from the bitstream, advancing the bitstream pointer
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_READ16 $Symbol, $N, $T1, $T2
+ ASSERT ($N>=1):LAND:($N<=16)
+ ASSERT $Symbol<>$T1
+ IF ($N<=8)
+ M_BD_READ8 $Symbol, $N, $T1
+ ELSE
+ ;// N>8 so we will be able to refill at least one byte
+ LDRB $T1, [$pBitStream], #1
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ ORR $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBS $BitCount, $BitCount, #(16-$N)
+ LDRCSB $T1, [$pBitStream], #1
+ MOV $Symbol, $Symbol, LSR #(32-$N)
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ ENDIF
+ MEND
+
+ ;//
+ ;// Skip variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits. 1<=N<=8
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VSKIP8 $N, $T1
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Skip variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits. 1<=N<=16
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VSKIP16 $N, $T1, $T2
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Read variable 1<=N<=8 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read. 1<=N<=8
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VREAD8 $Symbol, $N, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ RSB $T2, $N, #32
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR $T2
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Read variable 1<=N<=16 bits from the bitstream, advancing the bitstream pointer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $N - number of bits to read. 1<=N<=16
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the next N bits of the bitstream
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VREAD16 $Symbol, $N, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ ADD $BitCount, $BitCount, $N
+ SUBS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ RSB $T2, $N, #32
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ MOV $Symbol, $Symbol, LSR $T2
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Decode a code of the form 0000...001 where there
+ ;// are N zeros before the 1 and N<=15 (code length<=16)
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the number of zeros before the next 1
+ ;// >=16 is an illegal code
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_CLZ16 $Symbol, $T1, $T2
+ MOVS $Symbol, $BitBuffer, LSL $BitCount
+ CLZ $Symbol, $Symbol
+ ADD $BitCount, $BitCount, $Symbol
+ SUBS $BitCount, $BitCount, #7 ;// length is Symbol+1
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+ ;//
+ ;// Decode a code of the form 1111...110 where there
+ ;// are N ones before the 0 and N<=15 (code length<=16)
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - the number of zeros before the next 1
+ ;// >=16 is an illegal code
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_CLO16 $Symbol, $T1, $T2
+ MOV $Symbol, $BitBuffer, LSL $BitCount
+ MVN $Symbol, $Symbol
+ CLZ $Symbol, $Symbol
+ ADD $BitCount, $BitCount, $Symbol
+ SUBS $BitCount, $BitCount, #7 ;// length is Symbol+1
+ LDRCSB $T1, [$pBitStream], #1
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ SUBCSS $BitCount, $BitCount, #8
+ LDRCSB $T1, [$pBitStream], #1
+ ADDCC $BitCount, $BitCount, #8
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8
+ MEND
+
+
+ ;//
+ ;// Variable Length Decode module
+ ;//
+ ;// Decodes one VLD Symbol from a bitstream and refill the bitstream
+ ;// buffer.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pVLDTable - pointer to VLD decode table of 16-bit entries.
+ ;// The format is described above at the start of
+ ;// this file.
+ ;// $S0 - The number of bits to look up for the first step
+ ;// 1<=$S0<=8
+ ;// $S1 - The number of bits to look up for each subsequent
+ ;// step 1<=$S1<=$S0.
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $Symbol - decoded VLD symbol value
+ ;// $T1 - corrupted temp/scratch register
+ ;// $T2 - corrupted temp/scratch register
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_VLD $Symbol, $T1, $T2, $pVLDTable, $S0, $S1
+ ASSERT (1<=$S0):LAND:($S0<=8)
+ ASSERT (1<=$S1):LAND:($S1<=$S0)
+
+ ;// Note 0<=BitCount<=15 on entry and exit
+
+ MOVS $T1, $BitBuffer, LSL $BitCount ;// left align next bits
+ MOVS $Symbol, #(2<<$S0)-2 ;// create mask
+ AND $Symbol, $Symbol, $T1, LSR #(31-$S0) ;// 2*(next $S0 bits)
+ SUBS $BitCount, $BitCount, #8 ;// CS if buffer can be filled
+01
+ LDRCSB $T1, [$pBitStream], #1 ;// load refill byte
+ LDRH $Symbol, [$pVLDTable, $Symbol] ;// load table entry
+ ADDCC $BitCount, $BitCount, #8 ;// refill not possible
+ ADD $BitCount, $BitCount, #$S0 ;// assume $S0 bits used
+ ORRCS $BitBuffer, $T1, $BitBuffer, LSL #8 ;// merge in refill byte
+ MOVS $T1, $Symbol, LSR #1 ;// CS=leaf entry
+ BCS %FT02
+
+ MOVS $T1, $BitBuffer, LSL $BitCount ;// left align next bit
+ IF (2*$S0-$S1<=8)
+ ;// Can combine refill check and -S0+S1 and keep $BitCount<=15
+ SUBS $BitCount, $BitCount, #8+($S0-$S1)
+ ELSE
+ ;// Separate refill check and -S0+S1 offset
+ SUBS $BitCount, $BitCount, #8
+ SUB $BitCount, $BitCount, #($S0-$S1)
+ ENDIF
+ ADD $Symbol, $Symbol, $T1, LSR #(31-$S1) ;// add 2*(next $S1 bits) to
+ BIC $Symbol, $Symbol, #1 ;// table offset
+ B %BT01 ;// load next table entry
+02
+ ;// BitCount range now depend on the route here
+ ;// if (first step) S0 <= BitCount <= 7+S0 <=15
+ ;// else if (2*S0-S1<=8) S0 <= BitCount <= 7+(2*S0-S1) <=15
+ ;// else S1 <= BitCount <= 7+S1 <=15
+
+ SUB $BitCount, $BitCount, $Symbol, LSR#13
+ BIC $Symbol, $T1, #0xF000
+ MEND
+
+
+ ;// Add an offset number of bits
+ ;//
+ ;// Outputs destination byte and bit index values which corresponds to an offset number of bits
+ ;// from the current location. This is used to compare bitstream positions using. M_BD_CMP.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $Offset - Offset to be added in bits.
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $ByteIndex - Destination pBitStream pointer after adding the Offset.
+ ;// This value will be 4 byte ahead and needs to subtract by 4 to get exact
+ ;// pointer (as in M_BD_FINI). But for using with M_BD_CMP subtract is not needed.
+ ;// $BitIndex - Destination BitCount after the addition of Offset number of bits
+ ;//
+ MACRO
+ M_BD_ADD $ByteIndex, $BitIndex, $Offset
+
+ ;// ($ByteIndex,$BitIndex) = Current position + $Offset bits
+ ADD $Offset, $Offset, $BitCount
+ AND $BitIndex, $Offset, #7
+ ADD $ByteIndex, $pBitStream, $Offset, ASR #3
+ MEND
+
+ ;// Move bitstream pointers to the location given
+ ;//
+ ;// Outputs destination byte and bit index values which corresponds to
+ ;// the current location given (calculated using M_BD_ADD).
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;// $ByteIndex - Destination pBitStream pointer after move.
+ ;// This value will be 4 byte ahead and needs to subtract by 4 to get exact
+ ;// pointer (as in M_BD_FINI).
+ ;// $BitIndex - Destination BitCount after the move
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $pBitStream \
+ ;// } See description above.
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_MOV $ByteIndex, $BitIndex
+
+ ;// ($pBitStream, $Offset) = ($ByteIndex,$BitIndex)
+ MOV $BitCount, $BitIndex
+ MOV $pBitStream, $ByteIndex
+ MEND
+
+ ;// Bitstream Compare
+ ;//
+ ;// Compares bitstream position with that of a destination position. Destination position
+ ;// is held in two input registers which are calculated using M_BD_ADD macro
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $ByteIndex - Destination pBitStream pointer, (4 byte ahead as described in M_BD_ADD)
+ ;// $BitIndex - Destination BitCount
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// FLAGS - GE if destination is reached, LT = is destination is ahead
+ ;// $T1 - corrupted temp/scratch register
+ ;//
+ MACRO
+ M_BD_CMP $ByteIndex, $BitIndex, $T1
+
+ ;// Return flags set by (current positon)-($ByteIndex,$BitIndex)
+ ;// so GE means that we have reached the indicated position
+
+ ADD $T1, $pBitStream, $BitCount, LSR #3
+ CMP $T1, $ByteIndex
+ AND $T1, $BitCount, #7
+ CMPEQ $T1, $BitIndex
+ MEND
+
+
+ ;// Bitstream Decode finalise
+ ;//
+ ;// Writes back the bitstream state to the bitstream pointers
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $pBitStream \
+ ;// $BitBuffer } See description above.
+ ;// $BitCount /
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $ppBitStream - pointer to pointer to the next bitstream byte
+ ;// $pBitOffset - pointer to the number of bits used in the current byte (0..7)
+ ;// $pBitStream \
+ ;// $BitBuffer } these register are corrupted
+ ;// $BitCount /
+ ;//
+ MACRO
+ M_BD_FINI $ppBitStream, $pBitOffset
+
+ ;// Advance pointer by the number of free bits in the buffer
+ ADD $pBitStream, $pBitStream, $BitCount, LSR#3
+ AND $BitCount, $BitCount, #7
+
+ ;// Now move back 32 bits to reach the first usued bit
+ SUB $pBitStream, $pBitStream, #4
+
+ ;// Store out bitstream state
+ STR $BitCount, [$pBitOffset]
+ STR $pBitStream, [$ppBitStream]
+ MEND
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
new file mode 100755
index 0000000..b699034
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Bitstream.h
@@ -0,0 +1,212 @@
+/**
+ *
+ * File Name: armCOMM_Bitstream.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armCOMM_Bitstream.h
+ * Brief: Declares common API's/Data types used across the OpenMax Encoders/Decoders.
+ *
+ */
+
+#ifndef _armCodec_H_
+#define _armCodec_H_
+
+#include "omxtypes.h"
+
+typedef struct {
+ OMX_U8 codeLen;
+ OMX_U32 codeWord;
+} ARM_VLC32;
+
+/* The above should be renamed as "ARM_VLC32" */
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in] **ppBitStream
+ * [in] *pOffset
+ * [in] N=1...32
+ *
+ * Returns Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N=1..32
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ * Returns Value
+ */
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N);
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset);
+
+/**
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N);
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ *
+ * Parameters:
+ * [in] **ppBitStream
+ * [in] *pOffset
+ * [in] pCodeBook
+ *
+ * [out] **ppBitStream
+ * [out] *pOffset
+ *
+ * Returns : Code Book Index if successfull.
+ * : "ARM_NO_CODEBOOK_INDEX = 0xFFFF" if search fails.
+ **/
+
+#define ARM_NO_CODEBOOK_INDEX (OMX_U16)(0xFFFF)
+
+OMX_U16 armUnPackVLC32(
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ const ARM_VLC32 *pCodeBook
+);
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] codeWord Code word that need to be inserted in to the
+ * bitstream
+ * [in] codeLength Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackBits (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ OMX_U32 codeWord,
+ OMX_INT codeLength
+);
+
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pBitOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] code VLC code word that need to be inserted in to the
+ * bitstream
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackVLC32 (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ ARM_VLC32 code
+);
+
+#endif /*_armCodec_H_*/
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
new file mode 100755
index 0000000..e0cfdaa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCTTable.h
@@ -0,0 +1,40 @@
+/**
+ *
+ *
+ * File Name: armCOMM_IDCTTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File : armCOMM_IDCTTable.h
+ * Description : Contains declarations of tables for IDCT calculation.
+ *
+ */
+
+#ifndef _armCOMM_IDCTTable_H_
+#define _armCOMM_IDCTTable_H_
+
+#include "omxtypes.h"
+
+ /* Table of s(u)*A(u)*A(v)/16 at Q15
+ * s(u)=1.0 0 <= u <= 5
+ * s(6)=2.0
+ * s(7)=4.0
+ * A(0) = 2*sqrt(2)
+ * A(u) = 4*cos(u*pi/16) for (u!=0)
+ */
+extern const OMX_U16 armCOMM_IDCTPreScale [64];
+extern const OMX_U16 armCOMM_IDCTCoef [4];
+
+#endif /* _armCOMM_IDCTTable_H_ */
+
+
+/* End of File */
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
new file mode 100755
index 0000000..0baa087
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_IDCT_s.h
@@ -0,0 +1,1451 @@
+;//
+;// This confidential and proprietary software may be used only as
+;// authorised by a licensing agreement from ARM Limited
+;// (C) COPYRIGHT 2004 ARM Limited
+;// ALL RIGHTS RESERVED
+;// The entire notice above must be reproduced on all authorised
+;// copies and copies may only be made to the extent permitted
+;// by a licensing agreement from ARM Limited.
+;//
+;// IDCT_s.s
+;//
+;// Inverse DCT module
+;//
+;//
+;// ALGORITHM DESCRIPTION
+;//
+;// The 8x8 2D IDCT is performed by calculating a 1D IDCT for each
+;// column and then a 1D IDCT for each row.
+;//
+;// The 8-point 1D IDCT is defined by
+;// f(x) = (C(0)*T(0)*c(0,x) + ... + C(7)*T(7)*c(7,x))/2
+;//
+;// C(u) = 1/sqrt(2) if u=0 or 1 if u!=0
+;// c(u,x) = cos( (2x+1)*u*pi/16 )
+;//
+;// We compute the 8-point 1D IDCT using the reverse of
+;// the Arai-Agui-Nakajima flow graph which we split into
+;// 5 stages named in reverse order to identify with the
+;// forward DCT. Direct inversion of the forward formulae
+;// in file FDCT_s.s gives:
+;//
+;// IStage 5: j(u) = T(u)*A(u) [ A(u)=4*C(u)*c(u,0) ]
+;// [ A(0) = 2*sqrt(2)
+;// A(u) = 4*cos(u*pi/16) for (u!=0) ]
+;//
+;// IStage 4: i0 = j0 i1 = j4
+;// i3 = (j2+j6)/2 i2 = (j2-j6)/2
+;// i7 = (j5+j3)/2 i4 = (j5-j3)/2
+;// i5 = (j1+j7)/2 i6 = (j1-j7)/2
+;//
+;// IStage 3: h0 = (i0+i1)/2 h1 = (i0-i1)/2
+;// h2 = (i2*sqrt2)-i3 h3 = i3
+;// h4 = cos(pi/8)*i4 + sin(pi/8)*i6
+;// h6 = -sin(pi/8)*i4 + cos(pi/8)*i6
+;// [ The above two lines rotate by -(pi/8) ]
+;// h5 = (i5-i7)/sqrt2 h7 = (i5+i7)/2
+;//
+;// IStage 2: g0 = (h0+h3)/2 g3 = (h0-h3)/2
+;// g1 = (h1+h2)/2 g2 = (h1-h2)/2
+;// g7 = h7 g6 = h6 - h7
+;// g5 = h5 - g6 g4 = h4 - g5
+;//
+;// IStage 1: f0 = (g0+g7)/2 f7 = (g0-g7)/2
+;// f1 = (g1+g6)/2 f6 = (g1-g6)/2
+;// f2 = (g2+g5)/2 f5 = (g2-g5)/2
+;// f3 = (g3+g4)/2 f4 = (g3-g4)/2
+;//
+;// Note that most coefficients are halved 3 times during the
+;// above calculation. We can rescale the algorithm dividing
+;// the input by 8 to remove the halvings.
+;//
+;// IStage 5: j(u) = T(u)*A(u)/8
+;//
+;// IStage 4: i0 = j0 i1 = j4
+;// i3 = j2 + j6 i2 = j2 - j6
+;// i7 = j5 + j3 i4 = j5 - j3
+;// i5 = j1 + j7 i6 = j1 - j7
+;//
+;// IStage 3: h0 = i0 + i1 h1 = i0 - i1
+;// h2 = (i2*sqrt2)-i3 h3 = i3
+;// h4 = 2*( cos(pi/8)*i4 + sin(pi/8)*i6)
+;// h6 = 2*(-sin(pi/8)*i4 + cos(pi/8)*i6)
+;// h5 = (i5-i7)*sqrt2 h7 = i5 + i7
+;//
+;// IStage 2: g0 = h0 + h3 g3 = h0 - h3
+;// g1 = h1 + h2 g2 = h1 - h2
+;// g7 = h7 g6 = h6 - h7
+;// g5 = h5 - g6 g4 = h4 - g5
+;//
+;// IStage 1: f0 = g0 + g7 f7 = g0 - g7
+;// f1 = g1 + g6 f6 = g1 - g6
+;// f2 = g2 + g5 f5 = g2 - g5
+;// f3 = g3 + g4 f4 = g3 - g4
+;//
+;// Note:
+;// 1. The scaling by A(u)/8 can often be combined with inverse
+;// quantization. The column and row scalings can be combined.
+;// 2. The flowgraph in the AAN paper has h4,g6 negated compared
+;// to the above code but is otherwise identical.
+;// 3. The rotation by -pi/8 can be peformed using three multiplies
+;// Eg c*i4+s*i6 = (i6-i4)*s + (c+s)*i4
+;// -s*i4+c*i6 = (i6-i4)*s + (c-s)*i6
+;// 4. If |T(u)|<=1 then from the IDCT definition,
+;// |f(x)| <= ((1/sqrt2) + |c(1,x)| + .. + |c(7,x)|)/2
+;// = ((1/sqrt2) + cos(pi/16) + ... + cos(7*pi/16))/2
+;// = ((1/sqrt2) + (cot(pi/32)-1)/2)/2
+;// = (1 + cos(pi/16) + cos(2pi/16) + cos(3pi/16))/sqrt(2)
+;// = (approx)2.64
+;// So the max gain of the 2D IDCT is ~x7.0 = 3 bits.
+;// The table below shows input patterns generating the maximum
+;// value of |f(u)| for input in the range |T(x)|<=1. M=-1, P=+1
+;// InputPattern Max |f(x)|
+;// PPPPPPPP |f0| = 2.64
+;// PPPMMMMM |f1| = 2.64
+;// PPMMMPPP |f2| = 2.64
+;// PPMMPPMM |f3| = 2.64
+;// PMMPPMMP |f4| = 2.64
+;// PMMPMMPM |f5| = 2.64
+;// PMPPMPMP |f6| = 2.64
+;// PMPMPMPM |f7| = 2.64
+;// Note that this input pattern is the transpose of the
+;// corresponding max input patter for the FDCT.
+
+;// Arguments
+
+pSrc RN 0 ;// source data buffer
+Stride RN 1 ;// destination stride in bytes
+pDest RN 2 ;// destination data buffer
+pScale RN 3 ;// pointer to scaling table
+
+
+ ;// DCT Inverse Macro
+ ;// The DCT code should be parametrized according
+ ;// to the following inputs:
+ ;// $outsize = "u8" : 8-bit unsigned data saturated (0 to +255)
+ ;// "s9" : 16-bit signed data saturated to 9-bit (-256 to +255)
+ ;// "s16" : 16-bit signed data not saturated (max size ~+/-14273)
+ ;// $inscale = "s16" : signed 16-bit aan-scale table, Q15 format, with 4 byte alignment
+ ;// "s32" : signed 32-bit aan-scale table, Q23 format, with 4 byte alignment
+ ;//
+ ;// Inputs:
+ ;// pSrc = r0 = Pointer to input data
+ ;// Range is -256 to +255 (9-bit)
+ ;// Stride = r1 = Stride between input lines
+ ;// pDest = r2 = Pointer to output data
+ ;// pScale = r3 = Pointer to aan-scale table in the format defined by $inscale
+
+
+
+ MACRO
+ M_IDCT $outsize, $inscale, $stride
+ LCLA SHIFT
+
+
+ IF ARM1136JS
+
+;// REGISTER ALLOCATION
+;// This is hard since we have 8 values, 9 free registers and each
+;// butterfly requires a temporary register. We also want to
+;// maintain register order so we can use LDM/STM. The table below
+;// summarises the register allocation that meets all these criteria.
+;// a=1stcol, b=2ndcol, f,g,h,i are dataflow points described above.
+;//
+;// r1 a01 g0 h0
+;// r4 b01 f0 g1 h1 i0
+;// r5 a23 f1 g2 i1
+;// r6 b23 f2 g3 h2 i2
+;// r7 a45 f3 h3 i3
+;// r8 b45 f4 g4 h4 i4
+;// r9 a67 f5 g5 h5 i5
+;// r10 b67 f6 g6 h6 i6
+;// r11 f7 g7 h7 i7
+;//
+ra01 RN 1
+rb01 RN 4
+ra23 RN 5
+rb23 RN 6
+ra45 RN 7
+rb45 RN 8
+ra67 RN 9
+rb67 RN 10
+rtmp RN 11
+csPiBy8 RN 12 ;// [ (Sin(pi/8)@Q15), (Cos(pi/8)@Q15) ]
+LoopRR2 RN 14 ;// [ LoopNumber<<13 , (1/Sqrt(2))@Q15 ]
+;// Transpose allocation
+xft RN ra01
+xf0 RN rb01
+xf1 RN ra23
+xf2 RN rb23
+xf3 RN ra45
+xf4 RN rb45
+xf5 RN ra67
+xf6 RN rb67
+xf7 RN rtmp
+;// IStage 1 allocation
+xg0 RN xft
+xg1 RN xf0
+xg2 RN xf1
+xg3 RN xf2
+xgt RN xf3
+xg4 RN xf4
+xg5 RN xf5
+xg6 RN xf6
+xg7 RN xf7
+;// IStage 2 allocation
+xh0 RN xg0
+xh1 RN xg1
+xht RN xg2
+xh2 RN xg3
+xh3 RN xgt
+xh4 RN xg4
+xh5 RN xg5
+xh6 RN xg6
+xh7 RN xg7
+;// IStage 3,4 allocation
+xit RN xh0
+xi0 RN xh1
+xi1 RN xht
+xi2 RN xh2
+xi3 RN xh3
+xi4 RN xh4
+xi5 RN xh5
+xi6 RN xh6
+xi7 RN xh7
+
+ M_STR pDest, ppDest
+ IF "$stride"="s"
+ M_STR Stride, pStride
+ ENDIF
+ M_ADR pDest, pBlk
+ LDR csPiBy8, =0x30fc7642
+ LDR LoopRR2, =0x00005a82
+
+v6_idct_col$_F
+ ;// Load even values
+ LDR xi4, [pSrc], #4 ;// j0
+ LDR xi5, [pSrc, #4*16-4] ;// j4
+ LDR xi6, [pSrc, #2*16-4] ;// j2
+ LDR xi7, [pSrc, #6*16-4] ;// j6
+
+ ;// Scale Even Values
+ IF "$inscale"="s16" ;// 16x16 mul
+SHIFT SETA 12
+ LDR xi0, [pScale], #4
+ LDR xi1, [pScale, #4*16-4]
+ LDR xi2, [pScale, #2*16-4]
+ MOV xit, #1<<(SHIFT-1)
+ SMLABB xi3, xi0, xi4, xit
+ SMLATT xi4, xi0, xi4, xit
+ SMLABB xi0, xi1, xi5, xit
+ SMLATT xi5, xi1, xi5, xit
+ MOV xi3, xi3, ASR #SHIFT
+ PKHBT xi4, xi3, xi4, LSL #(16-SHIFT)
+ LDR xi3, [pScale, #6*16-4]
+ SMLABB xi1, xi2, xi6, xit
+ SMLATT xi6, xi2, xi6, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi5, xi0, xi5, LSL #(16-SHIFT)
+ SMLABB xi2, xi3, xi7, xit
+ SMLATT xi7, xi3, xi7, xit
+ MOV xi1, xi1, ASR #SHIFT
+ PKHBT xi6, xi1, xi6, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi7, xi2, xi7, LSL #(16-SHIFT)
+ ENDIF
+ IF "$inscale"="s32" ;// 32x16 mul
+SHIFT SETA (12+8-16)
+ MOV xit, #1<<(SHIFT-1)
+ LDR xi0, [pScale], #8
+ LDR xi1, [pScale, #0*32+4-8]
+ LDR xi2, [pScale, #4*32-8]
+ LDR xi3, [pScale, #4*32+4-8]
+ SMLAWB xi0, xi0, xi4, xit
+ SMLAWT xi1, xi1, xi4, xit
+ SMLAWB xi2, xi2, xi5, xit
+ SMLAWT xi3, xi3, xi5, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi4, xi0, xi1, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi5, xi2, xi3, LSL #(16-SHIFT)
+ LDR xi0, [pScale, #2*32-8]
+ LDR xi1, [pScale, #2*32+4-8]
+ LDR xi2, [pScale, #6*32-8]
+ LDR xi3, [pScale, #6*32+4-8]
+ SMLAWB xi0, xi0, xi6, xit
+ SMLAWT xi1, xi1, xi6, xit
+ SMLAWB xi2, xi2, xi7, xit
+ SMLAWT xi3, xi3, xi7, xit
+ MOV xi0, xi0, ASR #SHIFT
+ PKHBT xi6, xi0, xi1, LSL #(16-SHIFT)
+ MOV xi2, xi2, ASR #SHIFT
+ PKHBT xi7, xi2, xi3, LSL #(16-SHIFT)
+ ENDIF
+
+ ;// Load odd values
+ LDR xi0, [pSrc, #1*16-4] ;// j1
+ LDR xi1, [pSrc, #7*16-4] ;// j7
+ LDR xi2, [pSrc, #5*16-4] ;// j5
+ LDR xi3, [pSrc, #3*16-4] ;// j3
+
+ IF {TRUE}
+ ;// shortcut if odd values 0
+ TEQ xi0, #0
+ TEQEQ xi1, #0
+ TEQEQ xi2, #0
+ TEQEQ xi3, #0
+ BEQ v6OddZero$_F
+ ENDIF
+
+ ;// Store scaled even values
+ STMIA pDest, {xi4, xi5, xi6, xi7}
+
+ ;// Scale odd values
+ IF "$inscale"="s16"
+ ;// Perform AAN Scale
+ LDR xi4, [pScale, #1*16-4]
+ LDR xi5, [pScale, #7*16-4]
+ LDR xi6, [pScale, #5*16-4]
+ SMLABB xi7, xi0, xi4, xit
+ SMLATT xi0, xi0, xi4, xit
+ SMLABB xi4, xi1, xi5, xit
+ SMLATT xi1, xi1, xi5, xit
+ MOV xi7, xi7, ASR #SHIFT
+ PKHBT xi0, xi7, xi0, LSL #(16-SHIFT)
+ LDR xi7, [pScale, #3*16-4]
+ SMLABB xi5, xi2, xi6, xit
+ SMLATT xi2, xi2, xi6, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi1, xi4, xi1, LSL #(16-SHIFT)
+ SMLABB xi6, xi3, xi7, xit
+ SMLATT xi3, xi3, xi7, xit
+ MOV xi5, xi5, ASR #SHIFT
+ PKHBT xi2, xi5, xi2, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi3, xi6, xi3, LSL #(16-SHIFT)
+ ENDIF
+ IF "$inscale"="s32" ;// 32x16 mul
+ LDR xi4, [pScale, #1*32-8]
+ LDR xi5, [pScale, #1*32+4-8]
+ LDR xi6, [pScale, #7*32-8]
+ LDR xi7, [pScale, #7*32+4-8]
+ SMLAWB xi4, xi4, xi0, xit
+ SMLAWT xi5, xi5, xi0, xit
+ SMLAWB xi6, xi6, xi1, xit
+ SMLAWT xi7, xi7, xi1, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi0, xi4, xi5, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi1, xi6, xi7, LSL #(16-SHIFT)
+ LDR xi4, [pScale, #5*32-8]
+ LDR xi5, [pScale, #5*32+4-8]
+ LDR xi6, [pScale, #3*32-8]
+ LDR xi7, [pScale, #3*32+4-8]
+ SMLAWB xi4, xi4, xi2, xit
+ SMLAWT xi5, xi5, xi2, xit
+ SMLAWB xi6, xi6, xi3, xit
+ SMLAWT xi7, xi7, xi3, xit
+ MOV xi4, xi4, ASR #SHIFT
+ PKHBT xi2, xi4, xi5, LSL #(16-SHIFT)
+ MOV xi6, xi6, ASR #SHIFT
+ PKHBT xi3, xi6, xi7, LSL #(16-SHIFT)
+ ENDIF
+
+ LDR xit, =0x00010001 ;// rounding constant
+ SADD16 xi5, xi0, xi1 ;// (j1+j7)/2
+ SHADD16 xi5, xi5, xit
+
+ SSUB16 xi6, xi0, xi1 ;// j1-j7
+ SADD16 xi7, xi2, xi3 ;// (j5+j3)/2
+ SHADD16 xi7, xi7, xit
+
+ SSUB16 xi4, xi2, xi3 ;// j5-j3
+
+ SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
+
+ PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
+ PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
+
+ SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
+ SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
+ SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
+ SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
+
+ SMULBB xi1, xi3, LoopRR2
+ SMULTB xi3, xi3, LoopRR2
+
+ PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
+ PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
+ SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
+
+ ;// xi0,xi1,xi2,xi3 now free
+ ;// IStage 4,3, rows 2to3 x1/2
+
+ MOV xi3, xi3, LSL #1
+ PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
+ LDRD xi0, [pDest, #8] ;// j2,j6 scaled
+
+ ;// IStage 2, rows4to7
+ SSUB16 xg6, xh6, xh7
+ SSUB16 xg5, xh5, xg6
+ SSUB16 xg4, xh4, xg5
+
+ SSUB16 xi2, xi0, xi1 ;// (j2-j6)
+
+ SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+ LDRD xi0, [pDest] ;// j0, j4 scaled
+ SSUB16 xh2, xh2, xi3
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+
+ SHADD16 xh0, xi0, xi1
+ SHSUB16 xh1, xi0, xi1
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ SADD16 xf3, xg3, xg4
+ SSUB16 xf4, xg3, xg4
+ SADD16 xf2, xg2, xg5
+ SSUB16 xf5, xg2, xg5
+ SADD16 xf1, xg1, xg6
+ SSUB16 xf6, xg1, xg6
+ SADD16 xf0, xg0, xg7
+ SSUB16 xf7, xg0, xg7
+
+ ;// Transpose, store and loop
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ STMIA pDest!, {ra01, ra23, ra45, ra67}
+ PKHTB rb67, xf7, xf6, ASR #16
+ STMIA pDest!, {rb01, rb23, rb45, rb67}
+ BCC v6_idct_col$_F
+
+ SUB pSrc, pDest, #(64*2)
+ M_LDR pDest, ppDest
+ IF "$stride"="s"
+ M_LDR pScale, pStride
+ ENDIF
+ B v6_idct_row$_F
+
+v6OddZero$_F
+ SSUB16 xi2, xi6, xi7 ;// (j2-j6)
+ SHADD16 xi3, xi6, xi7 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+ SSUB16 xh2, xh2, xi3
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+
+ SHADD16 xh0, xi4, xi5
+ SHSUB16 xh1, xi4, xi5
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ MOV xf3, xg3
+ MOV xf4, xg3
+ MOV xf2, xg2
+ MOV xf5, xg2
+ MOV xf1, xg1
+ MOV xf6, xg1
+ MOV xf0, xg0
+ MOV xf7, xg0
+
+ ;// Transpose
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ PKHTB rb67, xf7, xf6, ASR #16
+
+ STMIA pDest!, {ra01, ra23, ra45, ra67}
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+ STMIA pDest!, {rb01, rb23, rb45, rb67}
+
+ BCC v6_idct_col$_F
+ SUB pSrc, pDest, #(64*2)
+ M_LDR pDest, ppDest
+ IF "$stride"="s"
+ M_LDR pScale, pStride
+ ENDIF
+
+
+v6_idct_row$_F
+ ;// IStage 4,3, rows4to7 x1/4
+ LDR xit, =0x00010001 ;// rounding constant
+ LDR xi0, [pSrc, #1*16] ;// j1
+ LDR xi1, [pSrc, #7*16] ;// 4*j7
+ LDR xi2, [pSrc, #5*16] ;// j5
+ LDR xi3, [pSrc, #3*16] ;// j3
+
+ SHADD16 xi1, xi1, xit ;// 2*j7
+ SHADD16 xi1, xi1, xit ;// j7
+
+ SHADD16 xi5, xi0, xi1 ;// (j1+j7)/2
+ SSUB16 xi6, xi0, xi1 ;// j1-j7
+ SHADD16 xi7, xi2, xi3 ;// (j5+j3)/2
+ SSUB16 xi4, xi2, xi3 ;// j5-j3
+
+ SSUB16 xi3, xi5, xi7 ;// (i5-i7)/2
+
+ PKHBT xi0, xi6, xi4, LSL#16 ;// [i4,i6] row a
+ PKHTB xi1, xi4, xi6, ASR#16 ;// [i4,i6] row b
+
+ SMUADX xi2, xi0, csPiBy8 ;// rowa by [c,s]
+ SMUADX xi4, xi1, csPiBy8 ;// rowb by [c,s]
+ SMUSD xi0, xi0, csPiBy8 ;// rowa by [-s,c]
+ SMUSD xi6, xi1, csPiBy8 ;// rowb by [-s,c]
+
+ SMULBB xi1, xi3, LoopRR2
+ SMULTB xi3, xi3, LoopRR2
+
+ PKHTB xh4, xi4, xi2, ASR#16 ;// h4/4
+ PKHTB xh6, xi6, xi0, ASR#16 ;// h6/4
+ SHADD16 xh7, xi5, xi7 ;// (i5+i7)/4
+
+ MOV xi3, xi3, LSL #1
+ PKHTB xh5, xi3, xi1, ASR#15 ;// h5/4
+
+ ;// xi0,xi1,xi2,xi3 now free
+ ;// IStage 4,3, rows 2to3 x1/2
+
+ LDR xi0, [pSrc, #2*16] ;// j2
+ LDR xi1, [pSrc, #6*16] ;// 2*j6
+
+ ;// IStage 2, rows4to7
+ SSUB16 xg6, xh6, xh7
+ SSUB16 xg5, xh5, xg6
+ SSUB16 xg4, xh4, xg5
+
+ SHADD16 xi1, xi1, xit ;// j6
+ SSUB16 xi2, xi0, xi1 ;// (j2-j6)
+ SHADD16 xi3, xi0, xi1 ;// (j2+j6)/2
+
+ SMULBB xi0, xi2, LoopRR2
+ SMULTB xi2, xi2, LoopRR2
+
+ MOV xi2, xi2, LSL #1
+
+ PKHTB xh2, xi2, xi0, ASR#15 ;// i2*sqrt(2)/4
+
+ ;// xi0, xi1 now free
+ ;// IStage 4,3 rows 0to1 x 1/2
+ LDR xi1, [pSrc, #4*16] ;// j4
+ LDR xi0, [pSrc], #4 ;// j0
+
+ SSUB16 xh2, xh2, xi3
+ ADDS LoopRR2, LoopRR2, #2<<29 ;// done two rows
+
+ ADD xi0, xi0, xit, LSL #2 ;// ensure correct round
+ SHADD16 xh0, xi0, xi1 ;// of DC result
+ SHSUB16 xh1, xi0, xi1
+
+ ;// IStage 2 rows 0to3 x 1/2
+ SHSUB16 xg2, xh1, xh2
+ SHADD16 xg1, xh1, xh2
+ SHSUB16 xg3, xh0, xh3
+ SHADD16 xg0, xh0, xh3
+
+ ;// IStage 1 all rows
+ SHADD16 xf3, xg3, xg4
+ SHSUB16 xf4, xg3, xg4
+ SHADD16 xf2, xg2, xg5
+ SHSUB16 xf5, xg2, xg5
+ SHADD16 xf1, xg1, xg6
+ SHSUB16 xf6, xg1, xg6
+ SHADD16 xf0, xg0, xg7
+ SHSUB16 xf7, xg0, xg7
+
+ ;// Saturate
+ IF ("$outsize"="u8")
+ USAT16 xf0, #8, xf0
+ USAT16 xf1, #8, xf1
+ USAT16 xf2, #8, xf2
+ USAT16 xf3, #8, xf3
+ USAT16 xf4, #8, xf4
+ USAT16 xf5, #8, xf5
+ USAT16 xf6, #8, xf6
+ USAT16 xf7, #8, xf7
+ ENDIF
+ IF ("$outsize"="s9")
+ SSAT16 xf0, #9, xf0
+ SSAT16 xf1, #9, xf1
+ SSAT16 xf2, #9, xf2
+ SSAT16 xf3, #9, xf3
+ SSAT16 xf4, #9, xf4
+ SSAT16 xf5, #9, xf5
+ SSAT16 xf6, #9, xf6
+ SSAT16 xf7, #9, xf7
+ ENDIF
+
+ ;// Transpose to Row, Pack and store
+ IF ("$outsize"="u8")
+ ORR xf0, xf0, xf1, LSL #8 ;// [ b1 b0 a1 a0 ]
+ ORR xf2, xf2, xf3, LSL #8 ;// [ b3 b2 a3 a2 ]
+ ORR xf4, xf4, xf5, LSL #8 ;// [ b5 b4 a5 a4 ]
+ ORR xf6, xf6, xf7, LSL #8 ;// [ b7 b6 a7 a6 ]
+ PKHBT ra01, xf0, xf2, LSL #16
+ PKHTB rb01, xf2, xf0, ASR #16
+ PKHBT ra23, xf4, xf6, LSL #16
+ PKHTB rb23, xf6, xf4, ASR #16
+ STMIA pDest, {ra01, ra23}
+ IF "$stride"="s"
+ ADD pDest, pDest, pScale
+ STMIA pDest, {rb01, rb23}
+ ADD pDest, pDest, pScale
+ ELSE
+ ADD pDest, pDest, #($stride)
+ STMIA pDest, {rb01, rb23}
+ ADD pDest, pDest, #($stride)
+ ENDIF
+ ENDIF
+ IF ("$outsize"="s9"):LOR:("$outsize"="s16")
+ PKHBT ra01, xf0, xf1, LSL #16
+ PKHTB rb01, xf1, xf0, ASR #16
+
+ PKHBT ra23, xf2, xf3, LSL #16
+ PKHTB rb23, xf3, xf2, ASR #16
+
+ PKHBT ra45, xf4, xf5, LSL #16
+ PKHTB rb45, xf5, xf4, ASR #16
+
+ PKHBT ra67, xf6, xf7, LSL #16
+ PKHTB rb67, xf7, xf6, ASR #16
+
+ STMIA pDest, {ra01, ra23, ra45, ra67}
+ IF "$stride"="s"
+ ADD pDest, pDest, pScale
+ STMIA pDest, {rb01, rb23, rb45, rb67}
+ ADD pDest, pDest, pScale
+ ELSE
+ ADD pDest, pDest, #($stride)
+ STMIA pDest, {rb01, rb23, rb45, rb67}
+ ADD pDest, pDest, #($stride)
+ ENDIF
+ ENDIF
+
+ BCC v6_idct_row$_F
+ ENDIF ;// ARM1136JS
+
+
+ IF CortexA8
+
+Src0 EQU 7
+Src1 EQU 8
+Src2 EQU 9
+Src3 EQU 10
+Src4 EQU 11
+Src5 EQU 12
+Src6 EQU 13
+Src7 EQU 14
+Tmp EQU 15
+
+qXj0 QN Src0.S16
+qXj1 QN Src1.S16
+qXj2 QN Src2.S16
+qXj3 QN Src3.S16
+qXj4 QN Src4.S16
+qXj5 QN Src5.S16
+qXj6 QN Src6.S16
+qXj7 QN Src7.S16
+qXjt QN Tmp.S16
+
+dXj0lo DN (Src0*2).S16
+dXj0hi DN (Src0*2+1).S16
+dXj1lo DN (Src1*2).S16
+dXj1hi DN (Src1*2+1).S16
+dXj2lo DN (Src2*2).S16
+dXj2hi DN (Src2*2+1).S16
+dXj3lo DN (Src3*2).S16
+dXj3hi DN (Src3*2+1).S16
+dXj4lo DN (Src4*2).S16
+dXj4hi DN (Src4*2+1).S16
+dXj5lo DN (Src5*2).S16
+dXj5hi DN (Src5*2+1).S16
+dXj6lo DN (Src6*2).S16
+dXj6hi DN (Src6*2+1).S16
+dXj7lo DN (Src7*2).S16
+dXj7hi DN (Src7*2+1).S16
+dXjtlo DN (Tmp*2).S16
+dXjthi DN (Tmp*2+1).S16
+
+qXi0 QN qXj0
+qXi1 QN qXj4
+qXi2 QN qXj2
+qXi3 QN qXj7
+qXi4 QN qXj5
+qXi5 QN qXjt
+qXi6 QN qXj1
+qXi7 QN qXj6
+qXit QN qXj3
+
+dXi0lo DN dXj0lo
+dXi0hi DN dXj0hi
+dXi1lo DN dXj4lo
+dXi1hi DN dXj4hi
+dXi2lo DN dXj2lo
+dXi2hi DN dXj2hi
+dXi3lo DN dXj7lo
+dXi3hi DN dXj7hi
+dXi4lo DN dXj5lo
+dXi4hi DN dXj5hi
+dXi5lo DN dXjtlo
+dXi5hi DN dXjthi
+dXi6lo DN dXj1lo
+dXi6hi DN dXj1hi
+dXi7lo DN dXj6lo
+dXi7hi DN dXj6hi
+dXitlo DN dXj3lo
+dXithi DN dXj3hi
+
+qXh0 QN qXit
+qXh1 QN qXi0
+qXh2 QN qXi2
+qXh3 QN qXi3
+qXh4 QN qXi7
+qXh5 QN qXi5
+qXh6 QN qXi4
+qXh7 QN qXi1
+qXht QN qXi6
+
+dXh0lo DN dXitlo
+dXh0hi DN dXithi
+dXh1lo DN dXi0lo
+dXh1hi DN dXi0hi
+dXh2lo DN dXi2lo
+dXh2hi DN dXi2hi
+dXh3lo DN dXi3lo
+dXh3hi DN dXi3hi
+dXh4lo DN dXi7lo
+dXh4hi DN dXi7hi
+dXh5lo DN dXi5lo
+dXh5hi DN dXi5hi
+dXh6lo DN dXi4lo
+dXh6hi DN dXi4hi
+dXh7lo DN dXi1lo
+dXh7hi DN dXi1hi
+dXhtlo DN dXi6lo
+dXhthi DN dXi6hi
+
+qXg0 QN qXh2
+qXg1 QN qXht
+qXg2 QN qXh1
+qXg3 QN qXh0
+qXg4 QN qXh4
+qXg5 QN qXh5
+qXg6 QN qXh6
+qXg7 QN qXh7
+qXgt QN qXh3
+
+qXf0 QN qXg6
+qXf1 QN qXg5
+qXf2 QN qXg4
+qXf3 QN qXgt
+qXf4 QN qXg3
+qXf5 QN qXg2
+qXf6 QN qXg1
+qXf7 QN qXg0
+qXft QN qXg7
+
+
+qXt0 QN 1.S32
+qXt1 QN 2.S32
+qT0lo QN 1.S32
+qT0hi QN 2.S32
+qT1lo QN 3.S32
+qT1hi QN 4.S32
+qScalelo QN 5.S32 ;// used to read post scale values
+qScalehi QN 6.S32
+qTemp0 QN 5.S32
+qTemp1 QN 6.S32
+
+
+Scale1 EQU 6
+Scale2 EQU 15
+qScale1 QN Scale1.S16
+qScale2 QN Scale2.S16
+dScale1lo DN (Scale1*2).S16
+dScale1hi DN (Scale1*2+1).S16
+dScale2lo DN (Scale2*2).S16
+dScale2hi DN (Scale2*2+1).S16
+
+dCoefs DN 0.S16 ;// Scale coefficients in format {[0] [C] [S] [InvSqrt2]}
+InvSqrt2 DN dCoefs[0] ;// 1/sqrt(2) in Q15
+S DN dCoefs[1] ;// Sin(PI/8) in Q15
+C DN dCoefs[2] ;// Cos(PI/8) in Q15
+
+pTemp RN 12
+
+
+ IMPORT armCOMM_IDCTCoef
+
+ VLD1 {qXj0,qXj1}, [pSrc @64]!
+ VLD1 {qXj2,qXj3}, [pSrc @64]!
+ VLD1 {qXj4,qXj5}, [pSrc @64]!
+ VLD1 {qXj6,qXj7}, [pSrc @64]!
+
+ ;// Load PreScale and multiply with Src
+ ;// IStage 4
+
+ IF "$inscale"="s16" ;// 16X16 Mul
+ M_IDCT_PRESCALE16
+ ENDIF
+
+ IF "$inscale"="s32" ;// 32X32 ,ul
+ M_IDCT_PRESCALE32
+ ENDIF
+
+ ;// IStage 3
+ VQDMULH qXi2, qXi2, InvSqrt2 ;// i2/sqrt(2)
+ VHADD qXh0, qXi0, qXi1 ;// (i0+i1)/2
+ VHSUB qXh1, qXi0, qXi1 ;// (i0-i1)/2
+ VHADD qXh7, qXi5, qXi7 ;// (i5+i7)/4
+ VSUB qXh5, qXi5, qXi7 ;// (i5-i7)/2
+ VQDMULH qXh5, qXh5, InvSqrt2 ;// h5/sqrt(2)
+ VSUB qXh2, qXi2, qXi3 ;// h2, h3
+
+ VMULL qXt0, dXi4lo, C ;// c*i4
+ VMLAL qXt0, dXi6lo, S ;// c*i4+s*i6
+ VMULL qXt1, dXi4hi, C
+ VMLAL qXt1, dXi6hi, S
+ VSHRN dXh4lo, qXt0, #16 ;// h4
+ VSHRN dXh4hi, qXt1, #16
+
+ VMULL qXt0, dXi6lo, C ;// c*i6
+ VMLSL qXt0, dXi4lo, S ;// -s*i4 + c*h6
+ VMULL qXt1, dXi6hi, C
+ VMLSL qXt1, dXi4hi, S
+ VSHRN dXh6lo, qXt0, #16 ;// h6
+ VSHRN dXh6hi, qXt1, #16
+
+ ;// IStage 2
+ VSUB qXg6, qXh6, qXh7
+ VSUB qXg5, qXh5, qXg6
+ VSUB qXg4, qXh4, qXg5
+ VHADD qXg1, qXh1, qXh2 ;// (h1+h2)/2
+ VHSUB qXg2, qXh1, qXh2 ;// (h1-h2)/2
+ VHADD qXg0, qXh0, qXh3 ;// (h0+h3)/2
+ VHSUB qXg3, qXh0, qXh3 ;// (h0-h3)/2
+
+ ;// IStage 1 all rows
+ VADD qXf3, qXg3, qXg4
+ VSUB qXf4, qXg3, qXg4
+ VADD qXf2, qXg2, qXg5
+ VSUB qXf5, qXg2, qXg5
+ VADD qXf1, qXg1, qXg6
+ VSUB qXf6, qXg1, qXg6
+ VADD qXf0, qXg0, qXg7
+ VSUB qXf7, qXg0, qXg7
+
+ ;// Transpose, store and loop
+XTR0 EQU Src5
+XTR1 EQU Tmp
+XTR2 EQU Src6
+XTR3 EQU Src7
+XTR4 EQU Src3
+XTR5 EQU Src0
+XTR6 EQU Src1
+XTR7 EQU Src2
+XTRt EQU Src4
+
+qA0 QN XTR0.S32 ;// for XTRpose
+qA1 QN XTR1.S32
+qA2 QN XTR2.S32
+qA3 QN XTR3.S32
+qA4 QN XTR4.S32
+qA5 QN XTR5.S32
+qA6 QN XTR6.S32
+qA7 QN XTR7.S32
+
+dB0 DN XTR0*2+1 ;// for using VSWP
+dB1 DN XTR1*2+1
+dB2 DN XTR2*2+1
+dB3 DN XTR3*2+1
+dB4 DN XTR4*2
+dB5 DN XTR5*2
+dB6 DN XTR6*2
+dB7 DN XTR7*2
+
+
+ VTRN qXf0, qXf1
+ VTRN qXf2, qXf3
+ VTRN qXf4, qXf5
+ VTRN qXf6, qXf7
+ VTRN qA0, qA2
+ VTRN qA1, qA3
+ VTRN qA4, qA6
+ VTRN qA5, qA7
+ VSWP dB0, dB4
+ VSWP dB1, dB5
+ VSWP dB2, dB6
+ VSWP dB3, dB7
+
+
+qYj0 QN qXf0
+qYj1 QN qXf1
+qYj2 QN qXf2
+qYj3 QN qXf3
+qYj4 QN qXf4
+qYj5 QN qXf5
+qYj6 QN qXf6
+qYj7 QN qXf7
+qYjt QN qXft
+
+dYj0lo DN (XTR0*2).S16
+dYj0hi DN (XTR0*2+1).S16
+dYj1lo DN (XTR1*2).S16
+dYj1hi DN (XTR1*2+1).S16
+dYj2lo DN (XTR2*2).S16
+dYj2hi DN (XTR2*2+1).S16
+dYj3lo DN (XTR3*2).S16
+dYj3hi DN (XTR3*2+1).S16
+dYj4lo DN (XTR4*2).S16
+dYj4hi DN (XTR4*2+1).S16
+dYj5lo DN (XTR5*2).S16
+dYj5hi DN (XTR5*2+1).S16
+dYj6lo DN (XTR6*2).S16
+dYj6hi DN (XTR6*2+1).S16
+dYj7lo DN (XTR7*2).S16
+dYj7hi DN (XTR7*2+1).S16
+dYjtlo DN (XTRt*2).S16
+dYjthi DN (XTRt*2+1).S16
+
+qYi0 QN qYj0
+qYi1 QN qYj4
+qYi2 QN qYj2
+qYi3 QN qYj7
+qYi4 QN qYj5
+qYi5 QN qYjt
+qYi6 QN qYj1
+qYi7 QN qYj6
+qYit QN qYj3
+
+dYi0lo DN dYj0lo
+dYi0hi DN dYj0hi
+dYi1lo DN dYj4lo
+dYi1hi DN dYj4hi
+dYi2lo DN dYj2lo
+dYi2hi DN dYj2hi
+dYi3lo DN dYj7lo
+dYi3hi DN dYj7hi
+dYi4lo DN dYj5lo
+dYi4hi DN dYj5hi
+dYi5lo DN dYjtlo
+dYi5hi DN dYjthi
+dYi6lo DN dYj1lo
+dYi6hi DN dYj1hi
+dYi7lo DN dYj6lo
+dYi7hi DN dYj6hi
+dYitlo DN dYj3lo
+dYithi DN dYj3hi
+
+qYh0 QN qYit
+qYh1 QN qYi0
+qYh2 QN qYi2
+qYh3 QN qYi3
+qYh4 QN qYi7
+qYh5 QN qYi5
+qYh6 QN qYi4
+qYh7 QN qYi1
+qYht QN qYi6
+
+dYh0lo DN dYitlo
+dYh0hi DN dYithi
+dYh1lo DN dYi0lo
+dYh1hi DN dYi0hi
+dYh2lo DN dYi2lo
+dYh2hi DN dYi2hi
+dYh3lo DN dYi3lo
+dYh3hi DN dYi3hi
+dYh4lo DN dYi7lo
+dYh4hi DN dYi7hi
+dYh5lo DN dYi5lo
+dYh5hi DN dYi5hi
+dYh6lo DN dYi4lo
+dYh6hi DN dYi4hi
+dYh7lo DN dYi1lo
+dYh7hi DN dYi1hi
+dYhtlo DN dYi6lo
+dYhthi DN dYi6hi
+
+qYg0 QN qYh2
+qYg1 QN qYht
+qYg2 QN qYh1
+qYg3 QN qYh0
+qYg4 QN qYh4
+qYg5 QN qYh5
+qYg6 QN qYh6
+qYg7 QN qYh7
+qYgt QN qYh3
+
+qYf0 QN qYg6
+qYf1 QN qYg5
+qYf2 QN qYg4
+qYf3 QN qYgt
+qYf4 QN qYg3
+qYf5 QN qYg2
+qYf6 QN qYg1
+qYf7 QN qYg0
+qYft QN qYg7
+
+ VRSHR qYj7, qYj7, #2
+ VRSHR qYj6, qYj6, #1
+
+ VHADD qYi5, qYj1, qYj7 ;// i5 = (j1+j7)/2
+ VSUB qYi6, qYj1, qYj7 ;// i6 = j1-j7
+ VHADD qYi3, qYj2, qYj6 ;// i3 = (j2+j6)/2
+ VSUB qYi2, qYj2, qYj6 ;// i2 = j2-j6
+ VHADD qYi7, qYj5, qYj3 ;// i7 = (j5+j3)/2
+ VSUB qYi4, qYj5, qYj3 ;// i4 = j5-j3
+
+ VQDMULH qYi2, qYi2, InvSqrt2 ;// i2/sqrt(2)
+ ;// IStage 4,3 rows 0to1 x 1/2
+
+ MOV pTemp, #0x4 ;// ensure correct round
+ VDUP qScale1, pTemp ;// of DC result
+ VADD qYi0, qYi0, qScale1
+
+ VHADD qYh0, qYi0, qYi1 ;// (i0+i1)/2
+ VHSUB qYh1, qYi0, qYi1 ;// (i0-i1)/2
+
+ VHADD qYh7, qYi5, qYi7 ;// (i5+i7)/4
+ VSUB qYh5, qYi5, qYi7 ;// (i5-i7)/2
+ VSUB qYh2, qYi2, qYi3 ;// h2, h3
+ VQDMULH qYh5, qYh5, InvSqrt2 ;// h5/sqrt(2)
+
+ VMULL qXt0, dYi4lo, C ;// c*i4
+ VMLAL qXt0, dYi6lo, S ;// c*i4+s*i6
+ VMULL qXt1, dYi4hi, C
+ VMLAL qXt1, dYi6hi, S
+ VSHRN dYh4lo, qXt0, #16 ;// h4
+ VSHRN dYh4hi, qXt1, #16
+
+ VMULL qXt0, dYi6lo, C ;// c*i6
+ VMLSL qXt0, dYi4lo, S ;// -s*i4 + c*h6
+ VMULL qXt1, dYi6hi, C
+ VMLSL qXt1, dYi4hi, S
+ VSHRN dYh6lo, qXt0, #16 ;// h6
+ VSHRN dYh6hi, qXt1, #16
+
+ VSUB qYg6, qYh6, qYh7
+ VSUB qYg5, qYh5, qYg6
+ VSUB qYg4, qYh4, qYg5
+
+ ;// IStage 2 rows 0to3 x 1/2
+ VHADD qYg1, qYh1, qYh2 ;// (h1+h2)/2
+ VHSUB qYg2, qYh1, qYh2 ;// (h1-h2)/2
+ VHADD qYg0, qYh0, qYh3 ;// (h0+h3)/2
+ VHSUB qYg3, qYh0, qYh3 ;// (h0-h3)/2
+
+
+ ;// IStage 1 all rows
+ VHADD qYf3, qYg3, qYg4
+ VHSUB qYf4, qYg3, qYg4
+ VHADD qYf2, qYg2, qYg5
+ VHSUB qYf5, qYg2, qYg5
+ VHADD qYf1, qYg1, qYg6
+ VHSUB qYf6, qYg1, qYg6
+ VHADD qYf0, qYg0, qYg7
+ VHSUB qYf7, qYg0, qYg7
+
+YTR0 EQU Src0
+YTR1 EQU Src4
+YTR2 EQU Src1
+YTR3 EQU Src2
+YTR4 EQU Src7
+YTR5 EQU Src5
+YTR6 EQU Tmp
+YTR7 EQU Src6
+YTRt EQU Src3
+
+qC0 QN YTR0.S32 ;// for YTRpose
+qC1 QN YTR1.S32
+qC2 QN YTR2.S32
+qC3 QN YTR3.S32
+qC4 QN YTR4.S32
+qC5 QN YTR5.S32
+qC6 QN YTR6.S32
+qC7 QN YTR7.S32
+
+dD0 DN YTR0*2+1 ;// for using VSWP
+dD1 DN YTR1*2+1
+dD2 DN YTR2*2+1
+dD3 DN YTR3*2+1
+dD4 DN YTR4*2
+dD5 DN YTR5*2
+dD6 DN YTR6*2
+dD7 DN YTR7*2
+
+ VTRN qYf0, qYf1
+ VTRN qYf2, qYf3
+ VTRN qYf4, qYf5
+ VTRN qYf6, qYf7
+ VTRN qC0, qC2
+ VTRN qC1, qC3
+ VTRN qC4, qC6
+ VTRN qC5, qC7
+ VSWP dD0, dD4
+ VSWP dD1, dD5
+ VSWP dD2, dD6
+ VSWP dD3, dD7
+
+
+dYf0U8 DN YTR0*2.U8
+dYf1U8 DN YTR1*2.U8
+dYf2U8 DN YTR2*2.U8
+dYf3U8 DN YTR3*2.U8
+dYf4U8 DN YTR4*2.U8
+dYf5U8 DN YTR5*2.U8
+dYf6U8 DN YTR6*2.U8
+dYf7U8 DN YTR7*2.U8
+
+ ;//
+ ;// Do saturation if outsize is other than S16
+ ;//
+
+ IF ("$outsize"="u8")
+ ;// Output range [0-255]
+ VQMOVN dYf0U8, qYf0
+ VQMOVN dYf1U8, qYf1
+ VQMOVN dYf2U8, qYf2
+ VQMOVN dYf3U8, qYf3
+ VQMOVN dYf4U8, qYf4
+ VQMOVN dYf5U8, qYf5
+ VQMOVN dYf6U8, qYf6
+ VQMOVN dYf7U8, qYf7
+ ENDIF
+
+ IF ("$outsize"="s9")
+ ;// Output range [-256 to +255]
+ VQSHL qYf0, qYf0, #16-9
+ VQSHL qYf1, qYf1, #16-9
+ VQSHL qYf2, qYf2, #16-9
+ VQSHL qYf3, qYf3, #16-9
+ VQSHL qYf4, qYf4, #16-9
+ VQSHL qYf5, qYf5, #16-9
+ VQSHL qYf6, qYf6, #16-9
+ VQSHL qYf7, qYf7, #16-9
+
+ VSHR qYf0, qYf0, #16-9
+ VSHR qYf1, qYf1, #16-9
+ VSHR qYf2, qYf2, #16-9
+ VSHR qYf3, qYf3, #16-9
+ VSHR qYf4, qYf4, #16-9
+ VSHR qYf5, qYf5, #16-9
+ VSHR qYf6, qYf6, #16-9
+ VSHR qYf7, qYf7, #16-9
+ ENDIF
+
+ ;// Store output depending on the Stride size
+ IF "$stride"="s"
+ VST1 qYf0, [pDest @64], Stride
+ VST1 qYf1, [pDest @64], Stride
+ VST1 qYf2, [pDest @64], Stride
+ VST1 qYf3, [pDest @64], Stride
+ VST1 qYf4, [pDest @64], Stride
+ VST1 qYf5, [pDest @64], Stride
+ VST1 qYf6, [pDest @64], Stride
+ VST1 qYf7, [pDest @64]
+ ELSE
+ IF ("$outsize"="u8")
+ VST1 dYf0U8, [pDest @64], #8
+ VST1 dYf1U8, [pDest @64], #8
+ VST1 dYf2U8, [pDest @64], #8
+ VST1 dYf3U8, [pDest @64], #8
+ VST1 dYf4U8, [pDest @64], #8
+ VST1 dYf5U8, [pDest @64], #8
+ VST1 dYf6U8, [pDest @64], #8
+ VST1 dYf7U8, [pDest @64]
+ ELSE
+ ;// ("$outsize"="s9") or ("$outsize"="s16")
+ VST1 qYf0, [pDest @64], #16
+ VST1 qYf1, [pDest @64], #16
+ VST1 qYf2, [pDest @64], #16
+ VST1 qYf3, [pDest @64], #16
+ VST1 qYf4, [pDest @64], #16
+ VST1 qYf5, [pDest @64], #16
+ VST1 qYf6, [pDest @64], #16
+ VST1 qYf7, [pDest @64]
+ ENDIF
+
+ ENDIF
+
+
+
+ ENDIF ;// CortexA8
+
+
+
+ MEND
+
+ ;// Scale TWO input rows with TWO rows of 16 bit scale values
+ ;//
+ ;// This macro is used by M_IDCT_PRESCALE16 to pre-scale one row
+ ;// input (Eight input values) with one row of scale values. Also
+ ;// Loads next scale values from pScale, if $LastRow flag is not set.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// $dAlo - Input D register with first four S16 values of row n
+ ;// $dAhi - Input D register with next four S16 values of row n
+ ;// $dBlo - Input D register with first four S16 values of row n+1
+ ;// $dBhi - Input D register with next four S16 values of row n+1
+ ;// pScale - Pointer to next row of scale values
+ ;// qT0lo - Temporary scratch register
+ ;// qT0hi - Temporary scratch register
+ ;// qT1lo - Temporary scratch register
+ ;// qT1hi - Temporary scratch register
+ ;// dScale1lo - Scale value of row n
+ ;// dScale1hi - Scale value of row n
+ ;// dScale2lo - Scale value of row n+1
+ ;// dScale2hi - Scale value of row n+1
+ ;//
+ ;// Input Flag
+ ;//
+ ;// $LastRow - Flag to indicate whether current row is last row
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// $dAlo - Scaled output values (first four S16 of row n)
+ ;// $dAhi - Scaled output values (next four S16 of row n)
+ ;// $dBlo - Scaled output values (first four S16 of row n+1)
+ ;// $dBhi - Scaled output values (next four S16 of row n+1)
+ ;// qScale1 - Scale values for next row
+ ;// qScale2 - Scale values for next row+1
+ ;// pScale - Pointer to next row of scale values
+ ;//
+ MACRO
+ M_IDCT_SCALE16 $dAlo, $dAhi, $dBlo, $dBhi, $LastRow
+ VMULL qT0lo, $dAlo, dScale1lo
+ VMULL qT0hi, $dAhi, dScale1hi
+ VMULL qT1lo, $dBlo, dScale2lo
+ VMULL qT1hi, $dBhi, dScale2hi
+ IF "$LastRow"="0"
+ VLD1 qScale1, [pScale], #16 ;// Load scale for row n+1
+ VLD1 qScale2, [pScale], #16 ;// Load scale for row n+2
+ ENDIF
+ VQRSHRN $dAlo, qT0lo, #12
+ VQRSHRN $dAhi, qT0hi, #12
+ VQRSHRN $dBlo, qT1lo, #12
+ VQRSHRN $dBhi, qT1hi, #12
+ MEND
+
+ ;// Scale 8x8 block input values with 16 bit scale values
+ ;//
+ ;// This macro is used to pre-scale block of 8x8 input.
+ ;// This also do the Ist stage transformations of IDCT.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// dXjnlo - n th input D register with first four S16 values
+ ;// dXjnhi - n th input D register with next four S16 values
+ ;// qXjn - n th input Q register with eight S16 values
+ ;// pScale - Pointer to scale values
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// qXin - n th output Q register with eight S16 output values of 1st stage
+ ;//
+ MACRO
+ M_IDCT_PRESCALE16
+ VLD1 qScale1, [pScale], #16 ;// Load Pre scale for row 0
+ VLD1 qScale2, [pScale], #16 ;// Load Pre scale for row 0
+ M_IDCT_SCALE16 dXj0lo, dXj0hi, dXj1lo, dXj1hi, 0 ;// Pre scale row 0 & 1
+ M_IDCT_SCALE16 dXj2lo, dXj2hi, dXj3lo, dXj3hi, 0
+ M_IDCT_SCALE16 dXj4lo, dXj4hi, dXj5lo, dXj5hi, 0
+ M_IDCT_SCALE16 dXj6lo, dXj6hi, dXj7lo, dXj7hi, 1
+ VHADD qXi5, qXj1, qXj7 ;// (j1+j7)/2
+ VSUB qXi6, qXj1, qXj7 ;// j1-j7
+ LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+ VHADD qXi3, qXj2, qXj6 ;// (j2+j6)/2
+ VSUB qXi2, qXj2, qXj6 ;// j2-j6
+ VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
+ VHADD qXi7, qXj5, qXj3 ;// (j5+j3)/2
+ VSUB qXi4, qXj5, qXj3 ;// j5-j3
+ MEND
+
+
+ ;// Scale 8x8 block input values with 32 bit scale values
+ ;//
+ ;// This macro is used to pre-scale block of 8x8 input.
+ ;// This also do the Ist stage transformations of IDCT.
+ ;//
+ ;// Input Registers:
+ ;//
+ ;// dXjnlo - n th input D register with first four S16 values
+ ;// dXjnhi - n th input D register with next four S16 values
+ ;// qXjn - n th input Q register with eight S16 values
+ ;// pScale - Pointer to 32bit scale values in Q23 format
+ ;//
+ ;// Output Registers:
+ ;//
+ ;// dXinlo - n th output D register with first four S16 output values of 1st stage
+ ;// dXinhi - n th output D register with next four S16 output values of 1st stage
+ ;//
+ MACRO
+ M_IDCT_PRESCALE32
+qScale0lo QN 0.S32
+qScale0hi QN 1.S32
+qScale1lo QN 2.S32
+qScale1hi QN 3.S32
+qScale2lo QN qScale1lo
+qScale2hi QN qScale1hi
+qScale3lo QN qScale1lo
+qScale3hi QN qScale1hi
+qScale4lo QN qScale1lo
+qScale4hi QN qScale1hi
+qScale5lo QN qScale0lo
+qScale5hi QN qScale0hi
+qScale6lo QN qScale0lo
+qScale6hi QN qScale0hi
+qScale7lo QN qScale0lo
+qScale7hi QN qScale0hi
+
+qSrc0lo QN 4.S32
+qSrc0hi QN 5.S32
+qSrc1lo QN 6.S32
+qSrc1hi QN Src4.S32
+qSrc2lo QN qSrc0lo
+qSrc2hi QN qSrc0hi
+qSrc3lo QN qSrc0lo
+qSrc3hi QN qSrc0hi
+qSrc4lo QN qSrc0lo
+qSrc4hi QN qSrc0hi
+qSrc5lo QN qSrc1lo
+qSrc5hi QN qSrc1hi
+qSrc6lo QN qSrc1lo
+qSrc6hi QN qSrc1hi
+qSrc7lo QN qSrc0lo
+qSrc7hi QN qSrc0hi
+
+qRes17lo QN qScale0lo
+qRes17hi QN qScale0hi
+qRes26lo QN qScale0lo
+qRes26hi QN qScale0hi
+qRes53lo QN qScale0lo
+qRes53hi QN qScale0hi
+
+ ADD pTemp, pScale, #4*8*7 ;// Address of pScale[7]
+
+ ;// Row 0
+ VLD1 {qScale0lo, qScale0hi}, [pScale]!
+ VSHLL qSrc0lo, dXj0lo, #(12-1)
+ VSHLL qSrc0hi, dXj0hi, #(12-1)
+ VLD1 {qScale1lo, qScale1hi}, [pScale]!
+ VQRDMULH qSrc0lo, qScale0lo, qSrc0lo
+ VQRDMULH qSrc0hi, qScale0hi, qSrc0hi
+ VLD1 {qScale7lo, qScale7hi}, [pTemp]!
+ VSHLL qSrc1lo, dXj1lo, #(12-1)
+ VSHLL qSrc1hi, dXj1hi, #(12-1)
+ VMOVN dXi0lo, qSrc0lo ;// Output i0
+ VMOVN dXi0hi, qSrc0hi
+ VSHLL qSrc7lo, dXj7lo, #(12-1)
+ VSHLL qSrc7hi, dXj7hi, #(12-1)
+ SUB pTemp, pTemp, #((16*2)+(4*8*1))
+ VQRDMULH qSrc1lo, qScale1lo, qSrc1lo
+ VQRDMULH qSrc1hi, qScale1hi, qSrc1hi
+ VQRDMULH qSrc7lo, qScale7lo, qSrc7lo
+ VQRDMULH qSrc7hi, qScale7hi, qSrc7hi
+ VLD1 {qScale2lo, qScale2hi}, [pScale]!
+
+ ;// Row 1 & 7
+ VHADD qRes17lo, qSrc1lo, qSrc7lo ;// (j1+j7)/2
+ VHADD qRes17hi, qSrc1hi, qSrc7hi ;// (j1+j7)/2
+ VMOVN dXi5lo, qRes17lo ;// Output i5
+ VMOVN dXi5hi, qRes17hi
+ VSUB qRes17lo, qSrc1lo, qSrc7lo ;// j1-j7
+ VSUB qRes17hi, qSrc1hi, qSrc7hi ;// j1-j7
+ VMOVN dXi6lo, qRes17lo ;// Output i6
+ VMOVN dXi6hi, qRes17hi
+ VSHLL qSrc2lo, dXj2lo, #(12-1)
+ VSHLL qSrc2hi, dXj2hi, #(12-1)
+ VLD1 {qScale6lo, qScale6hi}, [pTemp]!
+ VSHLL qSrc6lo, dXj6lo, #(12-1)
+ VSHLL qSrc6hi, dXj6hi, #(12-1)
+ SUB pTemp, pTemp, #((16*2)+(4*8*1))
+ VQRDMULH qSrc2lo, qScale2lo, qSrc2lo
+ VQRDMULH qSrc2hi, qScale2hi, qSrc2hi
+ VQRDMULH qSrc6lo, qScale6lo, qSrc6lo
+ VQRDMULH qSrc6hi, qScale6hi, qSrc6hi
+ VLD1 {qScale3lo, qScale3hi}, [pScale]!
+
+ ;// Row 2 & 6
+ VHADD qRes26lo, qSrc2lo, qSrc6lo ;// (j2+j6)/2
+ VHADD qRes26hi, qSrc2hi, qSrc6hi ;// (j2+j6)/2
+ VMOVN dXi3lo, qRes26lo ;// Output i3
+ VMOVN dXi3hi, qRes26hi
+ VSUB qRes26lo, qSrc2lo, qSrc6lo ;// j2-j6
+ VSUB qRes26hi, qSrc2hi, qSrc6hi ;// j2-j6
+ VMOVN dXi2lo, qRes26lo ;// Output i2
+ VMOVN dXi2hi, qRes26hi
+ VSHLL qSrc3lo, dXj3lo, #(12-1)
+ VSHLL qSrc3hi, dXj3hi, #(12-1)
+ VLD1 {qScale5lo, qScale5hi}, [pTemp]!
+ VSHLL qSrc5lo, dXj5lo, #(12-1)
+ VSHLL qSrc5hi, dXj5hi, #(12-1)
+ VQRDMULH qSrc3lo, qScale3lo, qSrc3lo
+ VQRDMULH qSrc3hi, qScale3hi, qSrc3hi
+ VQRDMULH qSrc5lo, qScale5lo, qSrc5lo
+ VQRDMULH qSrc5hi, qScale5hi, qSrc5hi
+
+ ;// Row 3 & 5
+ VHADD qRes53lo, qSrc5lo, qSrc3lo ;// (j5+j3)/2
+ VHADD qRes53hi, qSrc5hi, qSrc3hi ;// (j5+j3)/2
+ SUB pSrc, pSrc, #16*2*2
+ VMOVN dXi7lo, qRes53lo ;// Output i7
+ VMOVN dXi7hi, qRes53hi
+ VSUB qRes53lo, qSrc5lo, qSrc3lo ;// j5-j3
+ VSUB qRes53hi, qSrc5hi, qSrc3hi ;// j5-j3
+ VLD1 qXj4, [pSrc @64]
+ VMOVN dXi4lo, qRes53lo ;// Output i4
+ VMOVN dXi4hi, qRes53hi
+ VSHLL qSrc4lo, dXj4lo, #(12-1)
+ VSHLL qSrc4hi, dXj4hi, #(12-1)
+ VLD1 {qScale4lo, qScale4hi}, [pScale]
+ LDR pSrc, =armCOMM_IDCTCoef ;// Address of DCT inverse AAN constants
+ VQRDMULH qSrc4lo, qScale4lo, qSrc4lo
+ VQRDMULH qSrc4hi, qScale4hi, qSrc4hi
+ VLDR dCoefs, [pSrc] ;// Load DCT inverse AAN constants
+ ;// Row 4
+ VMOVN dXi1lo, qSrc4lo ;// Output i1
+ VMOVN dXi1hi, qSrc4hi
+
+ MEND
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
new file mode 100755
index 0000000..51118fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_MaskTable.h
@@ -0,0 +1,27 @@
+/**
+ *
+ * File Name: armCOMM_MaskTable.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Mask Table to mask the end of array
+ */
+
+
+
+#ifndef _ARMCOMM_MASKTABLE_H_
+#define _ARMCOMM_MASKTABLE_H_
+
+#define MaskTableSize 72
+
+/* Mask table */
+
+extern const OMX_U16 armCOMM_qMaskTable16[MaskTableSize];
+extern const OMX_U8 armCOMM_qMaskTable8[MaskTableSize];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
new file mode 100755
index 0000000..41b3e1e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_Version.h
@@ -0,0 +1,43 @@
+/* Guard the header against multiple inclusion. */
+#ifndef __ARM_COMM_VERSION_H__
+#define __ARM_COMM_VERSION_H__
+
+
+/* The following line should be in omxtypes.h but hasn't been approved by OpenMAX yet */
+#define OMX_VERSION 102
+
+/* We need to define these macros in order to convert a #define number into a #define string. */
+#define ARM_QUOTE(a) #a
+#define ARM_INDIRECT(A) ARM_QUOTE(A)
+
+/* Convert the OMX_VERSION number into a string that can be used, for example, to print it out. */
+#define ARM_VERSION_STRING ARM_INDIRECT(OMX_VERSION)
+
+
+/* Define this in order to turn on ARM version/release/build strings in each domain */
+#define ARM_INCLUDE_VERSION_DESCRIPTIONS
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+ extern const char * const omxAC_VersionDescription;
+ extern const char * const omxIC_VersionDescription;
+ extern const char * const omxIP_VersionDescription;
+ extern const char * const omxSP_VersionDescription;
+ extern const char * const omxVC_VersionDescription;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */
+
+
+/* The following entries should be automatically updated by the release script */
+/* They are used in the ARM version strings defined for each domain. */
+
+/* The release tag associated with this release of the library. - used for source and object releases */
+#define OMX_ARM_RELEASE_TAG "r1p0-00bet0"
+
+/* The ARM architecture used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_ARCHITECTURE "ARM Architecture V7 with NEON"
+
+/* The ARM Toolchain used to build any objects or executables in this release. */
+#define OMX_ARM_BUILD_TOOLCHAIN "ARM RVCT 3.1"
+
+
+#endif /* __ARM_COMM_VERSION_H__ */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
new file mode 100755
index 0000000..0956bd1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armCOMM_s.h
@@ -0,0 +1,1157 @@
+;//
+;//
+;// File Name: armCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// ARM optimized OpenMAX common header file
+;//
+
+;// Protect against multiple inclusion
+ IF :LNOT::DEF:ARMCOMM_S_H
+ GBLL ARMCOMM_S_H
+
+ REQUIRE8 ;// Requires 8-byte stack alignment
+ PRESERVE8 ;// Preserves 8-byte stack alignment
+
+ GBLL ARM_ERRORCHECK
+ARM_ERRORCHECK SETL {FALSE}
+
+;// Globals
+
+ GBLS _RRegList ;// R saved register list
+ GBLS _DRegList ;// D saved register list
+ GBLS _Variant ;// Selected processor variant
+ GBLS _CPU ;// CPU name
+ GBLS _Struct ;// Structure name
+
+ GBLL _InFunc ;// Inside function assembly flag
+ GBLL _SwLong ;// Long switch flag
+
+ GBLA _RBytes ;// Number of register bytes on stack
+ GBLA _SBytes ;// Number of scratch bytes on stack
+ GBLA _ABytes ;// Stack offset of next argument
+ GBLA _Workspace ;// Stack offset of scratch workspace
+ GBLA _F ;// Function number
+ GBLA _StOff ;// Struct offset
+ GBLA _SwNum ;// Switch number
+ GBLS _32 ;// Suffix for 32 byte alignmnet
+ GBLS _16 ;// Suffix for 16 byte alignmnet
+
+_InFunc SETL {FALSE}
+_SBytes SETA 0
+_F SETA 0
+_SwNum SETA 0
+_32 SETS "ALIGN32"
+_16 SETS "ALIGN16"
+
+;/////////////////////////////////////////////////////////
+;// Override the tools settings of the CPU if the #define
+;// USECPU is set, otherwise use the CPU defined by the
+;// assembler settings.
+;/////////////////////////////////////////////////////////
+
+ IF :DEF: OVERRIDECPU
+_CPU SETS OVERRIDECPU
+ ELSE
+_CPU SETS {CPU}
+ ENDIF
+
+
+
+;/////////////////////////////////////////////////////////
+;// Work out which code to build
+;/////////////////////////////////////////////////////////
+
+ IF :DEF:ARM1136JS:LOR::DEF:CortexA8:LOR::DEF:ARM_GENERIC
+ INFO 1,"Please switch to using M_VARIANTS"
+ ENDIF
+
+ ;// Define and reset all officially recongnised variants
+ MACRO
+ _M_DEF_VARIANTS
+ _M_DEF_VARIANT ARM926EJS
+ _M_DEF_VARIANT ARM1136JS
+ _M_DEF_VARIANT ARM1136JS_U
+ _M_DEF_VARIANT CortexA8
+ _M_DEF_VARIANT ARM7TDMI
+ MEND
+
+ MACRO
+ _M_DEF_VARIANT $var
+ GBLL $var
+ GBLL _ok$var
+$var SETL {FALSE}
+ MEND
+
+
+ ;// Variant declaration
+ ;//
+ ;// Define a list of code variants supported by this
+ ;// source file. This macro then chooses the most
+ ;// appropriate variant to build for the currently configured
+ ;// core.
+ ;//
+ MACRO
+ M_VARIANTS $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+ ;// Set to TRUE variants that are supported
+ _M_DEF_VARIANTS
+ _M_VARIANT $v0
+ _M_VARIANT $v1
+ _M_VARIANT $v2
+ _M_VARIANT $v3
+ _M_VARIANT $v4
+ _M_VARIANT $v5
+ _M_VARIANT $v6
+ _M_VARIANT $v7
+
+ ;// Look for first available variant to match a CPU
+ ;// _M_TRY cpu, variant fall back list
+_Variant SETS ""
+ _M_TRY ARM926EJ-S, ARM926EJS
+ _M_TRY ARM1176JZ-S, ARM1136JS
+ _M_TRY ARM1176JZF-S, ARM1136JS
+ _M_TRY ARM1156T2-S, ARM1136JS
+ _M_TRY ARM1156T2F-S, ARM1136JS
+ _M_TRY ARM1136J-S, ARM1136JS
+ _M_TRY ARM1136JF-S, ARM1136JS
+ _M_TRY MPCore, ARM1136JS
+ _M_TRY falcon-vfp, ARM1136JS
+ _M_TRY falcon-full-neon, CortexA8
+ _M_TRY Cortex-A8NoNeon, ARM1136JS
+ _M_TRY Cortex-A8, CortexA8, ARM1136JS
+ _M_TRY Cortex-R4, ARM1136JS
+ _M_TRY ARM7TDMI
+
+ ;// Select the correct variant
+ _M_DEF_VARIANTS
+ IF _Variant=""
+ INFO 1, "No match found for CPU '$_CPU'"
+ ELSE
+$_Variant SETL {TRUE}
+ ENDIF
+ MEND
+
+ ;// Register a variant as available
+ MACRO
+ _M_VARIANT $var
+ IF "$var"=""
+ MEXIT
+ ENDIF
+ IF :LNOT::DEF:_ok$var
+ INFO 1, "Unrecognized variant '$var'"
+ ENDIF
+$var SETL {TRUE}
+ MEND
+
+ ;// For a given CPU, see if any of the variants supporting
+ ;// this CPU are available. The first available variant is
+ ;// chosen
+ MACRO
+ _M_TRY $cpu, $v0,$v1,$v2,$v3,$v4,$v5,$v6,$v7
+ IF "$cpu"<>_CPU
+ MEXIT
+ ENDIF
+ _M_TRY1 $v0
+ _M_TRY1 $v1
+ _M_TRY1 $v2
+ _M_TRY1 $v3
+ _M_TRY1 $v4
+ _M_TRY1 $v5
+ _M_TRY1 $v6
+ _M_TRY1 $v7
+ ;// Check a match was found
+ IF _Variant=""
+ INFO 1, "No variant match found for CPU '$_CPU'"
+ ENDIF
+ MEND
+
+ MACRO
+ _M_TRY1 $var
+ IF "$var"=""
+ MEXIT
+ ENDIF
+ IF (_Variant=""):LAND:$var
+_Variant SETS "$var"
+ ENDIF
+ MEND
+
+;////////////////////////////////////////////////////////
+;// Structure definition
+;////////////////////////////////////////////////////////
+
+ ;// Declare a structure of given name
+ MACRO
+ M_STRUCT $sname
+_Struct SETS "$sname"
+_StOff SETA 0
+ MEND
+
+ ;// Declare a structure field
+ ;// The field is called $sname_$fname
+ ;// $size = the size of each entry, must be power of 2
+ ;// $number = (if provided) the number of entries for an array
+ MACRO
+ M_FIELD $fname, $size, $number
+ IF (_StOff:AND:($size-1))!=0
+_StOff SETA _StOff + ($size - (_StOff:AND:($size-1)))
+ ENDIF
+$_Struct._$fname EQU _StOff
+ IF "$number"<>""
+_StOff SETA _StOff + $size*$number
+ ELSE
+_StOff SETA _StOff + $size
+ ENDIF
+ MEND
+
+
+ MACRO
+ M_ENDSTRUCT
+sizeof_$_Struct EQU _StOff
+_Struct SETS ""
+ MEND
+
+;//////////////////////////////////////////////////////////
+;// Switch and table macros
+;//////////////////////////////////////////////////////////
+
+ ;// Start a relative switch table with register to switch on
+ ;//
+ ;// $v = the register to switch on
+ ;// $s = if specified must be "L" to indicate long
+ ;// this allows a greater range to the case code
+ MACRO
+ M_SWITCH $v, $s
+ ASSERT "$s"="":LOR:"$s"="L"
+_SwLong SETL {FALSE}
+ IF "$s"="L"
+_SwLong SETL {TRUE}
+ ENDIF
+_SwNum SETA _SwNum+1
+ IF {CONFIG}=16
+ ;// Thumb
+ IF _SwLong
+ TBH [pc, $v, LSL#1]
+ ELSE
+ TBB [pc, $v]
+ ENDIF
+_Switch$_SwNum
+ ELSE
+ ;// ARM
+ ADD pc, pc, $v, LSL #2
+ NOP
+ ENDIF
+ MEND
+
+ ;// Add a case to the switch statement
+ MACRO
+ M_CASE $label
+ IF {CONFIG}=16
+ ;// Thumb
+ IF _SwLong
+ DCW ($label - _Switch$_SwNum)/2
+ ELSE
+ DCB ($label - _Switch$_SwNum)/2
+ ENDIF
+ ELSE
+ ;// ARM
+ B $label
+ ENDIF
+ MEND
+
+ ;// End of switch statement
+ MACRO
+ M_ENDSWITCH
+ ALIGN 2
+ MEND
+
+
+;////////////////////////////////////////////////////////
+;// Data area allocation
+;////////////////////////////////////////////////////////
+
+ ;// Constant table allocator macro
+ ;//
+ ;// Creates a new section for each constant table
+ ;// $name is symbol through which the table can be accessed.
+ ;// $align is the optional alignment of the table, log2 of
+ ;// the byte alignment - $align=4 is 16 byte aligned
+ MACRO
+ M_TABLE $name, $align
+ ASSERT :LNOT:_InFunc
+ IF "$align"=""
+ AREA |.constdata|, READONLY, DATA
+ ELSE
+ ;// AREAs inherit the alignment of the first declaration.
+ ;// Therefore for each alignment size we must have an area
+ ;// of a different name.
+ AREA constdata_a$align, READONLY, DATA, ALIGN=$align
+
+ ;// We also force alignment incase we are tagging onto
+ ;// an already started area.
+ ALIGN (1<<$align)
+ ENDIF
+$name
+ MEND
+
+;/////////////////////////////////////////////////////
+;// Macros to allocate space on the stack
+;//
+;// These all assume that the stack is 8-byte aligned
+;// at entry to the function, which means that the
+;// 32-byte alignment macro needs to work in a
+;// bit more of a special way...
+;/////////////////////////////////////////////////////
+
+
+
+
+ ;// Allocate 1-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC1 $name, $size
+ ASSERT :LNOT:_InFunc
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 2-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC2 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:1)!=0
+_SBytes SETA _SBytes + (2 - (_SBytes:AND:1))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 4-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC4 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:3)!=0
+_SBytes SETA _SBytes + (4 - (_SBytes:AND:3))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size $size bytes.
+ MACRO
+ M_ALLOC8 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F EQU _SBytes
+_SBytes SETA _SBytes + ($size)
+ MEND
+
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size ($size+16) bytes.
+ ;// The extra 16 bytes are later used to align the pointer to 16 bytes
+
+ MACRO
+ M_ALLOC16 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F$_16 EQU (_SBytes + 8)
+_SBytes SETA _SBytes + ($size) + 8
+ MEND
+
+ ;// Allocate 8-byte aligned area of name
+ ;// $name size ($size+32) bytes.
+ ;// The extra 32 bytes are later used to align the pointer to 32 bytes
+
+ MACRO
+ M_ALLOC32 $name, $size
+ ASSERT :LNOT:_InFunc
+ IF (_SBytes:AND:7)!=0
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+$name$_F$_32 EQU (_SBytes + 24)
+_SBytes SETA _SBytes + ($size) + 24
+ MEND
+
+
+
+
+ ;// Argument Declaration Macro
+ ;//
+ ;// Allocate an argument name $name
+ ;// size $size bytes
+ MACRO
+ M_ARG $name, $size
+ ASSERT _InFunc
+$name$_F EQU _ABytes
+_ABytes SETA _ABytes + ($size)
+ MEND
+
+;///////////////////////////////////////////////
+;// Macros to access stacked variables
+;///////////////////////////////////////////////
+
+ ;// Macro to perform a data processing operation
+ ;// with a constant second operand
+ MACRO
+ _M_OPC $op,$rd,$rn,$const
+ LCLA _sh
+ LCLA _cst
+_sh SETA 0
+_cst SETA $const
+ IF _cst=0
+ $op $rd, $rn, #_cst
+ MEXIT
+ ENDIF
+ WHILE (_cst:AND:3)=0
+_cst SETA _cst>>2
+_sh SETA _sh+2
+ WEND
+ $op $rd, $rn, #(_cst:AND:0x000000FF)<<_sh
+ IF _cst>=256
+ $op $rd, $rd, #(_cst:AND:0xFFFFFF00)<<_sh
+ ENDIF
+ MEND
+
+ ;// Macro to perform a data access operation
+ ;// Such as LDR or STR
+ ;// The addressing mode is modified such that
+ ;// 1. If no address is given then the name is taken
+ ;// as a stack offset
+ ;// 2. If the addressing mode is not available for the
+ ;// state being assembled for (eg Thumb) then a suitable
+ ;// addressing mode is substituted.
+ ;//
+ ;// On Entry:
+ ;// $i = Instruction to perform (eg "LDRB")
+ ;// $a = Required byte alignment
+ ;// $r = Register(s) to transfer (eg "r1")
+ ;// $a0,$a1,$a2. Addressing mode and condition. One of:
+ ;// label {,cc}
+ ;// [base] {,,,cc}
+ ;// [base, offset]{!} {,,cc}
+ ;// [base, offset, shift]{!} {,cc}
+ ;// [base], offset {,,cc}
+ ;// [base], offset, shift {,cc}
+ MACRO
+ _M_DATA $i,$a,$r,$a0,$a1,$a2,$a3
+ IF "$a0":LEFT:1="["
+ IF "$a1"=""
+ $i$a3 $r, $a0
+ ELSE
+ IF "$a0":RIGHT:1="]"
+ IF "$a2"=""
+ _M_POSTIND $i$a3, "$r", $a0, $a1
+ ELSE
+ _M_POSTIND $i$a3, "$r", $a0, "$a1,$a2"
+ ENDIF
+ ELSE
+ IF "$a2"=""
+ _M_PREIND $i$a3, "$r", $a0, $a1
+ ELSE
+ _M_PREIND $i$a3, "$r", $a0, "$a1,$a2"
+ ENDIF
+ ENDIF
+ ENDIF
+ ELSE
+ LCLA _Offset
+_Offset SETA _Workspace + $a0$_F
+ ASSERT (_Offset:AND:($a-1))=0
+ $i$a1 $r, [sp, #_Offset]
+ ENDIF
+ MEND
+
+ ;// Handle post indexed load/stores
+ ;// op reg, [base], offset
+ MACRO
+ _M_POSTIND $i,$r,$a0,$a1
+ LCLS _base
+ LCLS _offset
+ IF {CONFIG}=16 ;// Thumb
+_base SETS ("$a0":LEFT:(:LEN:"$a0"-1)):RIGHT:(:LEN:"$a0"-2) ;// remove []
+_offset SETS "$a1"
+ IF _offset:LEFT:1="+"
+_offset SETS _offset:RIGHT:(:LEN:_offset-1)
+ ENDIF
+ $i $r, $a0
+ IF _offset:LEFT:1="-"
+_offset SETS _offset:RIGHT:(:LEN:_offset-1)
+ SUB $_base, $_base, $_offset
+ ELSE
+ ADD $_base, $_base, $_offset
+ ENDIF
+ ELSE ;// ARM
+ $i $r, $a0, $a1
+ ENDIF
+ MEND
+
+ ;// Handle pre indexed load/store
+ ;// op reg, [base, offset]{!}
+ MACRO
+ _M_PREIND $i,$r,$a0,$a1
+ LCLS _base
+ LCLS _offset
+ IF ({CONFIG}=16):LAND:(("$a1":RIGHT:2)="]!")
+_base SETS "$a0":RIGHT:(:LEN:("$a0")-1)
+_offset SETS "$a1":LEFT:(:LEN:("$a1")-2)
+ $i $r, [$_base, $_offset]
+ ADD $_base, $_base, $_offset
+ ELSE
+ $i $r, $a0, $a1
+ ENDIF
+ MEND
+
+ ;// Load unsigned byte from stack
+ MACRO
+ M_LDRB $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load signed byte from stack
+ MACRO
+ M_LDRSB $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRSB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store byte to stack
+ MACRO
+ M_STRB $r,$a0,$a1,$a2,$a3
+ _M_DATA "STRB",1,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load unsigned half word from stack
+ MACRO
+ M_LDRH $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load signed half word from stack
+ MACRO
+ M_LDRSH $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDRSH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store half word to stack
+ MACRO
+ M_STRH $r,$a0,$a1,$a2,$a3
+ _M_DATA "STRH",2,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load word from stack
+ MACRO
+ M_LDR $r,$a0,$a1,$a2,$a3
+ _M_DATA "LDR",4,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store word to stack
+ MACRO
+ M_STR $r,$a0,$a1,$a2,$a3
+ _M_DATA "STR",4,$r,$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Load double word from stack
+ MACRO
+ M_LDRD $r0,$r1,$a0,$a1,$a2,$a3
+ _M_DATA "LDRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Store double word to stack
+ MACRO
+ M_STRD $r0,$r1,$a0,$a1,$a2,$a3
+ _M_DATA "STRD",8,"$r0,$r1",$a0,$a1,$a2,$a3
+ MEND
+
+ ;// Get absolute address of stack allocated location
+ MACRO
+ M_ADR $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F)
+ MEND
+
+ ;// Get absolute address of stack allocated location and align the address to 16 bytes
+ MACRO
+ M_ADR16 $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F$_16)
+
+ ;// Now align $a to 16 bytes
+ BIC$cc $a,$a,#0x0F
+ MEND
+
+ ;// Get absolute address of stack allocated location and align the address to 32 bytes
+ MACRO
+ M_ADR32 $a, $b, $cc
+ _M_OPC ADD$cc, $a, sp, (_Workspace + $b$_F$_32)
+
+ ;// Now align $a to 32 bytes
+ BIC$cc $a,$a,#0x1F
+ MEND
+
+;//////////////////////////////////////////////////////////
+;// Function header and footer macros
+;//////////////////////////////////////////////////////////
+
+ ;// Function Header Macro
+ ;// Generates the function prologue
+ ;// Note that functions should all be "stack-moves-once"
+ ;// The FNSTART and FNEND macros should be the only places
+ ;// where the stack moves.
+ ;//
+ ;// $name = function name
+ ;// $rreg = "" don't stack any registers
+ ;// "lr" stack "lr" only
+ ;// "rN" stack registers "r4-rN,lr"
+ ;// $dreg = "" don't stack any D registers
+ ;// "dN" stack registers "d8-dN"
+ ;//
+ ;// Note: ARM Archicture procedure call standard AAPCS
+ ;// states that r4-r11, sp, d8-d15 must be preserved by
+ ;// a compliant function.
+ MACRO
+ M_START $name, $rreg, $dreg
+ ASSERT :LNOT:_InFunc
+ ASSERT "$name"!=""
+_InFunc SETL {TRUE}
+_RBytes SETA 0
+_Workspace SETA 0
+
+ ;// Create an area for the function
+ AREA |.text|, CODE
+ EXPORT $name
+$name FUNCTION
+
+ ;// Save R registers
+ _M_GETRREGLIST $rreg
+ IF _RRegList<>""
+ STMFD sp!, {$_RRegList, lr}
+ ENDIF
+
+ ;// Save D registers
+ _M_GETDREGLIST $dreg
+ IF _DRegList<>""
+ VSTMFD sp!, {$_DRegList}
+ ENDIF
+
+
+ ;// Ensure size claimed on stack is 8-byte aligned
+ IF ((_SBytes:AND:7)!=0)
+_SBytes SETA _SBytes + (8 - (_SBytes:AND:7))
+ ENDIF
+
+ IF (_SBytes!=0)
+ _M_OPC SUB, sp, sp, _SBytes
+ ENDIF
+
+
+_ABytes SETA _SBytes + _RBytes - _Workspace
+
+
+ ;// Print function name if debug enabled
+ M_PRINTF "$name\n",
+ MEND
+
+ ;// Work out a list of R saved registers
+ MACRO
+ _M_GETRREGLIST $rreg
+ IF "$rreg"=""
+_RRegList SETS ""
+ MEXIT
+ ENDIF
+ IF "$rreg"="lr":LOR:"$rreg"="r4"
+_RRegList SETS "r4"
+_RBytes SETA _RBytes+8
+ MEXIT
+ ENDIF
+ IF "$rreg"="r5":LOR:"$rreg"="r6"
+_RRegList SETS "r4-r6"
+_RBytes SETA _RBytes+16
+ MEXIT
+ ENDIF
+ IF "$rreg"="r7":LOR:"$rreg"="r8"
+_RRegList SETS "r4-r8"
+_RBytes SETA _RBytes+24
+ MEXIT
+ ENDIF
+ IF "$rreg"="r9":LOR:"$rreg"="r10"
+_RRegList SETS "r4-r10"
+_RBytes SETA _RBytes+32
+ MEXIT
+ ENDIF
+ IF "$rreg"="r11":LOR:"$rreg"="r12"
+_RRegList SETS "r4-r12"
+_RBytes SETA _RBytes+40
+ MEXIT
+ ENDIF
+ INFO 1, "Unrecognized saved r register limit '$rreg'"
+ MEND
+
+ ;// Work out a list of D saved registers
+ MACRO
+ _M_GETDREGLIST $dreg
+ IF "$dreg"=""
+_DRegList SETS ""
+ MEXIT
+ ENDIF
+ IF "$dreg"="d8"
+_DRegList SETS "d8"
+_RBytes SETA _RBytes+8
+ MEXIT
+ ENDIF
+ IF "$dreg"="d9"
+_DRegList SETS "d8-d9"
+_RBytes SETA _RBytes+16
+ MEXIT
+ ENDIF
+ IF "$dreg"="d10"
+_DRegList SETS "d8-d10"
+_RBytes SETA _RBytes+24
+ MEXIT
+ ENDIF
+ IF "$dreg"="d11"
+_DRegList SETS "d8-d11"
+_RBytes SETA _RBytes+32
+ MEXIT
+ ENDIF
+ IF "$dreg"="d12"
+_DRegList SETS "d8-d12"
+_RBytes SETA _RBytes+40
+ MEXIT
+ ENDIF
+ IF "$dreg"="d13"
+_DRegList SETS "d8-d13"
+_RBytes SETA _RBytes+48
+ MEXIT
+ ENDIF
+ IF "$dreg"="d14"
+_DRegList SETS "d8-d14"
+_RBytes SETA _RBytes+56
+ MEXIT
+ ENDIF
+ IF "$dreg"="d15"
+_DRegList SETS "d8-d15"
+_RBytes SETA _RBytes+64
+ MEXIT
+ ENDIF
+ INFO 1, "Unrecognized saved d register limit '$dreg'"
+ MEND
+
+ ;// Produce function return instructions
+ MACRO
+ _M_RET $cc
+ IF _DRegList<>""
+ VPOP$cc {$_DRegList}
+ ENDIF
+ IF _RRegList=""
+ BX$cc lr
+ ELSE
+ LDM$cc.FD sp!, {$_RRegList, pc}
+ ENDIF
+ MEND
+
+ ;// Early Function Exit Macro
+ ;// $cc = condition to exit with
+ ;// (Example: M_EXIT EQ)
+ MACRO
+ M_EXIT $cc
+ ASSERT _InFunc
+ IF _SBytes!=0
+ ;// Restore stack frame and exit
+ B$cc _End$_F
+ ELSE
+ ;// Can return directly
+ _M_RET $cc
+ ENDIF
+ MEND
+
+ ;// Function Footer Macro
+ ;// Generates the function epilogue
+ MACRO
+ M_END
+ ASSERT _InFunc
+_InFunc SETL {FALSE}
+_End$_F
+
+ ;// Restore the stack pointer to its original value on function entry
+ IF _SBytes!=0
+ _M_OPC ADD, sp, sp, _SBytes
+ ENDIF
+ _M_RET
+ ENDFUNC
+
+ ;// Reset the global stack tracking variables back to their
+ ;// initial values, and increment the function count
+_SBytes SETA 0
+_F SETA _F+1
+ MEND
+
+
+;//==========================================================================
+;// Debug Macros
+;//==========================================================================
+
+ GBLL DEBUG_ON
+DEBUG_ON SETL {FALSE}
+ GBLL DEBUG_STALLS_ON
+DEBUG_STALLS_ON SETL {FALSE}
+
+ ;//==========================================================================
+ ;// Debug call to printf
+ ;// M_PRINTF $format, $val0, $val1, $val2
+ ;//
+ ;// Examples:
+ ;// M_PRINTF "x=%08x\n", r0
+ ;//
+ ;// This macro preserves the value of all registers including the
+ ;// flags.
+ ;//==========================================================================
+
+ MACRO
+ M_PRINTF $format, $val0, $val1, $val2
+ IF DEBUG_ON
+
+ IMPORT printf
+ LCLA nArgs
+nArgs SETA 0
+
+ ;// save registers so we don't corrupt them
+ STMFD sp!, {r0-r12, lr}
+
+ ;// Drop stack to give us some workspace
+ SUB sp, sp, #16
+
+ ;// Save registers we need to print to the stack
+ IF "$val2" <> ""
+ ASSERT "$val1" <> ""
+ STR $val2, [sp, #8]
+nArgs SETA nArgs+1
+ ENDIF
+ IF "$val1" <> ""
+ ASSERT "$val0" <> ""
+ STR $val1, [sp, #4]
+nArgs SETA nArgs+1
+ ENDIF
+ IF "$val0"<>""
+ STR $val0, [sp]
+nArgs SETA nArgs+1
+ ENDIF
+
+ ;// Now we are safe to corrupt registers
+ ADR r0, %FT00
+ IF nArgs=1
+ LDR r1, [sp]
+ ENDIF
+ IF nArgs=2
+ LDMIA sp, {r1,r2}
+ ENDIF
+ IF nArgs=3
+ LDMIA sp, {r1,r2,r3}
+ ENDIF
+
+ ;// print the values
+ MRS r4, cpsr ;// preserve flags
+ BL printf
+ MSR cpsr_f, r4 ;// restore flags
+ B %FT01
+00 ;// string to print
+ DCB "$format", 0
+ ALIGN
+01 ;// Finished
+ ADD sp, sp, #16
+ ;// Restore registers
+ LDMFD sp!, {r0-r12,lr}
+
+ ENDIF ;// DEBUG_ON
+ MEND
+
+
+ ;// Stall Simulation Macro
+ ;// Inserts a given number of NOPs for the currently
+ ;// defined platform
+ MACRO
+ M_STALL $plat1stall, $plat2stall, $plat3stall, $plat4stall, $plat5stall, $plat6stall
+ IF DEBUG_STALLS_ON
+ _M_STALL_SUB $plat1stall
+ _M_STALL_SUB $plat2stall
+ _M_STALL_SUB $plat3stall
+ _M_STALL_SUB $plat4stall
+ _M_STALL_SUB $plat5stall
+ _M_STALL_SUB $plat6stall
+ ENDIF
+ MEND
+
+ MACRO
+ _M_STALL_SUB $platstall
+ IF "$platstall"!=""
+ LCLA _pllen
+ LCLS _pl
+ LCLL _pllog
+_pllen SETA :LEN:"$platstall"
+_pl SETS "$platstall":LEFT:(_pllen - 2)
+ IF :DEF:$_pl
+ IF $_pl
+ LCLS _st
+ LCLA _stnum
+_st SETS "$platstall":RIGHT:1
+_stnum SETA $_st
+ WHILE _stnum>0
+ MOV sp, sp
+_stnum SETA _stnum - 1
+ WEND
+ ENDIF
+ ENDIF
+ ENDIF
+ MEND
+
+
+
+;//==========================================================================
+;// Endian Invarience Macros
+;//
+;// The idea behind these macros is that if an array is
+;// loaded as words then the SMUL00 macro will multiply
+;// array elements 0 regardless of the endianess of the
+;// system. For little endian SMUL00=SMULBB, for big
+;// endian SMUL00=SMULTT and similarly for other packed operations.
+;//
+;//==========================================================================
+
+ MACRO
+ LIBI4 $comli, $combi, $a, $b, $c, $d, $cc
+ IF {ENDIAN}="big"
+ $combi.$cc $a, $b, $c, $d
+ ELSE
+ $comli.$cc $a, $b, $c, $d
+ ENDIF
+ MEND
+
+ MACRO
+ LIBI3 $comli, $combi, $a, $b, $c, $cc
+ IF {ENDIAN}="big"
+ $combi.$cc $a, $b, $c
+ ELSE
+ $comli.$cc $a, $b, $c
+ ENDIF
+ MEND
+
+ ;// SMLAxy macros
+
+ MACRO
+ SMLA00 $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA01 $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA0B $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA0T $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA10 $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA11 $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA1B $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLA1T $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAB0 $a, $b, $c, $d, $cc
+ LIBI4 SMLABB, SMLABT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAB1 $a, $b, $c, $d, $cc
+ LIBI4 SMLABT, SMLABB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAT0 $a, $b, $c, $d, $cc
+ LIBI4 SMLATB, SMLATT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAT1 $a, $b, $c, $d, $cc
+ LIBI4 SMLATT, SMLATB, $a, $b, $c, $d, $cc
+ MEND
+
+ ;// SMULxy macros
+
+ MACRO
+ SMUL00 $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL01 $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL0B $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL0T $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL10 $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL11 $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL1B $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMUL1T $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULB0 $a, $b, $c, $cc
+ LIBI3 SMULBB, SMULBT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULB1 $a, $b, $c, $cc
+ LIBI3 SMULBT, SMULBB, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULT0 $a, $b, $c, $cc
+ LIBI3 SMULTB, SMULTT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULT1 $a, $b, $c, $cc
+ LIBI3 SMULTT, SMULTB, $a, $b, $c, $cc
+ MEND
+
+ ;// SMLAWx, SMULWx macros
+
+ MACRO
+ SMLAW0 $a, $b, $c, $d, $cc
+ LIBI4 SMLAWB, SMLAWT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAW1 $a, $b, $c, $d, $cc
+ LIBI4 SMLAWT, SMLAWB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMULW0 $a, $b, $c, $cc
+ LIBI3 SMULWB, SMULWT, $a, $b, $c, $cc
+ MEND
+
+ MACRO
+ SMULW1 $a, $b, $c, $cc
+ LIBI3 SMULWT, SMULWB, $a, $b, $c, $cc
+ MEND
+
+ ;// SMLALxy macros
+
+
+ MACRO
+ SMLAL00 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL01 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL0B $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL0T $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL10 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL11 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL1B $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLAL1T $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALB0 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBB, SMLALBT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALB1 $a, $b, $c, $d, $cc
+ LIBI4 SMLALBT, SMLALBB, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALT0 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTB, SMLALTT, $a, $b, $c, $d, $cc
+ MEND
+
+ MACRO
+ SMLALT1 $a, $b, $c, $d, $cc
+ LIBI4 SMLALTT, SMLALTB, $a, $b, $c, $d, $cc
+ MEND
+
+ ENDIF ;// ARMCOMM_S_H
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
new file mode 100755
index 0000000..7a68d14
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/armOMX.h
@@ -0,0 +1,274 @@
+/*
+ *
+ * File Name: armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix.
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ * a function with a different version (the original version would still be
+ * in the library just with a different name - so you could debug the new
+ * version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that
+ * you can include two versions of the library and choose between functions
+ * at runtime.
+ *
+ * e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ *
+ */
+
+
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX BARE_SUFFIX
+#define OMXACMP3_SUFFIX BARE_SUFFIX
+#define OMXICJP_SUFFIX BARE_SUFFIX
+#define OMXIPBM_SUFFIX BARE_SUFFIX
+#define OMXIPCS_SUFFIX BARE_SUFFIX
+#define OMXIPPP_SUFFIX BARE_SUFFIX
+#define OMXSP_SUFFIX BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32 OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32 OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32 OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16 OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32 OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32 OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32 OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32 OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16 OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8 OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3 OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16 OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16 OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16 OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16 OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16 OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16 OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1 OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8 OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8 OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1 OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8 OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8 OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64 OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64 OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16 OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32 OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16 OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16 OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32 OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32 OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32 OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16 OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32 OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32 OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32 OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32 OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16 OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16 OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16 OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16 OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16 OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16 OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16 OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16 OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4 OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8 OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4 OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16 OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8 OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16 OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8 OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+
+#endif /* _armOMX_h_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
new file mode 100755
index 0000000..8b295a6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes.h
@@ -0,0 +1,252 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright © 2005-2008 The Khronos Group Inc. All Rights Reserved.
+ *
+ * These materials are protected by copyright laws and contain material
+ * proprietary to the Khronos Group, Inc. You may use these materials
+ * for implementing Khronos specifications, without altering or removing
+ * any trademark, copyright or other notice from the specification.
+ *
+ * Khronos Group makes no, and expressly disclaims any, representations
+ * or warranties, express or implied, regarding these materials, including,
+ * without limitation, any implied warranties of merchantability or fitness
+ * for a particular purpose or non-infringement of any intellectual property.
+ * Khronos Group makes no, and expressly disclaims any, warranties, express
+ * or implied, regarding the correctness, accuracy, completeness, timeliness,
+ * and reliability of these materials.
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters,
+ * Contributors or Members or their respective partners, officers, directors,
+ * employees, agents or representatives be liable for any damages, whether
+ * direct, indirect, special or consequential damages for lost revenues,
+ * lost profits, or otherwise, arising from or in connection with these
+ * materials.
+ *
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc.
+ *
+ */
+
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h>
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+
+ /* Mandatory return codes - use cases are explicitly described for each function */
+ OMX_Sts_NoErr = 0, /* No error, the function completed successfully */
+ OMX_Sts_Err = -2, /* Unknown/unspecified error */
+ OMX_Sts_InvalidBitstreamValErr = -182, /* Invalid value detected during bitstream processing */
+ OMX_Sts_MemAllocErr = -9, /* Not enough memory allocated for the operation */
+ OMX_StsACAAC_GainCtrErr = -159, /* AAC: Unsupported gain control data detected */
+ OMX_StsACAAC_PrgNumErr = -167, /* AAC: Invalid number of elements for one program */
+ OMX_StsACAAC_CoefValErr = -163, /* AAC: Invalid quantized coefficient value */
+ OMX_StsACAAC_MaxSfbErr = -162, /* AAC: Invalid maxSfb value in relation to numSwb */
+ OMX_StsACAAC_PlsDataErr = -160, /* AAC: pulse escape sequence data error */
+
+ /* Optional return codes - use cases are explicitly described for each function*/
+ OMX_Sts_BadArgErr = -5, /* Bad Arguments */
+
+ OMX_StsACAAC_TnsNumFiltErr = -157, /* AAC: Invalid number of TNS filters */
+ OMX_StsACAAC_TnsLenErr = -156, /* AAC: Invalid TNS region length */
+ OMX_StsACAAC_TnsOrderErr = -155, /* AAC: Invalid order of TNS filter */
+ OMX_StsACAAC_TnsCoefResErr = -154, /* AAC: Invalid bit-resolution for TNS filter coefficients */
+ OMX_StsACAAC_TnsCoefErr = -153, /* AAC: Invalid TNS filter coefficients */
+ OMX_StsACAAC_TnsDirectErr = -152, /* AAC: Invalid TNS filter direction */
+
+ OMX_StsICJP_JPEGMarkerErr = -183, /* JPEG marker encountered within an entropy-coded block; */
+ /* Huffman decoding operation terminated early. */
+ OMX_StsICJP_JPEGMarker = -181, /* JPEG marker encountered; Huffman decoding */
+ /* operation terminated early. */
+ OMX_StsIPPP_ContextMatchErr = -17, /* Context parameter doesn't match to the operation */
+
+ OMX_StsSP_EvenMedianMaskSizeErr = -180, /* Even size of the Median Filter mask was replaced by the odd one */
+
+ OMX_Sts_MaximumEnumeration = INT_MAX /*Placeholder, forces enum of size OMX_INT*/
+
+ } OMXResult; /** Return value or error value returned from a function. Identical to OMX_INT */
+
+
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff
+typedef unsigned short int OMX_U8;
+#else
+#error OMX_U8 undefined
+#endif
+
+
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f
+typedef signed short int OMX_S8;
+#else
+#error OMX_S8 undefined
+#endif
+
+
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16;
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff
+typedef signed int OMX_S16;
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32;
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32;
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+ typedef __int64 OMX_S64; /** Signed 64-bit integer */
+ typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+ #define OMX_MIN_S64 (0x8000000000000000i64)
+ #define OMX_MIN_U64 (0x0000000000000000i64)
+ #define OMX_MAX_S64 (0x7FFFFFFFFFFFFFFFi64)
+ #define OMX_MAX_U64 (0xFFFFFFFFFFFFFFFFi64)
+#else
+ typedef long long OMX_S64; /** Signed 64-bit integer */
+ typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+ #define OMX_MIN_S64 (0x8000000000000000LL)
+ #define OMX_MIN_U64 (0x0000000000000000LL)
+ #define OMX_MAX_S64 (0x7FFFFFFFFFFFFFFFLL)
+ #define OMX_MAX_U64 (0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+ OMX_S8 Re; /** Real part */
+ OMX_S8 Im; /** Imaginary part */
+
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+ OMX_S16 Re; /** Real part */
+ OMX_S16 Im; /** Imaginary part */
+
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+ OMX_S32 Re; /** Real part */
+ OMX_S32 Im; /** Imaginary part */
+
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+ OMX_S64 Re; /** Real part */
+ OMX_S64 Im; /** Imaginary part */
+
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8 (-128)
+#define OMX_MIN_U8 0
+#define OMX_MIN_S16 (-32768)
+#define OMX_MIN_U16 0
+#define OMX_MIN_S32 (-2147483647-1)
+#define OMX_MIN_U32 0
+
+#define OMX_MAX_S8 (127)
+#define OMX_MAX_U8 (255)
+#define OMX_MAX_S16 (32767)
+#define OMX_MAX_U16 (0xFFFF)
+#define OMX_MAX_S32 (2147483647)
+#define OMX_MAX_U32 (0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle,
+ * where x,y defines the coordinates of the top left corner
+ * of the rectangle, with dimensions width in the x-direction
+ * and height in the y-direction */
+typedef struct {
+ OMX_INT x; /** x-coordinate of top left corner of rectangle */
+ OMX_INT y; /** y-coordinate of top left corner of rectangle */
+ OMX_INT width; /** Width in the x-direction. */
+ OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y; /** y-coordinate */
+
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct
+{
+ OMX_INT width; /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
new file mode 100755
index 0000000..48703d1
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/api/omxtypes_s.h
@@ -0,0 +1,77 @@
+;//
+;//
+;// File Name: omxtypes_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Mandatory return codes - use cases are explicitly described for each function
+OMX_Sts_NoErr EQU 0 ;// No error the function completed successfully
+OMX_Sts_Err EQU -2 ;// Unknown/unspecified error
+OMX_Sts_InvalidBitstreamValErr EQU -182 ;// Invalid value detected during bitstream processing
+OMX_Sts_MemAllocErr EQU -9 ;// Not enough memory allocated for the operation
+OMX_StsACAAC_GainCtrErr EQU -159 ;// AAC: Unsupported gain control data detected
+OMX_StsACAAC_PrgNumErr EQU -167 ;// AAC: Invalid number of elements for one program
+OMX_StsACAAC_CoefValErr EQU -163 ;// AAC: Invalid quantized coefficient value
+OMX_StsACAAC_MaxSfbErr EQU -162 ;// AAC: Invalid maxSfb value in relation to numSwb
+OMX_StsACAAC_PlsDataErr EQU -160 ;// AAC: pulse escape sequence data error
+
+;// Optional return codes - use cases are explicitly described for each function
+OMX_Sts_BadArgErr EQU -5 ;// Bad Arguments
+
+OMX_StsACAAC_TnsNumFiltErr EQU -157 ;// AAC: Invalid number of TNS filters
+OMX_StsACAAC_TnsLenErr EQU -156 ;// AAC: Invalid TNS region length
+OMX_StsACAAC_TnsOrderErr EQU -155 ;// AAC: Invalid order of TNS filter
+OMX_StsACAAC_TnsCoefResErr EQU -154 ;// AAC: Invalid bit-resolution for TNS filter coefficients
+OMX_StsACAAC_TnsCoefErr EQU -153 ;// AAC: Invalid TNS filter coefficients
+OMX_StsACAAC_TnsDirectErr EQU -152 ;// AAC: Invalid TNS filter direction
+
+OMX_StsICJP_JPEGMarkerErr EQU -183 ;// JPEG marker encountered within an entropy-coded block;
+ ;// Huffman decoding operation terminated early.
+OMX_StsICJP_JPEGMarker EQU -181 ;// JPEG marker encountered; Huffman decoding
+ ;// operation terminated early.
+OMX_StsIPPP_ContextMatchErr EQU -17 ;// Context parameter doesn't match to the operation
+
+OMX_StsSP_EvenMedianMaskSizeErr EQU -180 ;// Even size of the Median Filter mask was replaced by the odd one
+
+OMX_Sts_MaximumEnumeration EQU 0x7FFFFFFF
+
+
+
+OMX_MIN_S8 EQU (-128)
+OMX_MIN_U8 EQU 0
+OMX_MIN_S16 EQU (-32768)
+OMX_MIN_U16 EQU 0
+
+
+OMX_MIN_S32 EQU (-2147483647-1)
+OMX_MIN_U32 EQU 0
+
+OMX_MAX_S8 EQU (127)
+OMX_MAX_U8 EQU (255)
+OMX_MAX_S16 EQU (32767)
+OMX_MAX_U16 EQU (0xFFFF)
+OMX_MAX_S32 EQU (2147483647)
+OMX_MAX_U32 EQU (0xFFFFFFFF)
+
+OMX_VC_UPPER EQU 0x1 ;// Used by the PredictIntra functions
+OMX_VC_LEFT EQU 0x2 ;// Used by the PredictIntra functions
+OMX_VC_UPPER_RIGHT EQU 0x40 ;// Used by the PredictIntra functions
+
+NULL EQU 0
+
+;// Structures
+
+ INCLUDE armCOMM_s.h
+
+ M_STRUCT OMXPoint
+ M_FIELD x, 4
+ M_FIELD y, 4
+ M_ENDSTRUCT
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl
new file mode 100755
index 0000000..649e74c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/build_vc.pl
@@ -0,0 +1,113 @@
+#!/usr/bin/perl
+#
+#
+# File Name: build_vc.pl
+# OpenMAX DL: v1.0.2
+# Revision: 12290
+# Date: Wednesday, April 9, 2008
+#
+# (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+#
+#
+#
+# This file builds the OpenMAX DL vc domain library omxVC.o.
+#
+
+use File::Spec;
+use strict;
+
+my ($CC, $CC_OPTS, $AS, $AS_OPTS, $LIB, $LIB_OPTS, $LIB_TYPE);
+
+$CC = 'armcc';
+$CC_OPTS = '--no_unaligned_access --cpu Cortex-A8 -c';
+$AS = 'armasm';
+$AS_OPTS = '--no_unaligned_access --cpu Cortex-A8';
+# $LIB = 'armlink';
+# $LIB_OPTS = '--partial -o';
+# $LIB_TYPE = '.o';
+$LIB = 'armar';
+$LIB_OPTS = '--create -r';
+$LIB_TYPE = '.a';
+
+#------------------------
+
+my (@headerlist, @filelist, $hd, $file, $ofile, $command, $objlist, $libfile, $h);
+
+# Define the list of directories containing included header files.
+@headerlist = qw(api vc/api vc/m4p2/api vc/m4p10/api);
+
+# Define the list of source files to compile.
+open(FILES, '<filelist_vc.txt') or die("Can't open source file list\n");
+@filelist = <FILES>;
+close(FILES);
+
+# Fix the file separators in the header paths
+foreach $h (@headerlist)
+{
+ $h = File::Spec->canonpath($h);
+}
+
+# Create the include path to be passed to the compiler
+$hd = '-I' . join(' -I', @headerlist);
+
+# Create the build directories "/lib/" and "/obj/" (if they are not there already)
+mkdir "obj", 0777 if (! -d "obj");
+mkdir "lib", 0777 if (! -d "lib");
+
+$objlist = '';
+
+# Compile each file
+foreach $file (@filelist)
+{
+ my $f;
+ my $base;
+ my $ext;
+ my $objfile;
+
+ chomp($file);
+ $file = File::Spec->canonpath($file);
+
+ (undef, undef, $f) = File::Spec->splitpath($file);
+ $f=~s/[\n\f\r]//g; # Remove any end-of-line characters
+
+ if(($base, $ext) = $f =~ /(.+)\.(\w)$/)
+ {
+ $objfile = File::Spec->catfile('obj', $base.'.o');
+
+ if($ext eq 'c')
+ {
+ $objlist .= "$objfile ";
+ $command = $CC.' '.$CC_OPTS.' '.$hd.' -o '.$objfile.' '.$file;
+ print "$command\n";
+ system($command);
+ }
+ elsif($ext eq 's')
+ {
+ $objlist .= "$objfile ";
+ $command = $AS.' '.$AS_OPTS.' '.$hd.' -o '.$objfile.' '.$file;
+ print "$command\n";
+ system($command);
+ }
+ else
+ {
+ print "Ignoring file: $f\n";
+ }
+ }
+ else
+ {
+ die "No file extension found: $f\n";
+ }
+}
+
+# Do the final link stage to create the libraries.
+$libfile = File::Spec->catfile('lib', 'omxVC'.$LIB_TYPE);
+$command = $LIB.' '.$LIB_OPTS.' '.$libfile.' '.$objlist;
+print "$command\n";
+(system($command) == 0) and print "Build successful\n";
+
+
+
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt
new file mode 100755
index 0000000..8db8eeb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/filelist_vc.txt
@@ -0,0 +1,75 @@
+./api/armCOMM.h
+./api/armCOMM_BitDec_s.h
+./api/armCOMM_Bitstream.h
+./api/armCOMM_IDCT_s.h
+./api/armCOMM_IDCTTable.h
+./api/armCOMM_MaskTable.h
+./api/armCOMM_s.h
+./api/armCOMM_Version.h
+./api/armOMX_ReleaseVersion.h
+./api/omxtypes.h
+./api/omxtypes_s.h
+./src/armCOMM_IDCTTable.c
+./src/armCOMM_MaskTable.c
+./vc/api/armVC.h
+./vc/api/armVCCOMM_s.h
+./vc/api/omxVC.h
+./vc/api/omxVC_s.h
+./vc/comm/src/omxVCCOMM_Copy16x16_s.s
+./vc/comm/src/omxVCCOMM_Copy8x8_s.s
+./vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
+./vc/m4p10/api/armVCM4P10_CAVLCTables.h
+./vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_CAVLCTables.c
+./vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
+./vc/m4p10/src/armVCM4P10_DequantTables_s.s
+./vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+./vc/m4p10/src/armVCM4P10_QuantTables_s.s
+./vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
+./vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
+./vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
+./vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
+./vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+./vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+./vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+./vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
+./vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
+./vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
+./vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+./vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+./vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
+./vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
+./vc/m4p2/src/armVCM4P2_Clip8_s.s
+./vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+./vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
+./vc/m4p2/src/armVCM4P2_Lookup_Tables.c
+./vc/m4p2/src/armVCM4P2_SetPredDir_s.s
+./vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
+./vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
+./vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
+./vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+./vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+./vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
+./vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
+./vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
+./vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
+./vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
+./vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
+./vc/src/armVC_Version.c \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c
new file mode 100755
index 0000000..e572a89
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM.c
@@ -0,0 +1,936 @@
+/**
+ *
+ * File Name: armCOMM.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 9641
+ * Date: Thursday, February 7, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Defines Common APIs used across OpenMAX API's
+ */
+
+#include "omxtypes.h"
+#include "armCOMM.h"
+
+/***********************************************************************/
+ /* Miscellaneous Arithmetic operations */
+
+/**
+ * Function: armRoundFloatToS16
+ *
+ * Description:
+ * Converts a double precision value into a short int after rounding
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16 format
+ *
+ */
+
+OMX_S16 armRoundFloatToS16 (OMX_F64 Value)
+{
+ if (Value > 0)
+ {
+ return (OMX_S16)(Value + .5);
+ }
+ else
+ {
+ return (OMX_S16)(Value - .5);
+ }
+}
+
+/**
+ * Function: armRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a int after rounding
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S32 format
+ *
+ */
+
+OMX_S32 armRoundFloatToS32 (OMX_F64 Value)
+{
+ if (Value > 0)
+ {
+ return (OMX_S32)(Value + .5);
+ }
+ else
+ {
+ return (OMX_S32)(Value - .5);
+ }
+}
+/**
+ * Function: armSatRoundFloatToS16
+ *
+ * Description:
+ * Converts a double precision value into a short int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S16 format
+ *
+ */
+
+OMX_S16 armSatRoundFloatToS16 (OMX_F64 Value)
+{
+ if (Value > 0)
+ {
+ Value += 0.5;
+
+ if(Value > (OMX_S16)OMX_MAX_S16 )
+ {
+ return (OMX_S16)OMX_MAX_S16;
+ }
+ else
+ {
+ return (OMX_S16)Value;
+ }
+ }
+ else
+ {
+ Value -= 0.5;
+
+ if(Value < (OMX_S16)OMX_MIN_S16 )
+ {
+ return (OMX_S16)OMX_MIN_S16;
+ }
+ else
+ {
+ return (OMX_S16)Value;
+ }
+ }
+}
+
+/**
+ * Function: armSatRoundFloatToS32
+ *
+ * Description:
+ * Converts a double precision value into a int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S32 format
+ *
+ */
+
+OMX_S32 armSatRoundFloatToS32 (OMX_F64 Value)
+{
+ if (Value > 0)
+ {
+ Value += 0.5;
+
+ if(Value > (OMX_S32)OMX_MAX_S32 )
+ {
+ return (OMX_S32)OMX_MAX_S32;
+ }
+ else
+ {
+ return (OMX_S32)Value;
+ }
+ }
+ else
+ {
+ Value -= 0.5;
+
+ if(Value < (OMX_S32)OMX_MIN_S32 )
+ {
+ return (OMX_S32)OMX_MIN_S32;
+ }
+ else
+ {
+ return (OMX_S32)Value;
+ }
+ }
+}
+
+/**
+ * Function: armSatRoundFloatToU16
+ *
+ * Description:
+ * Converts a double precision value into a unsigned short int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U16 format
+ *
+ */
+
+OMX_U16 armSatRoundFloatToU16 (OMX_F64 Value)
+{
+ Value += 0.5;
+
+ if(Value > (OMX_U16)OMX_MAX_U16 )
+ {
+ return (OMX_U16)OMX_MAX_U16;
+ }
+ else
+ {
+ return (OMX_U16)Value;
+ }
+}
+
+/**
+ * Function: armSatRoundFloatToU32
+ *
+ * Description:
+ * Converts a double precision value into a unsigned int after rounding and saturation
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_U32 format
+ *
+ */
+
+OMX_U32 armSatRoundFloatToU32 (OMX_F64 Value)
+{
+ Value += 0.5;
+
+ if(Value > (OMX_U32)OMX_MAX_U32 )
+ {
+ return (OMX_U32)OMX_MAX_U32;
+ }
+ else
+ {
+ return (OMX_U32)Value;
+ }
+}
+
+/**
+ * Function: armRoundFloatToS64
+ *
+ * Description:
+ * Converts a double precision value into a 64 bit int after rounding
+ *
+ * Parameters:
+ * [in] Value Float value to be converted
+ *
+ * Return Value:
+ * [out] converted value in OMX_S64 format
+ *
+ */
+
+OMX_S64 armRoundFloatToS64 (OMX_F64 Value)
+{
+ if (Value > 0)
+ {
+ return (OMX_S64)(Value + .5);
+ }
+ else
+ {
+ return (OMX_S64)(Value - .5);
+ }
+}
+
+/**
+ * Function: armSignCheck
+ *
+ * Description:
+ * Checks the sign of a variable:
+ * returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] var Variable to be checked
+ *
+ * Return Value:
+ * OMX_INT -- returns 1 if it is Positive
+ * returns 0 if it is 0
+ * returns -1 if it is Negative
+ */
+
+OMX_INT armSignCheck (
+ OMX_S16 var
+)
+
+{
+ OMX_INT Sign;
+
+ if (var < 0)
+ {
+ Sign = -1;
+ }
+ else if ( var > 0)
+ {
+ Sign = 1;
+ }
+ else
+ {
+ Sign = 0;
+ }
+
+ return Sign;
+}
+
+/**
+ * Function: armClip
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_S32 -- returns clipped value
+ */
+
+OMX_S32 armClip (
+ OMX_INT min,
+ OMX_INT max,
+ OMX_S32 src
+)
+
+{
+ if (src > max)
+ {
+ src = max;
+ }
+ else if (src < min)
+ {
+ src = min;
+ }
+
+ return src;
+}
+
+/**
+ * Function: armClip_F32
+ *
+ * Description: Clips the input between MAX and MIN value
+ *
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] Min lower bound
+ * [in] Max upper bound
+ * [in] src variable to the clipped
+ *
+ * Return Value:
+ * OMX_F32 -- returns clipped value
+ */
+
+OMX_F32 armClip_F32 (
+ OMX_F32 min,
+ OMX_F32 max,
+ OMX_F32 src
+)
+
+{
+ if (src > max)
+ {
+ src = max;
+ }
+ else if (src < min)
+ {
+ src = min;
+ }
+
+ return src;
+}
+
+/**
+ * Function: armShiftSat_F32
+ *
+ * Description: Divides a float value by 2^shift and
+ * saturates it for unsigned value range for satBits.
+ * Second parameter is like "shifting" the corresponding
+ * integer value. Takes care of rounding while clipping the final
+ * value.
+ *
+ * Parameters:
+ * [in] v Number to be operated upon
+ * [in] shift Divides the input "v" by "2^shift"
+ * [in] satBits Final range is [0, 2^satBits)
+ *
+ * Return Value:
+ * OMX_S32 -- returns "shifted" saturated value
+ */
+
+OMX_U32 armShiftSat_F32(OMX_F32 v, OMX_INT shift, OMX_INT satBits)
+{
+ OMX_U32 allOnes = (OMX_U32)(-1);
+ OMX_U32 maxV = allOnes >> (32-satBits);
+ OMX_F32 vShifted, vRounded, shiftDiv = (OMX_F32)(1 << shift);
+ OMX_U32 vInt;
+ OMX_U32 vIntSat;
+
+ if(v <= 0)
+ return 0;
+
+ vShifted = v / shiftDiv;
+ vRounded = (OMX_F32)(vShifted + 0.5);
+ vInt = (OMX_U32)vRounded;
+ vIntSat = vInt;
+ if(vIntSat > maxV)
+ vIntSat = maxV;
+ return vIntSat;
+}
+
+/**
+ * Functions: armSwapElem
+ *
+ * Description:
+ * These function swaps two elements at the specified pointer locations.
+ * The size of each element could be anything as specified by <elemSize>
+ *
+ * Return Value:
+ * OMXResult -- Error status from the function
+ */
+OMXResult armSwapElem(
+ OMX_U8 *pBuf1,
+ OMX_U8 *pBuf2,
+ OMX_INT elemSize
+ )
+{
+ OMX_INT i;
+ OMX_U8 temp;
+ armRetArgErrIf(!pBuf1 || !pBuf2, OMX_Sts_BadArgErr);
+
+ for(i = 0; i < elemSize; i++)
+ {
+ temp = *(pBuf1 + i);
+ *(pBuf1 + i) = *(pBuf2 + i);
+ *(pBuf2 + i) = temp;
+ }
+ return OMX_Sts_NoErr;
+}
+
+/**
+ * Function: armMedianOf3
+ *
+ * Description: Finds the median of three numbers
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] fEntry First entry
+ * [in] sEntry second entry
+ * [in] tEntry Third entry
+ *
+ * Return Value:
+ * OMX_S32 -- returns the median value
+ */
+
+OMX_S32 armMedianOf3 (
+ OMX_S32 fEntry,
+ OMX_S32 sEntry,
+ OMX_S32 tEntry
+)
+{
+ OMX_S32 a, b, c;
+
+ a = armMin (fEntry, sEntry);
+ b = armMax (fEntry, sEntry);
+ c = armMin (b, tEntry);
+ return (armMax (a, c));
+}
+
+/**
+ * Function: armLogSize
+ *
+ * Description: Finds the size of a positive value and returns the same
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] value Positive value
+ *
+ * Return Value:
+ * OMX_U8 -- Returns the minimum number of bits required to represent the positive value.
+ This is the smallest k>=0 such that that value is less than (1<<k).
+ */
+
+OMX_U8 armLogSize (
+ OMX_U16 value
+)
+{
+ OMX_U8 i;
+ for ( i = 0; value > 0; value = value >> 1)
+ {
+ i++;
+ }
+ return i;
+}
+
+/***********************************************************************/
+ /* Saturating Arithmetic operations */
+
+/**
+ * Function :armSatAdd_S32()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S32 armSatAdd_S32(OMX_S32 Value1,OMX_S32 Value2)
+{
+ OMX_S32 Result;
+
+ Result = Value1 + Value2;
+
+ if( (Value1^Value2) >= 0)
+ {
+ /*Same sign*/
+ if( (Result^Value1) >= 0)
+ {
+ /*Result has not saturated*/
+ return Result;
+ }
+ else
+ {
+ if(Value1 >= 0)
+ {
+ /*Result has saturated in positive side*/
+ return OMX_MAX_S32;
+ }
+ else
+ {
+ /*Result has saturated in negative side*/
+ return OMX_MIN_S32;
+ }
+
+ }
+
+ }
+ else
+ {
+ return Result;
+ }
+
+}
+
+/**
+ * Function :armSatAdd_S64()
+ *
+ * Description :
+ * Returns the result of saturated addition of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ *
+ **/
+
+OMX_S64 armSatAdd_S64(OMX_S64 Value1,OMX_S64 Value2)
+{
+ OMX_S64 Result;
+
+ Result = Value1 + Value2;
+
+ if( (Value1^Value2) >= 0)
+ {
+ /*Same sign*/
+ if( (Result^Value1) >= 0)
+ {
+ /*Result has not saturated*/
+ return Result;
+ }
+ else
+ {
+ if(Value1 >= 0)
+ {
+ /*Result has saturated in positive side*/
+ Result = OMX_MAX_S64;
+ return Result;
+ }
+ else
+ {
+ /*Result has saturated in negative side*/
+ return OMX_MIN_S64;
+ }
+
+ }
+
+ }
+ else
+ {
+ return Result;
+ }
+
+}
+
+/** Function :armSatSub_S32()
+ *
+ * Description :
+ * Returns the result of saturated substraction of the two inputs Value1, Value2
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatSub_S32(OMX_S32 Value1,OMX_S32 Value2)
+{
+ OMX_S32 Result;
+
+ Result = Value1 - Value2;
+
+ if( (Value1^Value2) < 0)
+ {
+ /*Opposite sign*/
+ if( (Result^Value1) >= 0)
+ {
+ /*Result has not saturated*/
+ return Result;
+ }
+ else
+ {
+ if(Value1 >= 0)
+ {
+ /*Result has saturated in positive side*/
+ return OMX_MAX_S32;
+ }
+ else
+ {
+ /*Result has saturated in negative side*/
+ return OMX_MIN_S32;
+ }
+
+ }
+
+ }
+ else
+ {
+ return Result;
+ }
+
+}
+
+/**
+ * Function :armSatMac_S32()
+ *
+ * Description :
+ * Returns the result of Multiplication of Value1 and Value2 and subesquent saturated
+ * accumulation with Mac
+ *
+ * Parametrs:
+ * [in] Value1 First Operand
+ * [in] Value2 Second Operand
+ * [in] Mac Accumulator
+ *
+ * Return:
+ * [out] Result of operation
+ **/
+
+OMX_S32 armSatMac_S32(OMX_S32 Mac,OMX_S16 Value1,OMX_S16 Value2)
+{
+ OMX_S32 Result;
+
+ Result = (OMX_S32)(Value1*Value2);
+ Result = armSatAdd_S32( Mac , Result );
+
+ return Result;
+}
+
+/**
+ * Function :armSatMac_S16S32_S32
+ *
+ * Description :
+ * Returns the result of saturated MAC operation of the three inputs delayElem, filTap , mac
+ *
+ * mac = mac + Saturate_in_32Bits(delayElem * filTap)
+ *
+ * Parametrs:
+ * [in] delayElem First 32 bit Operand
+ * [in] filTap Second 16 bit Operand
+ * [in] mac Result of MAC operation
+ *
+ * Return:
+ * [out] mac Result of operation
+ *
+ **/
+
+OMX_S32 armSatMac_S16S32_S32(OMX_S32 mac, OMX_S32 delayElem, OMX_S16 filTap )
+{
+
+ OMX_S32 result;
+
+ result = armSatMulS16S32_S32(filTap,delayElem);
+
+ if ( result > OMX_MAX_S16 )
+ {
+ result = OMX_MAX_S32;
+ }
+ else if( result < OMX_MIN_S16 )
+ {
+ result = OMX_MIN_S32;
+ }
+ else
+ {
+ result = delayElem * filTap;
+ }
+
+ mac = armSatAdd_S32(mac,result);
+
+ return mac;
+}
+
+
+/**
+ * Function :armSatRoundRightShift_S32_S16
+ *
+ * Description :
+ * Returns the result of rounded right shift operation of input by the scalefactor
+ *
+ * output = Saturate_in_16Bits( ( Right/LeftShift( (Round(input) , shift ) )
+ *
+ * Parametrs:
+ * [in] input The input to be operated on
+ * [in] shift The shift number
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S16 armSatRoundRightShift_S32_S16(OMX_S32 input, OMX_INT shift)
+{
+ input = armSatRoundLeftShift_S32(input,-shift);
+
+ if ( input > OMX_MAX_S16 )
+ {
+ return (OMX_S16)OMX_MAX_S16;
+ }
+ else if (input < OMX_MIN_S16)
+ {
+ return (OMX_S16)OMX_MIN_S16;
+ }
+ else
+ {
+ return (OMX_S16)input;
+ }
+
+}
+
+/**
+ * Function :armSatRoundLeftShift_S32()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] Shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatRoundLeftShift_S32(OMX_S32 Value, OMX_INT Shift)
+{
+ OMX_INT i;
+
+ if (Shift < 0)
+ {
+ Shift = -Shift;
+ Value = armSatAdd_S32(Value, (1 << (Shift - 1)));
+ Value = Value >> Shift;
+ }
+ else
+ {
+ for (i = 0; i < Shift; i++)
+ {
+ Value = armSatAdd_S32(Value, Value);
+ }
+ }
+ return Value;
+}
+
+/**
+ * Function :armSatRoundLeftShift_S64()
+ *
+ * Description :
+ * Returns the result of saturating left-shift operation on input
+ * Or rounded Right shift if the input Shift is negative.
+ *
+ * Parametrs:
+ * [in] Value Operand
+ * [in] shift Operand for shift operation
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S64 armSatRoundLeftShift_S64(OMX_S64 Value, OMX_INT Shift)
+{
+ OMX_INT i;
+
+ if (Shift < 0)
+ {
+ Shift = -Shift;
+ Value = armSatAdd_S64(Value, ((OMX_S64)1 << (Shift - 1)));
+ Value = Value >> Shift;
+ }
+ else
+ {
+ for (i = 0; i < Shift; i++)
+ {
+ Value = armSatAdd_S64(Value, Value);
+ }
+ }
+ return Value;
+}
+
+/**
+ * Function :armSatMulS16S32_S32()
+ *
+ * Description :
+ * Returns the result of a S16 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+
+OMX_S32 armSatMulS16S32_S32(OMX_S16 input1,OMX_S32 input2)
+{
+ OMX_S16 hi2,lo1;
+ OMX_U16 lo2;
+
+ OMX_S32 temp1,temp2;
+ OMX_S32 result;
+
+ lo1 = input1;
+
+ hi2 = ( input2 >> 16 );
+ lo2 = ( (OMX_U32)( input2 << 16 ) >> 16 );
+
+ temp1 = hi2 * lo1;
+ temp2 = ( lo2* lo1 ) >> 16;
+
+ result = armSatAdd_S32(temp1,temp2);
+
+ return result;
+}
+
+/**
+ * Function :armSatMulS32S32_S32()
+ *
+ * Description :
+ * Returns the result of a S32 data type multiplied with an S32 data type
+ * in a S32 container
+ *
+ * Parametrs:
+ * [in] input1 Operand 1
+ * [in] input2 Operand 2
+ *
+ * Return:
+ * [out] Result of operation
+ *
+ **/
+
+OMX_S32 armSatMulS32S32_S32(OMX_S32 input1,OMX_S32 input2)
+{
+ OMX_S16 hi1,hi2;
+ OMX_U16 lo1,lo2;
+
+ OMX_S32 temp1,temp2,temp3;
+ OMX_S32 result;
+
+ hi1 = ( input1 >> 16 );
+ lo1 = ( (OMX_U32)( input1 << 16 ) >> 16 );
+
+ hi2 = ( input2 >> 16 );
+ lo2 = ( (OMX_U32)( input2 << 16 ) >> 16 );
+
+ temp1 = hi1 * hi2;
+ temp2 = ( hi1* lo2 ) >> 16;
+ temp3 = ( hi2* lo1 ) >> 16;
+
+ result = armSatAdd_S32(temp1,temp2);
+ result = armSatAdd_S32(result,temp3);
+
+ return result;
+}
+
+/**
+ * Function :armIntDivAwayFromZero()
+ *
+ * Description : Integer division with rounding to the nearest integer.
+ * Half-integer values are rounded away from zero
+ * unless otherwise specified. For example 3//2 is rounded
+ * to 2, and -3//2 is rounded to -2.
+ *
+ * Parametrs:
+ * [in] Num Operand 1
+ * [in] Deno Operand 2
+ *
+ * Return:
+ * [out] Result of operation input1//input2
+ *
+ **/
+
+OMX_S32 armIntDivAwayFromZero (OMX_S32 Num, OMX_S32 Deno)
+{
+ OMX_F64 result;
+
+ result = ((OMX_F64)Num)/((OMX_F64)Deno);
+
+ if (result >= 0)
+ {
+ result += 0.5;
+ }
+ else
+ {
+ result -= 0.5;
+ }
+
+ return (OMX_S32)(result);
+}
+
+
+/*End of File*/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c
new file mode 100755
index 0000000..9ef9319
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_Bitstream.c
@@ -0,0 +1,329 @@
+/**
+ *
+ * File Name: armCOMM_Bitstream.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 9641
+ * Date: Thursday, February 7, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Defines bitstream encode and decode functions common to all codecs
+ */
+
+#include "omxtypes.h"
+#include "armCOMM.h"
+#include "armCOMM_Bitstream.h"
+
+/***************************************
+ * Fixed bit length Decode
+ ***************************************/
+
+/**
+ * Function: armLookAheadBits()
+ *
+ * Description:
+ * Get the next N bits from the bitstream without advancing the bitstream pointer
+ *
+ * Parameters:
+ * [in] **ppBitStream
+ * [in] *pOffset
+ * [in] N=1...32
+ *
+ * Returns Value
+ */
+
+OMX_U32 armLookAheadBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N)
+{
+ const OMX_U8 *pBitStream = *ppBitStream;
+ OMX_INT Offset = *pOffset;
+ OMX_U32 Value;
+
+ armAssert(Offset>=0 && Offset<=7);
+ armAssert(N>=1 && N<=32);
+
+ /* Read next 32 bits from stream */
+ Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16) | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+ Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+ /* Return N bits */
+ return Value >> (32-N);
+}
+
+
+/**
+ * Function: armGetBits()
+ *
+ * Description:
+ * Read N bits from the bitstream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N=1..32
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ * Returns Value
+ */
+
+
+OMX_U32 armGetBits(const OMX_U8 **ppBitStream, OMX_INT *pOffset, OMX_INT N)
+{
+ const OMX_U8 *pBitStream = *ppBitStream;
+ OMX_INT Offset = *pOffset;
+ OMX_U32 Value;
+
+ if(N == 0)
+ {
+ return 0;
+ }
+
+ armAssert(Offset>=0 && Offset<=7);
+ armAssert(N>=1 && N<=32);
+
+ /* Read next 32 bits from stream */
+ Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16) | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+ Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+ /* Advance bitstream pointer by N bits */
+ Offset += N;
+ *ppBitStream = pBitStream + (Offset>>3);
+ *pOffset = Offset & 7;
+
+ /* Return N bits */
+ return Value >> (32-N);
+}
+
+/**
+ * Function: armByteAlign()
+ *
+ * Description:
+ * Align the pointer *ppBitStream to the next byte boundary
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+OMXVoid armByteAlign(const OMX_U8 **ppBitStream,OMX_INT *pOffset)
+{
+ if(*pOffset > 0)
+ {
+ *ppBitStream += 1;
+ *pOffset = 0;
+ }
+}
+
+/**
+ * Function: armSkipBits()
+ *
+ * Description:
+ * Skip N bits from the value at *ppBitStream
+ *
+ * Parameters:
+ * [in] *ppBitStream
+ * [in] *pOffset
+ * [in] N
+ *
+ * [out] *ppBitStream
+ * [out] *pOffset
+ *
+ **/
+
+
+OMXVoid armSkipBits(const OMX_U8 **ppBitStream,OMX_INT *pOffset,OMX_INT N)
+{
+ OMX_INT Offset = *pOffset;
+ const OMX_U8 *pBitStream = *ppBitStream;
+
+ /* Advance bitstream pointer by N bits */
+ Offset += N;
+ *ppBitStream = pBitStream + (Offset>>3);
+ *pOffset = Offset & 7;
+}
+
+/***************************************
+ * Variable bit length Decode
+ ***************************************/
+
+/**
+ * Function: armUnPackVLC32()
+ *
+ * Description:
+ * Variable length decode of variable length symbol (max size 32 bits) read from
+ * the bit stream pointed by *ppBitStream at *pOffset by using the table
+ * pointed by pCodeBook
+ *
+ * Parameters:
+ * [in] *pBitStream
+ * [in] *pOffset
+ * [in] pCodeBook
+ *
+ * [out] *pBitStream
+ * [out] *pOffset
+ *
+ * Returns : Code Book Index if successfull.
+ * : ARM_NO_CODEBOOK_INDEX = -1 if search fails.
+ **/
+#ifndef C_OPTIMIZED_IMPLEMENTATION
+
+OMX_U16 armUnPackVLC32(
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ const ARM_VLC32 *pCodeBook
+)
+{
+ const OMX_U8 *pBitStream = *ppBitStream;
+ OMX_INT Offset = *pOffset;
+ OMX_U32 Value;
+ OMX_INT Index;
+
+ armAssert(Offset>=0 && Offset<=7);
+
+ /* Read next 32 bits from stream */
+ Value = (pBitStream[0] << 24 ) | ( pBitStream[1] << 16) | (pBitStream[2] << 8 ) | (pBitStream[3]) ;
+ Value = (Value << Offset ) | (pBitStream[4] >> (8-Offset));
+
+ /* Search through the codebook */
+ for (Index=0; pCodeBook->codeLen != 0; Index++)
+ {
+ if (pCodeBook->codeWord == (Value >> (32 - pCodeBook->codeLen)))
+ {
+ Offset = Offset + pCodeBook->codeLen;
+ *ppBitStream = pBitStream + (Offset >> 3) ;
+ *pOffset = Offset & 7;
+
+ return Index;
+ }
+ pCodeBook++;
+ }
+
+ /* No code match found */
+ return ARM_NO_CODEBOOK_INDEX;
+}
+
+#endif
+
+/***************************************
+ * Fixed bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackBits
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] codeWord Code word that need to be inserted in to the
+ * bitstream
+ * [in] codeLength Length of the code word valid range 1...32
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackBits (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pOffset,
+ OMX_U32 codeWord,
+ OMX_INT codeLength
+)
+{
+ OMX_U8 *pBitStream = *ppBitStream;
+ OMX_INT Offset = *pOffset;
+ OMX_U32 Value;
+
+ /* checking argument validity */
+ armRetArgErrIf(Offset < 0, OMX_Sts_BadArgErr);
+ armRetArgErrIf(Offset > 7, OMX_Sts_BadArgErr);
+ armRetArgErrIf(codeLength < 1, OMX_Sts_BadArgErr);
+ armRetArgErrIf(codeLength > 32, OMX_Sts_BadArgErr);
+
+ /* Prepare the first byte */
+ codeWord = codeWord << (32-codeLength);
+ Value = (pBitStream[0] >> (8-Offset)) << (8-Offset);
+ Value = Value | (codeWord >> (24+Offset));
+
+ /* Write out whole bytes */
+ while (8-Offset <= codeLength)
+ {
+ *pBitStream++ = (OMX_U8)Value;
+ codeWord = codeWord << (8-Offset);
+ codeLength = codeLength - (8-Offset);
+ Offset = 0;
+ Value = codeWord >> 24;
+ }
+
+ /* Write out final partial byte */
+ *pBitStream = (OMX_U8)Value;
+ *ppBitStream = pBitStream;
+ *pOffset = Offset + codeLength;
+
+ return OMX_Sts_NoErr;
+}
+
+/***************************************
+ * Variable bit length Encode
+ ***************************************/
+
+/**
+ * Function: armPackVLC32
+ *
+ * Description:
+ * Pack a VLC code word into the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte
+ * in the bit stream.
+ * [in] pBitOffset pointer to the bit position in the byte
+ * pointed by *ppBitStream. Valid within 0
+ * to 7.
+ * [in] code VLC code word that need to be inserted in to the
+ * bitstream
+ *
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMX_RESULT result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armPackVLC32 (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ ARM_VLC32 code
+)
+{
+ return (armPackBits(ppBitStream, pBitOffset, code.codeWord, code.codeLen));
+}
+
+/*End of File*/
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c
new file mode 100755
index 0000000..3f5e279
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_IDCTTable.c
@@ -0,0 +1,60 @@
+/**
+ *
+ * File Name: armCOMM_IDCTTable.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armCOMM_IDCTTable.c
+ * Brief: Defines Tables used in IDCT computation
+ *
+ */
+
+#include "armCOMM_IDCTTable.h"
+
+ /* Table of s(u)*A(u)*A(v)/16 at Q15
+ * s(u)=1.0 0 <= u <= 5
+ * s(6)=2.0
+ * s(7)=4.0
+ * A(0) = 2*sqrt(2)
+ * A(u) = 4*cos(u*pi/16) for (u!=0)
+ */
+
+__align(4) const OMX_U16 armCOMM_IDCTPreScale [64] =
+{
+ 0x4000, 0x58c5, 0x539f, 0x4b42, 0x4000, 0x3249, 0x4546, 0x46a1,
+ 0x58c5, 0x7b21, 0x73fc, 0x6862, 0x58c5, 0x45bf, 0x6016, 0x61f8,
+ 0x539f, 0x73fc, 0x6d41, 0x6254, 0x539f, 0x41b3, 0x5a82, 0x5c48,
+ 0x4b42, 0x6862, 0x6254, 0x587e, 0x4b42, 0x3b21, 0x5175, 0x530d,
+ 0x4000, 0x58c5, 0x539f, 0x4b42, 0x4000, 0x3249, 0x4546, 0x46a1,
+ 0x3249, 0x45bf, 0x41b3, 0x3b21, 0x3249, 0x2782, 0x366d, 0x377e,
+ 0x22a3, 0x300b, 0x2d41, 0x28ba, 0x22a3, 0x1b37, 0x257e, 0x263a,
+ 0x11a8, 0x187e, 0x1712, 0x14c3, 0x11a8, 0x0de0, 0x131d, 0x137d
+};
+ /* Above array armCOMM_IDCTPreScale, in Q23 format */
+const OMX_U32 armCOMM_IDCTPreScaleU32 [64] =
+{
+ 0x400000, 0x58c543, 0x539eba, 0x4b418c, 0x400000, 0x3248d4, 0x4545ea, 0x46a157,
+ 0x58c543, 0x7b20d8, 0x73fbfc, 0x686214, 0x58c543, 0x45bf1f, 0x6015a5, 0x61f78b,
+ 0x539eba, 0x73fbfc, 0x6d413d, 0x6253a6, 0x539eba, 0x41b328, 0x5a827a, 0x5c4869,
+ 0x4b418c, 0x686214, 0x6253a6, 0x587de3, 0x4b418c, 0x3b20d8, 0x5174e0, 0x530d69,
+ 0x400000, 0x58c543, 0x539eba, 0x4b418c, 0x400000, 0x3248d4, 0x4545ea, 0x46a157,
+ 0x3248d4, 0x45bf1f, 0x41b328, 0x3b20d8, 0x3248d4, 0x27821d, 0x366d72, 0x377e6b,
+ 0x22a2f5, 0x300ad3, 0x2d413d, 0x28ba70, 0x22a2f5, 0x1b36b9, 0x257d86, 0x26398d,
+ 0x11a856, 0x187de3, 0x17121a, 0x14c35a, 0x11a856, 0x0ddf9b, 0x131cc7, 0x137ca2
+};
+
+const OMX_U16 armCOMM_IDCTCoef [4] =
+{
+ 0x5a82, /* InvSqrt2 */
+ 0x30fc, /* SinPIBy8 */
+ 0x7642, /* CosPIBy8 */
+ 0x0000
+};
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c
new file mode 100755
index 0000000..09f88c3
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/src/armCOMM_MaskTable.c
@@ -0,0 +1,45 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: armCOMM_MaskTable.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Mask Table to mask the end of array.
+ *
+ */
+
+#include "omxtypes.h"
+
+#define MaskTableSize 72
+
+const OMX_U16 armCOMM_qMaskTable16[MaskTableSize] =
+{
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
+};
+
+const OMX_U8 armCOMM_qMaskTable8[MaskTableSize] =
+{
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h
new file mode 100755
index 0000000..35b510b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVC.h
@@ -0,0 +1,1153 @@
+/**
+ *
+ * File Name: armVC.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armVideo.h
+ * Brief: Declares API's/Basic Data types used across the OpenMAX Video domain
+ *
+ */
+
+
+#ifndef _armVideo_H_
+#define _armVideo_H_
+
+#include "omxVC.h"
+#include "armCOMM_Bitstream.h"
+
+/**
+ * ARM specific state structure to hold Motion Estimation information.
+ */
+
+struct m4p2_MESpec
+{
+ OMXVCM4P2MEParams MEParams;
+ OMXVCM4P2MEMode MEMode;
+};
+
+struct m4p10_MESpec
+{
+ OMXVCM4P10MEParams MEParams;
+ OMXVCM4P10MEMode MEMode;
+};
+
+typedef struct m4p2_MESpec ARMVCM4P2_MESpec;
+typedef struct m4p10_MESpec ARMVCM4P10_MESpec;
+
+/**
+ * Function: armVCM4P2_CompareMV
+ *
+ * Description:
+ * Performs comparision of motion vectors and SAD's to decide the
+ * best MV and SAD
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] mvX x coordinate of the candidate motion vector
+ * [in] mvY y coordinate of the candidate motion vector
+ * [in] candSAD Candidate SAD
+ * [in] bestMVX x coordinate of the best motion vector
+ * [in] bestMVY y coordinate of the best motion vector
+ * [in] bestSAD best SAD
+ *
+ * Return Value:
+ * OMX_INT -- 1 to indicate that the current sad is the best
+ * 0 to indicate that it is NOT the best SAD
+ */
+
+OMX_INT armVCM4P2_CompareMV (
+ OMX_S16 mvX,
+ OMX_S16 mvY,
+ OMX_INT candSAD,
+ OMX_S16 bestMVX,
+ OMX_S16 bestMVY,
+ OMX_INT bestSAD);
+
+/**
+ * Function: armVCM4P2_ACDCPredict
+ *
+ * Description:
+ * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+ * to the function call, prediction direction (predDir) should be selected
+ * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDst pointer to the coefficient buffer which contains
+ * the quantized coefficient residuals (PQF) of the
+ * current block
+ * [in] pPredBufRow pointer to the coefficient row buffer
+ * [in] pPredBufCol pointer to the coefficient column buffer
+ * [in] curQP quantization parameter of the current block. curQP
+ * may equal to predQP especially when the current
+ * block and the predictor block are in the same
+ * macroblock.
+ * [in] predQP quantization parameter of the predictor block
+ * [in] predDir indicates the prediction direction which takes one
+ * of the following values:
+ * OMX_VIDEO_HORIZONTAL predict horizontally
+ * OMX_VIDEO_VERTICAL predict vertically
+ * [in] ACPredFlag a flag indicating if AC prediction should be
+ * performed. It is equal to ac_pred_flag in the bit
+ * stream syntax of MPEG-4
+ * [in] videoComp video component type (luminance, chrominance or
+ * alpha) of the current block
+ * [in] flag This flag defines the if one wants to use this functions to
+ * calculate PQF (set 1, prediction) or QF (set 0, reconstruction)
+ * [out] pPreACPredict pointer to the predicted coefficients buffer.
+ * Filled ONLY if it is not NULL
+ * [out] pSrcDst pointer to the coefficient buffer which contains
+ * the quantized coefficients (QF) of the current
+ * block
+ * [out] pPredBufRow pointer to the updated coefficient row buffer
+ * [out] pPredBufCol pointer to the updated coefficient column buffer
+ * [out] pSumErr pointer to the updated sum of the difference
+ * between predicted and unpredicted coefficients
+ * If this is NULL, do not update
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_ACDCPredict(
+ OMX_S16 * pSrcDst,
+ OMX_S16 * pPreACPredict,
+ OMX_S16 * pPredBufRow,
+ OMX_S16 * pPredBufCol,
+ OMX_INT curQP,
+ OMX_INT predQP,
+ OMX_INT predDir,
+ OMX_INT ACPredFlag,
+ OMXVCM4P2VideoComponent videoComp,
+ OMX_U8 flag,
+ OMX_INT *pSumErr
+);
+
+/**
+ * Function: armVCM4P2_SetPredDir
+ *
+ * Description:
+ * Performs detecting the prediction direction
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] blockIndex block index indicating the component type and
+ * position as defined in subclause 6.1.3.8, of ISO/IEC
+ * 14496-2. Furthermore, indexes 6 to 9 indicate the
+ * alpha blocks spatially corresponding to luminance
+ * blocks 0 to 3 in the same macroblock.
+ * [in] pCoefBufRow pointer to the coefficient row buffer
+ * [in] pQpBuf pointer to the quantization parameter buffer
+ * [out] predQP quantization parameter of the predictor block
+ * [out] predDir indicates the prediction direction which takes one
+ * of the following values:
+ * OMX_VIDEO_HORIZONTAL predict horizontally
+ * OMX_VIDEO_VERTICAL predict vertically
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_SetPredDir(
+ OMX_INT blockIndex,
+ OMX_S16 *pCoefBufRow,
+ OMX_S16 *pCoefBufCol,
+ OMX_INT *predDir,
+ OMX_INT *predQP,
+ const OMX_U8 *pQpBuf
+);
+
+/**
+ * Function: armVCM4P2_EncodeVLCZigzag_Intra
+ *
+ * Description:
+ * Performs zigzag scanning and VLC encoding for one intra block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * by *ppBitStream. Valid within 0 to 7.
+ * [in] pQDctBlkCoef pointer to the quantized DCT coefficient
+ * [in] predDir AC prediction direction, which is used to decide
+ * the zigzag scan pattern. This takes one of the
+ * following values:
+ * OMX_VIDEO_NONE AC prediction not used.
+ * Performs classical zigzag
+ * scan.
+ * OMX_VIDEO_HORIZONTAL Horizontal prediction.
+ * Performs alternate-vertical
+ * zigzag scan.
+ * OMX_VIDEO_VERTICAL Vertical prediction.
+ * Performs alternate-horizontal
+ * zigzag scan.
+ * [in] pattern block pattern which is used to decide whether
+ * this block is encoded
+ * [in] start start indicates whether the encoding begins with 0th element
+ * or 1st.
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded,
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_EncodeVLCZigzag_Intra(
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ const OMX_S16 *pQDctBlkCoef,
+ OMX_U8 predDir,
+ OMX_U8 pattern,
+ OMX_INT shortVideoHeader,
+ OMX_U8 start
+);
+
+/**
+ * Function: armVCM4P2_DecodeVLCZigzag_Intra
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bitstream buffer
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * to by *ppBitStream. *pBitOffset is valid within
+ * [0-7].
+ * [in] predDir AC prediction direction which is used to decide
+ * the zigzag scan pattern. It takes one of the
+ * following values:
+ * OMX_VIDEO_NONE AC prediction not used;
+ * perform classical zigzag scan;
+ * OMX_VIDEO_HORIZONTAL Horizontal prediction;
+ * perform alternate-vertical
+ * zigzag scan;
+ * OMX_VIDEO_VERTICAL Vertical prediction;
+ * thus perform
+ * alternate-horizontal
+ * zigzag scan.
+ * [in] videoComp video component type (luminance, chrominance or
+ * alpha) of the current block
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ * and escape mode 4 is used when shortVideoHeader==1.
+ * [in] start start indicates whether the encoding begins with 0th element
+ * or 1st.
+ * [out] ppBitStream *ppBitStream is updated after the block is
+ * decoded, so that it points to the current byte
+ * in the bit stream buffer
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream
+ * [out] pDst pointer to the coefficient buffer of current
+ * block. Should be 32-bit aligned
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_DecodeVLCZigzag_Intra(
+ const OMX_U8 ** ppBitStream,
+ OMX_INT * pBitOffset,
+ OMX_S16 * pDst,
+ OMX_U8 predDir,
+ OMX_INT shortVideoHeader,
+ OMX_U8 start
+);
+
+/**
+ * Function: armVCM4P2_FillVLDBuffer
+ *
+ * Description:
+ * Performs filling of the coefficient buffer according to the run, level
+ * and sign, also updates the index
+ *
+ * Parameters:
+ * [in] storeRun Stored Run value (count of zeros)
+ * [in] storeLevel Stored Level value (non-zero value)
+ * [in] sign Flag indicating the sign of level
+ * [in] last status of the last flag
+ * [in] pIndex pointer to coefficient index in 8x8 matrix
+ * [out] pIndex pointer to updated coefficient index in 8x8
+ * matrix
+ * [in] pZigzagTable pointer to the zigzag tables
+ * [out] pDst pointer to the coefficient buffer of current
+ * block. Should be 32-bit aligned
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_FillVLDBuffer(
+ OMX_U32 storeRun,
+ OMX_S16 * pDst,
+ OMX_S16 storeLevel,
+ OMX_U8 sign,
+ OMX_U8 last,
+ OMX_U8 * index,
+ const OMX_U8 * pZigzagTable
+);
+
+/**
+ * Function: armVCM4P2_GetVLCBits
+ *
+ * Description:
+ * Performs escape mode decision based on the run, run+, level, level+ and
+ * last combinations.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * by *ppBitStream. Valid within 0 to 7
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ * and escape mode 4 is used when shortVideoHeader==1.
+ * [in] start start indicates whether the encoding begins with
+ * 0th element or 1st.
+ * [in/out] pLast pointer to last status flag
+ * [in] runBeginSingleLevelEntriesL0 The run value from which level
+ * will be equal to 1: last == 0
+ * [in] IndexBeginSingleLevelEntriesL0 Array index in the VLC table
+ * pointing to the
+ * runBeginSingleLevelEntriesL0
+ * [in] runBeginSingleLevelEntriesL1 The run value from which level
+ * will be equal to 1: last == 1
+ * [in] IndexBeginSingleLevelEntriesL1 Array index in the VLC table
+ * pointing to the
+ * runBeginSingleLevelEntriesL0
+ * [in] pRunIndexTableL0 Run Index table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pVlcTableL0 VLC table for last == 0
+ * [in] pRunIndexTableL1 Run Index table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pVlcTableL1 VLC table for last == 1
+ * [in] pLMAXTableL0 Level MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pLMAXTableL1 Level MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pRMAXTableL0 Run MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pRMAXTableL1 Run MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [out]pDst pointer to the coefficient buffer of current
+ * block. Should be 32-bit aligned
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_GetVLCBits (
+ const OMX_U8 **ppBitStream,
+ OMX_INT * pBitOffset,
+ OMX_S16 * pDst,
+ OMX_INT shortVideoHeader,
+ OMX_U8 start,
+ OMX_U8 * pLast,
+ OMX_U8 runBeginSingleLevelEntriesL0,
+ OMX_U8 maxIndexForMultipleEntriesL0,
+ OMX_U8 maxRunForMultipleEntriesL1,
+ OMX_U8 maxIndexForMultipleEntriesL1,
+ const OMX_U8 * pRunIndexTableL0,
+ const ARM_VLC32 *pVlcTableL0,
+ const OMX_U8 * pRunIndexTableL1,
+ const ARM_VLC32 *pVlcTableL1,
+ const OMX_U8 * pLMAXTableL0,
+ const OMX_U8 * pLMAXTableL1,
+ const OMX_U8 * pRMAXTableL0,
+ const OMX_U8 * pRMAXTableL1,
+ const OMX_U8 * pZigzagTable
+);
+
+/**
+ * Function: armVCM4P2_PutVLCBits
+ *
+ * Description:
+ * Checks the type of Escape Mode and put encoded bits for
+ * quantized DCT coefficients.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * by *ppBitStream. Valid within 0 to 7
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ * and escape mode 4 is used when shortVideoHeader==1.
+ * [in] start start indicates whether the encoding begins with
+ * 0th element or 1st.
+ * [in] maxStoreRunL0 Max store possible (considering last and inter/intra)
+ * for last = 0
+ * [in] maxStoreRunL1 Max store possible (considering last and inter/intra)
+ * for last = 1
+ * [in] maxRunForMultipleEntriesL0
+ * The run value after which level
+ * will be equal to 1:
+ * (considering last and inter/intra status) for last = 0
+ * [in] maxRunForMultipleEntriesL1
+ * The run value after which level
+ * will be equal to 1:
+ * (considering last and inter/intra status) for last = 1
+ * [in] pRunIndexTableL0 Run Index table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pVlcTableL0 VLC table for last == 0
+ * [in] pRunIndexTableL1 Run Index table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pVlcTableL1 VLC table for last == 1
+ * [in] pLMAXTableL0 Level MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pLMAXTableL1 Level MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [in] pRMAXTableL0 Run MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 0
+ * [in] pRMAXTableL1 Run MAX table defined in
+ * armVCM4P2_Huff_Tables_VLC.c for last == 1
+ * [out] pQDctBlkCoef pointer to the quantized DCT coefficient
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+
+OMXResult armVCM4P2_PutVLCBits (
+ OMX_U8 **ppBitStream,
+ OMX_INT * pBitOffset,
+ const OMX_S16 *pQDctBlkCoef,
+ OMX_INT shortVideoHeader,
+ OMX_U8 start,
+ OMX_U8 maxStoreRunL0,
+ OMX_U8 maxStoreRunL1,
+ OMX_U8 maxRunForMultipleEntriesL0,
+ OMX_U8 maxRunForMultipleEntriesL1,
+ const OMX_U8 * pRunIndexTableL0,
+ const ARM_VLC32 *pVlcTableL0,
+ const OMX_U8 * pRunIndexTableL1,
+ const ARM_VLC32 *pVlcTableL1,
+ const OMX_U8 * pLMAXTableL0,
+ const OMX_U8 * pLMAXTableL1,
+ const OMX_U8 * pRMAXTableL0,
+ const OMX_U8 * pRMAXTableL1,
+ const OMX_U8 * pZigzagTable
+);
+/**
+ * Function: armVCM4P2_FillVLCBuffer
+ *
+ * Description:
+ * Performs calculating the VLC bits depending on the escape type and insert
+ * the same in the bitstream
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * by *ppBitStream. Valid within 0 to 7
+ * [in] run Run value (count of zeros) to be encoded
+ * [in] level Level value (non-zero value) to be encoded
+ * [in] runPlus Calculated as runPlus = run - (RMAX + 1)
+ * [in] levelPlus Calculated as
+ * levelPlus = sign(level)*[abs(level) - LMAX]
+ * [in] fMode Flag indicating the escape modes
+ * [in] last status of the last flag
+ * [in] maxRunForMultipleEntries
+ * The run value after which level will be equal to 1:
+ * (considering last and inter/intra status)
+ * [in] pRunIndexTable Run Index table defined in
+ * armVCM4P2_Huff_tables_VLC.h
+ * [in] pVlcTable VLC table defined in armVCM4P2_Huff_tables_VLC.h
+ * [out] ppBitStream *ppBitStream is updated after the block is encoded
+ * so that it points to the current byte in the bit
+ * stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream.
+ *
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_FillVLCBuffer (
+ OMX_U8 **ppBitStream,
+ OMX_INT * pBitOffset,
+ OMX_U32 run,
+ OMX_S16 level,
+ OMX_U32 runPlus,
+ OMX_S16 levelPlus,
+ OMX_U8 fMode,
+ OMX_U8 last,
+ OMX_U8 maxRunForMultipleEntries,
+ const OMX_U8 *pRunIndexTable,
+ const ARM_VLC32 *pVlcTable
+);
+
+/**
+ * Function: armVCM4P2_CheckVLCEscapeMode
+ *
+ * Description:
+ * Performs escape mode decision based on the run, run+, level, level+ and
+ * last combinations.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] run Run value (count of zeros) to be encoded
+ * [in] level Level value (non-zero value) to be encoded
+ * [in] runPlus Calculated as runPlus = run - (RMAX + 1)
+ * [in] levelPlus Calculated as
+ * levelPlus = sign(level)*[abs(level) - LMAX]
+ * [in] maxStoreRun Max store possible (considering last and inter/intra)
+ * [in] maxRunForMultipleEntries
+ * The run value after which level
+ * will be equal to 1:
+ * (considering last and inter/intra status)
+ * [in] shortVideoHeader binary flag indicating presence of short_video_header; escape modes 0-3 are used if shortVideoHeader==0,
+ * and escape mode 4 is used when shortVideoHeader==1.
+ * [in] pRunIndexTable Run Index table defined in
+ * armVCM4P2_Huff_Tables_VLC.c
+ * (considering last and inter/intra status)
+ *
+ *
+ * Return Value:
+ * Returns an Escape mode which can take values from 0 to 3
+ * 0 --> no escape mode, 1 --> escape type 1,
+ * 1 --> escape type 2, 3 --> escape type 3, check section 7.4.1.3
+ * in the MPEG ISO standard.
+ *
+ */
+
+OMX_U8 armVCM4P2_CheckVLCEscapeMode(
+ OMX_U32 run,
+ OMX_U32 runPlus,
+ OMX_S16 level,
+ OMX_S16 levelPlus,
+ OMX_U8 maxStoreRun,
+ OMX_U8 maxRunForMultipleEntries,
+ OMX_INT shortVideoHeader,
+ const OMX_U8 *pRunIndexTable
+);
+
+
+/**
+ * Function: armVCM4P2_BlockMatch_Integer
+ *
+ * Description:
+ * Performs a 16x16 block search; estimates motion vector and associated minimum SAD.
+ * Both the input and output motion vectors are represented using half-pixel units, and
+ * therefore a shift left or right by 1 bit may be required, respectively, to match the
+ * input or output MVs with other functions that either generate output MVs or expect
+ * input MVs represented using integer pixel units.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcRefBuf pointer to the reference Y plane; points to the reference MB that
+ * corresponds to the location of the current macroblock in the current
+ * plane.
+ * [in] refWidth width of the reference plane
+ * [in] pRefRect pointer to the valid rectangular in reference plane. Relative to image origin.
+ * It's not limited to the image boundary, but depended on the padding. For example,
+ * if you pad 4 pixels outside the image border, then the value for left border
+ * can be -4
+ * [in] pSrcCurrBuf pointer to the current macroblock extracted from original plane (linear array,
+ * 256 entries); must be aligned on an 8-byte boundary.
+ * [in] pCurrPointPos position of the current macroblock in the current plane
+ * [in] pSrcPreMV pointer to predicted motion vector; NULL indicates no predicted MV
+ * [in] pSrcPreSAD pointer to SAD associated with the predicted MV (referenced by pSrcPreMV)
+ * [in] searchRange search range for 16X16 integer block,the units of it is full pixel,the search range
+ * is the same in all directions.It is in inclusive of the boundary and specified in
+ * terms of integer pixel units.
+ * [in] pMESpec vendor-specific motion estimation specification structure; must have been allocated
+ * and then initialized using omxVCM4P2_MEInit prior to calling the block matching
+ * function.
+ * [in] BlockSize MacroBlock Size i.e either 16x16 or 8x8.
+ * [out] pDstMV pointer to estimated MV
+ * [out] pDstSAD pointer to minimum SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr ¨C no error.
+ * OMX_Sts_BadArgErr ¨C bad arguments
+ *
+ */
+
+OMXResult armVCM4P2_BlockMatch_Integer(
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ const OMXVCMotionVector *pSrcPreMV,
+ const OMX_INT *pSrcPreSAD,
+ void *pMESpec,
+ OMXVCMotionVector *pDstMV,
+ OMX_INT *pDstSAD,
+ OMX_U8 BlockSize
+);
+
+/**
+ * Function: armVCM4P2_BlockMatch_Half
+ *
+ * Description:
+ * Performs a 16x16 block match with half-pixel resolution. Returns the estimated
+ * motion vector and associated minimum SAD. This function estimates the half-pixel
+ * motion vector by interpolating the integer resolution motion vector referenced
+ * by the input parameter pSrcDstMV, i.e., the initial integer MV is generated
+ * externally. The input parameters pSrcRefBuf and pSearchPointRefPos should be
+ * shifted by the winning MV of 16x16 integer search prior to calling BlockMatch_Half_16x16.
+ * The function BlockMatch_Integer_16x16 may be used for integer motion estimation.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcRefBuf pointer to the reference Y plane; points to the reference MB
+ * that corresponds to the location of the current macroblock in
+ * the current plane.
+ * [in] refWidth width of the reference plane
+ * [in] pRefRect reference plane valid region rectangle
+ * [in] pSrcCurrBuf pointer to the current macroblock extracted from original plane
+ * (linear array, 256 entries); must be aligned on an 8-byte boundary.
+ * [in] pSearchPointRefPos position of the starting point for half pixel search (specified
+ * in terms of integer pixel units) in the reference plane.
+ * [in] rndVal rounding control bit for half pixel motion estimation;
+ * 0=rounding control disabled; 1=rounding control enabled
+ * [in] pSrcDstMV pointer to the initial MV estimate; typically generated during a prior
+ * 16X16 integer search and its unit is half pixel.
+ * [in] BlockSize MacroBlock Size i.e either 16x16 or 8x8.
+ * [out]pSrcDstMV pointer to estimated MV
+ * [out]pDstSAD pointer to minimum SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr ¨C no error
+ * OMX_Sts_BadArgErr ¨C bad arguments
+ *
+ */
+
+OMXResult armVCM4P2_BlockMatch_Half(
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pSearchPointRefPos,
+ OMX_INT rndVal,
+ OMXVCMotionVector *pSrcDstMV,
+ OMX_INT *pDstSAD,
+ OMX_U8 BlockSize
+);
+/**
+ * Function: armVCM4P2_PadMV
+ *
+ * Description:
+ * Performs motion vector padding for a macroblock.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDstMV pointer to motion vector buffer of the current
+ * macroblock
+ * [in] pTransp pointer to transparent status buffer of the
+ * current macroblock
+ * [out] pSrcDstMV pointer to motion vector buffer in which the
+ * motion vectors have been padded
+ * Return Value:
+ * Standard OMXResult result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P2_PadMV(
+ OMXVCMotionVector * pSrcDstMV,
+ OMX_U8 * pTransp
+);
+
+/*
+ * H.264 Specific Declarations
+ */
+/* Defines */
+#define ARM_M4P10_Q_OFFSET (15)
+
+
+/* Dequant tables */
+
+extern const OMX_U8 armVCM4P10_PosToVCol4x4[16];
+extern const OMX_U8 armVCM4P10_PosToVCol2x2[4];
+extern const OMX_U8 armVCM4P10_VMatrix[6][3];
+extern const OMX_U32 armVCM4P10_MFMatrix[6][3];
+
+
+/*
+ * Description:
+ * This function perform the work required by the OpenMAX
+ * DecodeCoeffsToPair function and DecodeChromaDCCoeffsToPair.
+ * Since most of the code is common we share it here.
+ *
+ * Parameters:
+ * [in] ppBitStream Double pointer to current byte in bit stream buffer
+ * [in] pOffset Pointer to current bit position in the byte pointed
+ * to by *ppBitStream
+ * [in] sMaxNumCoeff Maximum number of non-zero coefficients in current
+ * block (4,15 or 16)
+ * [in] nTable Table number (0 to 4) according to the five columns
+ * of Table 9-5 in the H.264 spec
+ * [out] ppBitStream *ppBitStream is updated after each block is decoded
+ * [out] pOffset *pOffset is updated after each block is decoded
+ * [out] pNumCoeff Pointer to the number of nonzero coefficients in
+ * this block
+ * [out] ppPosCoefbuf Double pointer to destination residual
+ * coefficient-position pair buffer
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+
+ */
+
+OMXResult armVCM4P10_DecodeCoeffsToPair(
+ const OMX_U8** ppBitStream,
+ OMX_S32* pOffset,
+ OMX_U8* pNumCoeff,
+ OMX_U8**ppPosCoefbuf,
+ OMX_INT nTable,
+ OMX_INT sMaxNumCoeff
+ );
+
+/*
+ * Description:
+ * Perform DC style intra prediction, averaging upper and left block
+ *
+ * Parameters:
+ * [in] pSrcLeft Pointer to the buffer of 16 left coefficients:
+ * p[x, y] (x = -1, y = 0..3)
+ * [in] pSrcAbove Pointer to the buffer of 16 above coefficients:
+ * p[x,y] (x = 0..3, y = -1)
+ * [in] leftStep Step of left coefficient buffer
+ * [in] dstStep Step of the destination buffer
+ * [in] availability Neighboring 16x16 MB availability flag
+ * [out] pDst Pointer to the destination buffer
+ *
+ * Return Value:
+ * None
+ */
+
+void armVCM4P10_PredictIntraDC4x4(
+ const OMX_U8* pSrcLeft,
+ const OMX_U8 *pSrcAbove,
+ OMX_U8* pDst,
+ OMX_INT leftStep,
+ OMX_INT dstStep,
+ OMX_S32 availability
+);
+
+/*
+ * Description
+ * Unpack a 4x4 block of coefficient-residual pair values
+ *
+ * Parameters:
+ * [in] ppSrc Double pointer to residual coefficient-position pair
+ * buffer output by CALVC decoding
+ * [out] ppSrc *ppSrc is updated to the start of next non empty block
+ * [out] pDst Pointer to unpacked 4x4 block
+ */
+
+void armVCM4P10_UnpackBlock4x4(
+ const OMX_U8 **ppSrc,
+ OMX_S16* pDst
+);
+
+/*
+ * Description
+ * Unpack a 2x2 block of coefficient-residual pair values
+ *
+ * Parameters:
+ * [in] ppSrc Double pointer to residual coefficient-position pair
+ * buffer output by CALVC decoding
+ * [out] ppSrc *ppSrc is updated to the start of next non empty block
+ * [out] pDst Pointer to unpacked 4x4 block
+ */
+
+void armVCM4P10_UnpackBlock2x2(
+ const OMX_U8 **ppSrc,
+ OMX_S16* pDst
+);
+
+/*
+ * Description
+ * Deblock one boundary pixel
+ *
+ * Parameters:
+ * [in] pQ0 Pointer to pixel q0
+ * [in] Step Step between pixels q0 and q1
+ * [in] tC0 Edge threshold value
+ * [in] alpha alpha threshold value
+ * [in] beta beta threshold value
+ * [in] bS deblocking strength
+ * [in] ChromaFlag True for chroma blocks
+ * [out] pQ0 Deblocked pixels
+ *
+ */
+
+void armVCM4P10_DeBlockPixel(
+ OMX_U8 *pQ0, /* pointer to the pixel q0 */
+ int Step, /* step between pixels q0 and q1 */
+ int tC0, /* edge threshold value */
+ int alpha, /* alpha */
+ int beta, /* beta */
+ int bS, /* deblocking strength */
+ int ChromaFlag
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfHor_Luma
+ *
+ * Description:
+ * This function performs interpolation for horizontal 1/2-pel positions
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to top-left corner of block used to interpolate
+ in the reconstructed frame plane
+ * [in] iSrcStep Step of the source buffer.
+ * [in] iDstStep Step of the destination(interpolation) buffer.
+ * [in] iWidth Width of the current block
+ * [in] iHeight Height of the current block
+ * [out] pDst Pointer to the interpolation buffer of the 1/2-pel
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfHor_Luma(
+ const OMX_U8* pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8* pDst,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfVer_Luma
+ *
+ * Description:
+ * This function performs interpolation for vertical 1/2-pel positions
+ * around a full-pel position.
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to top-left corner of block used to interpolate
+ * in the reconstructed frame plane
+ * [in] iSrcStep Step of the source buffer.
+ * [in] iDstStep Step of the destination(interpolation) buffer.
+ * [in] iWidth Width of the current block
+ * [in] iHeight Height of the current block
+ * [out] pDst Pointer to the interpolation buffer of the 1/2-pel
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfVer_Luma(
+ const OMX_U8* pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8* pDst,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+/**
+ * Function: armVCM4P10_InterpolateHalfDiag_Luma
+ *
+ * Description:
+ * This function performs interpolation for (1/2, 1/2) positions
+ * around a full-pel position.
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to top-left corner of block used to interpolate
+ * in the reconstructed frame plane
+ * [in] iSrcStep Step of the source buffer.
+ * [in] iDstStep Step of the destination(interpolation) buffer.
+ * [in] iWidth Width of the current block
+ * [in] iHeight Height of the current block
+ * [out] pDst Pointer to the interpolation buffer of the (1/2,1/2)-pel
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+OMXResult armVCM4P10_InterpolateHalfDiag_Luma(
+ const OMX_U8* pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8* pDst,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+/*
+ * Description:
+ * Transform Residual 4x4 Coefficients
+ *
+ * Parameters:
+ * [in] pSrc Source 4x4 block
+ * [out] pDst Destination 4x4 block
+ *
+ */
+
+void armVCM4P10_TransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc);
+
+/*
+ * Description:
+ * Forward Transform Residual 4x4 Coefficients
+ *
+ * Parameters:
+ * [in] pSrc Source 4x4 block
+ * [out] pDst Destination 4x4 block
+ *
+ */
+
+void armVCM4P10_FwdTransformResidual4x4(OMX_S16* pDst, OMX_S16 *pSrc);
+
+OMX_INT armVCM4P10_CompareMotionCostToMV (
+ OMX_S16 mvX,
+ OMX_S16 mvY,
+ OMXVCMotionVector diffMV,
+ OMX_INT candSAD,
+ OMXVCMotionVector *bestMV,
+ OMX_U32 nLamda,
+ OMX_S32 *pBestCost);
+
+/**
+ * Function: armVCCOMM_SAD
+ *
+ * Description:
+ * This function calculate the SAD for NxM blocks.
+ *
+ * Remarks:
+ *
+ * [in] pSrcOrg Pointer to the original block
+ * [in] iStepOrg Step of the original block buffer
+ * [in] pSrcRef Pointer to the reference block
+ * [in] iStepRef Step of the reference block buffer
+ * [in] iHeight Height of the block
+ * [in] iWidth Width of the block
+ * [out] pDstSAD Pointer of result SAD
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+OMXResult armVCCOMM_SAD(
+ const OMX_U8* pSrcOrg,
+ OMX_U32 iStepOrg,
+ const OMX_U8* pSrcRef,
+ OMX_U32 iStepRef,
+ OMX_S32* pDstSAD,
+ OMX_U32 iHeight,
+ OMX_U32 iWidth);
+
+/**
+ * Function: armVCCOMM_Average
+ *
+ * Description:
+ * This function calculates the average of two blocks and stores the result.
+ *
+ * Remarks:
+ *
+ * [in] pPred0 Pointer to the top-left corner of reference block 0
+ * [in] pPred1 Pointer to the top-left corner of reference block 1
+ * [in] iPredStep0 Step of reference block 0
+ * [in] iPredStep1 Step of reference block 1
+ * [in] iDstStep Step of the destination buffer
+ * [in] iWidth Width of the blocks
+ * [in] iHeight Height of the blocks
+ * [out] pDstPred Pointer to the destination buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+ OMXResult armVCCOMM_Average (
+ const OMX_U8* pPred0,
+ const OMX_U8* pPred1,
+ OMX_U32 iPredStep0,
+ OMX_U32 iPredStep1,
+ OMX_U8* pDstPred,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+/**
+ * Function: armVCM4P10_SADQuar
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the
+ * average of the other two (pSrcRef0 and pSrcRef1)
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to the original block
+ * [in] pSrcRef0 Pointer to reference block 0
+ * [in] pSrcRef1 Pointer to reference block 1
+ * [in] iSrcStep Step of the original block buffer
+ * [in] iRefStep0 Step of reference block 0
+ * [in] iRefStep1 Step of reference block 1
+ * [in] iHeight Height of the block
+ * [in] iWidth Width of the block
+ * [out] pDstSAD Pointer of result SAD
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+OMXResult armVCM4P10_SADQuar(
+ const OMX_U8* pSrc,
+ const OMX_U8* pSrcRef0,
+ const OMX_U8* pSrcRef1,
+ OMX_U32 iSrcStep,
+ OMX_U32 iRefStep0,
+ OMX_U32 iRefStep1,
+ OMX_U32* pDstSAD,
+ OMX_U32 iHeight,
+ OMX_U32 iWidth
+);
+
+/**
+ * Function: armVCM4P10_Interpolate_Chroma
+ *
+ * Description:
+ * This function performs interpolation for chroma components.
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to top-left corner of block used to
+ * interpolate in the reconstructed frame plane
+ * [in] iSrcStep Step of the source buffer.
+ * [in] iDstStep Step of the destination(interpolation) buffer.
+ * [in] iWidth Width of the current block
+ * [in] iHeight Height of the current block
+ * [in] dx Fractional part of horizontal motion vector
+ * component in 1/8 pixel unit (0~7)
+ * [in] dy Fractional part of vertical motion vector
+ * component in 1/8 pixel unit (0~7)
+ * [out] pDst Pointer to the interpolation buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+ OMXResult armVCM4P10_Interpolate_Chroma(
+ OMX_U8 *pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8 *pDst,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight,
+ OMX_U32 dx,
+ OMX_U32 dy
+);
+
+/**
+ * Function: armVCM4P10_Interpolate_Luma
+ *
+ * Description:
+ * This function performs interpolation for luma components.
+ *
+ * Remarks:
+ *
+ * [in] pSrc Pointer to top-left corner of block used to
+ * interpolate in the reconstructed frame plane
+ * [in] iSrcStep Step of the source buffer.
+ * [in] iDstStep Step of the destination(interpolation) buffer.
+ * [in] iWidth Width of the current block
+ * [in] iHeight Height of the current block
+ * [in] dx Fractional part of horizontal motion vector
+ * component in 1/4 pixel unit (0~3)
+ * [in] dy Fractional part of vertical motion vector
+ * component in 1/4 pixel unit (0~3)
+ * [out] pDst Pointer to the interpolation buffer
+ *
+ * Return Value:
+ * Standard OMXResult value.
+ *
+ */
+
+ OMXResult armVCM4P10_Interpolate_Luma(
+ const OMX_U8 *pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8 *pDst,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight,
+ OMX_U32 dx,
+ OMX_U32 dy
+);
+
+/**
+ * Function: omxVCH264_DequantTransformACFromPair_U8_S16_C1_DLx
+ *
+ * Description:
+ * Reconstruct the 4x4 residual block from coefficient-position pair buffer,
+ * perform dequantisation and integer inverse transformation for 4x4 block of
+ * residuals and update the pair buffer pointer to next non-empty block.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppSrc Double pointer to residual coefficient-position
+ * pair buffer output by CALVC decoding
+ * [in] pDC Pointer to the DC coefficient of this block, NULL
+ * if it doesn't exist
+ * [in] QP Quantization parameter
+ * [in] AC Flag indicating if at least one non-zero coefficient exists
+ * [out] pDst pointer to the reconstructed 4x4 block data
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult armVCM4P10_DequantTransformACFromPair_U8_S16_C1_DLx(
+ OMX_U8 **ppSrc,
+ OMX_S16 *pDst,
+ OMX_INT QP,
+ OMX_S16* pDC,
+ int AC
+);
+
+#endif /*_armVideo_H_*/
+
+/*End of File*/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h
new file mode 100755
index 0000000..32a0166
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/armVCCOMM_s.h
@@ -0,0 +1,72 @@
+;//
+;//
+;// File Name: armVCCOMM_s.h
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// ARM optimized OpenMAX AC header file
+;//
+;// Formula used:
+;// MACRO for calculating median for three values.
+
+
+
+ IF :LNOT::DEF:ARMVCCOMM_S_H
+ INCLUDE armCOMM_s.h
+ M_VARIANTS CortexA8, ARM1136JS
+
+ IF ARM1136JS :LOR: CortexA8
+
+ ;///*
+ ;// * Macro: M_MEDIAN3
+ ;// *
+ ;// * Description: Finds the median of three numbers
+ ;// *
+ ;// * Remarks:
+ ;// *
+ ;// * Parameters:
+ ;// * [in] x First entry for the list of three numbers.
+ ;// * [in] y Second entry for the list of three numbers.
+ ;// * Input value may be corrupted at the end of
+ ;// * the execution of this macro.
+ ;// * [in] z Third entry of the list of three numbers.
+ ;// * Input value corrupted at the end of the
+ ;// * execution of this macro.
+ ;// * [in] t Temporary scratch register.
+ ;// * [out]z Median of the three numbers.
+ ;// */
+
+ MACRO
+
+ M_MEDIAN3 $x, $y, $z, $t
+
+ SUBS $t, $y, $z; // if (y < z)
+ ADDLT $z, $z, $t; // swap y and z
+ SUBLT $y, $y, $t;
+
+ ;// Now z' <= y', so there are three cases for the
+ ;// median value, depending on x.
+
+ ;// 1) x <= z' <= y' : median value is z'
+ ;// 2) z' <= x <= y' : median value is x
+ ;// 3) z' <= y' <= x : median value is y'
+
+ CMP $z, $x; // if ( x > min(y,z) )
+ MOVLT $z, $x; // ans = x
+
+ CMP $x, $y; // if ( x > max(y,z) )
+ MOVGT $z, $y; // ans = max(y,z)
+
+ MEND
+ ENDIF
+
+
+
+ ENDIF ;// ARMACCOMM_S_H
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h
new file mode 100755
index 0000000..7b3cc72
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC.h
@@ -0,0 +1,4381 @@
+/**
+ * File: omxVC.h
+ * Brief: OpenMAX DL v1.0.2 - Video Coding library
+ *
+ * Copyright © 2005-2008 The Khronos Group Inc. All Rights Reserved.
+ *
+ * These materials are protected by copyright laws and contain material
+ * proprietary to the Khronos Group, Inc. You may use these materials
+ * for implementing Khronos specifications, without altering or removing
+ * any trademark, copyright or other notice from the specification.
+ *
+ * Khronos Group makes no, and expressly disclaims any, representations
+ * or warranties, express or implied, regarding these materials, including,
+ * without limitation, any implied warranties of merchantability or fitness
+ * for a particular purpose or non-infringement of any intellectual property.
+ * Khronos Group makes no, and expressly disclaims any, warranties, express
+ * or implied, regarding the correctness, accuracy, completeness, timeliness,
+ * and reliability of these materials.
+ *
+ * Under no circumstances will the Khronos Group, or any of its Promoters,
+ * Contributors or Members or their respective partners, officers, directors,
+ * employees, agents or representatives be liable for any damages, whether
+ * direct, indirect, special or consequential damages for lost revenues,
+ * lost profits, or otherwise, arising from or in connection with these
+ * materials.
+ *
+ * Khronos and OpenMAX are trademarks of the Khronos Group Inc.
+ *
+ */
+
+/* *****************************************************************************************/
+
+#ifndef _OMXVC_H_
+#define _OMXVC_H_
+
+#include "omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* 6.1.1.1 Motion Vectors */
+/* In omxVC, motion vectors are represented as follows: */
+
+typedef struct {
+ OMX_S16 dx;
+ OMX_S16 dy;
+} OMXVCMotionVector;
+
+
+
+/**
+ * Function: omxVCCOMM_Average_8x (6.1.3.1.1)
+ *
+ * Description:
+ * This function calculates the average of two 8x4, 8x8, or 8x16 blocks. The
+ * result is rounded according to (a+b+1)/2. The block average function can
+ * be used in conjunction with half-pixel interpolation to obtain quarter
+ * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1.
+ *
+ * Input Arguments:
+ *
+ * pPred0 - Pointer to the top-left corner of reference block 0
+ * pPred1 - Pointer to the top-left corner of reference block 1
+ * iPredStep0 - Step of reference block 0
+ * iPredStep1 - Step of reference block 1
+ * iDstStep - Step of the destination buffer.
+ * iHeight - Height of the blocks
+ *
+ * Output Arguments:
+ *
+ * pDstPred - Pointer to the destination buffer. 8-byte aligned.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - one or more of the following pointers is NULL: pPred0, pPred1, or
+ * pDstPred.
+ * - pDstPred is not aligned on an 8-byte boundary.
+ * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 8.
+ * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 8.
+ * - iDstStep <= 0 or iDstStep is not a multiple of 8.
+ * - iHeight is not 4, 8, or 16.
+ *
+ */
+OMXResult omxVCCOMM_Average_8x (
+ const OMX_U8 *pPred0,
+ const OMX_U8 *pPred1,
+ OMX_U32 iPredStep0,
+ OMX_U32 iPredStep1,
+ OMX_U8 *pDstPred,
+ OMX_U32 iDstStep,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCCOMM_Average_16x (6.1.3.1.2)
+ *
+ * Description:
+ * This function calculates the average of two 16x16 or 16x8 blocks. The
+ * result is rounded according to (a+b+1)/2. The block average function can
+ * be used in conjunction with half-pixel interpolation to obtain quarter
+ * pixel motion estimates, as described in [ISO14496-10], subclause 8.4.2.2.1.
+ *
+ * Input Arguments:
+ *
+ * pPred0 - Pointer to the top-left corner of reference block 0
+ * pPred1 - Pointer to the top-left corner of reference block 1
+ * iPredStep0 - Step of reference block 0
+ * iPredStep1 - Step of reference block 1
+ * iDstStep - Step of the destination buffer
+ * iHeight - Height of the blocks
+ *
+ * Output Arguments:
+ *
+ * pDstPred - Pointer to the destination buffer. 16-byte aligned.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - one or more of the following pointers is NULL: pPred0, pPred1, or
+ * pDstPred.
+ * - pDstPred is not aligned on a 16-byte boundary.
+ * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 16.
+ * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 16.
+ * - iDstStep <= 0 or iDstStep is not a multiple of 16.
+ * - iHeight is not 8 or 16.
+ *
+ */
+OMXResult omxVCCOMM_Average_16x (
+ const OMX_U8 *pPred0,
+ const OMX_U8 *pPred1,
+ OMX_U32 iPredStep0,
+ OMX_U32 iPredStep1,
+ OMX_U8 *pDstPred,
+ OMX_U32 iDstStep,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCCOMM_ExpandFrame_I (6.1.3.2.1)
+ *
+ * Description:
+ * This function expands a reconstructed frame in-place. The unexpanded
+ * source frame should be stored in a plane buffer with sufficient space
+ * pre-allocated for edge expansion, and the input frame should be located in
+ * the plane buffer center. This function executes the pixel expansion by
+ * replicating source frame edge pixel intensities in the empty pixel
+ * locations (expansion region) between the source frame edge and the plane
+ * buffer edge. The width/height of the expansion regions on the
+ * horizontal/vertical edges is controlled by the parameter iExpandPels.
+ *
+ * Input Arguments:
+ *
+ * pSrcDstPlane - pointer to the top-left corner of the frame to be
+ * expanded; must be aligned on an 8-byte boundary.
+ * iFrameWidth - frame width; must be a multiple of 8.
+ * iFrameHeight -frame height; must be a multiple of 8.
+ * iExpandPels - number of pixels to be expanded in the horizontal and
+ * vertical directions; must be a multiple of 8.
+ * iPlaneStep - distance, in bytes, between the start of consecutive lines
+ * in the plane buffer; must be larger than or equal to
+ * (iFrameWidth + 2 * iExpandPels).
+ *
+ * Output Arguments:
+ *
+ * pSrcDstPlane -Pointer to the top-left corner of the frame (NOT the
+ * top-left corner of the plane); must be aligned on an 8-byte
+ * boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - pSrcDstPlane is NULL.
+ * - pSrcDstPlane is not aligned on an 8-byte boundary.
+ * - one of the following parameters is either equal to zero or is a
+ * non-multiple of 8: iFrameHeight, iFrameWidth, iPlaneStep, or
+ * iExpandPels.
+ * - iPlaneStep < (iFrameWidth + 2 * iExpandPels).
+ *
+ */
+OMXResult omxVCCOMM_ExpandFrame_I (
+ OMX_U8 *pSrcDstPlane,
+ OMX_U32 iFrameWidth,
+ OMX_U32 iFrameHeight,
+ OMX_U32 iExpandPels,
+ OMX_U32 iPlaneStep
+);
+
+
+
+/**
+ * Function: omxVCCOMM_Copy8x8 (6.1.3.3.1)
+ *
+ * Description:
+ * Copies the reference 8x8 block to the current block.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the reference block in the source frame; must be
+ * aligned on an 8-byte boundary.
+ * step - distance between the starts of consecutive lines in the reference
+ * frame, in bytes; must be a multiple of 8 and must be larger than
+ * or equal to 8.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the destination block; must be aligned on an 8-byte
+ * boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - one or more of the following pointers is NULL: pSrc, pDst
+ * - one or more of the following pointers is not aligned on an 8-byte
+ * boundary: pSrc, pDst
+ * - step <8 or step is not a multiple of 8.
+ *
+ */
+OMXResult omxVCCOMM_Copy8x8 (
+ const OMX_U8 *pSrc,
+ OMX_U8 *pDst,
+ OMX_INT step
+);
+
+
+
+/**
+ * Function: omxVCCOMM_Copy16x16 (6.1.3.3.2)
+ *
+ * Description:
+ * Copies the reference 16x16 macroblock to the current macroblock.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the reference macroblock in the source frame; must be
+ * aligned on a 16-byte boundary.
+ * step - distance between the starts of consecutive lines in the reference
+ * frame, in bytes; must be a multiple of 16 and must be larger
+ * than or equal to 16.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the destination macroblock; must be aligned on a
+ * 16-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - one or more of the following pointers is NULL: pSrc, pDst
+ * - one or more of the following pointers is not aligned on a 16-byte
+ * boundary: pSrc, pDst
+ * - step <16 or step is not a multiple of 16.
+ *
+ */
+OMXResult omxVCCOMM_Copy16x16 (
+ const OMX_U8 *pSrc,
+ OMX_U8 *pDst,
+ OMX_INT step
+);
+
+
+
+/**
+ * Function: omxVCCOMM_ComputeTextureErrorBlock_SAD (6.1.4.1.1)
+ *
+ * Description:
+ * Computes texture error of the block; also returns SAD.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the source plane; must be aligned on an 8-byte
+ * boundary.
+ * srcStep - step of the source plane
+ * pSrcRef - pointer to the reference buffer, an 8x8 block; must be aligned
+ * on an 8-byte boundary.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the destination buffer, an 8x8 block; must be aligned
+ * on an 8-byte boundary.
+ * pDstSAD - pointer to the Sum of Absolute Differences (SAD) value
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - At least one of the following
+ * pointers is NULL: pSrc, pSrcRef, pDst and pDstSAD.
+ * - pSrc is not 8-byte aligned.
+ * - SrcStep <= 0 or srcStep is not a multiple of 8.
+ * - pSrcRef is not 8-byte aligned.
+ * - pDst is not 8-byte aligned.
+ *
+ */
+OMXResult omxVCCOMM_ComputeTextureErrorBlock_SAD (
+ const OMX_U8 *pSrc,
+ OMX_INT srcStep,
+ const OMX_U8 *pSrcRef,
+ OMX_S16 *pDst,
+ OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCCOMM_ComputeTextureErrorBlock (6.1.4.1.2)
+ *
+ * Description:
+ * Computes the texture error of the block.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the source plane. This should be aligned on an 8-byte
+ * boundary.
+ * srcStep - step of the source plane
+ * pSrcRef - pointer to the reference buffer, an 8x8 block. This should be
+ * aligned on an 8-byte boundary.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the destination buffer, an 8x8 block. This should be
+ * aligned on an 8-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - At least one of the following pointers is NULL:
+ * pSrc, pSrcRef, pDst.
+ * - pSrc is not 8-byte aligned.
+ * - SrcStep <= 0 or srcStep is not a multiple of 8.
+ * - pSrcRef is not 8-byte aligned.
+ * - pDst is not 8-byte aligned
+ *
+ */
+OMXResult omxVCCOMM_ComputeTextureErrorBlock (
+ const OMX_U8 *pSrc,
+ OMX_INT srcStep,
+ const OMX_U8 *pSrcRef,
+ OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function: omxVCCOMM_LimitMVToRect (6.1.4.1.3)
+ *
+ * Description:
+ * Limits the motion vector associated with the current block/macroblock to
+ * prevent the motion compensated block/macroblock from moving outside a
+ * bounding rectangle as shown in Figure 6-1.
+ *
+ * Input Arguments:
+ *
+ * pSrcMV - pointer to the motion vector associated with the current block
+ * or macroblock
+ * pRectVOPRef - pointer to the bounding rectangle
+ * Xcoord, Ycoord - coordinates of the current block or macroblock
+ * size - size of the current block or macroblock; must be equal to 8 or
+ * 16.
+ *
+ * Output Arguments:
+ *
+ * pDstMV - pointer to the limited motion vector
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the
+ * following conditions is true:
+ * - at least one of the following pointers is NULL:
+ * pSrcMV, pDstMV, or pRectVOPRef.
+ * - size is not equal to either 8 or 16.
+ * - the width or height of the bounding rectangle is less than
+ * twice the block size.
+ */
+OMXResult omxVCCOMM_LimitMVToRect (
+ const OMXVCMotionVector *pSrcMV,
+ OMXVCMotionVector *pDstMV,
+ const OMXRect *pRectVOPRef,
+ OMX_INT Xcoord,
+ OMX_INT Ycoord,
+ OMX_INT size
+);
+
+
+
+/**
+ * Function: omxVCCOMM_SAD_16x (6.1.4.1.4)
+ *
+ * Description:
+ * This function calculates the SAD for 16x16 and 16x8 blocks.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrg - Pointer to the original block; must be aligned on a 16-byte
+ * boundary.
+ * iStepOrg - Step of the original block buffer
+ * pSrcRef - Pointer to the reference block
+ * iStepRef - Step of the reference block buffer
+ * iHeight - Height of the block
+ *
+ * Output Arguments:
+ *
+ * pDstSAD - Pointer of result SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the
+ * following conditions is true:
+ * - at least one of the following pointers is NULL:
+ * pSrcOrg, pDstSAD, or pSrcRef
+ * - pSrcOrg is not 16-byte aligned.
+ * - iStepOrg <= 0 or iStepOrg is not a multiple of 16
+ * - iStepRef <= 0 or iStepRef is not a multiple of 16
+ * - iHeight is not 8 or 16
+ *
+ */
+OMXResult omxVCCOMM_SAD_16x (
+ const OMX_U8 *pSrcOrg,
+ OMX_U32 iStepOrg,
+ const OMX_U8 *pSrcRef,
+ OMX_U32 iStepRef,
+ OMX_S32 *pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCCOMM_SAD_8x (6.1.4.1.5)
+ *
+ * Description:
+ * This function calculates the SAD for 8x16, 8x8, 8x4 blocks.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrg - Pointer to the original block; must be aligned on a 8-byte
+ * boundary.
+ * iStepOrg - Step of the original block buffer
+ * pSrcRef - Pointer to the reference block
+ * iStepRef - Step of the reference block buffer
+ * iHeight - Height of the block
+ *
+ * Output Arguments:
+ *
+ * pDstSAD -Pointer of result SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the
+ * following conditions is true:
+ * - at least one of the following pointers is NULL:
+ * pSrcOrg, pDstSAD, or pSrcRef
+ * - pSrcOrg is not 8-byte aligned.
+ * - iStepOrg <= 0 or iStepOrg is not a multiple of 8
+ * - iStepRef <= 0 or iStepRef is not a multiple of 8
+ * - iHeight is not 4, 8 or 16
+ *
+ */
+OMXResult omxVCCOMM_SAD_8x (
+ const OMX_U8 *pSrcOrg,
+ OMX_U32 iStepOrg,
+ const OMX_U8 *pSrcRef,
+ OMX_U32 iStepRef,
+ OMX_S32*pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/* 6.2.1.1 Direction */
+/* The direction enumerator is used with functions that perform AC/DC prediction and zig-zag scan. */
+
+enum {
+ OMX_VC_NONE = 0,
+ OMX_VC_HORIZONTAL = 1,
+ OMX_VC_VERTICAL = 2
+};
+
+
+
+/* 6.2.1.2 Bilinear Interpolation */
+/* The bilinear interpolation enumerator is used with motion estimation, motion compensation, and reconstruction functions. */
+
+enum {
+ OMX_VC_INTEGER_PIXEL = 0, /* case a */
+ OMX_VC_HALF_PIXEL_X = 1, /* case b */
+ OMX_VC_HALF_PIXEL_Y = 2, /* case c */
+ OMX_VC_HALF_PIXEL_XY = 3 /* case d */
+};
+
+
+
+/* 6.2.1.3 Neighboring Macroblock Availability */
+/* Neighboring macroblock availability is indicated using the following flags: */
+
+enum {
+ OMX_VC_UPPER = 1, /** above macroblock is available */
+ OMX_VC_LEFT = 2, /** left macroblock is available */
+ OMX_VC_CENTER = 4,
+ OMX_VC_RIGHT = 8,
+ OMX_VC_LOWER = 16,
+ OMX_VC_UPPER_LEFT = 32, /** above-left macroblock is available */
+ OMX_VC_UPPER_RIGHT = 64, /** above-right macroblock is available */
+ OMX_VC_LOWER_LEFT = 128,
+ OMX_VC_LOWER_RIGHT = 256
+};
+
+
+
+/* 6.2.1.4 Video Components */
+/* A data type that enumerates video components is defined as follows: */
+
+typedef enum {
+ OMX_VC_LUMINANCE, /** Luminance component */
+ OMX_VC_CHROMINANCE /** chrominance component */
+} OMXVCM4P2VideoComponent;
+
+
+
+/* 6.2.1.5 MacroblockTypes */
+/* A data type that enumerates macroblock types is defined as follows: */
+
+typedef enum {
+ OMX_VC_INTER = 0, /** P picture or P-VOP */
+ OMX_VC_INTER_Q = 1, /** P picture or P-VOP */
+ OMX_VC_INTER4V = 2, /** P picture or P-VOP */
+ OMX_VC_INTRA = 3, /** I and P picture, I- and P-VOP */
+ OMX_VC_INTRA_Q = 4, /** I and P picture, I- and P-VOP */
+ OMX_VC_INTER4V_Q = 5 /** P picture or P-VOP (H.263)*/
+} OMXVCM4P2MacroblockType;
+
+
+
+/* 6.2.1.6 Coordinates */
+/* Coordinates are represented as follows: */
+
+typedef struct {
+ OMX_INT x;
+ OMX_INT y;
+} OMXVCM4P2Coordinate;
+
+
+
+/* 6.2.1.7 Motion Estimation Algorithms */
+/* A data type that enumerates motion estimation search methods is defined as follows: */
+
+typedef enum {
+ OMX_VC_M4P2_FAST_SEARCH = 0, /** Fast motion search */
+ OMX_VC_M4P2_FULL_SEARCH = 1 /** Full motion search */
+} OMXVCM4P2MEMode;
+
+
+
+/* 6.2.1.8 Motion Estimation Parameters */
+/* A data structure containing control parameters for
+ * motion estimation functions is defined as follows:
+ */
+
+typedef struct {
+ OMX_INT searchEnable8x8; /** enables 8x8 search */
+ OMX_INT halfPelSearchEnable; /** enables half-pel resolution */
+ OMX_INT searchRange; /** search range */
+ OMX_INT rndVal; /** rounding control; 0-disabled, 1-enabled*/
+} OMXVCM4P2MEParams;
+
+
+
+/* 6.2.1.9 Macroblock Information */
+/* A data structure containing macroblock parameters for
+ * motion estimation functions is defined as follows:
+ */
+
+typedef struct {
+ OMX_S32 sliceId; /* slice number */
+ OMXVCM4P2MacroblockType mbType; /* MB type: OMX_VC_INTRA, OMX_VC_INTER, or OMX_VC_INTER4 */
+ OMX_S32 qp; /* quantization parameter*/
+ OMX_U32 cbpy; /* CBP Luma */
+ OMX_U32 cbpc; /* CBP Chroma */
+ OMXVCMotionVector pMV0[2][2]; /* motion vector, represented using 1/2-pel units,
+ * pMV0[blocky][blockx] (blocky = 0~1, blockx =0~1)
+ */
+ OMXVCMotionVector pMVPred[2][2]; /* motion vector prediction, represented using 1/2-pel units,
+ * pMVPred[blocky][blockx] (blocky = 0~1, blockx = 0~1)
+ */
+ OMX_U8 pPredDir[2][2]; /* AC prediction direction:
+ * OMX_VC_NONE, OMX_VC_VERTICAL, OMX_VC_HORIZONTAL
+ */
+} OMXVCM4P2MBInfo, *OMXVCM4P2MBInfoPtr;
+
+
+
+/**
+ * Function: omxVCM4P2_FindMVpred (6.2.3.1.1)
+ *
+ * Description:
+ * Predicts a motion vector for the current block using the procedure
+ * specified in [ISO14496-2], subclause 7.6.5. The resulting predicted MV is
+ * returned in pDstMVPred. If the parameter pDstMVPredME if is not NULL then
+ * the set of three MV candidates used for prediction is also returned,
+ * otherwise pDstMVPredMEis NULL upon return.
+ *
+ * Input Arguments:
+ *
+ * pSrcMVCurMB - pointer to the MV buffer associated with the current Y
+ * macroblock; a value of NULL indicates unavailability.
+ * pSrcCandMV1 - pointer to the MV buffer containing the 4 MVs associated
+ * with the MB located to the left of the current MB; set to NULL
+ * if there is no MB to the left.
+ * pSrcCandMV2 - pointer to the MV buffer containing the 4 MVs associated
+ * with the MB located above the current MB; set to NULL if there
+ * is no MB located above the current MB.
+ * pSrcCandMV3 - pointer to the MV buffer containing the 4 MVs associated
+ * with the MB located to the right and above the current MB; set
+ * to NULL if there is no MB located to the above-right.
+ * iBlk - the index of block in the current macroblock
+ * pDstMVPredME - MV candidate return buffer; if set to NULL then
+ * prediction candidate MVs are not returned and pDstMVPredME will
+ * be NULL upon function return; if pDstMVPredME is non-NULL then it
+ * must point to a buffer containing sufficient space for three
+ * return MVs.
+ *
+ * Output Arguments:
+ *
+ * pDstMVPred - pointer to the predicted motion vector
+ * pDstMVPredME - if non-NULL upon input then pDstMVPredME points upon
+ * return to a buffer containing the three motion vector candidates
+ * used for prediction as specified in [ISO14496-2], subclause
+ * 7.6.5, otherwise if NULL upon input then pDstMVPredME is NULL
+ * upon output.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - the pointer pDstMVPred is NULL
+ * - the parameter iBlk does not fall into the range 0 <= iBlk<=3
+ *
+ */
+OMXResult omxVCM4P2_FindMVpred (
+ const OMXVCMotionVector *pSrcMVCurMB,
+ const OMXVCMotionVector *pSrcCandMV1,
+ const OMXVCMotionVector *pSrcCandMV2,
+ const OMXVCMotionVector *pSrcCandMV3,
+ OMXVCMotionVector *pDstMVPred,
+ OMXVCMotionVector *pDstMVPredME,
+ OMX_INT iBlk
+);
+
+
+
+/**
+ * Function: omxVCM4P2_IDCT8x8blk (6.2.3.2.1)
+ *
+ * Description:
+ * Computes a 2D inverse DCT for a single 8x8 block, as defined in
+ * [ISO14496-2].
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the start of the linearly arranged IDCT input buffer;
+ * must be aligned on a 16-byte boundary. According to
+ * [ISO14496-2], the input coefficient values should lie within the
+ * range [-2048, 2047].
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the start of the linearly arranged IDCT output buffer;
+ * must be aligned on a 16-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - pSrc or pDst is NULL.
+ * - pSrc or pDst is not 16-byte aligned.
+ *
+ */
+OMXResult omxVCM4P2_IDCT8x8blk (
+ const OMX_S16 *pSrc,
+ OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function: omxVCM4P2_MEGetBufSize (6.2.4.1.1)
+ *
+ * Description:
+ * Computes the size, in bytes, of the vendor-specific specification
+ * structure for the following motion estimation functions:
+ * BlockMatch_Integer_8x8, BlockMatch_Integer_16x16, and MotionEstimationMB.
+ *
+ * Input Arguments:
+ *
+ * MEmode - motion estimation mode; available modes are defined by the
+ * enumerated type OMXVCM4P2MEMode
+ * pMEParams - motion estimation parameters
+ *
+ * Output Arguments:
+ *
+ * pSize - pointer to the number of bytes required for the specification
+ * structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - one or more of the following is true:
+ * - an invalid value was specified for the parameter MEmode
+ * - a negative or zero value was specified for the
+ * parameter pMEParams->searchRange
+ *
+ */
+OMXResult omxVCM4P2_MEGetBufSize (
+ OMXVCM4P2MEMode MEmode,
+ const OMXVCM4P2MEParams *pMEParams,
+ OMX_U32 *pSize
+);
+
+
+
+/**
+ * Function: omxVCM4P2_MEInit (6.2.4.1.2)
+ *
+ * Description:
+ * Initializes the vendor-specific specification structure required for the
+ * following motion estimation functions: BlockMatch_Integer_8x8,
+ * BlockMatch_Integer_16x16, and MotionEstimationMB. Memory for the
+ * specification structure *pMESpec must be allocated prior to calling the
+ * function, and should be aligned on a 4-byte boundary. Following
+ * initialization by this function, the vendor-specific structure *pMESpec
+ * should contain an implementation-specific representation of all motion
+ * estimation parameters received via the structure pMEParams, for example
+ * rndVal, searchRange, etc. The number of bytes required for the
+ * specification structure can be determined using the function
+ * omxVCM4P2_MEGetBufSize.
+ *
+ * Input Arguments:
+ *
+ * MEmode - motion estimation mode; available modes are defined by the
+ * enumerated type OMXVCM4P2MEMode
+ * pMEParams - motion estimation parameters
+ * pMESpec - pointer to the uninitialized ME specification structure
+ *
+ * Output Arguments:
+ *
+ * pMESpec - pointer to the initialized ME specification structure
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - one or more of the following is true:
+ * - an invalid value was specified for the parameter MEmode
+ * - a negative or zero value was specified for the
+ * parameter pMEParams->searchRange
+ *
+ */
+OMXResult omxVCM4P2_MEInit (
+ OMXVCM4P2MEMode MEmode,
+ const OMXVCM4P2MEParams*pMEParams,
+ void *pMESpec
+);
+
+
+
+/**
+ * Function: omxVCM4P2_BlockMatch_Integer_16x16 (6.2.4.2.1)
+ *
+ * Description:
+ * Performs a 16x16 block search; estimates motion vector and associated
+ * minimum SAD. Both the input and output motion vectors are represented using
+ * half-pixel units, and therefore a shift left or right by 1 bit may be
+ * required, respectively, to match the input or output MVs with other
+ * functions that either generate output MVs or expect input MVs represented
+ * using integer pixel units.
+ *
+ * Input Arguments:
+ *
+ * pSrcRefBuf - pointer to the reference Y plane; points to the reference
+ * MB that corresponds to the location of the current macroblock in
+ * the current plane.
+ * refWidth - width of the reference plane
+ * pRefRect - pointer to the valid reference plane rectangle; coordinates
+ * are specified relative to the image origin. Rectangle
+ * boundaries may extend beyond image boundaries if the image has
+ * been padded. For example, if padding extends 4 pixels beyond
+ * frame border, then the value for the left border could be set to
+ * -4.
+ * pSrcCurrBuf - pointer to the current block in the current macroblock
+ * buffer extracted from the original plane (linear array, 256
+ * entries); must be aligned on a 16-byte boundary. The number of
+ * bytes between lines (step) is 16.
+ * pCurrPointPos - position of the current macroblock in the current plane
+ * pSrcPreMV - pointer to predicted motion vector; NULL indicates no
+ * predicted MV
+ * pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced
+ * by pSrcPreMV); may be set to NULL if unavailable.
+ * pMESpec - vendor-specific motion estimation specification structure;
+ * must have been allocated and then initialized using
+ * omxVCM4P2_MEInit prior to calling the block matching function.
+ *
+ * Output Arguments:
+ *
+ * pDstMV - pointer to estimated MV
+ * pDstSAD - pointer to minimum SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following
+ * conditions is true:
+ * - at least one of the following pointers is NULL: pSrcRefBuf,
+ * pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or
+ * pMESpec, or
+ * - pSrcCurrBuf is not 16-byte aligned
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Integer_16x16 (
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ const OMXVCMotionVector*pSrcPreMV,
+ const OMX_INT *pSrcPreSAD,
+ void *pMESpec,
+ OMXVCMotionVector*pDstMV,
+ OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P2_BlockMatch_Integer_8x8 (6.2.4.2.2)
+ *
+ * Description:
+ * Performs an 8x8 block search; estimates motion vector and associated
+ * minimum SAD. Both the input and output motion vectors are represented
+ * using half-pixel units, and therefore a shift left or right by 1 bit may be
+ * required, respectively, to match the input or output MVs with other
+ * functions that either generate output MVs or expect input MVs represented
+ * using integer pixel units.
+ *
+ * Input Arguments:
+ *
+ * pSrcRefBuf - pointer to the reference Y plane; points to the reference
+ * block that corresponds to the location of the current 8x8 block
+ * in the current plane.
+ * refWidth - width of the reference plane
+ * pRefRect - pointer to the valid reference plane rectangle; coordinates
+ * are specified relative to the image origin. Rectangle
+ * boundaries may extend beyond image boundaries if the image has
+ * been padded.
+ * pSrcCurrBuf - pointer to the current block in the current macroblock
+ * buffer extracted from the original plane (linear array, 128
+ * entries); must be aligned on an 8-byte boundary. The number of
+ * bytes between lines (step) is 16 bytes.
+ * pCurrPointPos - position of the current block in the current plane
+ * pSrcPreMV - pointer to predicted motion vector; NULL indicates no
+ * predicted MV
+ * pSrcPreSAD - pointer to SAD associated with the predicted MV (referenced
+ * by pSrcPreMV); may be set to NULL if unavailable.
+ * pMESpec - vendor-specific motion estimation specification structure;
+ * must have been allocated and then initialized using
+ * omxVCM4P2_MEInit prior to calling the block matching function.
+ *
+ * Output Arguments:
+ *
+ * pDstMV - pointer to estimated MV
+ * pDstSAD - pointer to minimum SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following
+ * conditions is true:
+ * - at least one of the following pointers is NULL: pSrcRefBuf,
+ * pRefRect, pSrcCurrBuff, pCurrPointPos, pDstMV, pDstSAD or
+ * pMESpec, or
+ * - pSrcCurrBuf is not 8-byte aligned
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Integer_8x8 (
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ const OMXVCMotionVector *pSrcPreMV,
+ const OMX_INT *pSrcPreSAD,
+ void *pMESpec,
+ OMXVCMotionVector *pDstMV,
+ OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P2_BlockMatch_Half_16x16 (6.2.4.2.3)
+ *
+ * Description:
+ * Performs a 16x16 block match with half-pixel resolution. Returns the
+ * estimated motion vector and associated minimum SAD. This function
+ * estimates the half-pixel motion vector by interpolating the integer
+ * resolution motion vector referenced by the input parameter pSrcDstMV, i.e.,
+ * the initial integer MV is generated externally. The input parameters
+ * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of
+ * 16x16 integer search prior to calling BlockMatch_Half_16x16. The function
+ * BlockMatch_Integer_16x16 may be used for integer motion estimation.
+ *
+ * Input Arguments:
+ *
+ * pSrcRefBuf - pointer to the reference Y plane; points to the reference
+ * macroblock that corresponds to the location of the current
+ * macroblock in the current plane.
+ * refWidth - width of the reference plane
+ * pRefRect - reference plane valid region rectangle
+ * pSrcCurrBuf - pointer to the current block in the current macroblock
+ * buffer extracted from the original plane (linear array, 256
+ * entries); must be aligned on a 16-byte boundary. The number of
+ * bytes between lines (step) is 16.
+ * pSearchPointRefPos - position of the starting point for half pixel
+ * search (specified in terms of integer pixel units) in the
+ * reference plane, i.e., the reference position pointed to by the
+ * predicted motion vector.
+ * rndVal - rounding control parameter: 0 - disabled; 1 - enabled.
+ * pSrcDstMV - pointer to the initial MV estimate; typically generated
+ * during a prior 16X16 integer search; specified in terms of
+ * half-pixel units.
+ *
+ * Output Arguments:
+ *
+ * pSrcDstMV - pointer to estimated MV
+ * pDstSAD - pointer to minimum SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following
+ * conditions is true:
+ * - at least one of the following pointers is NULL: pSrcRefBuf,
+ * pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV.
+ * - pSrcCurrBuf is not 16-byte aligned, or
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Half_16x16 (
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pSearchPointRefPos,
+ OMX_INT rndVal,
+ OMXVCMotionVector *pSrcDstMV,
+ OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P2_BlockMatch_Half_8x8 (6.2.4.2.4)
+ *
+ * Description:
+ * Performs an 8x8 block match with half-pixel resolution. Returns the
+ * estimated motion vector and associated minimum SAD. This function
+ * estimates the half-pixel motion vector by interpolating the integer
+ * resolution motion vector referenced by the input parameter pSrcDstMV, i.e.,
+ * the initial integer MV is generated externally. The input parameters
+ * pSrcRefBuf and pSearchPointRefPos should be shifted by the winning MV of
+ * 8x8 integer search prior to calling BlockMatch_Half_8x8. The function
+ * BlockMatch_Integer_8x8 may be used for integer motion estimation.
+ *
+ * Input Arguments:
+ *
+ * pSrcRefBuf - pointer to the reference Y plane; points to the reference
+ * block that corresponds to the location of the current 8x8 block
+ * in the current plane.
+ * refWidth - width of the reference plane
+ * pRefRect - reference plane valid region rectangle
+ * pSrcCurrBuf - pointer to the current block in the current macroblock
+ * buffer extracted from the original plane (linear array, 128
+ * entries); must be aligned on a 8-byte boundary. The number of
+ * bytes between lines (step) is 16.
+ * pSearchPointRefPos - position of the starting point for half pixel
+ * search (specified in terms of integer pixel units) in the
+ * reference plane.
+ * rndVal - rounding control parameter: 0 - disabled; 1 - enabled.
+ * pSrcDstMV - pointer to the initial MV estimate; typically generated
+ * during a prior 8x8 integer search, specified in terms of
+ * half-pixel units.
+ *
+ * Output Arguments:
+ *
+ * pSrcDstMV - pointer to estimated MV
+ * pDstSAD - pointer to minimum SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one of the following
+ * conditions is true:
+ * - at least one of the following pointers is NULL:
+ * pSrcRefBuf, pRefRect, pSrcCurrBuff, pSearchPointRefPos, pSrcDstMV
+ * - pSrcCurrBuf is not 8-byte aligned
+ *
+ */
+OMXResult omxVCM4P2_BlockMatch_Half_8x8 (
+ const OMX_U8 *pSrcRefBuf,
+ OMX_INT refWidth,
+ const OMXRect *pRefRect,
+ const OMX_U8 *pSrcCurrBuf,
+ const OMXVCM4P2Coordinate *pSearchPointRefPos,
+ OMX_INT rndVal,
+ OMXVCMotionVector *pSrcDstMV,
+ OMX_INT *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P2_MotionEstimationMB (6.2.4.3.1)
+ *
+ * Description:
+ * Performs motion search for a 16x16 macroblock. Selects best motion search
+ * strategy from among inter-1MV, inter-4MV, and intra modes. Supports
+ * integer and half pixel resolution.
+ *
+ * Input Arguments:
+ *
+ * pSrcCurrBuf - pointer to the top-left corner of the current MB in the
+ * original picture plane; must be aligned on a 16-byte boundary.
+ * The function does not expect source data outside the region
+ * bounded by the MB to be available; for example it is not
+ * necessary for the caller to guarantee the availability of
+ * pSrcCurrBuf[-SrcCurrStep], i.e., the row of pixels above the MB
+ * to be processed.
+ * srcCurrStep - width of the original picture plane, in terms of full
+ * pixels; must be a multiple of 16.
+ * pSrcRefBuf - pointer to the reference Y plane; points to the reference
+ * plane location corresponding to the location of the current
+ * macroblock in the current plane; must be aligned on a 16-byte
+ * boundary.
+ * srcRefStep - width of the reference picture plane, in terms of full
+ * pixels; must be a multiple of 16.
+ * pRefRect - reference plane valid region rectangle, specified relative to
+ * the image origin
+ * pCurrPointPos - position of the current macroblock in the current plane
+ * pMESpec - pointer to the vendor-specific motion estimation specification
+ * structure; must be allocated and then initialized using
+ * omxVCM4P2_MEInit prior to calling this function.
+ * pMBInfo - array, of dimension four, containing pointers to information
+ * associated with four nearby MBs:
+ * - pMBInfo[0] - pointer to left MB information
+ * - pMBInfo[1] - pointer to top MB information
+ * - pMBInfo[2] - pointer to top-left MB information
+ * - pMBInfo[3] - pointer to top-right MB information
+ * Any pointer in the array may be set equal to NULL if the
+ * corresponding MB doesn't exist. For each MB, the following structure
+ * members are used:
+ * - mbType - macroblock type, either OMX_VC_INTRA, OMX_VC_INTER, or
+ * OMX_VC_INTER4V
+ * - pMV0[2][2] - estimated motion vectors; represented
+ * in 1/2 pixel units
+ * - sliceID - number of the slice to which the MB belongs
+ * pSrcDstMBCurr - pointer to information structure for the current MB.
+ * The following entries should be set prior to calling the
+ * function: sliceID - the number of the slice the to which the
+ * current MB belongs. The structure elements cbpy and cbpc are
+ * ignored.
+ *
+ * Output Arguments:
+ *
+ * pSrcDstMBCurr - pointer to updated information structure for the current
+ * MB after MB-level motion estimation has been completed. The
+ * following structure members are updated by the ME function:
+ * - mbType - macroblock type: OMX_VC_INTRA, OMX_VC_INTER, or
+ * OMX_VC_INTER4V.
+ * - pMV0[2][2] - estimated motion vectors; represented in
+ * terms of 1/2 pel units.
+ * - pMVPred[2][2] - predicted motion vectors; represented
+ * in terms of 1/2 pel units.
+ * The structure members cbpy and cbpc are not updated by the function.
+ * pDstSAD - pointer to the minimum SAD for INTER1V, or sum of minimum SADs
+ * for INTER4V
+ * pDstBlockSAD - pointer to an array of SAD values for each of the four
+ * 8x8 luma blocks in the MB. The block SADs are in scan order for
+ * each MB.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments. Returned if one or more of the
+ * following conditions is true:
+ * - at least one of the following pointers is NULL: pSrcCurrBuf,
+ * pSrcRefBuf, pRefRect, pCurrPointPos, pMBInter, pMBIntra,
+ * pSrcDstMBCurr, or pDstSAD.
+ *
+ */
+OMXResult omxVCM4P2_MotionEstimationMB (
+ const OMX_U8 *pSrcCurrBuf,
+ OMX_S32 srcCurrStep,
+ const OMX_U8 *pSrcRefBuf,
+ OMX_S32 srcRefStep,
+ const OMXRect*pRefRect,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ void *pMESpec,
+ const OMXVCM4P2MBInfoPtr *pMBInfo,
+ OMXVCM4P2MBInfo *pSrcDstMBCurr,
+ OMX_U16 *pDstSAD,
+ OMX_U16 *pDstBlockSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DCT8x8blk (6.2.4.4.1)
+ *
+ * Description:
+ * Computes a 2D forward DCT for a single 8x8 block, as defined in
+ * [ISO14496-2].
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the start of the linearly arranged input buffer; must
+ * be aligned on a 16-byte boundary. Input values (pixel
+ * intensities) are valid in the range [-255,255].
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the start of the linearly arranged output buffer; must
+ * be aligned on a 16-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, returned if:
+ * - pSrc or pDst is NULL.
+ * - pSrc or pDst is not 16-byte aligned.
+ *
+ */
+OMXResult omxVCM4P2_DCT8x8blk (
+ const OMX_S16 *pSrc,
+ OMX_S16 *pDst
+);
+
+
+
+/**
+ * Function: omxVCM4P2_QuantIntra_I (6.2.4.4.2)
+ *
+ * Description:
+ * Performs quantization on intra block coefficients. This function supports
+ * bits_per_pixel == 8.
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input intra block coefficients; must be aligned
+ * on a 16-byte boundary.
+ * QP - quantization parameter (quantizer_scale).
+ * blockIndex - block index indicating the component type and position,
+ * valid in the range 0 to 5, as defined in [ISO14496-2], subclause
+ * 6.1.3.8.
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; shortVideoHeader==1 selects linear intra DC
+ * mode, and shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to the output (quantized) interblock coefficients.
+ * When shortVideoHeader==1, AC coefficients are saturated on the
+ * interval [-127, 127], and DC coefficients are saturated on the
+ * interval [1, 254]. When shortVideoHeader==0, AC coefficients
+ * are saturated on the interval [-2047, 2047].
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - pSrcDst is NULL.
+ * - blockIndex < 0 or blockIndex >= 10
+ * - QP <= 0 or QP >= 32.
+ *
+ */
+OMXResult omxVCM4P2_QuantIntra_I (
+ OMX_S16 *pSrcDst,
+ OMX_U8 QP,
+ OMX_INT blockIndex,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_QuantInter_I (6.2.4.4.3)
+ *
+ * Description:
+ * Performs quantization on an inter coefficient block; supports
+ * bits_per_pixel == 8.
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input inter block coefficients; must be aligned
+ * on a 16-byte boundary.
+ * QP - quantization parameter (quantizer_scale)
+ * shortVideoHeader - binary flag indicating presence of short_video_header;
+ * shortVideoHeader==1 selects linear intra DC mode, and
+ * shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to the output (quantized) interblock coefficients.
+ * When shortVideoHeader==1, AC coefficients are saturated on the
+ * interval [-127, 127], and DC coefficients are saturated on the
+ * interval [1, 254]. When shortVideoHeader==0, AC coefficients
+ * are saturated on the interval [-2047, 2047].
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - pSrcDst is NULL.
+ * - QP <= 0 or QP >= 32.
+ *
+ */
+OMXResult omxVCM4P2_QuantInter_I (
+ OMX_S16 *pSrcDst,
+ OMX_U8 QP,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_TransRecBlockCoef_intra (6.2.4.4.4)
+ *
+ * Description:
+ * Quantizes the DCT coefficients, implements intra block AC/DC coefficient
+ * prediction, and reconstructs the current intra block texture for prediction
+ * on the next frame. Quantized row and column coefficients are returned in
+ * the updated coefficient buffers.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the pixels of current intra block; must be aligned on
+ * an 8-byte boundary.
+ * pPredBufRow - pointer to the coefficient row buffer containing
+ * ((num_mb_per_row * 2 + 1) * 8) elements of type OMX_S16.
+ * Coefficients are organized into blocks of eight as described
+ * below (Internal Prediction Coefficient Update Procedures). The
+ * DC coefficient is first, and the remaining buffer locations
+ * contain the quantized AC coefficients. Each group of eight row
+ * buffer elements combined with one element eight elements ahead
+ * contains the coefficient predictors of the neighboring block
+ * that is spatially above or to the left of the block currently to
+ * be decoded. A negative-valued DC coefficient indicates that this
+ * neighboring block is not INTRA-coded or out of bounds, and
+ * therefore the AC and DC coefficients are invalid. Pointer must
+ * be aligned on an 8-byte boundary.
+ * pPredBufCol - pointer to the prediction coefficient column buffer
+ * containing 16 elements of type OMX_S16. Coefficients are
+ * organized as described in section 6.2.2.5. Pointer must be
+ * aligned on an 8-byte boundary.
+ * pSumErr - pointer to a flag indicating whether or not AC prediction is
+ * required; AC prediction is enabled if *pSumErr >=0, but the
+ * value is not used for coefficient prediction, i.e., the sum of
+ * absolute differences starts from 0 for each call to this
+ * function. Otherwise AC prediction is disabled if *pSumErr < 0 .
+ * blockIndex - block index indicating the component type and position, as
+ * defined in [ISO14496-2], subclause 6.1.3.8.
+ * curQp - quantization parameter of the macroblock to which the current
+ * block belongs
+ * pQpBuf - pointer to a 2-element quantization parameter buffer; pQpBuf[0]
+ * contains the quantization parameter associated with the 8x8
+ * block left of the current block (QPa), and pQpBuf[1] contains
+ * the quantization parameter associated with the 8x8 block above
+ * the current block (QPc). In the event that the corresponding
+ * block is outside of the VOP bound, the Qp value will not affect
+ * the intra prediction process, as described in [ISO14496-2],
+ * sub-clause 7.4.3.3, Adaptive AC Coefficient Prediction.
+ * srcStep - width of the source buffer; must be a multiple of 8.
+ * dstStep - width of the reconstructed destination buffer; must be a
+ * multiple of 16.
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; shortVideoHeader==1 selects linear intra DC
+ * mode, and shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the quantized DCT coefficient buffer; pDst[0] contains
+ * the predicted DC coefficient; the remaining entries contain the
+ * quantized AC coefficients (without prediction). The pointer
+ * pDstmust be aligned on a 16-byte boundary.
+ * pRec - pointer to the reconstructed texture; must be aligned on an
+ * 8-byte boundary.
+ * pPredBufRow - pointer to the updated coefficient row buffer
+ * pPredBufCol - pointer to the updated coefficient column buffer
+ * pPreACPredict - if prediction is enabled, the parameter points to the
+ * start of the buffer containing the coefficient differences for
+ * VLC encoding. The entry pPreACPredict[0]indicates prediction
+ * direction for the current block and takes one of the following
+ * values: OMX_VC_NONE (prediction disabled), OMX_VC_HORIZONTAL, or
+ * OMX_VC_VERTICAL. The entries
+ * pPreACPredict[1]-pPreACPredict[7]contain predicted AC
+ * coefficients. If prediction is disabled (*pSumErr<0) then the
+ * contents of this buffer are undefined upon return from the
+ * function
+ * pSumErr - pointer to the value of the accumulated AC coefficient errors,
+ * i.e., sum of the absolute differences between predicted and
+ * unpredicted AC coefficients
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - Bad arguments:
+ * - At least one of the following pointers is NULL: pSrc, pDst, pRec,
+ * pCoefBufRow, pCoefBufCol, pQpBuf, pPreACPredict, pSumErr.
+ * - blockIndex < 0 or blockIndex >= 10;
+ * - curQP <= 0 or curQP >= 32.
+ * - srcStep, or dstStep <= 0 or not a multiple of 8.
+ * - pDst is not 16-byte aligned: .
+ * - At least one of the following pointers is not 8-byte aligned:
+ * pSrc, pRec.
+ *
+ * Note: The coefficient buffers must be updated in accordance with the
+ * update procedures defined in section in 6.2.2.
+ *
+ */
+OMXResult omxVCM4P2_TransRecBlockCoef_intra (
+ const OMX_U8 *pSrc,
+ OMX_S16 *pDst,
+ OMX_U8 *pRec,
+ OMX_S16 *pPredBufRow,
+ OMX_S16 *pPredBufCol,
+ OMX_S16 *pPreACPredict,
+ OMX_INT *pSumErr,
+ OMX_INT blockIndex,
+ OMX_U8 curQp,
+ const OMX_U8 *pQpBuf,
+ OMX_INT srcStep,
+ OMX_INT dstStep,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_TransRecBlockCoef_inter (6.2.4.4.5)
+ *
+ * Description:
+ * Implements DCT, and quantizes the DCT coefficients of the inter block
+ * while reconstructing the texture residual. There is no boundary check for
+ * the bit stream buffer.
+ *
+ * Input Arguments:
+ *
+ * pSrc -pointer to the residuals to be encoded; must be aligned on an
+ * 16-byte boundary.
+ * QP - quantization parameter.
+ * shortVideoHeader - binary flag indicating presence of short_video_header;
+ * shortVideoHeader==1 selects linear intra DC mode, and
+ * shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the quantized DCT coefficients buffer; must be aligned
+ * on a 16-byte boundary.
+ * pRec - pointer to the reconstructed texture residuals; must be aligned
+ * on a 16-byte boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - At least one of the following pointers is either NULL or
+ * not 16-byte aligned:
+ * - pSrc
+ * - pDst
+ * - pRec
+ * - QP <= 0 or QP >= 32.
+ *
+ */
+OMXResult omxVCM4P2_TransRecBlockCoef_inter (
+ const OMX_S16 *pSrc,
+ OMX_S16 *pDst,
+ OMX_S16 *pRec,
+ OMX_U8 QP,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_EncodeVLCZigzag_IntraDCVLC (6.2.4.5.2)
+ *
+ * Description:
+ * Performs zigzag scan and VLC encoding of AC and DC coefficients for one
+ * intra block. Two versions of the function (DCVLC and ACVLC) are provided
+ * in order to support the two different methods of processing DC
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, "Intra DC
+ * Coefficient Decoding for the Case of Switched VLC Encoding".
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - double pointer to the current byte in the bitstream
+ * pBitOffset - pointer to the bit position in the byte pointed by
+ * *ppBitStream. Valid within 0 to 7.
+ * pQDctBlkCoef - pointer to the quantized DCT coefficient
+ * predDir - AC prediction direction, which is used to decide the zigzag
+ * scan pattern; takes one of the following values:
+ * - OMX_VC_NONE - AC prediction not used.
+ * Performs classical zigzag scan.
+ * - OMX_VC_HORIZONTAL - Horizontal prediction.
+ * Performs alternate-vertical zigzag scan.
+ * - OMX_VC_VERTICAL - Vertical prediction.
+ * Performs alternate-horizontal zigzag scan.
+ * pattern - block pattern which is used to decide whether this block is
+ * encoded
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ * videoComp - video component type (luminance, chrominance) of the current
+ * block
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is encoded, so
+ * that it points to the current byte in the bit stream buffer.
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - Bad arguments:
+ * - At least one of the following pointers is NULL: ppBitStream,
+ * *ppBitStream, pBitOffset, pQDctBlkCoef.
+ * - *pBitOffset < 0, or *pBitOffset >7.
+ * - PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or
+ * OMX_VC_VERTICAL.
+ * - VideoComp is not one component of enum OMXVCM4P2VideoComponent.
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_IntraDCVLC (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ const OMX_S16 *pQDctBlkCoef,
+ OMX_U8 predDir,
+ OMX_U8 pattern,
+ OMX_INT shortVideoHeader,
+ OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function: omxVCM4P2_EncodeVLCZigzag_IntraACVLC (6.2.4.5.2)
+ *
+ * Description:
+ * Performs zigzag scan and VLC encoding of AC and DC coefficients for one
+ * intra block. Two versions of the function (DCVLC and ACVLC) are provided
+ * in order to support the two different methods of processing DC
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC
+ * Coefficient Decoding for the Case of Switched VLC Encoding.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - double pointer to the current byte in the bitstream
+ * pBitOffset - pointer to the bit position in the byte pointed by
+ * *ppBitStream. Valid within 0 to 7.
+ * pQDctBlkCoef - pointer to the quantized DCT coefficient
+ * predDir - AC prediction direction, which is used to decide the zigzag
+ * scan pattern; takes one of the following values:
+ * - OMX_VC_NONE - AC prediction not used.
+ * Performs classical zigzag scan.
+ * - OMX_VC_HORIZONTAL - Horizontal prediction.
+ * Performs alternate-vertical zigzag scan.
+ * - OMX_VC_VERTICAL - Vertical prediction.
+ * Performs alternate-horizontal zigzag scan.
+ * pattern - block pattern which is used to decide whether this block is
+ * encoded
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is encoded, so
+ * that it points to the current byte in the bit stream buffer.
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - Bad arguments:
+ * - At least one of the following pointers is NULL: ppBitStream,
+ * *ppBitStream, pBitOffset, pQDctBlkCoef.
+ * - *pBitOffset < 0, or *pBitOffset >7.
+ * - PredDir is not one of: OMX_VC_NONE, OMX_VC_HORIZONTAL, or
+ * OMX_VC_VERTICAL.
+ * - VideoComp is not one component of enum OMXVCM4P2VideoComponent.
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_IntraACVLC (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ const OMX_S16 *pQDctBlkCoef,
+ OMX_U8 predDir,
+ OMX_U8 pattern,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_EncodeVLCZigzag_Inter (6.2.4.5.3)
+ *
+ * Description:
+ * Performs classical zigzag scanning and VLC encoding for one inter block.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the bit
+ * stream
+ * pBitOffset - pointer to the bit position in the byte pointed by
+ * *ppBitStream. Valid within 0 to 7
+ * pQDctBlkCoef - pointer to the quantized DCT coefficient
+ * pattern - block pattern which is used to decide whether this block is
+ * encoded
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is encoded so that
+ * it points to the current byte in the bit stream buffer.
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - Bad arguments
+ * - At least one of the pointers: is NULL: ppBitStream, *ppBitStream,
+ * pBitOffset, pQDctBlkCoef
+ * - *pBitOffset < 0, or *pBitOffset >7.
+ *
+ */
+OMXResult omxVCM4P2_EncodeVLCZigzag_Inter (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ const OMX_S16 *pQDctBlkCoef,
+ OMX_U8 pattern,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_EncodeMV (6.2.4.5.4)
+ *
+ * Description:
+ * Predicts a motion vector for the current macroblock, encodes the
+ * difference, and writes the output to the stream buffer. The input MVs
+ * pMVCurMB, pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB should lie
+ * within the ranges associated with the input parameter fcodeForward, as
+ * described in [ISO14496-2], subclause 7.6.3. This function provides a
+ * superset of the functionality associated with the function
+ * omxVCM4P2_FindMVpred.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - double pointer to the current byte in the bitstream buffer
+ * pBitOffset - index of the first free (next available) bit in the stream
+ * buffer referenced by *ppBitStream, valid in the range 0 to 7.
+ * pMVCurMB - pointer to the current macroblock motion vector; a value of
+ * NULL indicates unavailability.
+ * pSrcMVLeftMB - pointer to the source left macroblock motion vector; a
+ * value of NULLindicates unavailability.
+ * pSrcMVUpperMB - pointer to source upper macroblock motion vector; a
+ * value of NULL indicates unavailability.
+ * pSrcMVUpperRightMB - pointer to source upper right MB motion vector; a
+ * value of NULL indicates unavailability.
+ * fcodeForward - an integer with values from 1 to 7; used in encoding
+ * motion vectors related to search range, as described in
+ * [ISO14496-2], subclause 7.6.3.
+ * MBType - macro block type, valid in the range 0 to 5
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - updated pointer to the current byte in the bit stream
+ * buffer
+ * pBitOffset - updated index of the next available bit position in stream
+ * buffer referenced by *ppBitStream
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - At least one of the following pointers is NULL: ppBitStream,
+ * *ppBitStream, pBitOffset, pMVCurMB
+ * - *pBitOffset < 0, or *pBitOffset >7.
+ * - fcodeForward <= 0, or fcodeForward > 7, or MBType < 0.
+ *
+ */
+OMXResult omxVCM4P2_EncodeMV (
+ OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ const OMXVCMotionVector *pMVCurMB,
+ const OMXVCMotionVector*pSrcMVLeftMB,
+ const OMXVCMotionVector *pSrcMVUpperMB,
+ const OMXVCMotionVector *pSrcMVUpperRightMB,
+ OMX_INT fcodeForward,
+ OMXVCM4P2MacroblockType MBType
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodePadMV_PVOP (6.2.5.1.1)
+ *
+ * Description:
+ * Decodes and pads the four motion vectors associated with a non-intra P-VOP
+ * macroblock. For macroblocks of type OMX_VC_INTER4V, the output MV is
+ * padded as specified in [ISO14496-2], subclause 7.6.1.6. Otherwise, for
+ * macroblocks of types other than OMX_VC_INTER4V, the decoded MV is copied to
+ * all four output MV buffer entries.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the bit
+ * stream buffer
+ * pBitOffset - pointer to the bit position in the byte pointed to by
+ * *ppBitStream. *pBitOffset is valid within [0-7].
+ * pSrcMVLeftMB, pSrcMVUpperMB, and pSrcMVUpperRightMB - pointers to the
+ * motion vector buffers of the macroblocks specially at the left,
+ * upper, and upper-right side of the current macroblock,
+ * respectively; a value of NULL indicates unavailability. Note:
+ * Any neighborhood macroblock outside the current VOP or video
+ * packet or outside the current GOB (when short_video_header is
+ * 1 ) for which gob_header_empty is 0 is treated as
+ * transparent, according to [ISO14496-2], subclause 7.6.5.
+ * fcodeForward - a code equal to vop_fcode_forward in MPEG-4 bit stream
+ * syntax
+ * MBType - the type of the current macroblock. If MBType is not equal to
+ * OMX_VC_INTER4V, the destination motion vector buffer is still
+ * filled with the same decoded vector.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded, so
+ * that it points to the current byte in the bit stream buffer
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream
+ * pDstMVCurMB - pointer to the motion vector buffer for the current
+ * macroblock; contains four decoded motion vectors
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - At least one of the following pointers is NULL:
+ * ppBitStream, *ppBitStream, pBitOffset, pDstMVCurMB
+ * - *pBitOffset exceeds [0,7]
+ * - fcodeForward exceeds (0,7]
+ * - MBType less than zero
+ * - motion vector buffer is not 4-byte aligned.
+ * OMX_Sts_Err - status error
+ *
+ */
+OMXResult omxVCM4P2_DecodePadMV_PVOP (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMXVCMotionVector *pSrcMVLeftMB,
+ OMXVCMotionVector*pSrcMVUpperMB,
+ OMXVCMotionVector *pSrcMVUpperRightMB,
+ OMXVCMotionVector*pDstMVCurMB,
+ OMX_INT fcodeForward,
+ OMXVCM4P2MacroblockType MBType
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeVLCZigzag_IntraDCVLC (6.2.5.2.2)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients
+ * for one intra block. Two versions of the function (DCVLC and ACVLC) are
+ * provided in order to support the two different methods of processing DC
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC
+ * Coefficient Decoding for the Case of Switched VLC Encoding.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the
+ * bitstream buffer
+ * pBitOffset - pointer to the bit position in the current byte referenced
+ * by *ppBitStream. The parameter *pBitOffset is valid in the
+ * range [0-7].
+ * Bit Position in one byte: |Most Least|
+ * *pBitOffset |0 1 2 3 4 5 6 7|
+ * predDir - AC prediction direction; used to select the zigzag scan
+ * pattern; takes one of the following values:
+ * - OMX_VC_NONE - AC prediction not used;
+ * performs classical zigzag scan.
+ * - OMX_VC_HORIZONTAL - Horizontal prediction;
+ * performs alternate-vertical zigzag scan;
+ * - OMX_VC_VERTICAL - Vertical prediction;
+ * performs alternate-horizontal zigzag scan.
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ * videoComp - video component type (luminance or chrominance) of the
+ * current block
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded such
+ * that it points to the current byte in the bit stream buffer
+ * pBitOffset - *pBitOffset is updated such that it points to the current
+ * bit position in the byte pointed by *ppBitStream
+ * pDst - pointer to the coefficient buffer of current block; must be
+ * 4-byte aligned.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if:
+ * - At least one of the following pointers is NULL:
+ * ppBitStream, *ppBitStream, pBitOffset, pDst
+ * - *pBitOffset exceeds [0,7]
+ * - preDir exceeds [0,2]
+ * - pDst is not 4-byte aligned
+ * OMX_Sts_Err - if:
+ * - In DecodeVLCZigzag_IntraDCVLC, dc_size > 12
+ * - At least one of mark bits equals zero
+ * - Illegal stream encountered; code cannot be located in VLC table
+ * - Forbidden code encountered in the VLC FLC table.
+ * - The number of coefficients is greater than 64
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_IntraDCVLC (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_S16 *pDst,
+ OMX_U8 predDir,
+ OMX_INT shortVideoHeader,
+ OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeVLCZigzag_IntraACVLC (6.2.5.2.2)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan of AC and DC coefficients
+ * for one intra block. Two versions of the function (DCVLC and ACVLC) are
+ * provided in order to support the two different methods of processing DC
+ * coefficients, as described in [ISO14496-2], subclause 7.4.1.4, Intra DC
+ * Coefficient Decoding for the Case of Switched VLC Encoding.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the
+ * bitstream buffer
+ * pBitOffset - pointer to the bit position in the current byte referenced
+ * by *ppBitStream. The parameter *pBitOffset is valid in the
+ * range [0-7]. Bit Position in one byte: |Most Least| *pBitOffset
+ * |0 1 2 3 4 5 6 7|
+ * predDir - AC prediction direction; used to select the zigzag scan
+ * pattern; takes one of the following values: OMX_VC_NONE - AC
+ * prediction not used; performs classical zigzag scan.
+ * OMX_VC_HORIZONTAL - Horizontal prediction; performs
+ * alternate-vertical zigzag scan; OMX_VC_VERTICAL - Vertical
+ * prediction; performs alternate-horizontal zigzag scan.
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ * videoComp - video component type (luminance or chrominance) of the
+ * current block
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded such
+ * that it points to the current byte in the bit stream buffer
+ * pBitOffset - *pBitOffset is updated such that it points to the current
+ * bit position in the byte pointed by *ppBitStream
+ * pDst - pointer to the coefficient buffer of current block; must be
+ * 4-byte aligned.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments At least one of the following
+ * pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst,
+ * or At least one of the following conditions is true:
+ * *pBitOffset exceeds [0,7], preDir exceeds [0,2], or pDst is
+ * not 4-byte aligned
+ * OMX_Sts_Err In DecodeVLCZigzag_IntraDCVLC, dc_size > 12 At least one of
+ * mark bits equals zero Illegal stream encountered; code cannot
+ * be located in VLC table Forbidden code encountered in the VLC
+ * FLC table The number of coefficients is greater than 64
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_IntraACVLC (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_S16 *pDst,
+ OMX_U8 predDir,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeVLCZigzag_Inter (6.2.5.2.3)
+ *
+ * Description:
+ * Performs VLC decoding and inverse zigzag scan for one inter-coded block.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - double pointer to the current byte in the stream buffer
+ * pBitOffset - pointer to the next available bit in the current stream
+ * byte referenced by *ppBitStream. The parameter *pBitOffset is
+ * valid within the range [0-7].
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; escape modes 0-3 are used if
+ * shortVideoHeader==0, and escape mode 4 is used when
+ * shortVideoHeader==1.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded such
+ * that it points to the current byte in the stream buffer
+ * pBitOffset - *pBitOffset is updated after decoding such that it points
+ * to the next available bit in the stream byte referenced by
+ * *ppBitStream
+ * pDst - pointer to the coefficient buffer of current block; must be
+ * 4-byte aligned.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_BadArgErr - bad arguments:
+ * - At least one of the following pointers is NULL:
+ * ppBitStream, *ppBitStream, pBitOffset, pDst
+ * - pDst is not 4-byte aligned
+ * - *pBitOffset exceeds [0,7]
+ * OMX_Sts_Err - status error, if:
+ * - At least one mark bit is equal to zero
+ * - Encountered an illegal stream code that cannot be found in the VLC table
+ * - Encountered an illegal code in the VLC FLC table
+ * - The number of coefficients is greater than 64
+ *
+ */
+OMXResult omxVCM4P2_DecodeVLCZigzag_Inter (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_S16 *pDst,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_QuantInvIntra_I (6.2.5.3.2)
+ *
+ * Description:
+ * Performs the second inverse quantization mode on an intra/inter coded
+ * block. Supports bits_per_pixel = 8. The output coefficients are clipped to
+ * the range [-2048, 2047].
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input (quantized) intra/inter block; must be
+ * aligned on a 16-byte boundary.
+ * QP - quantization parameter (quantizer_scale)
+ * videoComp - video component type of the current block. Takes one of the
+ * following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra
+ * version only).
+ * shortVideoHeader - binary flag indicating presence of short_video_header
+ * (intra version only).
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to the output (dequantized) intra/inter block
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; one or more of the following is
+ * true:
+ * - pSrcDst is NULL
+ * - QP <= 0 or QP >=31
+ * - videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE.
+ *
+ */
+OMXResult omxVCM4P2_QuantInvIntra_I (
+ OMX_S16 *pSrcDst,
+ OMX_INT QP,
+ OMXVCM4P2VideoComponent videoComp,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_QuantInvInter_I (6.2.5.3.2)
+ *
+ * Description:
+ * Performs the second inverse quantization mode on an intra/inter coded
+ * block. Supports bits_per_pixel = 8. The output coefficients are clipped to
+ * the range [-2048, 2047].
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input (quantized) intra/inter block; must be
+ * aligned on a 16-byte boundary.
+ * QP - quantization parameter (quantizer_scale)
+ * videoComp - video component type of the current block. Takes one of the
+ * following flags: OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE (intra
+ * version only).
+ * shortVideoHeader - binary flag indicating presence of short_video_header
+ * (intra version only).
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to the output (dequantized) intra/inter block
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; one or more of the following is
+ * true:
+ * - pSrcDst is NULL
+ * - QP <= 0 or QP >=31
+ * - videoComp is neither OMX_VC_LUMINANCE nor OMX_VC_CHROMINANCE.
+ *
+ */
+OMXResult omxVCM4P2_QuantInvInter_I (
+ OMX_S16 *pSrcDst,
+ OMX_INT QP
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Intra (6.2.5.4.1)
+ *
+ * Description:
+ * Decodes the INTRA block coefficients. Inverse quantization, inversely
+ * zigzag positioning, and IDCT, with appropriate clipping on each step, are
+ * performed on the coefficients. The results are then placed in the output
+ * frame/plane on a pixel basis. Note: This function will be used only when
+ * at least one non-zero AC coefficient of current block exists in the bit
+ * stream. The DC only condition will be handled in another function.
+ *
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the bit
+ * stream buffer. There is no boundary check for the bit stream
+ * buffer.
+ * pBitOffset - pointer to the bit position in the byte pointed to by
+ * *ppBitStream. *pBitOffset is valid within [0-7].
+ * step - width of the destination plane
+ * pCoefBufRow - pointer to the coefficient row buffer; must be aligned on
+ * an 8-byte boundary.
+ * pCoefBufCol - pointer to the coefficient column buffer; must be aligned
+ * on an 8-byte boundary.
+ * curQP - quantization parameter of the macroblock which the current block
+ * belongs to
+ * pQPBuf - pointer to the quantization parameter buffer
+ * blockIndex - block index indicating the component type and position as
+ * defined in [ISO14496-2], subclause 6.1.3.8, Figure 6-5.
+ * intraDCVLC - a code determined by intra_dc_vlc_thr and QP. This allows a
+ * mechanism to switch between two VLC for coding of Intra DC
+ * coefficients as per [ISO14496-2], Table 6-21.
+ * ACPredFlag - a flag equal to ac_pred_flag (of luminance) indicating if
+ * the ac coefficients of the first row or first column are
+ * differentially coded for intra coded macroblock.
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; shortVideoHeader==1 selects linear intra DC
+ * mode, and shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded, so
+ * that it points to the current byte in the bit stream buffer
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream
+ * pDst - pointer to the block in the destination plane; must be aligned on
+ * an 8-byte boundary.
+ * pCoefBufRow - pointer to the updated coefficient row buffer.
+ * pCoefBufCol - pointer to the updated coefficient column buffer Note:
+ * The coefficient buffers must be updated in accordance with the
+ * update procedure defined in section 6.2.2.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if:
+ * - At least one of the following pointers is NULL:
+ * ppBitStream, *ppBitStream, pBitOffset, pCoefBufRow, pCoefBufCol,
+ * pQPBuf, pDst.
+ * - *pBitOffset exceeds [0,7]
+ * - curQP exceeds (1, 31)
+ * - blockIndex exceeds [0,5]
+ * - step is not the multiple of 8
+ * - a pointer alignment requirement was violated.
+ * OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Intra.
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Intra (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_U8 *pDst,
+ OMX_INT step,
+ OMX_S16 *pCoefBufRow,
+ OMX_S16 *pCoefBufCol,
+ OMX_U8 curQP,
+ const OMX_U8 *pQPBuf,
+ OMX_INT blockIndex,
+ OMX_INT intraDCVLC,
+ OMX_INT ACPredFlag,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Inter (6.2.5.4.2)
+ *
+ * Description:
+ * Decodes the INTER block coefficients. This function performs inverse
+ * quantization, inverse zigzag positioning, and IDCT (with appropriate
+ * clipping on each step) on the coefficients. The results (residuals) are
+ * placed in a contiguous array of 64 elements. For INTER block, the output
+ * buffer holds the residuals for further reconstruction.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - pointer to the pointer to the current byte in the bit
+ * stream buffer. There is no boundary check for the bit stream
+ * buffer.
+ * pBitOffset - pointer to the bit position in the byte pointed to by
+ * *ppBitStream. *pBitOffset is valid within [0-7]
+ * QP - quantization parameter
+ * shortVideoHeader - binary flag indicating presence of
+ * short_video_header; shortVideoHeader==1 selects linear intra DC
+ * mode, and shortVideoHeader==0 selects non linear intra DC mode.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after the block is decoded, so
+ * that it points to the current byte in the bit stream buffer
+ * pBitOffset - *pBitOffset is updated so that it points to the current bit
+ * position in the byte pointed by *ppBitStream
+ * pDst - pointer to the decoded residual buffer (a contiguous array of 64
+ * elements of OMX_S16 data type); must be aligned on a 16-byte
+ * boundary.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if:
+ * - At least one of the following pointers is Null:
+ * ppBitStream, *ppBitStream, pBitOffset , pDst
+ * - *pBitOffset exceeds [0,7]
+ * - QP <= 0.
+ * - pDst is not 16-byte aligned
+ * OMX_Sts_Err - status error. Refer to OMX_Sts_Err of DecodeVLCZigzag_Inter .
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Inter (
+ const OMX_U8 **ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_S16 *pDst,
+ OMX_INT QP,
+ OMX_INT shortVideoHeader
+);
+
+
+
+/**
+ * Function: omxVCM4P2_PredictReconCoefIntra (6.2.5.4.3)
+ *
+ * Description:
+ * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+ * to the function call, prediction direction (predDir) should be selected as
+ * specified in [ISO14496-2], subclause 7.4.3.1.
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the coefficient buffer which contains the quantized
+ * coefficient residuals (PQF) of the current block; must be
+ * aligned on a 4-byte boundary. The output coefficients are
+ * saturated to the range [-2048, 2047].
+ * pPredBufRow - pointer to the coefficient row buffer; must be aligned on
+ * a 4-byte boundary.
+ * pPredBufCol - pointer to the coefficient column buffer; must be aligned
+ * on a 4-byte boundary.
+ * curQP - quantization parameter of the current block. curQP may equal to
+ * predQP especially when the current block and the predictor block
+ * are in the same macroblock.
+ * predQP - quantization parameter of the predictor block
+ * predDir - indicates the prediction direction which takes one of the
+ * following values: OMX_VC_HORIZONTAL - predict horizontally
+ * OMX_VC_VERTICAL - predict vertically
+ * ACPredFlag - a flag indicating if AC prediction should be performed. It
+ * is equal to ac_pred_flag in the bit stream syntax of MPEG-4
+ * videoComp - video component type (luminance or chrominance) of the
+ * current block
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to the coefficient buffer which contains the quantized
+ * coefficients (QF) of the current block
+ * pPredBufRow - pointer to the updated coefficient row buffer
+ * pPredBufCol - pointer to the updated coefficient column buffer Note:
+ * Buffer update: Update the AC prediction buffer (both row and
+ * column buffer).
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments, if:
+ * - At least one of the pointers is NULL:
+ * pSrcDst, pPredBufRow, or pPredBufCol.
+ * - curQP <= 0,
+ * - predQP <= 0,
+ * - curQP >31,
+ * - predQP > 31,
+ * - preDir exceeds [1,2]
+ * - pSrcDst, pPredBufRow, or pPredBufCol is not 4-byte aligned.
+ *
+ */
+OMXResult omxVCM4P2_PredictReconCoefIntra (
+ OMX_S16 *pSrcDst,
+ OMX_S16 *pPredBufRow,
+ OMX_S16 *pPredBufCol,
+ OMX_INT curQP,
+ OMX_INT predQP,
+ OMX_INT predDir,
+ OMX_INT ACPredFlag,
+ OMXVCM4P2VideoComponent videoComp
+);
+
+
+
+/**
+ * Function: omxVCM4P2_MCReconBlock (6.2.5.5.1)
+ *
+ * Description:
+ * Performs motion compensation prediction for an 8x8 block using
+ * interpolation described in [ISO14496-2], subclause 7.6.2.
+ *
+ * Input Arguments:
+ *
+ * pSrc - pointer to the block in the reference plane.
+ * srcStep - distance between the start of consecutive lines in the
+ * reference plane, in bytes; must be a multiple of 8.
+ * dstStep - distance between the start of consecutive lines in the
+ * destination plane, in bytes; must be a multiple of 8.
+ * pSrcResidue - pointer to a buffer containing the 16-bit prediction
+ * residuals; must be 16-byte aligned. If the pointer is NULL, then
+ * no prediction is done, only motion compensation, i.e., the block
+ * is moved with interpolation.
+ * predictType - bilinear interpolation type, as defined in section
+ * 6.2.1.2.
+ * rndVal - rounding control parameter: 0 - disabled; 1 - enabled.
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the destination buffer; must be 8-byte aligned. If
+ * prediction residuals are added then output intensities are
+ * clipped to the range [0,255].
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned under any of the following
+ * conditions:
+ * - pDst is not 8-byte aligned.
+ * - pSrcResidue is not 16-byte aligned.
+ * - one or more of the following pointers is NULL: pSrc or pDst.
+ * - either srcStep or dstStep is not a multiple of 8.
+ * - invalid type specified for the parameter predictType.
+ * - the parameter rndVal is not equal either to 0 or 1.
+ *
+ */
+OMXResult omxVCM4P2_MCReconBlock (
+ const OMX_U8 *pSrc,
+ OMX_INT srcStep,
+ const OMX_S16 *pSrcResidue,
+ OMX_U8 *pDst,
+ OMX_INT dstStep,
+ OMX_INT predictType,
+ OMX_INT rndVal
+);
+
+
+
+/* 6.3.1.1 Intra 16x16 Prediction Modes */
+/* A data type that enumerates intra_16x16 macroblock prediction modes is defined as follows: */
+
+typedef enum {
+ OMX_VC_16X16_VERT = 0, /** Intra_16x16_Vertical */
+ OMX_VC_16X16_HOR = 1, /** Intra_16x16_Horizontal */
+ OMX_VC_16X16_DC = 2, /** Intra_16x16_DC */
+ OMX_VC_16X16_PLANE = 3 /** Intra_16x16_Plane */
+} OMXVCM4P10Intra16x16PredMode;
+
+
+
+/* 6.3.1.2 Intra 4x4 Prediction Modes */
+/* A data type that enumerates intra_4x4 macroblock prediction modes is defined as follows: */
+
+typedef enum {
+ OMX_VC_4X4_VERT = 0, /** Intra_4x4_Vertical */
+ OMX_VC_4X4_HOR = 1, /** Intra_4x4_Horizontal */
+ OMX_VC_4X4_DC = 2, /** Intra_4x4_DC */
+ OMX_VC_4X4_DIAG_DL = 3, /** Intra_4x4_Diagonal_Down_Left */
+ OMX_VC_4X4_DIAG_DR = 4, /** Intra_4x4_Diagonal_Down_Right */
+ OMX_VC_4X4_VR = 5, /** Intra_4x4_Vertical_Right */
+ OMX_VC_4X4_HD = 6, /** Intra_4x4_Horizontal_Down */
+ OMX_VC_4X4_VL = 7, /** Intra_4x4_Vertical_Left */
+ OMX_VC_4X4_HU = 8 /** Intra_4x4_Horizontal_Up */
+} OMXVCM4P10Intra4x4PredMode;
+
+
+
+/* 6.3.1.3 Chroma Prediction Modes */
+/* A data type that enumerates intra chroma prediction modes is defined as follows: */
+
+typedef enum {
+ OMX_VC_CHROMA_DC = 0, /** Intra_Chroma_DC */
+ OMX_VC_CHROMA_HOR = 1, /** Intra_Chroma_Horizontal */
+ OMX_VC_CHROMA_VERT = 2, /** Intra_Chroma_Vertical */
+ OMX_VC_CHROMA_PLANE = 3 /** Intra_Chroma_Plane */
+} OMXVCM4P10IntraChromaPredMode;
+
+
+
+/* 6.3.1.4 Motion Estimation Modes */
+/* A data type that enumerates H.264 motion estimation modes is defined as follows: */
+
+typedef enum {
+ OMX_VC_M4P10_FAST_SEARCH = 0, /** Fast motion search */
+ OMX_VC_M4P10_FULL_SEARCH = 1 /** Full motion search */
+} OMXVCM4P10MEMode;
+
+
+
+/* 6.3.1.5 Macroblock Types */
+/* A data type that enumerates H.264 macroblock types is defined as follows: */
+
+typedef enum {
+ OMX_VC_P_16x16 = 0, /* defined by [ISO14496-10] */
+ OMX_VC_P_16x8 = 1,
+ OMX_VC_P_8x16 = 2,
+ OMX_VC_P_8x8 = 3,
+ OMX_VC_PREF0_8x8 = 4,
+ OMX_VC_INTER_SKIP = 5,
+ OMX_VC_INTRA_4x4 = 8,
+ OMX_VC_INTRA_16x16 = 9,
+ OMX_VC_INTRA_PCM = 10
+} OMXVCM4P10MacroblockType;
+
+
+
+/* 6.3.1.6 Sub-Macroblock Types */
+/* A data type that enumerates H.264 sub-macroblock types is defined as follows: */
+
+typedef enum {
+ OMX_VC_SUB_P_8x8 = 0, /* defined by [ISO14496-10] */
+ OMX_VC_SUB_P_8x4 = 1,
+ OMX_VC_SUB_P_4x8 = 2,
+ OMX_VC_SUB_P_4x4 = 3
+} OMXVCM4P10SubMacroblockType;
+
+
+
+/* 6.3.1.7 Variable Length Coding (VLC) Information */
+
+typedef struct {
+ OMX_U8 uTrailing_Ones; /* Trailing ones; 3 at most */
+ OMX_U8 uTrailing_One_Signs; /* Trailing ones signal */
+ OMX_U8 uNumCoeffs; /* Total number of non-zero coefs, including trailing ones */
+ OMX_U8 uTotalZeros; /* Total number of zero coefs */
+ OMX_S16 iLevels[16]; /* Levels of non-zero coefs, in reverse zig-zag order */
+ OMX_U8 uRuns[16]; /* Runs for levels and trailing ones, in reverse zig-zag order */
+} OMXVCM4P10VLCInfo;
+
+
+
+/* 6.3.1.8 Macroblock Information */
+
+typedef struct {
+ OMX_S32 sliceId; /* slice number */
+ OMXVCM4P10MacroblockType mbType; /* MB type */
+ OMXVCM4P10SubMacroblockType subMBType[4]; /* sub-block type */
+ OMX_S32 qpy; /* qp for luma */
+ OMX_S32 qpc; /* qp for chroma */
+ OMX_U32 cbpy; /* CBP Luma */
+ OMX_U32 cbpc; /* CBP Chroma */
+ OMXVCMotionVector pMV0[4][4]; /* motion vector, represented using 1/4-pel units, pMV0[blocky][blockx] (blocky = 0~3, blockx =0~3) */
+ OMXVCMotionVector pMVPred[4][4]; /* motion vector prediction, Represented using 1/4-pel units, pMVPred[blocky][blockx] (blocky = 0~3, blockx = 0~3) */
+ OMX_U8 pRefL0Idx[4]; /* reference picture indices */
+ OMXVCM4P10Intra16x16PredMode Intra16x16PredMode; /* best intra 16x16 prediction mode */
+ OMXVCM4P10Intra4x4PredMode pIntra4x4PredMode[16]; /* best intra 4x4 prediction mode for each block, pMV0 indexed as above */
+} OMXVCM4P10MBInfo, *OMXVCM4P10MBInfoPtr;
+
+
+
+/* 6.3.1.9 Motion Estimation Parameters */
+
+typedef struct {
+ OMX_S32 blockSplitEnable8x8; /* enables 16x8, 8x16, 8x8 */
+ OMX_S32 blockSplitEnable4x4; /* enable splitting of 8x4, 4x8, 4x4 blocks */
+ OMX_S32 halfSearchEnable;
+ OMX_S32 quarterSearchEnable;
+ OMX_S32 intraEnable4x4; /* 1=enable, 0=disable */
+ OMX_S32 searchRange16x16; /* integer pixel units */
+ OMX_S32 searchRange8x8;
+ OMX_S32 searchRange4x4;
+} OMXVCM4P10MEParams;
+
+
+
+/**
+ * Function: omxVCM4P10_PredictIntra_4x4 (6.3.3.1.1)
+ *
+ * Description:
+ * Perform Intra_4x4 prediction for luma samples. If the upper-right block is
+ * not available, then duplication work should be handled inside the function.
+ * Users need not define them outside.
+ *
+ * Input Arguments:
+ *
+ * pSrcLeft - Pointer to the buffer of 4 left pixels:
+ * p[x, y] (x = -1, y = 0..3)
+ * pSrcAbove - Pointer to the buffer of 8 above pixels:
+ * p[x,y] (x = 0..7, y =-1);
+ * must be aligned on a 4-byte boundary.
+ * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1)
+ * leftStep - Step of left pixel buffer; must be a multiple of 4.
+ * dstStep - Step of the destination buffer; must be a multiple of 4.
+ * predMode - Intra_4x4 prediction mode.
+ * availability - Neighboring 4x4 block availability flag, refer to
+ * "Neighboring Macroblock Availability" .
+ *
+ * Output Arguments:
+ *
+ * pDst - Pointer to the destination buffer; must be aligned on a 4-byte
+ * boundary.
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * pDst is NULL.
+ * dstStep < 4, or dstStep is not a multiple of 4.
+ * leftStep is not a multiple of 4.
+ * predMode is not in the valid range of enumeration
+ * OMXVCM4P10Intra4x4PredMode.
+ * predMode is OMX_VC_4x4_VERT, but availability doesn't set OMX_VC_UPPER
+ * indicating p[x,-1] (x = 0..3) is not available.
+ * predMode is OMX_VC_4x4_HOR, but availability doesn't set OMX_VC_LEFT
+ * indicating p[-1,y] (y = 0..3) is not available.
+ * predMode is OMX_VC_4x4_DIAG_DL, but availability doesn't set
+ * OMX_VC_UPPER indicating p[x, -1] (x = 0..3) is not available.
+ * predMode is OMX_VC_4x4_DIAG_DR, but availability doesn't set
+ * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating
+ * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not
+ * available.
+ * predMode is OMX_VC_4x4_VR, but availability doesn't set
+ * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating
+ * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not
+ * available.
+ * predMode is OMX_VC_4x4_HD, but availability doesn't set
+ * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating
+ * p[x,-1] (x = 0..3), or p[-1,y] (y = 0..3) or p[-1,-1] is not
+ * available.
+ * predMode is OMX_VC_4x4_VL, but availability doesn't set OMX_VC_UPPER
+ * indicating p[x,-1] (x = 0..3) is not available.
+ * predMode is OMX_VC_4x4_HU, but availability doesn't set OMX_VC_LEFT
+ * indicating p[-1,y] (y = 0..3) is not available.
+ * availability sets OMX_VC_UPPER, but pSrcAbove is NULL.
+ * availability sets OMX_VC_LEFT, but pSrcLeft is NULL.
+ * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL.
+ * either pSrcAbove or pDst is not aligned on a 4-byte boundary.
+ *
+ * Note:
+ * pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if
+ * they are not used by intra prediction as implied in predMode.
+ *
+ */
+OMXResult omxVCM4P10_PredictIntra_4x4 (
+ const OMX_U8 *pSrcLeft,
+ const OMX_U8 *pSrcAbove,
+ const OMX_U8 *pSrcAboveLeft,
+ OMX_U8 *pDst,
+ OMX_INT leftStep,
+ OMX_INT dstStep,
+ OMXVCM4P10Intra4x4PredMode predMode,
+ OMX_S32 availability
+);
+
+
+
+/**
+ * Function: omxVCM4P10_PredictIntra_16x16 (6.3.3.1.2)
+ *
+ * Description:
+ * Perform Intra_16x16 prediction for luma samples. If the upper-right block
+ * is not available, then duplication work should be handled inside the
+ * function. Users need not define them outside.
+ *
+ * Input Arguments:
+ *
+ * pSrcLeft - Pointer to the buffer of 16 left pixels: p[x, y] (x = -1, y =
+ * 0..15)
+ * pSrcAbove - Pointer to the buffer of 16 above pixels: p[x,y] (x = 0..15,
+ * y= -1); must be aligned on a 16-byte boundary.
+ * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1)
+ * leftStep - Step of left pixel buffer; must be a multiple of 16.
+ * dstStep - Step of the destination buffer; must be a multiple of 16.
+ * predMode - Intra_16x16 prediction mode, please refer to section 3.4.1.
+ * availability - Neighboring 16x16 MB availability flag. Refer to
+ * section 3.4.4.
+ *
+ * Output Arguments:
+ *
+ * pDst -Pointer to the destination buffer; must be aligned on a 16-byte
+ * boundary.
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * pDst is NULL.
+ * dstStep < 16. or dstStep is not a multiple of 16.
+ * leftStep is not a multiple of 16.
+ * predMode is not in the valid range of enumeration
+ * OMXVCM4P10Intra16x16PredMode
+ * predMode is OMX_VC_16X16_VERT, but availability doesn't set
+ * OMX_VC_UPPER indicating p[x,-1] (x = 0..15) is not available.
+ * predMode is OMX_VC_16X16_HOR, but availability doesn't set OMX_VC_LEFT
+ * indicating p[-1,y] (y = 0..15) is not available.
+ * predMode is OMX_VC_16X16_PLANE, but availability doesn't set
+ * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating
+ * p[x,-1](x = 0..15), or p[-1,y] (y = 0..15), or p[-1,-1] is not
+ * available.
+ * availability sets OMX_VC_UPPER, but pSrcAbove is NULL.
+ * availability sets OMX_VC_LEFT, but pSrcLeft is NULL.
+ * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL.
+ * either pSrcAbove or pDst is not aligned on a 16-byte boundary.
+ *
+ * Note:
+ * pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointers if
+ * they are not used by intra prediction implied in predMode.
+ * Note:
+ * OMX_VC_UPPER_RIGHT is not used in intra_16x16 luma prediction.
+ *
+ */
+OMXResult omxVCM4P10_PredictIntra_16x16 (
+ const OMX_U8 *pSrcLeft,
+ const OMX_U8 *pSrcAbove,
+ const OMX_U8 *pSrcAboveLeft,
+ OMX_U8 *pDst,
+ OMX_INT leftStep,
+ OMX_INT dstStep,
+ OMXVCM4P10Intra16x16PredMode predMode,
+ OMX_S32 availability
+);
+
+
+
+/**
+ * Function: omxVCM4P10_PredictIntraChroma_8x8 (6.3.3.1.3)
+ *
+ * Description:
+ * Performs intra prediction for chroma samples.
+ *
+ * Input Arguments:
+ *
+ * pSrcLeft - Pointer to the buffer of 8 left pixels: p[x, y] (x = -1, y=
+ * 0..7).
+ * pSrcAbove - Pointer to the buffer of 8 above pixels: p[x,y] (x = 0..7, y
+ * = -1); must be aligned on an 8-byte boundary.
+ * pSrcAboveLeft - Pointer to the above left pixels: p[x,y] (x = -1, y = -1)
+ * leftStep - Step of left pixel buffer; must be a multiple of 8.
+ * dstStep - Step of the destination buffer; must be a multiple of 8.
+ * predMode - Intra chroma prediction mode, please refer to section 3.4.3.
+ * availability - Neighboring chroma block availability flag, please refer
+ * to "Neighboring Macroblock Availability".
+ *
+ * Output Arguments:
+ *
+ * pDst - Pointer to the destination buffer; must be aligned on an 8-byte
+ * boundary.
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If any of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * pDst is NULL.
+ * dstStep < 8 or dstStep is not a multiple of 8.
+ * leftStep is not a multiple of 8.
+ * predMode is not in the valid range of enumeration
+ * OMXVCM4P10IntraChromaPredMode.
+ * predMode is OMX_VC_CHROMA_VERT, but availability doesn't set
+ * OMX_VC_UPPER indicating p[x,-1] (x = 0..7) is not available.
+ * predMode is OMX_VC_CHROMA_HOR, but availability doesn't set OMX_VC_LEFT
+ * indicating p[-1,y] (y = 0..7) is not available.
+ * predMode is OMX_VC_CHROMA_PLANE, but availability doesn't set
+ * OMX_VC_UPPER_LEFT or OMX_VC_UPPER or OMX_VC_LEFT indicating
+ * p[x,-1](x = 0..7), or p[-1,y] (y = 0..7), or p[-1,-1] is not
+ * available.
+ * availability sets OMX_VC_UPPER, but pSrcAbove is NULL.
+ * availability sets OMX_VC_LEFT, but pSrcLeft is NULL.
+ * availability sets OMX_VC_UPPER_LEFT, but pSrcAboveLeft is NULL.
+ * either pSrcAbove or pDst is not aligned on a 8-byte boundary.
+ *
+ * Note: pSrcAbove, pSrcAbove, pSrcAboveLeft may be invalid pointer if
+ * they are not used by intra prediction implied in predMode.
+ *
+ * Note: OMX_VC_UPPER_RIGHT is not used in intra chroma prediction.
+ *
+ */
+OMXResult omxVCM4P10_PredictIntraChroma_8x8 (
+ const OMX_U8 *pSrcLeft,
+ const OMX_U8 *pSrcAbove,
+ const OMX_U8 *pSrcAboveLeft,
+ OMX_U8 *pDst,
+ OMX_INT leftStep,
+ OMX_INT dstStep,
+ OMXVCM4P10IntraChromaPredMode predMode,
+ OMX_S32 availability
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InterpolateLuma (6.3.3.2.1)
+ *
+ * Description:
+ * Performs quarter-pixel interpolation for inter luma MB. It is assumed that
+ * the frame is already padded when calling this function.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the source reference frame buffer
+ * srcStep - reference frame step, in bytes; must be a multiple of roi.width
+ * dstStep - destination frame step, in bytes; must be a multiple of
+ * roi.width
+ * dx - Fractional part of horizontal motion vector component in 1/4 pixel
+ * unit; valid in the range [0,3]
+ * dy - Fractional part of vertical motion vector y component in 1/4 pixel
+ * unit; valid in the range [0,3]
+ * roi - Dimension of the interpolation region; the parameters roi.width and
+ * roi.height must be equal to either 4, 8, or 16.
+ *
+ * Output Arguments:
+ *
+ * pDst - Pointer to the destination frame buffer:
+ * if roi.width==4, 4-byte alignment required
+ * if roi.width==8, 8-byte alignment required
+ * if roi.width==16, 16-byte alignment required
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * pSrc or pDst is NULL.
+ * srcStep or dstStep < roi.width.
+ * dx or dy is out of range [0,3].
+ * roi.width or roi.height is out of range {4, 8, 16}.
+ * roi.width is equal to 4, but pDst is not 4 byte aligned.
+ * roi.width is equal to 8 or 16, but pDst is not 8 byte aligned.
+ * srcStep or dstStep is not a multiple of 8.
+ *
+ */
+OMXResult omxVCM4P10_InterpolateLuma (
+ const OMX_U8 *pSrc,
+ OMX_S32 srcStep,
+ OMX_U8 *pDst,
+ OMX_S32 dstStep,
+ OMX_S32 dx,
+ OMX_S32 dy,
+ OMXSize roi
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InterpolateChroma (6.3.3.2.2)
+ *
+ * Description:
+ * Performs 1/8-pixel interpolation for inter chroma MB.
+ *
+ * Input Arguments:
+ *
+ * pSrc -Pointer to the source reference frame buffer
+ * srcStep -Reference frame step in bytes
+ * dstStep -Destination frame step in bytes; must be a multiple of
+ * roi.width.
+ * dx -Fractional part of horizontal motion vector component in 1/8 pixel
+ * unit; valid in the range [0,7]
+ * dy -Fractional part of vertical motion vector component in 1/8 pixel
+ * unit; valid in the range [0,7]
+ * roi -Dimension of the interpolation region; the parameters roi.width and
+ * roi.height must be equal to either 2, 4, or 8.
+ *
+ * Output Arguments:
+ *
+ * pDst -Pointer to the destination frame buffer:
+ * if roi.width==2, 2-byte alignment required
+ * if roi.width==4, 4-byte alignment required
+ * if roi.width==8, 8-byte alignment required
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * pSrc or pDst is NULL.
+ * srcStep or dstStep < 8.
+ * dx or dy is out of range [0-7].
+ * roi.width or roi.height is out of range {2,4,8}.
+ * roi.width is equal to 2, but pDst is not 2-byte aligned.
+ * roi.width is equal to 4, but pDst is not 4-byte aligned.
+ * roi.width is equal to 8, but pDst is not 8 byte aligned.
+ * srcStep or dstStep is not a multiple of 8.
+ *
+ */
+OMXResult omxVCM4P10_InterpolateChroma (
+ const OMX_U8 *pSrc,
+ OMX_S32 srcStep,
+ OMX_U8 *pDst,
+ OMX_S32 dstStep,
+ OMX_S32 dx,
+ OMX_S32 dy,
+ OMXSize roi
+);
+
+
+
+/**
+ * Function: omxVCM4P10_FilterDeblockingLuma_VerEdge_I (6.3.3.3.1)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four vertical edges of the luma
+ * macroblock (16x16).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - Pointer to the input macroblock; must be 16-byte aligned.
+ * srcdstStep -Step of the arrays; must be a multiple of 16.
+ * pAlpha -Array of size 2 of alpha thresholds (the first item is the alpha
+ * threshold for the external vertical edge, and the second item is
+ * for the internal vertical edge); per [ISO14496-10] alpha values
+ * must be in the range [0,255].
+ * pBeta -Array of size 2 of beta thresholds (the first item is the beta
+ * threshold for the external vertical edge, and the second item is
+ * for the internal vertical edge); per [ISO14496-10] beta values
+ * must be in the range [0,18].
+ * pThresholds -Array of size 16 of Thresholds (TC0) (values for the left
+ * edge of each 4x4 block, arranged in vertical block order); must
+ * be aligned on a 4-byte boundary.. Per [ISO14496-10] values must
+ * be in the range [0,25].
+ * pBS -Array of size 16 of BS parameters (arranged in vertical block
+ * order); valid in the range [0,4] with the following
+ * restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii)
+ * pBS[i]== 4 if and only if pBS[i^3]== 4. Must be 4-byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst -Pointer to filtered output macroblock.
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns
+ * OMX_Sts_BadArgErr:
+ * Either of the pointers in pSrcDst, pAlpha, pBeta, pThresholds, or pBS
+ * is NULL.
+ * Either pThresholds or pBS is not aligned on a 4-byte boundary.
+ * pSrcDst is not 16-byte aligned.
+ * srcdstStep is not a multiple of 16.
+ * pAlpha[0] and/or pAlpha[1] is outside the range [0,255].
+ * pBeta[0] and/or pBeta[1] is outside the range [0,18].
+ * One or more entries in the table pThresholds[0..15]is outside of the
+ * range [0,25].
+ * pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 &&
+ * pBS[i^3]!=4) for 0<=i<=3.
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingLuma_VerEdge_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_FilterDeblockingLuma_HorEdge_I (6.3.3.3.2)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four horizontal edges of the luma
+ * macroblock (16x16).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input macroblock; must be 16-byte aligned.
+ * srcdstStep - step of the arrays; must be a multiple of 16.
+ * pAlpha - array of size 2 of alpha thresholds (the first item is the alpha
+ * threshold for the external vertical edge, and the second item is
+ * for the internal horizontal edge); per [ISO14496-10] alpha
+ * values must be in the range [0,255].
+ * pBeta - array of size 2 of beta thresholds (the first item is the beta
+ * threshold for the external horizontal edge, and the second item
+ * is for the internal horizontal edge). Per [ISO14496-10] beta
+ * values must be in the range [0,18].
+ * pThresholds - array of size 16 containing thresholds, TC0, for the top
+ * horizontal edge of each 4x4 block, arranged in horizontal block
+ * order; must be aligned on a 4-byte boundary. Per [ISO14496 10]
+ * values must be in the range [0,25].
+ * pBS - array of size 16 of BS parameters (arranged in horizontal block
+ * order); valid in the range [0,4] with the following
+ * restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii)
+ * pBS[i]== 4 if and only if pBS[i^3]== 4. Must be 4-byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst -Pointer to filtered output macroblock.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr, if the function runs without error.
+ *
+ * OMX_Sts_BadArgErr, if one of the following cases occurs:
+ * - one or more of the following pointers is NULL: pSrcDst, pAlpha,
+ * pBeta, pThresholds, or pBS.
+ * - either pThresholds or pBS is not aligned on a 4-byte boundary.
+ * - pSrcDst is not 16-byte aligned.
+ * - srcdstStep is not a multiple of 16.
+ * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255].
+ * - pBeta[0] and/or pBeta[1] is outside the range [0,18].
+ * - One or more entries in the table pThresholds[0..15] is
+ * outside of the range [0,25].
+ * - pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or
+ * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingLuma_HorEdge_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_FilterDeblockingChroma_VerEdge_I (6.3.3.3.3)
+ *
+ * Description:
+ * Performs in-place deblock filtering on four vertical edges of the chroma
+ * macroblock (8x8).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - Pointer to the input macroblock; must be 8-byte aligned.
+ * srcdstStep - Step of the arrays; must be a multiple of 8.
+ * pAlpha - Array of size 2 of alpha thresholds (the first item is alpha
+ * threshold for external vertical edge, and the second item is for
+ * internal vertical edge); per [ISO14496-10] alpha values must be
+ * in the range [0,255].
+ * pBeta - Array of size 2 of beta thresholds (the first item is the beta
+ * threshold for the external vertical edge, and the second item is
+ * for the internal vertical edge); per [ISO14496-10] beta values
+ * must be in the range [0,18].
+ * pThresholds - Array of size 8 containing thresholds, TC0, for the left
+ * vertical edge of each 4x2 chroma block, arranged in vertical
+ * block order; must be aligned on a 4-byte boundary. Per
+ * [ISO14496-10] values must be in the range [0,25].
+ * pBS - Array of size 16 of BS parameters (values for each 2x2 chroma
+ * block, arranged in vertical block order). This parameter is the
+ * same as the pBS parameter passed into FilterDeblockLuma_VerEdge;
+ * valid in the range [0,4] with the following restrictions: i)
+ * pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and
+ * only if pBS[i^3]== 4. Must be 4 byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst -Pointer to filtered output macroblock.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr, if the function runs without error.
+ *
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - one or more of the following pointers is NULL: pSrcDst, pAlpha,
+ * pBeta, pThresholds, or pBS.
+ * - pSrcDst is not 8-byte aligned.
+ * - srcdstStep is not a multiple of 8.
+ * - pThresholds is not 4-byte aligned.
+ * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255].
+ * - pBeta[0] and/or pBeta[1] is outside the range [0,18].
+ * - One or more entries in the table pThresholds[0..7] is outside
+ * of the range [0,25].
+ * - pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or
+ * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ * - pBS is not 4-byte aligned.
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingChroma_VerEdge_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_FilterDeblockingChroma_HorEdge_I (6.3.3.3.4)
+ *
+ * Description:
+ * Performs in-place deblock filtering on the horizontal edges of the chroma
+ * macroblock (8x8).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input macroblock; must be 8-byte aligned.
+ * srcdstStep - array step; must be a multiple of 8.
+ * pAlpha - array of size 2 containing alpha thresholds; the first element
+ * contains the threshold for the external horizontal edge, and the
+ * second element contains the threshold for internal horizontal
+ * edge. Per [ISO14496-10] alpha values must be in the range
+ * [0,255].
+ * pBeta - array of size 2 containing beta thresholds; the first element
+ * contains the threshold for the external horizontal edge, and the
+ * second element contains the threshold for the internal
+ * horizontal edge. Per [ISO14496-10] beta values must be in the
+ * range [0,18].
+ * pThresholds - array of size 8 containing thresholds, TC0, for the top
+ * horizontal edge of each 2x4 chroma block, arranged in horizontal
+ * block order; must be aligned on a 4-byte boundary. Per
+ * [ISO14496-10] values must be in the range [0,25].
+ * pBS - array of size 16 containing BS parameters for each 2x2 chroma
+ * block, arranged in horizontal block order; valid in the range
+ * [0,4] with the following restrictions: i) pBS[i]== 4 may occur
+ * only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]== 4.
+ * Must be 4-byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst -Pointer to filtered output macroblock.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr, if the function runs without error.
+ *
+ * OMX_Sts_BadArgErr, if one of the following cases occurs:
+ * - any of the following pointers is NULL:
+ * pSrcDst, pAlpha, pBeta, pThresholds, or pBS.
+ * - pSrcDst is not 8-byte aligned.
+ * - srcdstStep is not a multiple of 8.
+ * - pThresholds is not 4-byte aligned.
+ * - pAlpha[0] and/or pAlpha[1] is outside the range [0,255].
+ * - pBeta[0] and/or pBeta[1] is outside the range [0,18].
+ * - One or more entries in the table pThresholds[0..7] is outside
+ * of the range [0,25].
+ * - pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or
+ * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ * - pBS is not 4-byte aligned.
+ *
+ */
+OMXResult omxVCM4P10_FilterDeblockingChroma_HorEdge_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_DeblockLuma_I (6.3.3.3.5)
+ *
+ * Description:
+ * This function performs in-place deblock filtering the horizontal and
+ * vertical edges of a luma macroblock (16x16).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input macroblock; must be 16-byte aligned.
+ * srcdstStep - image width; must be a multiple of 16.
+ * pAlpha - pointer to a 2x2 table of alpha thresholds, organized as
+ * follows: {external vertical edge, internal vertical edge,
+ * external horizontal edge, internal horizontal edge }. Per
+ * [ISO14496-10] alpha values must be in the range [0,255].
+ * pBeta - pointer to a 2x2 table of beta thresholds, organized as follows:
+ * {external vertical edge, internal vertical edge, external
+ * horizontal edge, internal horizontal edge }. Per [ISO14496-10]
+ * beta values must be in the range [0,18].
+ * pThresholds - pointer to a 16x2 table of threshold (TC0), organized as
+ * follows: {values for the left or above edge of each 4x4 block,
+ * arranged in vertical block order and then in horizontal block
+ * order}; must be aligned on a 4-byte boundary. Per [ISO14496-10]
+ * values must be in the range [0,25].
+ * pBS - pointer to a 16x2 table of BS parameters arranged in scan block
+ * order for vertical edges and then horizontal edges; valid in the
+ * range [0,4] with the following restrictions: i) pBS[i]== 4 may
+ * occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]==
+ * 4. Must be 4-byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to filtered output macroblock.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - one or more of the following pointers is NULL: pSrcDst, pAlpha,
+ * pBeta, pThresholds or pBS.
+ * - pSrcDst is not 16-byte aligned.
+ * - either pThresholds or pBS is not aligned on a 4-byte boundary.
+ * - one or more entries in the table pAlpha[0..3] is outside the range
+ * [0,255].
+ * - one or more entries in the table pBeta[0..3] is outside the range
+ * [0,18].
+ * - one or more entries in the table pThresholds[0..31]is outside of
+ * the range [0,25].
+ * - pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or
+ * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ * - srcdstStep is not a multiple of 16.
+ *
+ */
+OMXResult omxVCM4P10_DeblockLuma_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_DeblockChroma_I (6.3.3.3.6)
+ *
+ * Description:
+ * Performs in-place deblocking filtering on all edges of the chroma
+ * macroblock (16x16).
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - pointer to the input macroblock; must be 8-byte aligned.
+ * srcdstStep - step of the arrays; must be a multiple of 8.
+ * pAlpha - pointer to a 2x2 array of alpha thresholds, organized as
+ * follows: {external vertical edge, internal vertical edge,
+ * external horizontal edge, internal horizontal edge }. Per
+ * [ISO14496-10] alpha values must be in the range [0,255].
+ * pBeta - pointer to a 2x2 array of Beta Thresholds, organized as follows:
+ * { external vertical edge, internal vertical edge, external
+ * horizontal edge, internal horizontal edge }. Per [ISO14496-10]
+ * beta values must be in the range [0,18].
+ * pThresholds - array of size 8x2 of Thresholds (TC0) (values for the left
+ * or above edge of each 4x2 or 2x4 block, arranged in vertical
+ * block order and then in horizontal block order); must be aligned
+ * on a 4-byte boundary. Per [ISO14496-10] values must be in the
+ * range [0,25].
+ * pBS - array of size 16x2 of BS parameters (arranged in scan block order
+ * for vertical edges and then horizontal edges); valid in the
+ * range [0,4] with the following restrictions: i) pBS[i]== 4 may
+ * occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^3]==
+ * 4. Must be 4-byte aligned.
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - pointer to filtered output macroblock.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - one or more of the following pointers is NULL: pSrcDst, pAlpha,
+ * pBeta, pThresholds, or pBS.
+ * - pSrcDst is not 8-byte aligned.
+ * - either pThresholds or pBS is not 4-byte aligned.
+ * - one or more entries in the table pAlpha[0..3] is outside the range
+ * [0,255].
+ * - one or more entries in the table pBeta[0..3] is outside the range
+ * [0,18].
+ * - one or more entries in the table pThresholds[0..15]is outside of
+ * the range [0,25].
+ * - pBS is out of range, i.e., one of the following conditions is true:
+ * pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or
+ * (pBS[i]==4 && pBS[i^3]!=4) for 0<=i<=3.
+ * - srcdstStep is not a multiple of 8.
+ *
+ */
+OMXResult omxVCM4P10_DeblockChroma_I (
+ OMX_U8 *pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8 *pAlpha,
+ const OMX_U8 *pBeta,
+ const OMX_U8 *pThresholds,
+ const OMX_U8 *pBS
+);
+
+
+
+/**
+ * Function: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (6.3.4.1.1)
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse raster scan for a 2x2 block of
+ * ChromaDCLevel. The decoded coefficients in the packed position-coefficient
+ * buffer are stored in reverse zig-zag order, i.e., the first buffer element
+ * contains the last non-zero postion-coefficient pair of the block. Within
+ * each position-coefficient pair, the position entry indicates the
+ * raster-scan position of the coefficient, while the coefficient entry
+ * contains the coefficient value.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream - Double pointer to current byte in bit stream buffer
+ * pOffset - Pointer to current bit position in the byte pointed to by
+ * *ppBitStream; valid in the range [0,7].
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after each block is decoded
+ * pOffset - *pOffset is updated after each block is decoded
+ * pNumCoeff - Pointer to the number of nonzero coefficients in this block
+ * ppPosCoefBuf - Double pointer to destination residual
+ * coefficient-position pair buffer. Buffer position
+ * (*ppPosCoefBuf) is updated upon return, unless there are only
+ * zero coefficients in the currently decoded block. In this case
+ * the caller is expected to bypass the transform/dequantization of
+ * the empty blocks.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr, if the function runs without error.
+ *
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - ppBitStream or pOffset is NULL.
+ * - ppPosCoefBuf or pNumCoeff is NULL.
+ * OMX_Sts_Err - if one of the following is true:
+ * - an illegal code is encountered in the bitstream
+ *
+ */
+OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (
+ const OMX_U8 **ppBitStream,
+ OMX_S32*pOffset,
+ OMX_U8 *pNumCoeff,
+ OMX_U8 **ppPosCoefbuf
+);
+
+
+
+/**
+ * Function: omxVCM4P10_DecodeCoeffsToPairCAVLC (6.3.4.1.2)
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of
+ * Intra16x16DCLevel, Intra16x16ACLevel, LumaLevel, and ChromaACLevel. Inverse
+ * field scan is not supported. The decoded coefficients in the packed
+ * position-coefficient buffer are stored in reverse zig-zag order, i.e., the
+ * first buffer element contains the last non-zero postion-coefficient pair of
+ * the block. Within each position-coefficient pair, the position entry
+ * indicates the raster-scan position of the coefficient, while the
+ * coefficient entry contains the coefficient value.
+ *
+ * Input Arguments:
+ *
+ * ppBitStream -Double pointer to current byte in bit stream buffer
+ * pOffset - Pointer to current bit position in the byte pointed to by
+ * *ppBitStream; valid in the range [0,7].
+ * sMaxNumCoeff - Maximum the number of non-zero coefficients in current
+ * block
+ * sVLCSelect - VLC table selector, obtained from the number of non-zero
+ * coefficients contained in the above and left 4x4 blocks. It is
+ * equivalent to the variable nC described in H.264 standard table
+ * 9 5, except its value can t be less than zero.
+ *
+ * Output Arguments:
+ *
+ * ppBitStream - *ppBitStream is updated after each block is decoded.
+ * Buffer position (*ppPosCoefBuf) is updated upon return, unless
+ * there are only zero coefficients in the currently decoded block.
+ * In this case the caller is expected to bypass the
+ * transform/dequantization of the empty blocks.
+ * pOffset - *pOffset is updated after each block is decoded
+ * pNumCoeff - Pointer to the number of nonzero coefficients in this block
+ * ppPosCoefBuf - Double pointer to destination residual
+ * coefficient-position pair buffer
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ *
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - ppBitStream or pOffset is NULL.
+ * - ppPosCoefBuf or pNumCoeff is NULL.
+ * - sMaxNumCoeff is not equal to either 15 or 16.
+ * - sVLCSelect is less than 0.
+ *
+ * OMX_Sts_Err - if one of the following is true:
+ * - an illegal code is encountered in the bitstream
+ *
+ */
+OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC (
+ const OMX_U8 **ppBitStream,
+ OMX_S32 *pOffset,
+ OMX_U8 *pNumCoeff,
+ OMX_U8 **ppPosCoefbuf,
+ OMX_INT sVLCSelect,
+ OMX_INT sMaxNumCoeff
+);
+
+
+
+/**
+ * Function: omxVCM4P10_TransformDequantLumaDCFromPair (6.3.4.2.1)
+ *
+ * Description:
+ * Reconstructs the 4x4 LumaDC block from the coefficient-position pair
+ * buffer, performs integer inverse, and dequantization for 4x4 LumaDC
+ * coefficients, and updates the pair buffer pointer to the next non-empty
+ * block.
+ *
+ * Input Arguments:
+ *
+ * ppSrc - Double pointer to residual coefficient-position pair buffer
+ * output by CALVC decoding
+ * QP - Quantization parameter QpY
+ *
+ * Output Arguments:
+ *
+ * ppSrc - *ppSrc is updated to the start of next non empty block
+ * pDst - Pointer to the reconstructed 4x4 LumaDC coefficients buffer; must
+ * be aligned on a 8-byte boundary.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - ppSrc or pDst is NULL.
+ * - pDst is not 8 byte aligned.
+ * - QP is not in the range of [0-51].
+ *
+ */
+OMXResult omxVCM4P10_TransformDequantLumaDCFromPair (
+ const OMX_U8 **ppSrc,
+ OMX_S16 *pDst,
+ OMX_INT QP
+);
+
+
+
+/**
+ * Function: omxVCM4P10_TransformDequantChromaDCFromPair (6.3.4.2.2)
+ *
+ * Description:
+ * Reconstruct the 2x2 ChromaDC block from coefficient-position pair buffer,
+ * perform integer inverse transformation, and dequantization for 2x2 chroma
+ * DC coefficients, and update the pair buffer pointer to next non-empty
+ * block.
+ *
+ * Input Arguments:
+ *
+ * ppSrc - Double pointer to residual coefficient-position pair buffer
+ * output by CALVC decoding
+ * QP - Quantization parameter QpC
+ *
+ * Output Arguments:
+ *
+ * ppSrc - *ppSrc is updated to the start of next non empty block
+ * pDst - Pointer to the reconstructed 2x2 ChromaDC coefficients buffer;
+ * must be aligned on a 4-byte boundary.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - ppSrc or pDst is NULL.
+ * - pDst is not 4-byte aligned.
+ * - QP is not in the range of [0-51].
+ *
+ */
+OMXResult omxVCM4P10_TransformDequantChromaDCFromPair (
+ const OMX_U8 **ppSrc,
+ OMX_S16 *pDst,
+ OMX_INT QP
+);
+
+
+
+/**
+ * Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd (6.3.4.2.3)
+ *
+ * Description:
+ * Reconstruct the 4x4 residual block from coefficient-position pair buffer,
+ * perform dequantization and integer inverse transformation for 4x4 block of
+ * residuals with previous intra prediction or motion compensation data, and
+ * update the pair buffer pointer to next non-empty block. If pDC == NULL,
+ * there re 16 non-zero AC coefficients at most in the packed buffer starting
+ * from 4x4 block position 0; If pDC != NULL, there re 15 non-zero AC
+ * coefficients at most in the packet buffer starting from 4x4 block position
+ * 1.
+ *
+ * Input Arguments:
+ *
+ * ppSrc - Double pointer to residual coefficient-position pair buffer
+ * output by CALVC decoding
+ * pPred - Pointer to the predicted 4x4 block; must be aligned on a 4-byte
+ * boundary
+ * predStep - Predicted frame step size in bytes; must be a multiple of 4
+ * dstStep - Destination frame step in bytes; must be a multiple of 4
+ * pDC - Pointer to the DC coefficient of this block, NULL if it doesn't
+ * exist
+ * QP - QP Quantization parameter. It should be QpC in chroma 4x4 block
+ * decoding, otherwise it should be QpY.
+ * AC - Flag indicating if at least one non-zero AC coefficient exists
+ *
+ * Output Arguments:
+ *
+ * pDst - pointer to the reconstructed 4x4 block data; must be aligned on a
+ * 4-byte boundary
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - pPred or pDst is NULL.
+ * - pPred or pDst is not 4-byte aligned.
+ * - predStep or dstStep is not a multiple of 4.
+ * - AC !=0 and Qp is not in the range of [0-51] or ppSrc == NULL.
+ * - AC ==0 && pDC ==NULL.
+ *
+ */
+OMXResult omxVCM4P10_DequantTransformResidualFromPairAndAdd (
+ const OMX_U8 **ppSrc,
+ const OMX_U8 *pPred,
+ const OMX_S16 *pDC,
+ OMX_U8 *pDst,
+ OMX_INT predStep,
+ OMX_INT dstStep,
+ OMX_INT QP,
+ OMX_INT AC
+);
+
+
+
+/**
+ * Function: omxVCM4P10_MEGetBufSize (6.3.5.1.1)
+ *
+ * Description:
+ * Computes the size, in bytes, of the vendor-specific specification
+ * structure for the omxVCM4P10 motion estimation functions BlockMatch_Integer
+ * and MotionEstimationMB.
+ *
+ * Input Arguments:
+ *
+ * MEmode - motion estimation mode; available modes are defined by the
+ * enumerated type OMXVCM4P10MEMode
+ * pMEParams -motion estimation parameters
+ *
+ * Output Arguments:
+ *
+ * pSize - pointer to the number of bytes required for the motion
+ * estimation specification structure
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - pMEParams or pSize is NULL.
+ * - an invalid MEMode is specified.
+ *
+ */
+OMXResult omxVCM4P10_MEGetBufSize (
+ OMXVCM4P10MEMode MEmode,
+ const OMXVCM4P10MEParams *pMEParams,
+ OMX_U32 *pSize
+);
+
+
+
+/**
+ * Function: omxVCM4P10_MEInit (6.3.5.1.2)
+ *
+ * Description:
+ * Initializes the vendor-specific specification structure required for the
+ * omxVCM4P10 motion estimation functions: BlockMatch_Integer and
+ * MotionEstimationMB. Memory for the specification structure *pMESpec must be
+ * allocated prior to calling the function, and should be aligned on a 4-byte
+ * boundary. The number of bytes required for the specification structure can
+ * be determined using the function omxVCM4P10_MEGetBufSize. Following
+ * initialization by this function, the vendor-specific structure *pMESpec
+ * should contain an implementation-specific representation of all motion
+ * estimation parameters received via the structure pMEParams, for example
+ * searchRange16x16, searchRange8x8, etc.
+ *
+ * Input Arguments:
+ *
+ * MEmode - motion estimation mode; available modes are defined by the
+ * enumerated type OMXVCM4P10MEMode
+ * pMEParams - motion estimation parameters
+ * pMESpec - pointer to the uninitialized ME specification structure
+ *
+ * Output Arguments:
+ *
+ * pMESpec - pointer to the initialized ME specification structure
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - pMEParams or pSize is NULL.
+ * - an invalid value was specified for the parameter MEmode
+ * - a negative or zero value was specified for one of the search ranges
+ * (e.g., pMBParams >searchRange8x8, pMEParams->searchRange16x16, etc.)
+ * - either in isolation or in combination, one or more of the enables or
+ * search ranges in the structure *pMEParams were configured such
+ * that the requested behavior fails to comply with [ISO14496-10].
+ *
+ */
+OMXResult omxVCM4P10_MEInit (
+ OMXVCM4P10MEMode MEmode,
+ const OMXVCM4P10MEParams *pMEParams,
+ void *pMESpec
+);
+
+
+
+/**
+ * Function: omxVCM4P10_BlockMatch_Integer (6.3.5.2.1)
+ *
+ * Description:
+ * Performs integer block match. Returns best MV and associated cost.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrgY - Pointer to the top-left corner of the current block:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * pSrcRefY - Pointer to the top-left corner of the co-located block in the
+ * reference picture:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * nSrcOrgStep - Stride of the original picture plane, expressed in terms
+ * of integer pixels; must be a multiple of iBlockWidth.
+ * nSrcRefStep - Stride of the reference picture plane, expressed in terms
+ * of integer pixels
+ * pRefRect - pointer to the valid reference rectangle inside the reference
+ * picture plane
+ * nCurrPointPos - position of the current block in the current plane
+ * iBlockWidth - Width of the current block, expressed in terms of integer
+ * pixels; must be equal to either 4, 8, or 16.
+ * iBlockHeight - Height of the current block, expressed in terms of
+ * integer pixels; must be equal to either 4, 8, or 16.
+ * nLamda - Lamda factor; used to compute motion cost
+ * pMVPred - Predicted MV; used to compute motion cost, expressed in terms
+ * of 1/4-pel units
+ * pMVCandidate - Candidate MV; used to initialize the motion search,
+ * expressed in terms of integer pixels
+ * pMESpec - pointer to the ME specification structure
+ *
+ * Output Arguments:
+ *
+ * pDstBestMV - Best MV resulting from integer search, expressed in terms
+ * of 1/4-pel units
+ * pBestCost - Motion cost associated with the best MV; computed as
+ * SAD+Lamda*BitsUsedByMV
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - any of the following pointers are NULL:
+ * pSrcOrgY, pSrcRefY, pRefRect, pMVPred, pMVCandidate, or pMESpec.
+ * - Either iBlockWidth or iBlockHeight are values other than 4, 8, or 16.
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Integer (
+ const OMX_U8 *pSrcOrgY,
+ OMX_S32 nSrcOrgStep,
+ const OMX_U8 *pSrcRefY,
+ OMX_S32 nSrcRefStep,
+ const OMXRect *pRefRect,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ OMX_U8 iBlockWidth,
+ OMX_U8 iBlockHeight,
+ OMX_U32 nLamda,
+ const OMXVCMotionVector *pMVPred,
+ const OMXVCMotionVector *pMVCandidate,
+ OMXVCMotionVector *pBestMV,
+ OMX_S32 *pBestCost,
+ void *pMESpec
+);
+
+
+
+/**
+ * Function: omxVCM4P10_BlockMatch_Half (6.3.5.2.2)
+ *
+ * Description:
+ * Performs a half-pel block match using results from a prior integer search.
+ * Returns the best MV and associated cost. This function estimates the
+ * half-pixel motion vector by interpolating the integer resolution motion
+ * vector referenced by the input parameter pSrcDstBestMV, i.e., the initial
+ * integer MV is generated externally. The function
+ * omxVCM4P10_BlockMatch_Integer may be used for integer motion estimation.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrgY - Pointer to the current position in original picture plane:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * pSrcRefY - Pointer to the top-left corner of the co-located block in the
+ * reference picture:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * nSrcOrgStep - Stride of the original picture plane in terms of full
+ * pixels; must be a multiple of iBlockWidth.
+ * nSrcRefStep - Stride of the reference picture plane in terms of full
+ * pixels
+ * iBlockWidth - Width of the current block in terms of full pixels; must
+ * be equal to either 4, 8, or 16.
+ * iBlockHeight - Height of the current block in terms of full pixels; must
+ * be equal to either 4, 8, or 16.
+ * nLamda - Lamda factor, used to compute motion cost
+ * pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to
+ * compute motion cost
+ * pSrcDstBestMV - The best MV resulting from a prior integer search,
+ * represented in terms of 1/4-pel units
+ *
+ * Output Arguments:
+ *
+ * pSrcDstBestMV - Best MV resulting from the half-pel search, expressed in
+ * terms of 1/4-pel units
+ * pBestCost - Motion cost associated with the best MV; computed as
+ * SAD+Lamda*BitsUsedByMV
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - any of the following pointers is NULL: pSrcOrgY, pSrcRefY,
+ * pSrcDstBestMV, pMVPred, pBestCost
+ * - iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16.
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Half (
+ const OMX_U8 *pSrcOrgY,
+ OMX_S32 nSrcOrgStep,
+ const OMX_U8 *pSrcRefY,
+ OMX_S32 nSrcRefStep,
+ OMX_U8 iBlockWidth,
+ OMX_U8 iBlockHeight,
+ OMX_U32 nLamda,
+ const OMXVCMotionVector *pMVPred,
+ OMXVCMotionVector *pSrcDstBestMV,
+ OMX_S32 *pBestCost
+);
+
+
+
+/**
+ * Function: omxVCM4P10_BlockMatch_Quarter (6.3.5.2.3)
+ *
+ * Description:
+ * Performs a quarter-pel block match using results from a prior half-pel
+ * search. Returns the best MV and associated cost. This function estimates
+ * the quarter-pixel motion vector by interpolating the half-pel resolution
+ * motion vector referenced by the input parameter pSrcDstBestMV, i.e., the
+ * initial half-pel MV is generated externally. The function
+ * omxVCM4P10_BlockMatch_Half may be used for half-pel motion estimation.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrgY - Pointer to the current position in original picture plane:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * pSrcRefY - Pointer to the top-left corner of the co-located block in the
+ * reference picture:
+ * If iBlockWidth==4, 4-byte alignment required.
+ * If iBlockWidth==8, 8-byte alignment required.
+ * If iBlockWidth==16, 16-byte alignment required.
+ * nSrcOrgStep - Stride of the original picture plane in terms of full
+ * pixels; must be a multiple of iBlockWidth.
+ * nSrcRefStep - Stride of the reference picture plane in terms of full
+ * pixels
+ * iBlockWidth - Width of the current block in terms of full pixels; must
+ * be equal to either 4, 8, or 16.
+ * iBlockHeight - Height of the current block in terms of full pixels; must
+ * be equal to either 4, 8, or 16.
+ * nLamda - Lamda factor, used to compute motion cost
+ * pMVPred - Predicted MV, represented in terms of 1/4-pel units; used to
+ * compute motion cost
+ * pSrcDstBestMV - The best MV resulting from a prior half-pel search,
+ * represented in terms of 1/4 pel units
+ *
+ * Output Arguments:
+ *
+ * pSrcDstBestMV - Best MV resulting from the quarter-pel search, expressed
+ * in terms of 1/4-pel units
+ * pBestCost - Motion cost associated with the best MV; computed as
+ * SAD+Lamda*BitsUsedByMV
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - One or more of the following pointers is NULL:
+ * pSrcOrgY, pSrcRefY, pSrcDstBestMV, pMVPred, pBestCost
+ * - iBlockWidth or iBlockHeight are equal to values other than 4, 8, or 16.
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_BlockMatch_Quarter (
+ const OMX_U8 *pSrcOrgY,
+ OMX_S32 nSrcOrgStep,
+ const OMX_U8 *pSrcRefY,
+ OMX_S32 nSrcRefStep,
+ OMX_U8 iBlockWidth,
+ OMX_U8 iBlockHeight,
+ OMX_U32 nLamda,
+ const OMXVCMotionVector *pMVPred,
+ OMXVCMotionVector *pSrcDstBestMV,
+ OMX_S32 *pBestCost
+);
+
+
+
+/**
+ * Function: omxVCM4P10_MotionEstimationMB (6.3.5.3.1)
+ *
+ * Description:
+ * Performs MB-level motion estimation and selects best motion estimation
+ * strategy from the set of modes supported in baseline profile [ISO14496-10].
+ *
+ * Input Arguments:
+ *
+ * pSrcCurrBuf - Pointer to the current position in original picture plane;
+ * 16-byte alignment required
+ * pSrcRefBufList - Pointer to an array with 16 entries. Each entry points
+ * to the top-left corner of the co-located MB in a reference
+ * picture. The array is filled from low-to-high with valid
+ * reference frame pointers; the unused high entries should be set
+ * to NULL. Ordering of the reference frames should follow
+ * [ISO14496-10] subclause 8.2.4 Decoding Process for Reference
+ * Picture Lists. The entries must be 16-byte aligned.
+ * pSrcRecBuf - Pointer to the top-left corner of the co-located MB in the
+ * reconstructed picture; must be 16-byte aligned.
+ * SrcCurrStep - Width of the original picture plane in terms of full
+ * pixels; must be a multiple of 16.
+ * SrcRefStep - Width of the reference picture plane in terms of full
+ * pixels; must be a multiple of 16.
+ * SrcRecStep - Width of the reconstructed picture plane in terms of full
+ * pixels; must be a multiple of 16.
+ * pRefRect - Pointer to the valid reference rectangle; relative to the
+ * image origin.
+ * pCurrPointPos - Position of the current macroblock in the current plane.
+ * Lambda - Lagrange factor for computing the cost function
+ * pMESpec - Pointer to the motion estimation specification structure; must
+ * have been allocated and initialized prior to calling this
+ * function.
+ * pMBInter - Array, of dimension four, containing pointers to information
+ * associated with four adjacent type INTER MBs (Left, Top,
+ * Top-Left, Top-Right). Any pointer in the array may be set equal
+ * to NULL if the corresponding MB doesn t exist or is not of type
+ * INTER.
+ * - pMBInter[0] - Pointer to left MB information
+ * - pMBInter[1] - Pointer to top MB information
+ * - pMBInter[2] - Pointer to top-left MB information
+ * - pMBInter[3] - Pointer to top-right MB information
+ * pMBIntra - Array, of dimension four, containing pointers to information
+ * associated with four adjacent type INTRA MBs (Left, Top,
+ * Top-Left, Top-Right). Any pointer in the array may be set equal
+ * to NULL if the corresponding MB doesn t exist or is not of type
+ * INTRA.
+ * - pMBIntra[0] - Pointer to left MB information
+ * - pMBIntra[1] - Pointer to top MB information
+ * - pMBIntra[2] - Pointer to top-left MB information
+ * - pMBIntra[3] - Pointer to top-right MB information
+ * pSrcDstMBCurr - Pointer to information structure for the current MB.
+ * The following entries should be set prior to calling the
+ * function: sliceID - the number of the slice the to which the
+ * current MB belongs.
+ *
+ * Output Arguments:
+ *
+ * pDstCost - Pointer to the minimum motion cost for the current MB.
+ * pDstBlockSAD - Pointer to the array of SADs for each of the sixteen luma
+ * 4x4 blocks in each MB. The block SADs are in scan order for
+ * each MB. For implementations that cannot compute the SAD values
+ * individually, the maximum possible value (0xffff) is returned
+ * for each of the 16 block SAD entries.
+ * pSrcDstMBCurr - Pointer to updated information structure for the current
+ * MB after MB-level motion estimation has been completed. The
+ * following fields are updated by the ME function. The following
+ * parameter set quantifies the MB-level ME search results:
+ * - MbType
+ * - subMBType[4]
+ * - pMV0[4][4]
+ * - pMVPred[4][4]
+ * - pRefL0Idx[4]
+ * - Intra16x16PredMode
+ * - pIntra4x4PredMode[4][4]
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - One or more of the following pointers is NULL: pSrcCurrBuf,
+ * pSrcRefBufList, pSrcRecBuf, pRefRect, pCurrPointPos, pMESpec,
+ * pMBInter, pMBIntra,pSrcDstMBCurr, pDstCost, pSrcRefBufList[0]
+ * - SrcRefStep, SrcRecStep are not multiples of 16
+ * - iBlockWidth or iBlockHeight are values other than 4, 8, or 16.
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_MotionEstimationMB (
+ const OMX_U8 *pSrcCurrBuf,
+ OMX_S32 SrcCurrStep,
+ const OMX_U8 *pSrcRefBufList[15],
+ OMX_S32 SrcRefStep,
+ const OMX_U8 *pSrcRecBuf,
+ OMX_S32 SrcRecStep,
+ const OMXRect *pRefRect,
+ const OMXVCM4P2Coordinate *pCurrPointPos,
+ OMX_U32 Lambda,
+ void *pMESpec,
+ const OMXVCM4P10MBInfoPtr *pMBInter,
+ const OMXVCM4P10MBInfoPtr *pMBIntra,
+ OMXVCM4P10MBInfoPtr pSrcDstMBCurr,
+ OMX_INT *pDstCost,
+ OMX_U16 *pDstBlockSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SAD_4x (6.3.5.4.1)
+ *
+ * Description:
+ * This function calculates the SAD for 4x8 and 4x4 blocks.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrg -Pointer to the original block; must be aligned on a 4-byte
+ * boundary.
+ * iStepOrg -Step of the original block buffer; must be a multiple of 4.
+ * pSrcRef -Pointer to the reference block
+ * iStepRef -Step of the reference block buffer
+ * iHeight -Height of the block; must be equal to either 4 or 8.
+ *
+ * Output Arguments:
+ *
+ * pDstSAD -Pointer of result SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - One or more of the following pointers is NULL:
+ * pSrcOrg, pSrcRef, or pDstSAD
+ * - iHeight is not equal to either 4 or 8.
+ * - iStepOrg is not a multiple of 4
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_SAD_4x (
+ const OMX_U8 *pSrcOrg,
+ OMX_U32 iStepOrg,
+ const OMX_U8 *pSrcRef,
+ OMX_U32 iStepRef,
+ OMX_S32 *pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SADQuar_4x (6.3.5.4.2)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average
+ * of the other two (pSrcRef0 and pSrcRef1) for 4x8 or 4x4 blocks. Rounding
+ * is applied according to the convention (a+b+1)>>1.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the original block; must be aligned on a 4-byte
+ * boundary.
+ * pSrcRef0 - Pointer to reference block 0
+ * pSrcRef1 - Pointer to reference block 1
+ * iSrcStep - Step of the original block buffer; must be a multiple of 4.
+ * iRefStep0 - Step of reference block 0
+ * iRefStep1 - Step of reference block 1
+ * iHeight - Height of the block; must be equal to either 4 or 8.
+ *
+ * Output Arguments:
+ *
+ * pDstSAD - Pointer of result SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - iHeight is not equal to either 4 or 8.
+ * - One or more of the following pointers is NULL: pSrc, pSrcRef0,
+ * pSrcRef1, pDstSAD.
+ * - iSrcStep is not a multiple of 4
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_4x (
+ const OMX_U8 *pSrc,
+ const OMX_U8 *pSrcRef0,
+ const OMX_U8 *pSrcRef1,
+ OMX_U32 iSrcStep,
+ OMX_U32 iRefStep0,
+ OMX_U32 iRefStep1,
+ OMX_U32 *pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SADQuar_8x (6.3.5.4.3)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average
+ * of the other two (pSrcRef0 and pSrcRef1) for 8x16, 8x8, or 8x4 blocks.
+ * Rounding is applied according to the convention (a+b+1)>>1.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the original block; must be aligned on an 8-byte
+ * boundary.
+ * pSrcRef0 - Pointer to reference block 0
+ * pSrcRef1 - Pointer to reference block 1
+ * iSrcStep - Step of the original block buffer; must be a multiple of 8.
+ * iRefStep0 - Step of reference block 0
+ * iRefStep1 - Step of reference block 1
+ * iHeight - Height of the block; must be equal either 4, 8, or 16.
+ *
+ * Output Arguments:
+ *
+ * pDstSAD - Pointer of result SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - iHeight is not equal to either 4, 8, or 16.
+ * - One or more of the following pointers is NULL: pSrc, pSrcRef0,
+ * pSrcRef1, pDstSAD.
+ * - iSrcStep is not a multiple of 8
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_8x (
+ const OMX_U8 *pSrc,
+ const OMX_U8 *pSrcRef0,
+ const OMX_U8 *pSrcRef1,
+ OMX_U32 iSrcStep,
+ OMX_U32 iRefStep0,
+ OMX_U32 iRefStep1,
+ OMX_U32 *pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SADQuar_16x (6.3.5.4.4)
+ *
+ * Description:
+ * This function calculates the SAD between one block (pSrc) and the average
+ * of the other two (pSrcRef0 and pSrcRef1) for 16x16 or 16x8 blocks.
+ * Rounding is applied according to the convention (a+b+1)>>1.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the original block; must be aligned on a 16-byte
+ * boundary.
+ * pSrcRef0 - Pointer to reference block 0
+ * pSrcRef1 - Pointer to reference block 1
+ * iSrcStep - Step of the original block buffer; must be a multiple of 16
+ * iRefStep0 - Step of reference block 0
+ * iRefStep1 - Step of reference block 1
+ * iHeight - Height of the block; must be equal to either 8 or 16
+ *
+ * Output Arguments:
+ *
+ * pDstSAD -Pointer of result SAD
+ *
+ * Return Value:
+ * OMX_Sts_NoErr, if the function runs without error.
+ * OMX_Sts_BadArgErr - bad arguments: if one of the following cases occurs:
+ * - iHeight is not equal to either 8 or 16.
+ * - One or more of the following pointers is NULL: pSrc, pSrcRef0,
+ * pSrcRef1, pDstSAD.
+ * - iSrcStep is not a multiple of 16
+ * - Any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_SADQuar_16x (
+ const OMX_U8 *pSrc,
+ const OMX_U8 *pSrcRef0,
+ const OMX_U8 *pSrcRef1,
+ OMX_U32 iSrcStep,
+ OMX_U32 iRefStep0,
+ OMX_U32 iRefStep1,
+ OMX_U32 *pDstSAD,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SATD_4x4 (6.3.5.4.5)
+ *
+ * Description:
+ * This function calculates the sum of absolute transform differences (SATD)
+ * for a 4x4 block by applying a Hadamard transform to the difference block
+ * and then calculating the sum of absolute coefficient values.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrg - Pointer to the original block; must be aligned on a 4-byte
+ * boundary
+ * iStepOrg - Step of the original block buffer; must be a multiple of 4
+ * pSrcRef - Pointer to the reference block; must be aligned on a 4-byte
+ * boundary
+ * iStepRef - Step of the reference block buffer; must be a multiple of 4
+ *
+ * Output Arguments:
+ *
+ * pDstSAD - pointer to the resulting SAD
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrcOrg, pSrcRef, or pDstSAD either pSrcOrg
+ * - pSrcRef is not aligned on a 4-byte boundary
+ * - iStepOrg <= 0 or iStepOrg is not a multiple of 4
+ * - iStepRef <= 0 or iStepRef is not a multiple of 4
+ *
+ */
+OMXResult omxVCM4P10_SATD_4x4 (
+ const OMX_U8 *pSrcOrg,
+ OMX_U32 iStepOrg,
+ const OMX_U8 *pSrcRef,
+ OMX_U32 iStepRef,
+ OMX_U32 *pDstSAD
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InterpolateHalfHor_Luma (6.3.5.5.1)
+ *
+ * Description:
+ * This function performs interpolation for two horizontal 1/2-pel positions
+ * (-1/2,0) and (1/2, 0) - around a full-pel position.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the top-left corner of the block used to interpolate in
+ * the reconstruction frame plane.
+ * iSrcStep - Step of the source buffer.
+ * iDstStep - Step of the destination(interpolation) buffer; must be a
+ * multiple of iWidth.
+ * iWidth - Width of the current block; must be equal to either 4, 8, or 16
+ * iHeight - Height of the current block; must be equal to 4, 8, or 16
+ *
+ * Output Arguments:
+ *
+ * pDstLeft -Pointer to the interpolation buffer of the left -pel position
+ * (-1/2, 0)
+ * If iWidth==4, 4-byte alignment required.
+ * If iWidth==8, 8-byte alignment required.
+ * If iWidth==16, 16-byte alignment required.
+ * pDstRight -Pointer to the interpolation buffer of the right -pel
+ * position (1/2, 0)
+ * If iWidth==4, 4-byte alignment required.
+ * If iWidth==8, 8-byte alignment required.
+ * If iWidth==16, 16-byte alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrc, pDstLeft, or pDstRight
+ * - iWidth or iHeight have values other than 4, 8, or 16
+ * - iWidth==4 but pDstLeft and/or pDstRight is/are not aligned on a 4-byte boundary
+ * - iWidth==8 but pDstLeft and/or pDstRight is/are not aligned on a 8-byte boundary
+ * - iWidth==16 but pDstLeft and/or pDstRight is/are not aligned on a 16-byte boundary
+ * - any alignment restrictions are violated
+ *
+ */
+OMXResult omxVCM4P10_InterpolateHalfHor_Luma (
+ const OMX_U8 *pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8 *pDstLeft,
+ OMX_U8 *pDstRight,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InterpolateHalfVer_Luma (6.3.5.5.2)
+ *
+ * Description:
+ * This function performs interpolation for two vertical 1/2-pel positions -
+ * (0, -1/2) and (0, 1/2) - around a full-pel position.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to top-left corner of block used to interpolate in the
+ * reconstructed frame plane
+ * iSrcStep - Step of the source buffer.
+ * iDstStep - Step of the destination (interpolation) buffer; must be a
+ * multiple of iWidth.
+ * iWidth - Width of the current block; must be equal to either 4, 8, or 16
+ * iHeight - Height of the current block; must be equal to either 4, 8, or 16
+ *
+ * Output Arguments:
+ *
+ * pDstUp -Pointer to the interpolation buffer of the -pel position above
+ * the current full-pel position (0, -1/2)
+ * If iWidth==4, 4-byte alignment required.
+ * If iWidth==8, 8-byte alignment required.
+ * If iWidth==16, 16-byte alignment required.
+ * pDstDown -Pointer to the interpolation buffer of the -pel position below
+ * the current full-pel position (0, 1/2)
+ * If iWidth==4, 4-byte alignment required.
+ * If iWidth==8, 8-byte alignment required.
+ * If iWidth==16, 16-byte alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrc, pDstUp, or pDstDown
+ * - iWidth or iHeight have values other than 4, 8, or 16
+ * - iWidth==4 but pDstUp and/or pDstDown is/are not aligned on a 4-byte boundary
+ * - iWidth==8 but pDstUp and/or pDstDown is/are not aligned on a 8-byte boundary
+ * - iWidth==16 but pDstUp and/or pDstDown is/are not aligned on a 16-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_InterpolateHalfVer_Luma (
+ const OMX_U8 *pSrc,
+ OMX_U32 iSrcStep,
+ OMX_U8 *pDstUp,
+ OMX_U8 *pDstDown,
+ OMX_U32 iDstStep,
+ OMX_U32 iWidth,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_Average_4x (6.3.5.5.3)
+ *
+ * Description:
+ * This function calculates the average of two 4x4, 4x8 blocks. The result
+ * is rounded according to (a+b+1)/2.
+ *
+ * Input Arguments:
+ *
+ * pPred0 - Pointer to the top-left corner of reference block 0
+ * pPred1 - Pointer to the top-left corner of reference block 1
+ * iPredStep0 - Step of reference block 0; must be a multiple of 4.
+ * iPredStep1 - Step of reference block 1; must be a multiple of 4.
+ * iDstStep - Step of the destination buffer; must be a multiple of 4.
+ * iHeight - Height of the blocks; must be either 4 or 8.
+ *
+ * Output Arguments:
+ *
+ * pDstPred - Pointer to the destination buffer. 4-byte alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pPred0, pPred1, or pDstPred
+ * - pDstPred is not aligned on a 4-byte boundary
+ * - iPredStep0 <= 0 or iPredStep0 is not a multiple of 4
+ * - iPredStep1 <= 0 or iPredStep1 is not a multiple of 4
+ * - iDstStep <= 0 or iDstStep is not a multiple of 4
+ * - iHeight is not equal to either 4 or 8
+ *
+ */
+OMXResult omxVCM4P10_Average_4x (
+ const OMX_U8 *pPred0,
+ const OMX_U8 *pPred1,
+ OMX_U32 iPredStep0,
+ OMX_U32 iPredStep1,
+ OMX_U8 *pDstPred,
+ OMX_U32 iDstStep,
+ OMX_U32 iHeight
+);
+
+
+
+/**
+ * Function: omxVCM4P10_TransformQuant_ChromaDC (6.3.5.6.1)
+ *
+ * Description:
+ * This function performs 2x2 Hadamard transform of chroma DC coefficients
+ * and then quantizes the coefficients.
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - Pointer to the 2x2 array of chroma DC coefficients. 8-byte
+ * alignment required.
+ * iQP - Quantization parameter; must be in the range [0,51].
+ * bIntra - Indicate whether this is an INTRA block. 1-INTRA, 0-INTER
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - Pointer to transformed and quantized coefficients. 8-byte
+ * alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrcDst
+ * - pSrcDst is not aligned on an 8-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_TransformQuant_ChromaDC (
+ OMX_S16 *pSrcDst,
+ OMX_U32 iQP,
+ OMX_U8 bIntra
+);
+
+
+
+/**
+ * Function: omxVCM4P10_TransformQuant_LumaDC (6.3.5.6.2)
+ *
+ * Description:
+ * This function performs a 4x4 Hadamard transform of luma DC coefficients
+ * and then quantizes the coefficients.
+ *
+ * Input Arguments:
+ *
+ * pSrcDst - Pointer to the 4x4 array of luma DC coefficients. 16-byte
+ * alignment required.
+ * iQP - Quantization parameter; must be in the range [0,51].
+ *
+ * Output Arguments:
+ *
+ * pSrcDst - Pointer to transformed and quantized coefficients. 16-byte
+ * alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL: pSrcDst
+ * - pSrcDst is not aligned on an 16-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_TransformQuant_LumaDC (
+ OMX_S16 *pSrcDst,
+ OMX_U32 iQP
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InvTransformDequant_LumaDC (6.3.5.6.3)
+ *
+ * Description:
+ * This function performs inverse 4x4 Hadamard transform and then dequantizes
+ * the coefficients.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the 4x4 array of the 4x4 Hadamard-transformed and
+ * quantized coefficients. 16 byte alignment required.
+ * iQP - Quantization parameter; must be in the range [0,51].
+ *
+ * Output Arguments:
+ *
+ * pDst - Pointer to inverse-transformed and dequantized coefficients.
+ * 16-byte alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL: pSrc
+ * - pSrc or pDst is not aligned on a 16-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_InvTransformDequant_LumaDC (
+ const OMX_S16 *pSrc,
+ OMX_S16 *pDst,
+ OMX_U32 iQP
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InvTransformDequant_ChromaDC (6.3.5.6.4)
+ *
+ * Description:
+ * This function performs inverse 2x2 Hadamard transform and then dequantizes
+ * the coefficients.
+ *
+ * Input Arguments:
+ *
+ * pSrc - Pointer to the 2x2 array of the 2x2 Hadamard-transformed and
+ * quantized coefficients. 8 byte alignment required.
+ * iQP - Quantization parameter; must be in the range [0,51].
+ *
+ * Output Arguments:
+ *
+ * pDst - Pointer to inverse-transformed and dequantized coefficients.
+ * 8-byte alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL: pSrc
+ * - pSrc or pDst is not aligned on an 8-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_InvTransformDequant_ChromaDC (
+ const OMX_S16 *pSrc,
+ OMX_S16 *pDst,
+ OMX_U32 iQP
+);
+
+
+
+/**
+ * Function: omxVCM4P10_InvTransformResidualAndAdd (6.3.5.7.1)
+ *
+ * Description:
+ * This function performs inverse an 4x4 integer transformation to produce
+ * the difference signal and then adds the difference to the prediction to get
+ * the reconstructed signal.
+ *
+ * Input Arguments:
+ *
+ * pSrcPred - Pointer to prediction signal. 4-byte alignment required.
+ * pDequantCoeff - Pointer to the transformed coefficients. 8-byte
+ * alignment required.
+ * iSrcPredStep - Step of the prediction buffer; must be a multiple of 4.
+ * iDstReconStep - Step of the destination reconstruction buffer; must be a
+ * multiple of 4.
+ * bAC - Indicate whether there is AC coefficients in the coefficients
+ * matrix.
+ *
+ * Output Arguments:
+ *
+ * pDstRecon -Pointer to the destination reconstruction buffer. 4-byte
+ * alignment required.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrcPred, pDequantCoeff, pDstRecon
+ * - pSrcPred is not aligned on a 4-byte boundary
+ * - iSrcPredStep or iDstReconStep is not a multiple of 4.
+ * - pDequantCoeff is not aligned on an 8-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_InvTransformResidualAndAdd (
+ const OMX_U8 *pSrcPred,
+ const OMX_S16 *pDequantCoeff,
+ OMX_U8 *pDstRecon,
+ OMX_U32 iSrcPredStep,
+ OMX_U32 iDstReconStep,
+ OMX_U8 bAC
+);
+
+
+
+/**
+ * Function: omxVCM4P10_SubAndTransformQDQResidual (6.3.5.8.1)
+ *
+ * Description:
+ * This function subtracts the prediction signal from the original signal to
+ * produce the difference signal and then performs a 4x4 integer transform and
+ * quantization. The quantized transformed coefficients are stored as
+ * pDstQuantCoeff. This function can also output dequantized coefficients or
+ * unquantized DC coefficients optionally by setting the pointers
+ * pDstDeQuantCoeff, pDCCoeff.
+ *
+ * Input Arguments:
+ *
+ * pSrcOrg - Pointer to original signal. 4-byte alignment required.
+ * pSrcPred - Pointer to prediction signal. 4-byte alignment required.
+ * iSrcOrgStep - Step of the original signal buffer; must be a multiple of
+ * 4.
+ * iSrcPredStep - Step of the prediction signal buffer; must be a multiple
+ * of 4.
+ * pNumCoeff -Number of non-zero coefficients after quantization. If this
+ * parameter is not required, it is set to NULL.
+ * nThreshSAD - Zero-block early detection threshold. If this parameter is
+ * not required, it is set to 0.
+ * iQP - Quantization parameter; must be in the range [0,51].
+ * bIntra - Indicates whether this is an INTRA block, either 1-INTRA or
+ * 0-INTER
+ *
+ * Output Arguments:
+ *
+ * pDstQuantCoeff - Pointer to the quantized transformed coefficients.
+ * 8-byte alignment required.
+ * pDstDeQuantCoeff - Pointer to the dequantized transformed coefficients
+ * if this parameter is not equal to NULL. 8-byte alignment
+ * required.
+ * pDCCoeff - Pointer to the unquantized DC coefficient if this parameter
+ * is not equal to NULL.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrcOrg, pSrcPred, pNumCoeff, pDstQuantCoeff,
+ * pDstDeQuantCoeff, pDCCoeff
+ * - pSrcOrg is not aligned on a 4-byte boundary
+ * - pSrcPred is not aligned on a 4-byte boundary
+ * - iSrcOrgStep is not a multiple of 4
+ * - iSrcPredStep is not a multiple of 4
+ * - pDstQuantCoeff or pDstDeQuantCoeff is not aligned on an 8-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_SubAndTransformQDQResidual (
+ const OMX_U8 *pSrcOrg,
+ const OMX_U8 *pSrcPred,
+ OMX_U32 iSrcOrgStep,
+ OMX_U32 iSrcPredStep,
+ OMX_S16 *pDstQuantCoeff,
+ OMX_S16 *pDstDeQuantCoeff,
+ OMX_S16 *pDCCoeff,
+ OMX_S8 *pNumCoeff,
+ OMX_U32 nThreshSAD,
+ OMX_U32 iQP,
+ OMX_U8 bIntra
+);
+
+
+
+/**
+ * Function: omxVCM4P10_GetVLCInfo (6.3.5.9.1)
+ *
+ * Description:
+ * This function extracts run-length encoding (RLE) information from the
+ * coefficient matrix. The results are returned in an OMXVCM4P10VLCInfo
+ * structure.
+ *
+ * Input Arguments:
+ *
+ * pSrcCoeff - pointer to the transform coefficient matrix. 8-byte
+ * alignment required.
+ * pScanMatrix - pointer to the scan order definition matrix. For a luma
+ * block the scan matrix should follow [ISO14496-10] section 8.5.4,
+ * and should contain the values 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13,
+ * 10, 7, 11, 14, 15. For a chroma block, the scan matrix should
+ * contain the values 0, 1, 2, 3.
+ * bAC - indicates presence of a DC coefficient; 0 = DC coefficient
+ * present, 1= DC coefficient absent.
+ * MaxNumCoef - specifies the number of coefficients contained in the
+ * transform coefficient matrix, pSrcCoeff. The value should be 16
+ * for blocks of type LUMADC, LUMAAC, LUMALEVEL, and CHROMAAC. The
+ * value should be 4 for blocks of type CHROMADC.
+ *
+ * Output Arguments:
+ *
+ * pDstVLCInfo - pointer to structure that stores information for
+ * run-length coding.
+ *
+ * Return Value:
+ *
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments; returned if any of the following
+ * conditions are true:
+ * - at least one of the following pointers is NULL:
+ * pSrcCoeff, pScanMatrix, pDstVLCInfo
+ * - pSrcCoeff is not aligned on an 8-byte boundary
+ *
+ */
+OMXResult omxVCM4P10_GetVLCInfo (
+ const OMX_S16 *pSrcCoeff,
+ const OMX_U8 *pScanMatrix,
+ OMX_U8 bAC,
+ OMX_U32 MaxNumCoef,
+ OMXVCM4P10VLCInfo*pDstVLCInfo
+);
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /** end of #define _OMXVC_H_ */
+
+/** EOF */
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h
new file mode 100755
index 0000000..89f3040
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/api/omxVC_s.h
@@ -0,0 +1,129 @@
+;/******************************************************************************
+;// Copyright (c) 1999-2005 The Khronos Group Inc. All Rights Reserved
+;//
+;//
+;//
+;//
+;//
+;//
+;//
+;//
+;******************************************************************************/
+
+;/** =============== Structure Definition for Sample Generation ============== */
+;/** transparent status */
+
+;enum {
+OMX_VIDEO_TRANSPARENT EQU 0; /** Wholly transparent */
+OMX_VIDEO_PARTIAL EQU 1; /** Partially transparent */
+OMX_VIDEO_OPAQUE EQU 2; /** Opaque */
+;}
+
+;/** direction */
+;enum {
+OMX_VIDEO_NONE EQU 0;
+OMX_VIDEO_HORIZONTAL EQU 1;
+OMX_VIDEO_VERTICAL EQU 2;
+;}
+
+;/** bilinear interpolation type */
+;enum {
+OMX_VIDEO_INTEGER_PIXEL EQU 0; /** case a */
+OMX_VIDEO_HALF_PIXEL_X EQU 1; /** case b */
+OMX_VIDEO_HALF_PIXEL_Y EQU 2; /** case c */
+OMX_VIDEO_HALF_PIXEL_XY EQU 3; /** case d */
+;}
+
+;enum {
+OMX_UPPER EQU 1; /** set if the above macroblock is available */
+OMX_LEFT EQU 2; /** set if the left macroblock is available */
+OMX_CENTER EQU 4;
+OMX_RIGHT EQU 8;
+OMX_LOWER EQU 16;
+OMX_UPPER_LEFT EQU 32; /** set if the above-left macroblock is available */
+OMX_UPPER_RIGHT EQU 64; /** set if the above-right macroblock is available */
+OMX_LOWER_LEFT EQU 128;
+OMX_LOWER_RIGHT EQU 256
+;}
+
+;enum {
+OMX_VIDEO_LUMINANCE EQU 0; /** Luminance component */
+OMX_VIDEO_CHROMINANCE EQU 1; /** chrominance component */
+OMX_VIDEO_ALPHA EQU 2; /** Alpha component */
+;}
+
+;enum {
+OMX_VIDEO_INTER EQU 0; /** P picture or P-VOP */
+OMX_VIDEO_INTER_Q EQU 1; /** P picture or P-VOP */
+OMX_VIDEO_INTER4V EQU 2; /** P picture or P-VOP */
+OMX_VIDEO_INTRA EQU 3; /** I and P picture; I- and P-VOP */
+OMX_VIDEO_INTRA_Q EQU 4; /** I and P picture; I- and P-VOP */
+OMX_VIDEO_INTER4V_Q EQU 5; /** P picture or P-VOP (H.263)*/
+OMX_VIDEO_DIRECT EQU 6; /** B picture or B-VOP (MPEG-4 only) */
+OMX_VIDEO_INTERPOLATE EQU 7; /** B picture or B-VOP */
+OMX_VIDEO_BACKWARD EQU 8; /** B picture or B-VOP */
+OMX_VIDEO_FORWARD EQU 9; /** B picture or B-VOP */
+OMX_VIDEO_NOTCODED EQU 10; /** B picture or B-VOP */
+;}
+
+;enum {
+OMX_16X16_VERT EQU 0; /** Intra_16x16_Vertical (prediction mode) */
+OMX_16X16_HOR EQU 1; /** Intra_16x16_Horizontal (prediction mode) */
+OMX_16X16_DC EQU 2; /** Intra_16x16_DC (prediction mode) */
+OMX_16X16_PLANE EQU 3; /** Intra_16x16_Plane (prediction mode) */
+;}
+
+;enum {
+OMX_4x4_VERT EQU 0; /** Intra_4x4_Vertical (prediction mode) */
+OMX_4x4_HOR EQU 1; /** Intra_4x4_Horizontal (prediction mode) */
+OMX_4x4_DC EQU 2; /** Intra_4x4_DC (prediction mode) */
+OMX_4x4_DIAG_DL EQU 3; /** Intra_4x4_Diagonal_Down_Left (prediction mode) */
+OMX_4x4_DIAG_DR EQU 4; /** Intra_4x4_Diagonal_Down_Right (prediction mode) */
+OMX_4x4_VR EQU 5; /** Intra_4x4_Vertical_Right (prediction mode) */
+OMX_4x4_HD EQU 6; /** Intra_4x4_Horizontal_Down (prediction mode) */
+OMX_4x4_VL EQU 7; /** Intra_4x4_Vertical_Left (prediction mode) */
+OMX_4x4_HU EQU 8; /** Intra_4x4_Horizontal_Up (prediction mode) */
+;}
+
+;enum {
+OMX_CHROMA_DC EQU 0; /** Intra_Chroma_DC (prediction mode) */
+OMX_CHROMA_HOR EQU 1; /** Intra_Chroma_Horizontal (prediction mode) */
+OMX_CHROMA_VERT EQU 2; /** Intra_Chroma_Vertical (prediction mode) */
+OMX_CHROMA_PLANE EQU 3; /** Intra_Chroma_Plane (prediction mode) */
+;}
+
+;typedef struct {
+x EQU 0;
+y EQU 4;
+;}OMXCoordinate;
+
+;typedef struct {
+dx EQU 0;
+dy EQU 2;
+;}OMXMotionVector;
+
+;typedef struct {
+xx EQU 0;
+yy EQU 4;
+width EQU 8;
+height EQU 12;
+;}OMXiRect;
+
+;typedef enum {
+OMX_VC_INTER EQU 0; /** P picture or P-VOP */
+OMX_VC_INTER_Q EQU 1; /** P picture or P-VOP */
+OMX_VC_INTER4V EQU 2; /** P picture or P-VOP */
+OMX_VC_INTRA EQU 3; /** I and P picture, I- and P-VOP */
+OMX_VC_INTRA_Q EQU 4; /** I and P picture, I- and P-VOP */
+OMX_VC_INTER4V_Q EQU 5; /** P picture or P-VOP (H.263)*/
+;} OMXVCM4P2MacroblockType;
+
+;enum {
+OMX_VC_NONE EQU 0
+OMX_VC_HORIZONTAL EQU 1
+OMX_VC_VERTICAL EQU 2
+;};
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s
new file mode 100755
index 0000000..296d59d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy16x16_s.s
@@ -0,0 +1,95 @@
+ ;/**
+ ; * Function: omxVCCOMM_Copy16x16
+ ; *
+ ; * Description:
+ ; * Copies the reference 16x16 block to the current block.
+ ; * Parameters:
+ ; * [in] pSrc - pointer to the reference block in the source frame; must be aligned on an 16-byte boundary.
+ ; * [in] step - distance between the starts of consecutive lines in the reference frame, in bytes;
+ ; * must be a multiple of 16 and must be larger than or equal to 16.
+ ; * [out] pDst - pointer to the destination block; must be aligned on an 8-byte boundary.
+ ; * Return Value:
+ ; * OMX_Sts_NoErr - no error
+ ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions:
+ ; * - one or more of the following pointers is NULL: pSrc, pDst
+ ; * - one or more of the following pointers is not aligned on an 16-byte boundary: pSrc, pDst
+ ; * - step <16 or step is not a multiple of 16.
+ ; */
+
+ INCLUDE omxtypes_s.h
+
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+
+ ;//Input Arguments
+pSrc RN 0
+pDst RN 1
+step RN 2
+
+;//Local Variables
+Return RN 0
+;// Neon Registers
+
+X0 DN D0.S8
+X1 DN D1.S8
+X2 DN D2.S8
+X3 DN D3.S8
+X4 DN D4.S8
+X5 DN D5.S8
+X6 DN D6.S8
+X7 DN D7.S8
+
+ M_START omxVCCOMM_Copy16x16
+
+
+ VLD1 {X0,X1},[pSrc@128],step ;// Load 16 bytes from 16 byte aligned pSrc and pSrc=pSrc + step after loading
+ VLD1 {X2,X3},[pSrc@128],step
+ VLD1 {X4,X5},[pSrc@128],step
+ VLD1 {X6,X7},[pSrc@128],step
+
+ VST1 {X0,X1,X2,X3},[pDst@128]! ;// Store 32 bytes to 16 byte aligned pDst
+ VST1 {X4,X5,X6,X7},[pDst@128]!
+
+
+ VLD1 {X0,X1},[pSrc@128],step
+ VLD1 {X2,X3},[pSrc@128],step
+ VLD1 {X4,X5},[pSrc@128],step
+ VLD1 {X6,X7},[pSrc@128],step
+
+ VST1 {X0,X1,X2,X3},[pDst@128]!
+ VST1 {X4,X5,X6,X7},[pDst@128]!
+
+
+ VLD1 {X0,X1},[pSrc@128],step
+ VLD1 {X2,X3},[pSrc@128],step
+ VLD1 {X4,X5},[pSrc@128],step
+ VLD1 {X6,X7},[pSrc@128],step
+
+ VST1 {X0,X1,X2,X3},[pDst@128]!
+ VST1 {X4,X5,X6,X7},[pDst@128]!
+
+
+ VLD1 {X0,X1},[pSrc@128],step
+ VLD1 {X2,X3},[pSrc@128],step
+ VLD1 {X4,X5},[pSrc@128],step
+ VLD1 {X6,X7},[pSrc@128],step
+
+ VST1 {X0,X1,X2,X3},[pDst@128]!
+ VST1 {X4,X5,X6,X7},[pDst@128]!
+
+
+ MOV Return,#OMX_Sts_NoErr
+
+
+
+ M_END
+ ENDIF
+
+
+
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s
new file mode 100755
index 0000000..db9e5ef
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_Copy8x8_s.s
@@ -0,0 +1,70 @@
+ ;/**
+ ; * Function: omxVCCOMM_Copy8x8
+ ; *
+ ; * Description:
+ ; * Copies the reference 8x8 block to the current block.
+ ; * Parameters:
+ ; * [in] pSrc - pointer to the reference block in the source frame; must be aligned on an 8-byte boundary.
+ ; * [in] step - distance between the starts of consecutive lines in the reference frame, in bytes;
+ ; * must be a multiple of 8 and must be larger than or equal to 8.
+ ; * [out] pDst - pointer to the destination block; must be aligned on an 8-byte boundary.
+ ; * Return Value:
+ ; * OMX_Sts_NoErr - no error
+ ; * OMX_Sts_BadArgErr - bad arguments; returned under any of the following conditions:
+ ; * - one or more of the following pointers is NULL: pSrc, pDst
+ ; * - one or more of the following pointers is not aligned on an 8-byte boundary: pSrc, pDst
+ ; * - step <8 or step is not a multiple of 8.
+ ; */
+
+ INCLUDE omxtypes_s.h
+
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+
+ ;//Input Arguments
+pSrc RN 0
+pDst RN 1
+step RN 2
+
+;//Local Variables
+Count RN 3
+Return RN 0
+;// Neon Registers
+
+X0 DN D0.S8
+X1 DN D1.S8
+X2 DN D2.S8
+X3 DN D3.S8
+ M_START omxVCCOMM_Copy8x8
+
+
+
+ VLD1 {X0},[pSrc],step ;// Load 8 bytes from 8 byte aligned pSrc, pSrc=pSrc+step after load
+ VLD1 {X1},[pSrc],step
+ VLD1 {X2},[pSrc],step
+ VLD1 {X3},[pSrc],step
+
+ VST1 {X0,X1},[pDst]! ;// Store 16 bytes to 8 byte aligned pDst
+ VST1 {X2,X3},[pDst]!
+
+ VLD1 {X0},[pSrc],step
+ VLD1 {X1},[pSrc],step
+ VLD1 {X2},[pSrc],step
+ VLD1 {X3},[pSrc],step
+
+ VST1 {X0,X1},[pDst]!
+ VST1 {X2,X3},[pDst]!
+
+ MOV Return,#OMX_Sts_NoErr
+
+ M_END
+ ENDIF
+
+
+
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
new file mode 100755
index 0000000..5c5b7d8
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/comm/src/omxVCCOMM_ExpandFrame_I_s.s
@@ -0,0 +1,236 @@
+;//
+;//
+;// File Name: omxVCCOMM_ExpandFrame_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// This function will Expand Frame boundary pixels into Plane
+;//
+;//
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+;// Set debugging level
+DEBUG_ON SETL {FALSE}
+
+
+
+ IF CortexA8
+
+ M_START omxVCCOMM_ExpandFrame_I,r11
+
+;//Input registers
+
+pSrcDstPlane RN 0
+iFrameWidth RN 1
+iFrameHeight RN 2
+iExpandPels RN 3
+iPlaneStep RN 4
+pTop RN 5
+pBot RN 6
+pDstTop RN 7
+pDstBot RN 8
+pLeft RN 5
+pRight RN 6
+pDstLeft RN 9
+pDstRight RN 10
+Offset RN 11
+Temp RN 14
+Counter RN 12
+Tmp RN 7
+;//Output registers
+
+result RN 0
+;// Neon registers
+qData0 QN 0.U8
+qData1 QN 1.U8
+dData0 DN 0.U8
+dData1 DN 1.U8
+dData2 DN 2.U8
+dData3 DN 3.U8
+
+ ;// Define stack arguments
+ M_ARG pPlaneStep, 4
+
+ ;// Load argument from the stack
+ M_LDR iPlaneStep, pPlaneStep
+
+ SUB pTop, pSrcDstPlane, #0 ;// Top row pointer of the frame
+ MUL Offset, iExpandPels, iPlaneStep ;// E*Step
+ SUB Temp, iFrameHeight, #1 ;// H-1
+ MUL Temp, iPlaneStep, Temp ;// (H-1)*Step
+ ADD pBot, Temp, pSrcDstPlane ;// BPtr = TPtr + (H-1)*Step
+ MOV Temp, iFrameWidth ;// Outer loop counter
+
+ ;// Check if pSrcDstPlane and iPlaneStep are 16 byte aligned
+ TST pSrcDstPlane, #0xf
+ TSTEQ iPlaneStep, #0xf
+ BNE Hor8Loop00
+
+ ;//
+ ;// Copy top and bottom region of the plane as follows
+ ;// top region = top row elements from the frame
+ ;// bottom region = last row elements from the frame
+ ;//
+
+ ;// Case for 16 byte alignment
+Hor16Loop00
+ SUB pDstTop, pTop, Offset
+ VLD1 qData0, [pTop @128]!
+ MOV Counter, iExpandPels ;// Inner loop counter
+ ADD pDstBot, pBot, iPlaneStep
+ VLD1 qData1, [pBot @128]!
+Ver16Loop0
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ VST1 qData0, [pDstTop @128], iPlaneStep
+ SUBS Counter, Counter, #8
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ VST1 qData1, [pDstBot @128], iPlaneStep
+ BGT Ver16Loop0
+
+ SUBS Temp, Temp, #16
+ BGT Hor16Loop00
+ B EndAlignedLoop
+
+ ;// Case for 8 byte alignment
+Hor8Loop00
+ SUB pDstTop, pTop, Offset
+ VLD1 qData0, [pTop @64]!
+ MOV Counter, iExpandPels ;// Inner loop counter
+ ADD pDstBot, pBot, iPlaneStep
+ VLD1 qData1, [pBot @64]!
+Ver8Loop0
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ VST1 qData0, [pDstTop @64], iPlaneStep
+ SUBS Counter, Counter, #8
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ VST1 qData1, [pDstBot @64], iPlaneStep
+ BGT Ver8Loop0
+
+ SUBS Temp, Temp, #16
+ BGT Hor8Loop00
+
+EndAlignedLoop
+ ADD Temp, pSrcDstPlane, iFrameWidth
+ SUB pDstRight, Temp, Offset
+ SUB pRight, Temp, #1
+ SUB pDstLeft, pSrcDstPlane, Offset
+ SUB pDstLeft, pDstLeft, iExpandPels
+ ADD pLeft, pSrcDstPlane, #0
+
+ VLD1 {dData0 []}, [pLeft], iPlaneStep ;// Top-Left corner pixel from frame duplicated in dData0
+ SUB Offset, iPlaneStep, iExpandPels
+ VLD1 {dData1 []}, [pRight], iPlaneStep ;// Top-Right corner pixel from frame duplicated in dData1
+ MOV Temp, iExpandPels
+
+ ;//
+ ;// Copy top-left and top-right region of the plane as follows
+ ;// top-left region = top-left corner pixel from the frame
+ ;// top-right region = top-right corner pixel from the frame
+ ;//
+HorLoop11
+ MOV Counter, iExpandPels
+VerLoop1
+ VST1 dData0, [pDstLeft], #8
+ SUBS Counter, Counter, #8
+ VST1 dData1, [pDstRight], #8
+ BGT VerLoop1
+
+ SUBS Temp, Temp, #1
+ ADD pDstLeft, pDstLeft, Offset
+ ADD pDstRight, pDstRight, Offset
+ BPL HorLoop11
+
+ SUB iFrameHeight, iFrameHeight, #1
+ ;//
+ ;// Copy left and right region of the plane as follows
+ ;// Left region = copy the row with left start pixel from the frame
+ ;// Right region = copy the row with right end pixel from the frame
+ ;//
+HorLoop22
+ VLD1 {dData0 []}, [pLeft], iPlaneStep
+ MOV Counter, iExpandPels
+ VLD1 {dData1 []}, [pRight], iPlaneStep
+VerLoop2
+ VST1 dData0, [pDstLeft], #8
+ SUBS Counter, Counter, #8
+ VST1 dData1, [pDstRight], #8
+ BGT VerLoop2
+
+ SUBS iFrameHeight, iFrameHeight, #1
+ ADD pDstLeft, pDstLeft, Offset
+ ADD pDstRight, pDstRight, Offset
+ BGT HorLoop22
+
+ MOV Temp, iExpandPels
+ ;//
+ ;// Copy bottom-left and bottom-right region of the plane as follows
+ ;// bottom-left region = bottom-left corner pixel from the frame
+ ;// bottom-right region = bottom-right corner pixel from the frame
+ ;//
+HorLoop33
+ MOV Counter, iExpandPels
+VerLoop3
+ VST1 dData0, [pDstLeft], #8
+ SUBS Counter, Counter, #8
+ VST1 dData1, [pDstRight], #8
+ BGT VerLoop3
+
+ SUBS Temp, Temp, #1
+ ADD pDstLeft, pDstLeft, Offset
+ ADD pDstRight, pDstRight, Offset
+ BGT HorLoop33
+End
+ MOV r0, #OMX_Sts_NoErr
+
+ M_END
+
+ ENDIF
+
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h
new file mode 100755
index 0000000..547a2d9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/api/armVCM4P10_CAVLCTables.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: armVCM4P10_CAVLCTables.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Header file for optimized H.264 CALVC tables
+ *
+ */
+
+#ifndef ARMVCM4P10_CAVLCTABLES_H
+#define ARMVCM4P10_CAVLCTABLES_H
+
+/* CAVLC tables */
+
+extern const OMX_U16 *armVCM4P10_CAVLCCoeffTokenTables[18];
+extern const OMX_U16 *armVCM4P10_CAVLCTotalZeroTables[15];
+extern const OMX_U16 *armVCM4P10_CAVLCTotalZeros2x2Tables[3];
+extern const OMX_U16 *armVCM4P10_CAVLCRunBeforeTables[15];
+extern const OMX_U8 armVCM4P10_ZigZag_4x4[16];
+extern const OMX_U8 armVCM4P10_ZigZag_2x2[4];
+extern const OMX_S8 armVCM4P10_SuffixToLevel[7];
+
+#endif
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
new file mode 100755
index 0000000..4f0892d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
@@ -0,0 +1,222 @@
+;//
+;//
+;// File Name: armVCM4P10_Average_4x_Align_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+;// Functions:
+;// armVCM4P10_Average_4x4_Align<ALIGNMENT>_unsafe
+;//
+;// Implements Average of 4x4 with equation c = (a+b+1)>>1.
+;// First operand will be at offset ALIGNMENT from aligned address
+;// Second operand will be at aligned location and will be used as output.
+;// destination pointed by (pDst) for vertical interpolation.
+;// This function needs to copy 4 bytes in horizontal direction
+;//
+;// Registers used as input for this function
+;// r0,r1,r2,r3 where r2 containings aligned memory pointer and r3 step size
+;//
+;// Registers preserved for top level function
+;// r4,r5,r6,r8,r9,r14
+;//
+;// Registers modified by the function
+;// r7,r10,r11,r12
+;//
+;// Output registers
+;// r2 - pointer to the aligned location
+;// r3 - step size to this aligned location
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+ EXPORT armVCM4P10_Average_4x4_Align0_unsafe
+ EXPORT armVCM4P10_Average_4x4_Align2_unsafe
+ EXPORT armVCM4P10_Average_4x4_Align3_unsafe
+
+DEBUG_ON SETL {FALSE}
+
+;// Declare input registers
+pPred0 RN 0
+iPredStep0 RN 1
+pPred1 RN 2
+iPredStep1 RN 3
+pDstPred RN 2
+iDstStep RN 3
+
+;// Declare other intermediate registers
+iPredA0 RN 10
+iPredA1 RN 11
+iPredB0 RN 12
+iPredB1 RN 14
+Temp1 RN 4
+Temp2 RN 5
+ResultA RN 5
+ResultB RN 4
+r0x80808080 RN 7
+
+ IF ARM1136JS
+
+ ;// This function calculates average of 4x4 block
+ ;// pPred0 is at alignment offset 0 and pPred1 is alignment 4
+
+ ;// Function header
+ M_START armVCM4P10_Average_4x4_Align0_unsafe, r6
+
+ ;// Code start
+ LDR r0x80808080, =0x80808080
+
+ ;// 1st load
+ M_LDR iPredB0, [pPred1]
+ M_LDR iPredA0, [pPred0], iPredStep0
+ M_LDR iPredB1, [pPred1, iPredStep1]
+ M_LDR iPredA1, [pPred0], iPredStep0
+
+ ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
+ MVN iPredB0, iPredB0
+ MVN iPredB1, iPredB1
+ UHSUB8 ResultA, iPredA0, iPredB0
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+
+ ;// 2nd load
+ M_LDR iPredA0, [pPred0], iPredStep0
+ M_LDR iPredB0, [pPred1]
+ M_LDR iPredA1, [pPred0], iPredStep0
+ M_LDR iPredB1, [pPred1, iPredStep1]
+
+ MVN iPredB0, iPredB0
+ UHSUB8 ResultA, iPredA0, iPredB0
+ MVN iPredB1, iPredB1
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+End0
+ M_END
+
+ ;// This function calculates average of 4x4 block
+ ;// pPred0 is at alignment offset 2 and pPred1 is alignment 4
+
+ ;// Function header
+ M_START armVCM4P10_Average_4x4_Align2_unsafe, r6
+
+ ;// Code start
+ LDR r0x80808080, =0x80808080
+
+ ;// 1st load
+ LDR Temp1, [pPred0, #4]
+ M_LDR iPredA0, [pPred0], iPredStep0
+ M_LDR iPredB0, [pPred1]
+ M_LDR iPredB1, [pPred1, iPredStep1]
+ M_LDR Temp2, [pPred0, #4]
+ M_LDR iPredA1, [pPred0], iPredStep0
+ MVN iPredB0, iPredB0
+ MVN iPredB1, iPredB1
+ MOV iPredA0, iPredA0, LSR #16
+ ORR iPredA0, iPredA0, Temp1, LSL #16
+ MOV iPredA1, iPredA1, LSR #16
+ ORR iPredA1, iPredA1, Temp2, LSL #16
+
+ ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
+ UHSUB8 ResultA, iPredA0, iPredB0
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+
+ ;// 2nd load
+ LDR Temp1, [pPred0, #4]
+ M_LDR iPredA0, [pPred0], iPredStep0
+ LDR iPredB0, [pPred1]
+ LDR iPredB1, [pPred1, iPredStep1]
+ LDR Temp2, [pPred0, #4]
+ M_LDR iPredA1, [pPred0], iPredStep0
+ MVN iPredB0, iPredB0
+ MVN iPredB1, iPredB1
+ MOV iPredA0, iPredA0, LSR #16
+ ORR iPredA0, iPredA0, Temp1, LSL #16
+ MOV iPredA1, iPredA1, LSR #16
+ ORR iPredA1, iPredA1, Temp2, LSL #16
+
+ UHSUB8 ResultA, iPredA0, iPredB0
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+End2
+ M_END
+
+
+ ;// This function calculates average of 4x4 block
+ ;// pPred0 is at alignment offset 3 and pPred1 is alignment 4
+
+ ;// Function header
+ M_START armVCM4P10_Average_4x4_Align3_unsafe, r6
+
+ ;// Code start
+ LDR r0x80808080, =0x80808080
+
+ ;// 1st load
+ LDR Temp1, [pPred0, #4]
+ M_LDR iPredA0, [pPred0], iPredStep0
+ LDR iPredB0, [pPred1]
+ LDR iPredB1, [pPred1, iPredStep1]
+ LDR Temp2, [pPred0, #4]
+ M_LDR iPredA1, [pPred0], iPredStep0
+
+ MVN iPredB0, iPredB0
+ MVN iPredB1, iPredB1
+ MOV iPredA0, iPredA0, LSR #24
+ ORR iPredA0, iPredA0, Temp1, LSL #8
+ MOV iPredA1, iPredA1, LSR #24
+ ORR iPredA1, iPredA1, Temp2, LSL #8
+ UHSUB8 ResultA, iPredA0, iPredB0
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+
+ ;// 2nd load
+ LDR Temp1, [pPred0, #4]
+ M_LDR iPredA0, [pPred0], iPredStep0
+ LDR iPredB0, [pPred1]
+ LDR iPredB1, [pPred1, iPredStep1]
+ LDR Temp2, [pPred0, #4]
+ M_LDR iPredA1, [pPred0], iPredStep0
+
+ MVN iPredB0, iPredB0
+ MVN iPredB1, iPredB1
+ MOV iPredA0, iPredA0, LSR #24
+ ORR iPredA0, iPredA0, Temp1, LSL #8
+ MOV iPredA1, iPredA1, LSR #24
+ ORR iPredA1, iPredA1, Temp2, LSL #8
+
+ UHSUB8 ResultA, iPredA0, iPredB0
+ UHSUB8 ResultB, iPredA1, iPredB1
+ EOR ResultA, ResultA, r0x80808080
+ M_STR ResultA, [pDstPred], iDstStep
+ EOR ResultB, ResultB, r0x80808080
+ M_STR ResultB, [pDstPred], iDstStep
+End3
+ M_END
+
+ ENDIF
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c
new file mode 100755
index 0000000..137495d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_CAVLCTables.c
@@ -0,0 +1,327 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: armVCM4P10_CAVLCTables.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Optimized CAVLC tables for H.264
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+
+#include "armVCM4P10_CAVLCTables.h"
+
+/* 4x4 DeZigZag table */
+
+const OMX_U8 armVCM4P10_ZigZag_4x4[16] =
+{
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+/* 2x2 DeZigZag table */
+
+const OMX_U8 armVCM4P10_ZigZag_2x2[4] =
+{
+ 0, 1, 2, 3
+};
+
+
+/*
+ * Suffix To Level table
+ * We increment the suffix length if
+ * ((LevelCode>>1)+1)>(3<<(SuffixLength-1)) && SuffixLength<6
+ * (LevelCode>>1)>=(3<<(SuffixLength-1)) && SuffixLength<6
+ * LevelCode >= 3<<SuffixLength && SuffixLength<6
+ * (LevelCode+2) >= (3<<SuffixLength)+2 && SuffixLength<6
+ */
+const OMX_S8 armVCM4P10_SuffixToLevel[7] =
+{
+ (3<<1)+2, /* SuffixLength=1 */
+ (3<<1)+2, /* SuffixLength=1 */
+ (3<<2)+2, /* SuffixLength=2 */
+ (3<<3)+2, /* SuffixLength=3 */
+ (3<<4)+2, /* SuffixLength=4 */
+ (3<<5)+2, /* SuffixLength=5 */
+ -1 /* SuffixLength=6 - never increment */
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_0[132] = {
+ 0x0020, 0x0100, 0x2015, 0x2015, 0x400b, 0x400b, 0x400b, 0x400b,
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+ 0x0028, 0x00f0, 0x00f8, 0x0027, 0x0030, 0x00d8, 0x00e0, 0x00e8,
+ 0x0038, 0x00a0, 0x00c8, 0x00d0, 0x0040, 0x0068, 0x0090, 0x0098,
+ 0x0048, 0x0050, 0x0058, 0x0060, 0x27ff, 0x27ff, 0x206b, 0x206b,
+ 0x0081, 0x0085, 0x0083, 0x0079, 0x0087, 0x007d, 0x007b, 0x0071,
+ 0x007f, 0x0075, 0x0073, 0x0069, 0x0070, 0x0078, 0x0080, 0x0088,
+ 0x2077, 0x2077, 0x206d, 0x206d, 0x2063, 0x2063, 0x2061, 0x2061,
+ 0x206f, 0x206f, 0x2065, 0x2065, 0x205b, 0x205b, 0x2059, 0x2059,
+ 0x0067, 0x005d, 0x0053, 0x0051, 0x005f, 0x0055, 0x004b, 0x0049,
+ 0x00a8, 0x00b0, 0x00b8, 0x00c0, 0x2041, 0x2041, 0x204d, 0x204d,
+ 0x2043, 0x2043, 0x2039, 0x2039, 0x2057, 0x2057, 0x2045, 0x2045,
+ 0x203b, 0x203b, 0x2031, 0x2031, 0x204f, 0x204f, 0x203d, 0x203d,
+ 0x2033, 0x2033, 0x2029, 0x2029, 0x0047, 0x0035, 0x002b, 0x0021,
+ 0x203f, 0x203f, 0x202d, 0x202d, 0x2023, 0x2023, 0x2019, 0x2019,
+ 0x0037, 0x0025, 0x001b, 0x0011, 0x202f, 0x202f, 0x201d, 0x201d,
+ 0x0013, 0x0009, 0x201f, 0x201f
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_1[128] = {
+ 0x0020, 0x00e8, 0x00f0, 0x00f8, 0x0027, 0x001f, 0x2015, 0x2015,
+ 0x400b, 0x400b, 0x400b, 0x400b, 0x4001, 0x4001, 0x4001, 0x4001,
+ 0x0028, 0x00d0, 0x00d8, 0x00e0, 0x0030, 0x0098, 0x00c0, 0x00c8,
+ 0x0038, 0x0060, 0x0088, 0x0090, 0x0040, 0x0048, 0x0050, 0x0058,
+ 0x27ff, 0x27ff, 0x207f, 0x207f, 0x0087, 0x0085, 0x0083, 0x0081,
+ 0x007b, 0x0079, 0x007d, 0x0073, 0x2075, 0x2075, 0x2071, 0x2071,
+ 0x0068, 0x0070, 0x0078, 0x0080, 0x2077, 0x2077, 0x206d, 0x206d,
+ 0x206b, 0x206b, 0x2069, 0x2069, 0x206f, 0x206f, 0x2065, 0x2065,
+ 0x2063, 0x2063, 0x2061, 0x2061, 0x0059, 0x005d, 0x005b, 0x0051,
+ 0x0067, 0x0055, 0x0053, 0x0049, 0x00a0, 0x00a8, 0x00b0, 0x00b8,
+ 0x205f, 0x205f, 0x204d, 0x204d, 0x204b, 0x204b, 0x2041, 0x2041,
+ 0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2039, 0x2039,
+ 0x204f, 0x204f, 0x203d, 0x203d, 0x203b, 0x203b, 0x2031, 0x2031,
+ 0x0029, 0x0035, 0x0033, 0x0021, 0x2047, 0x2047, 0x202d, 0x202d,
+ 0x202b, 0x202b, 0x2019, 0x2019, 0x003f, 0x0025, 0x0023, 0x0011,
+ 0x0037, 0x001d, 0x001b, 0x0009, 0x202f, 0x202f, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_2[112] = {
+ 0x0020, 0x0088, 0x00b0, 0x00b8, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+ 0x003f, 0x0037, 0x002f, 0x0027, 0x001f, 0x0015, 0x000b, 0x0001,
+ 0x0028, 0x0050, 0x0078, 0x0080, 0x0030, 0x0038, 0x0040, 0x0048,
+ 0x07ff, 0x0081, 0x0087, 0x0085, 0x0083, 0x0079, 0x007f, 0x007d,
+ 0x007b, 0x0071, 0x0077, 0x0075, 0x0073, 0x0069, 0x206b, 0x206b,
+ 0x0058, 0x0060, 0x0068, 0x0070, 0x2061, 0x2061, 0x206d, 0x206d,
+ 0x2063, 0x2063, 0x2059, 0x2059, 0x206f, 0x206f, 0x2065, 0x2065,
+ 0x205b, 0x205b, 0x2051, 0x2051, 0x0067, 0x005d, 0x0053, 0x0049,
+ 0x005f, 0x0055, 0x004b, 0x0041, 0x0090, 0x0098, 0x00a0, 0x00a8,
+ 0x2039, 0x2039, 0x2031, 0x2031, 0x204d, 0x204d, 0x2029, 0x2029,
+ 0x2057, 0x2057, 0x2045, 0x2045, 0x2043, 0x2043, 0x2021, 0x2021,
+ 0x0019, 0x003d, 0x003b, 0x0011, 0x004f, 0x0035, 0x0033, 0x0009,
+ 0x202b, 0x202b, 0x202d, 0x202d, 0x2023, 0x2023, 0x2025, 0x2025,
+ 0x201b, 0x201b, 0x2047, 0x2047, 0x201d, 0x201d, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_3[80] = {
+ 0x0020, 0x0028, 0x0030, 0x0038, 0x0040, 0x0048, 0x0050, 0x0058,
+ 0x0060, 0x0068, 0x0070, 0x0078, 0x0080, 0x0088, 0x0090, 0x0098,
+ 0x0009, 0x000b, 0x07ff, 0x0001, 0x0011, 0x0013, 0x0015, 0x07ff,
+ 0x0019, 0x001b, 0x001d, 0x001f, 0x0021, 0x0023, 0x0025, 0x0027,
+ 0x0029, 0x002b, 0x002d, 0x002f, 0x0031, 0x0033, 0x0035, 0x0037,
+ 0x0039, 0x003b, 0x003d, 0x003f, 0x0041, 0x0043, 0x0045, 0x0047,
+ 0x0049, 0x004b, 0x004d, 0x004f, 0x0051, 0x0053, 0x0055, 0x0057,
+ 0x0059, 0x005b, 0x005d, 0x005f, 0x0061, 0x0063, 0x0065, 0x0067,
+ 0x0069, 0x006b, 0x006d, 0x006f, 0x0071, 0x0073, 0x0075, 0x0077,
+ 0x0079, 0x007b, 0x007d, 0x007f, 0x0081, 0x0083, 0x0085, 0x0087
+};
+
+static const OMX_U16 armVCM4P10_CAVLCCoeffTokenTables_4[32] = {
+ 0x0020, 0x0038, 0x2015, 0x2015, 0x4001, 0x4001, 0x4001, 0x4001,
+ 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b, 0x600b,
+ 0x0028, 0x0030, 0x0021, 0x0019, 0x2027, 0x2027, 0x0025, 0x0023,
+ 0x201d, 0x201d, 0x201b, 0x201b, 0x0011, 0x001f, 0x0013, 0x0009
+};
+
+const OMX_U16 * armVCM4P10_CAVLCCoeffTokenTables[18] = {
+ armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=0 */
+ armVCM4P10_CAVLCCoeffTokenTables_0, /* nC=1 */
+ armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=2 */
+ armVCM4P10_CAVLCCoeffTokenTables_1, /* nC=3 */
+ armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=4 */
+ armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=5 */
+ armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=6 */
+ armVCM4P10_CAVLCCoeffTokenTables_2, /* nC=7 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=8 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=9 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=10 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=11 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=12 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=13 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=14 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=15 */
+ armVCM4P10_CAVLCCoeffTokenTables_3, /* nC=16 */
+ armVCM4P10_CAVLCCoeffTokenTables_4 /* nC=-1 */
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_0[40] = {
+ 0x0020, 0x0048, 0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003,
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+ 0x0028, 0x0040, 0x0011, 0x000f, 0x0030, 0x0038, 0x0019, 0x0017,
+ 0x27ff, 0x27ff, 0x201f, 0x201f, 0x201d, 0x201d, 0x201b, 0x201b,
+ 0x2015, 0x2015, 0x2013, 0x2013, 0x200d, 0x200d, 0x200b, 0x200b
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_1[24] = {
+ 0x0020, 0x0028, 0x0011, 0x000f, 0x000d, 0x000b, 0x2009, 0x2009,
+ 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001,
+ 0x001d, 0x001b, 0x0019, 0x0017, 0x2015, 0x2015, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_2[24] = {
+ 0x0020, 0x0028, 0x0011, 0x000b, 0x0009, 0x0001, 0x200f, 0x200f,
+ 0x200d, 0x200d, 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003,
+ 0x001b, 0x0017, 0x2019, 0x2019, 0x2015, 0x2015, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_3[24] = {
+ 0x0020, 0x0028, 0x0013, 0x000f, 0x0007, 0x0005, 0x2011, 0x2011,
+ 0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2003, 0x2003,
+ 0x2019, 0x2019, 0x2017, 0x2017, 0x2015, 0x2015, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_4[20] = {
+ 0x0020, 0x0015, 0x0011, 0x0005, 0x0003, 0x0001, 0x200f, 0x200f,
+ 0x200d, 0x200d, 0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007,
+ 0x2017, 0x2017, 0x2013, 0x2013
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_5[20] = {
+ 0x0020, 0x0011, 0x2013, 0x2013, 0x200f, 0x200f, 0x200d, 0x200d,
+ 0x200b, 0x200b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2005, 0x2005,
+ 0x0015, 0x0001, 0x2003, 0x2003
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_6[20] = {
+ 0x0020, 0x000f, 0x2011, 0x2011, 0x200d, 0x200d, 0x2009, 0x2009,
+ 0x2007, 0x2007, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b,
+ 0x0013, 0x0001, 0x2003, 0x2003
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_7[20] = {
+ 0x0020, 0x0003, 0x200f, 0x200f, 0x200d, 0x200d, 0x2007, 0x2007,
+ 0x400b, 0x400b, 0x400b, 0x400b, 0x4009, 0x4009, 0x4009, 0x4009,
+ 0x0011, 0x0001, 0x2005, 0x2005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_8[20] = {
+ 0x0020, 0x0005, 0x200b, 0x200b, 0x400d, 0x400d, 0x400d, 0x400d,
+ 0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007,
+ 0x0003, 0x0001, 0x200f, 0x200f
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_9[20] = {
+ 0x0020, 0x000d, 0x2005, 0x2005, 0x400b, 0x400b, 0x400b, 0x400b,
+ 0x4009, 0x4009, 0x4009, 0x4009, 0x4007, 0x4007, 0x4007, 0x4007,
+ 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_10[16] = {
+ 0x0001, 0x0003, 0x2005, 0x2005, 0x2007, 0x2007, 0x200b, 0x200b,
+ 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009, 0x6009
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_11[16] = {
+ 0x0001, 0x0003, 0x2009, 0x2009, 0x4005, 0x4005, 0x4005, 0x4005,
+ 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007, 0x6007
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_12[16] = {
+ 0x2001, 0x2001, 0x2003, 0x2003, 0x4007, 0x4007, 0x4007, 0x4007,
+ 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_13[16] = {
+ 0x4001, 0x4001, 0x4001, 0x4001, 0x4003, 0x4003, 0x4003, 0x4003,
+ 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005, 0x6005
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeroTables_14[16] = {
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001,
+ 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003
+};
+
+const OMX_U16 * armVCM4P10_CAVLCTotalZeroTables[15] = {
+ armVCM4P10_CAVLCTotalZeroTables_0,
+ armVCM4P10_CAVLCTotalZeroTables_1,
+ armVCM4P10_CAVLCTotalZeroTables_2,
+ armVCM4P10_CAVLCTotalZeroTables_3,
+ armVCM4P10_CAVLCTotalZeroTables_4,
+ armVCM4P10_CAVLCTotalZeroTables_5,
+ armVCM4P10_CAVLCTotalZeroTables_6,
+ armVCM4P10_CAVLCTotalZeroTables_7,
+ armVCM4P10_CAVLCTotalZeroTables_8,
+ armVCM4P10_CAVLCTotalZeroTables_9,
+ armVCM4P10_CAVLCTotalZeroTables_10,
+ armVCM4P10_CAVLCTotalZeroTables_11,
+ armVCM4P10_CAVLCTotalZeroTables_12,
+ armVCM4P10_CAVLCTotalZeroTables_13,
+ armVCM4P10_CAVLCTotalZeroTables_14
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_0[16] = {
+ 0x2007, 0x2007, 0x2005, 0x2005, 0x4003, 0x4003, 0x4003, 0x4003,
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_1[16] = {
+ 0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003,
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCTotalZeros2x2Tables_2[16] = {
+ 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003, 0x6003,
+ 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001, 0x6001
+};
+
+const OMX_U16 * armVCM4P10_CAVLCTotalZeros2x2Tables[3] = {
+ armVCM4P10_CAVLCTotalZeros2x2Tables_0,
+ armVCM4P10_CAVLCTotalZeros2x2Tables_1,
+ armVCM4P10_CAVLCTotalZeros2x2Tables_2
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_0[8] = {
+ 0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_1[8] = {
+ 0x2005, 0x2005, 0x2003, 0x2003, 0x4001, 0x4001, 0x4001, 0x4001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_2[8] = {
+ 0x2007, 0x2007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_3[8] = {
+ 0x0009, 0x0007, 0x2005, 0x2005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_4[8] = {
+ 0x000b, 0x0009, 0x0007, 0x0005, 0x2003, 0x2003, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_5[8] = {
+ 0x0003, 0x0005, 0x0009, 0x0007, 0x000d, 0x000b, 0x2001, 0x2001
+};
+
+static const OMX_U16 armVCM4P10_CAVLCRunBeforeTables_6[24] = {
+ 0x0010, 0x000d, 0x000b, 0x0009, 0x0007, 0x0005, 0x0003, 0x0001,
+ 0x0018, 0x0011, 0x200f, 0x200f, 0x0020, 0x0015, 0x2013, 0x2013,
+ 0x0028, 0x0019, 0x2017, 0x2017, 0x07ff, 0x001d, 0x201b, 0x201b
+};
+
+/* Tables 7 to 14 are duplicates of table 6 */
+
+const OMX_U16 * armVCM4P10_CAVLCRunBeforeTables[15] = {
+ armVCM4P10_CAVLCRunBeforeTables_0, /* ZerosLeft=1 */
+ armVCM4P10_CAVLCRunBeforeTables_1,
+ armVCM4P10_CAVLCRunBeforeTables_2,
+ armVCM4P10_CAVLCRunBeforeTables_3,
+ armVCM4P10_CAVLCRunBeforeTables_4,
+ armVCM4P10_CAVLCRunBeforeTables_5, /* ZerosLeft=6 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=7 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=8 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=9 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=10 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=11 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=12 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=13 */
+ armVCM4P10_CAVLCRunBeforeTables_6, /* ZerosLeft=14 */
+ armVCM4P10_CAVLCRunBeforeTables_6 /* ZerosLeft=15 */
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
new file mode 100755
index 0000000..4c3a77c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingChroma_unsafe_s.s
@@ -0,0 +1,198 @@
+;//
+;//
+;// File Name: armVCM4P10_DeblockingChroma_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+ IF CortexA8
+
+pAlpha RN 2
+pBeta RN 3
+
+pThresholds RN 5
+pBS RN 4
+bS3210 RN 6
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dP_3 DN D7.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+dQ_3 DN D11.U8
+
+
+;// Filtering Decision
+dAlpha DN D0.U8
+dBeta DN D2.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+
+;// bSLT4
+dTC3210 DN D18.U8
+dTCs DN D31.S8
+dTC DN D31.U8
+
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+dMask_4 DN D26.U16
+
+dTemp DN D28.U8
+dDummy DN D17.U8
+
+;// Computing P0,Q0
+qDq0p0 QN Q10.S16
+qDp1q1 QN Q11.S16
+qDelta QN Q10.S16 ; reuse qDq0p0
+dDelta DN D20.S8
+
+
+;// Computing P1,Q1
+qP_0n QN Q14.S16
+qQ_0n QN Q12.S16
+
+dQ_0n DN D24.U8
+dP_0n DN D29.U8
+
+;// bSGE4
+
+dHSp0q1 DN D13.U8
+dHSq0p1 DN D31.U8
+
+dBS3210 DN D28.U16
+
+dP_0t DN D13.U8 ;dHSp0q1
+dQ_0t DN D31.U8 ;Temp1
+
+dP_0n DN D29.U8
+dQ_0n DN D24.U8 ;Temp2
+
+;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
+;//
+;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11
+;// - Filter masks - filt: D16, aqflg: D12, apflg: D17
+;// - Additional Params - pThresholds: r5
+;//
+;// Outputs - Pixels - P0-P1: D29-D30, Q0-Q1: D24-D25
+;// - Additional Params - pThresholds: r5
+
+;// Registers Corrupted - D18-D31
+
+
+ M_START armVCM4P10_DeblockingChromabSLT4_unsafe
+
+
+ ;dTC3210 -18
+ ;dTemp-28
+
+ VLD1 d18.U32[0], [pThresholds]! ;here
+
+ ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+ ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
+
+ ;// qDp1q1-11
+ ;// qDq0p0-10
+ VSUBL qDp1q1, dP_1, dQ_1
+ VMOV dTemp, dTC3210
+ VSUBL qDq0p0, dQ_0, dP_0
+ VSHR qDp1q1, qDp1q1, #2
+ VZIP.8 dTC3210, dTemp
+
+ ;// qDelta-qDq0p0-10
+
+ ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
+
+ ;// dTC3210-18
+ ;// dTemp-28
+ ;// dTC-31
+ VBIF dTC3210, dMask_0, dFilt
+ VRHADD qDelta, qDp1q1, qDq0p0
+ VADD dTC, dTC3210, dMask_1
+ VQMOVN dDelta, qDelta
+ ;// dDelta-d20
+
+ ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
+ VLD1 {dAlpha[]}, [pAlpha]
+ VMIN dDelta, dDelta, dTCs
+ VNEG dTCs, dTCs
+ VLD1 {dBeta[]}, [pBeta]
+ ;1
+ VMAX dDelta, dDelta, dTCs
+
+ ;// dP_0n - 29
+ ;// dQ_0n - 24
+
+ ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
+ ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
+
+ ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+ ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+
+ ;// qP_0n - 14
+ ;// qQ_0n - 12
+
+ VMOVL qP_0n, dP_0
+ VMOVL qQ_0n, dQ_0
+
+ ;1
+ VADDW qP_0n, qP_0n, dDelta
+ VSUBW qQ_0n, qQ_0n, dDelta
+
+ VQMOVUN dP_0n, qP_0n
+ VQMOVUN dQ_0n, qQ_0n
+
+ M_END
+
+;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
+;//
+;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11
+;// - Filter masks - filt: D16, aqflg: D12, apflg: D17
+;// - Additional Params - alpha: D0, dMask_1: D15
+;//
+;// Outputs - Pixels - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
+
+;// Registers Corrupted - D18-D31
+
+ M_START armVCM4P10_DeblockingChromabSGE4_unsafe
+
+ ;dHSq0p1 - 31
+ ;dHSp0q1 - 13
+ VHADD dHSp0q1, dP_0, dQ_1
+ VHADD dHSq0p1, dQ_0, dP_1
+
+ ;// Prepare the bS mask
+
+ ;// dHSp0q1-13
+ ;// dP_0t-dHSp0q1-13
+ ;// dHSq0p1-31
+ ;// dQ_0t-Temp1-31
+ VLD1 {dAlpha[]}, [pAlpha]
+ ADD pThresholds, pThresholds, #4
+ VLD1 {dBeta[]}, [pBeta]
+
+ VRHADD dP_0t, dHSp0q1, dP_1
+ VRHADD dQ_0t, dHSq0p1, dQ_1
+
+ M_END
+
+ ENDIF
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
new file mode 100755
index 0000000..0afe4fd
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s
@@ -0,0 +1,396 @@
+;//
+;//
+;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+ IF CortexA8
+
+pThresholds RN 5
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dP_3 DN D7.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+dQ_3 DN D11.U8
+
+
+;// Filtering Decision
+dAlpha DN D0.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+
+;// bSLT4
+dTC0 DN D18.U8
+dTC1 DN D19.U8
+dTC01 DN D18.U8
+
+dTCs DN D31.S8
+dTC DN D31.U8
+
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+
+dTemp DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0 QN Q10.S16
+qDp1q1 QN Q11.S16
+qDelta QN Q10.S16 ; reuse qDq0p0
+dDelta DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0 DN D24.U8
+
+dMaxP DN D23.U8
+dMinP DN D22.U8
+
+dMaxQ DN D19.U8
+dMinQ DN D21.U8
+
+dDeltaP DN D26.U8
+dDeltaQ DN D27.U8
+
+qP_0n QN Q14.S16
+qQ_0n QN Q12.S16
+
+dQ_0n DN D24.U8
+dQ_1n DN D25.U8
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+
+;// bSGE4
+
+qSp0q0 QN Q10.U16
+
+qSp2q1 QN Q11.U16
+qSp0q0p1 QN Q12.U16
+qSp3p2 QN Q13.U16
+dHSp0q1 DN D28.U8
+
+qSq2p1 QN Q11.U16
+qSp0q0q1 QN Q12.U16
+qSq3q2 QN Q13.U16 ;!!
+dHSq0p1 DN D28.U8 ;!!
+
+qTemp1 QN Q11.U16 ;!!;qSp2q1
+qTemp2 QN Q12.U16 ;!!;qSp0q0p1
+
+dP_0t DN D28.U8 ;!!;dHSp0q1
+dQ_0t DN D22.U8 ;!!;Temp1
+
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+dP_2n DN D31.U8
+
+dQ_0n DN D24.U8 ;!!;Temp2
+dQ_1n DN D25.U8 ;!!;Temp2
+dQ_2n DN D28.U8 ;!!;dQ_0t
+
+;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe
+;//
+;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11
+;// - Filter masks - filt: D16, aqflg: D12, apflg: D17
+;// - Additional Params - pThresholds: r5
+;//
+;// Outputs - Pixels - P0-P1: D29-D30, Q0-Q1: D24-D25
+;// - Additional Params - pThresholds: r5
+
+;// Registers Corrupted - D18-D31
+
+
+ M_START armVCM4P10_DeblockingLumabSLT4_unsafe
+
+
+ ;// qDq0p0-10
+ VSUBL qDp1q1, dP_1, dQ_1
+ VLD1 {dTC0[]}, [pThresholds]!
+ ;// qDp1q1-11
+ VSUBL qDq0p0, dQ_0, dP_0
+ VLD1 {dTC1[]}, [pThresholds]!
+
+ ;// dRp0q0-24
+ VSHR qDp1q1, qDp1q1, #2
+
+ ;// dTC01 = (dTC1 << 4) | dTC0
+ ;// dTC01-18
+ VEXT dTC01, dTC0, dTC1, #4
+ ;// dTemp-19
+ VAND dTemp, dApflg, dMask_1
+
+ VBIF dTC01, dMask_0, dFilt
+
+
+ ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3;
+ ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1
+
+ ;// qDelta-qDq0p0-10
+ VRHADD qDelta, qDp1q1, qDq0p0
+ VRHADD dRp0q0, dP_0, dQ_0
+ VADD dTC, dTC01, dTemp
+
+ ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1)
+
+ VAND dTemp, dAqflg, dMask_1
+ VQADD dMaxP, dP_1, dTC01
+ VQMOVN dDelta, qDelta
+ VADD dTC, dTC, dTemp
+
+ ;// dMaxP = QADD(dP_1, dTC01)
+ ;// dMinP = QSUB(dP_1, dTC01)
+
+ ;// dMaxP-d23
+ ;// dMinP-d22
+ VQSUB dMinP, dP_1, dTC01
+
+ ;// dDelta-d20
+
+ ;// dMaxQ = QADD(dQ_1, dTC01)
+ ;// dMinQ = QSUB(dQ_1, dTC01)
+
+ ;// dMaxQ-19
+ ;// dMinQ-21
+ VQADD dMaxQ, dQ_1, dTC01
+ VHADD dDeltaP, dRp0q0, dP_2
+ VMIN dDelta, dDelta, dTCs
+
+ ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta);
+ VNEG dTCs, dTCs
+
+ VQSUB dMinQ, dQ_1, dTC01
+
+ ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
+ ;// delta = armClip(-tC0, tC0, delta);
+ ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta);
+
+ ;// dDeltaP = (dP_2 + dRp0q0)>>1;
+ ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP);
+ ;// dP_1n = armClip(MinP, MaxP, dDeltaP);
+
+ ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
+ ;// delta = armClip(-tC0, tC0, delta);
+ ;// pQ0[1*Step] = (OMX_U8)(q1 + delta);
+
+ ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
+ ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ);
+ ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ);
+
+ ;// dDeltaP-26
+ VHADD dDeltaQ, dRp0q0, dQ_2
+
+ ;// dDeltaQ-27
+
+ ;// dP_0n - 29
+ ;// dP_1n - 30
+ ;// dQ_0n - 24
+ ;// dQ_1n - 25
+
+ ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1;
+ ;// dDeltaQ = (dQ_2 + dRp0q0)>>1;
+
+ VMAX dP_1n, dDeltaP, dMinP
+ VMAX dDelta, dDelta, dTCs
+
+ ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta);
+ ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta);
+
+ ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+ ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta);
+
+ ;// qP_0n - 14
+ ;// qQ_0n - 12
+
+ VMOVL qP_0n, dP_0
+ VMOVL qQ_0n, dQ_0
+
+ VADDW qP_0n, qP_0n, dDelta
+ VSUBW qQ_0n, qQ_0n, dDelta
+
+ VQMOVUN dP_0n, qP_0n
+ VQMOVUN dQ_0n, qQ_0n
+
+ VMAX dQ_1n, dDeltaQ, dMinQ
+
+ VMIN dP_1n, dP_1n, dMaxP
+ VMIN dQ_1n, dQ_1n, dMaxQ
+ VBIF dP_0n, dP_0, dFilt
+
+ VBIF dP_1n, dP_1, dApflg
+ VBIF dQ_0n, dQ_0, dFilt
+ VBIF dQ_1n, dQ_1, dAqflg
+
+ M_END
+
+;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
+;//
+;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11
+;// - Filter masks - filt: D16, aqflg: D12, apflg: D17
+;// - Additional Params - alpha: D0, dMask_1: D15
+;//
+;// Outputs - Pixels - P0-P2: D29-D31, Q0-Q2: D24,D25,D28
+
+;// Registers Corrupted - D18-D31
+
+ M_START armVCM4P10_DeblockingLumabSGE4_unsafe
+
+
+ ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2)
+ ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2)
+
+ ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) )
+ ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) )
+
+ ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
+ ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) )
+
+ ;// P Filter
+
+ VSHR dTemp, dAlpha, #2
+ VADD dTemp, dTemp, dMask_1
+
+ ;// qSp0q0-10
+ VADDL qSp0q0, dQ_0, dP_0
+ VADD dTemp, dTemp, dMask_1
+
+ ;// qSp2q1-11
+ ;// qSp0q0p1-12
+ VADDL qSp2q1, dP_2, dQ_1
+ VADDW qSp0q0p1, qSp0q0, dP_1
+
+ VCGT dTemp, dTemp, dAp0q0
+ VSHR qSp2q1, #1
+
+ ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3);
+ ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1
+
+ ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1
+ ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1
+ ;// dP_0n = ( qTemp1 + 1 ) >> 1
+
+ ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2);
+
+ ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2);
+ ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2);
+
+ ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
+ ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2);
+
+ ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2);
+ ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2);
+
+ ;// qTemp1-qSp2q1-11
+ ;// qTemp2-qSp0q0p1-12
+ VHADD qTemp1, qSp0q0p1, qSp2q1
+ VADDW qTemp2, qSp0q0p1, dP_2
+
+ ;// qSp3p2-13
+ VADDL qSp3p2, dP_3, dP_2
+
+ VAND dApflg, dApflg, dTemp
+ VHADD dHSp0q1, dP_0, dQ_1
+ VSRA qSp3p2, qTemp2, #1
+ ;// dHSp0q1-28
+ VAND dAqflg, dAqflg, dTemp
+
+ ;// dP_0n-29
+ ;// dP_0t-dHSp0q1-28
+ VQRSHRN dP_0n, qTemp1, #1
+ VRHADD dP_0t, dHSp0q1, dP_1
+
+ ;// dP_1n-30
+ VQRSHRN dP_1n, qTemp2, #2
+
+ VADDL qSq2p1, dQ_2, dP_1
+ VADDW qSp0q0q1, qSp0q0, dQ_1
+
+ VBIF dP_0n, dP_0t, dApflg
+
+ ;// Q Filter
+
+ ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3);
+ ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1
+
+ ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1
+ ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1
+ ;// dQ_0n = ( qTemp1 + 1 ) >> 1
+
+ ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2);
+
+ ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2);
+ ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2);
+
+ ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3);
+ ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2);
+
+ ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2);
+ ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2);
+
+ ;// qTemp1-qSp2q1-11
+ ;// qTemp2-qSp0q0p1-12
+ ;// qSq2p1-11
+ ;// qSp0q0q1-12
+
+
+ ;// qTemp2-qSp0q0p1-12
+ ;// qTemp1-qSq2p1-11
+ ;// qSq3q2-13
+ ;// dP_2n-31
+
+ VQRSHRN dP_2n, qSp3p2, #2
+ VADDL qSq3q2, dQ_3, dQ_2
+
+ VSHR qSq2p1, #1
+
+ VHADD qTemp1, qSp0q0q1, qSq2p1
+ VADDW qTemp2, qSp0q0q1, dQ_2
+
+ ;// dHSq0p1-28
+ VHADD dHSq0p1, dQ_0, dP_1
+
+ VBIF dP_0n, dP_0, dFilt
+ VBIF dP_1n, dP_1, dApflg
+
+ VSRA qSq3q2, qTemp2, #1
+
+ ;// dQ_1-Temp2-25
+ ;// dQ_0-Temp2-24
+ VQRSHRN dQ_1n, qTemp2, #2
+ VQRSHRN dQ_0n, qTemp1, #1
+
+ ;// dQ_0t-Temp1-22
+ VRHADD dQ_0t, dHSq0p1, dQ_1
+ VBIF dQ_1n, dQ_1, dAqflg
+
+ VBIF dP_2n, dP_2, dApflg
+ VBIF dQ_0n, dQ_0t, dAqflg
+ VQRSHRN dQ_2n, qSq3q2, #2
+ VBIF dQ_0n, dQ_0, dFilt
+ VBIF dQ_2n, dQ_2, dAqflg
+
+ M_END
+
+ ENDIF
+
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
new file mode 100755
index 0000000..10a89e9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DecodeCoeffsToPair_s.s
@@ -0,0 +1,325 @@
+;//
+;//
+;// File Name: armVCM4P10_DecodeCoeffsToPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+
+ IMPORT armVCM4P10_CAVLCCoeffTokenTables
+ IMPORT armVCM4P10_CAVLCTotalZeroTables
+ IMPORT armVCM4P10_CAVLCTotalZeros2x2Tables
+ IMPORT armVCM4P10_CAVLCRunBeforeTables
+ IMPORT armVCM4P10_SuffixToLevel
+ IMPORT armVCM4P10_ZigZag_4x4
+ IMPORT armVCM4P10_ZigZag_2x2
+
+ M_VARIANTS ARM1136JS
+
+;//DEBUG_ON SETL {TRUE}
+
+LAST_COEFF EQU 0x20 ;// End of block flag
+TWO_BYTE_COEFF EQU 0x10
+
+;// Declare input registers
+
+ppBitStream RN 0
+pOffset RN 1
+pNumCoeff RN 2
+ppPosCoefbuf RN 3
+nC RN 4 ;// number of coeffs or 17 for chroma
+sMaxNumCoeff RN 5
+
+;// Declare inner loop registers
+
+;// Level loop
+Count RN 0
+TrailingOnes RN 1
+pLevel RN 2
+LevelSuffix RN 3
+SuffixLength RN 4
+TotalCoeff RN 5
+
+pVLDTable RN 6
+Symbol RN 7
+T1 RN 8
+T2 RN 9
+RBitStream RN 10
+RBitBuffer RN 11
+RBitCount RN 12
+lr RN 14
+
+;// Run loop
+Count RN 0
+ZerosLeft RN 1
+pLevel RN 2
+ppRunTable RN 3
+pRun RN 4
+TotalCoeff RN 5
+
+pVLDTable RN 6
+Symbol RN 7
+T1 RN 8
+T2 RN 9
+RBitStream RN 10
+RBitBuffer RN 11
+RBitCount RN 12
+lr RN 14
+
+;// Fill in coefficients loop
+pPosCoefbuf RN 0
+temp RN 1
+pLevel RN 2
+ppPosCoefbuf RN 3
+pRun RN 4
+TotalCoeff RN 5
+pZigZag RN 6
+
+T1 RN 8
+T2 RN 9
+RBitStream RN 10
+RBitBuffer RN 11
+RBitCount RN 12
+CoeffNum RN 14
+
+
+
+ IF ARM1136JS
+
+ ;// Allocate stack memory required by the function
+ M_ALLOC4 pppBitStream, 4
+ M_ALLOC4 ppOffset, 4
+ M_ALLOC4 pppPosCoefbuf, 4
+ M_ALLOC4 ppLevel, 16*2
+ M_ALLOC4 ppRun, 16
+
+ ;// Write function header
+ M_START armVCM4P10_DecodeCoeffsToPair, r11
+
+ ;// Define stack arguments
+ M_ARG pNC, 4
+ M_ARG pSMaxNumCoeff,4
+
+ ;// Code start
+ M_BD_INIT0 ppBitStream, pOffset, RBitStream, RBitBuffer, RBitCount
+ LDR pVLDTable, =armVCM4P10_CAVLCCoeffTokenTables
+ M_LDR nC, pNC
+
+ M_BD_INIT1 T1, T2, lr
+ LDR pVLDTable, [pVLDTable, nC, LSL #2] ;// Find VLD table
+
+ M_BD_INIT2 T1, T2, lr
+
+ ;// Decode Symbol = TotalCoeff*4 + TrailingOnes
+ M_BD_VLD Symbol, T1, T2, pVLDTable, 4, 2
+
+ MOVS TotalCoeff, Symbol, LSR #2
+ STRB TotalCoeff, [pNumCoeff]
+ M_PRINTF "TotalCoeff=%d\n", TotalCoeff
+ BEQ.W EndNoError ;// Finished if no coefficients
+
+ CMP Symbol, #17*4
+ BGE.W EndBadSymbol ;// Error if bad symbol
+
+ ;// Save bitstream pointers
+ M_STR ppBitStream, pppBitStream
+ M_STR pOffset, ppOffset
+ M_STR ppPosCoefbuf, pppPosCoefbuf
+
+ ;// Decode Trailing Ones
+ ANDS TrailingOnes, Symbol, #3
+ M_ADR pLevel, ppLevel
+ M_PRINTF "TrailingOnes=%d\n", TrailingOnes
+ BEQ TrailingOnesDone
+ MOV Count, TrailingOnes
+TrailingOnesLoop
+ M_BD_READ8 Symbol, 1, T1
+ SUBS Count, Count, #1
+ MOV T1, #1
+ SUB T1, T1, Symbol, LSL #1
+ M_PRINTF "Level=%d\n", T1
+ STRH T1, [pLevel], #2
+ BGT TrailingOnesLoop
+TrailingOnesDone
+
+ ;// Decode level values
+ SUBS Count, TotalCoeff, TrailingOnes ;// Number of levels to read
+ BEQ DecodeRuns ;// None left
+
+ MOV SuffixLength, #1
+ CMP TotalCoeff, #10
+ MOVLE SuffixLength, #0
+ CMP TrailingOnes, #3 ;// if (TrailingOnes<3)
+ MOVLT TrailingOnes, #4 ;// then TrailingOnes = +4
+ MOVGE TrailingOnes, #2 ;// else TrailingOnes = +2
+ MOVGE SuffixLength, #0 ;// SuffixLength = 0
+
+LevelLoop
+ M_BD_CLZ16 Symbol, T1, T2 ;// Symbol=LevelPrefix
+ CMP Symbol,#16
+ BGE EndBadSymbol
+
+ MOVS lr, SuffixLength ;// if LevelSuffixSize==0
+ TEQEQ Symbol, #14 ;// and LevelPrefix==14
+ MOVEQ lr, #4 ;// then LevelSuffixSize=4
+ TEQ Symbol, #15 ;// if LevelSuffixSize==15
+ MOVEQ lr, #12 ;// then LevelSuffixSize=12
+
+ TEQEQ SuffixLength,#0
+ ADDEQ Symbol,Symbol,#15
+
+ TEQ lr, #0 ;// if LevelSuffixSize==0
+ BEQ LevelCodeRead ;// LevelCode = LevelPrefix
+
+ M_BD_VREAD16 LevelSuffix, lr, T1, T2 ;// Read Level Suffix
+
+ MOV Symbol, Symbol, LSL SuffixLength
+ ADD Symbol, LevelSuffix, Symbol
+
+LevelCodeRead
+ ;// Symbol = LevelCode
+ ADD Symbol, Symbol, TrailingOnes ;// +4 if level cannot be +/-1, +2 o/w
+ MOV TrailingOnes, #2
+ MOVS T1, Symbol, LSR #1
+ RSBCS T1, T1, #0 ;// If Symbol odd then negate
+ M_PRINTF "Level=%d\n", T1
+ STRH T1, [pLevel], #2 ;// Store level.
+
+ LDR T2, =armVCM4P10_SuffixToLevel
+ LDRSB T1, [T2, SuffixLength] ;// Find increment level
+ TEQ SuffixLength, #0
+ MOVEQ SuffixLength, #1
+ CMP Symbol, T1
+ ADDCS SuffixLength, SuffixLength, #1
+ SUBS Count, Count, #1
+ BGT LevelLoop
+
+DecodeRuns
+ ;// Find number of zeros
+ M_LDR T1, pSMaxNumCoeff ;// sMaxNumCoeff
+ SUB Count, TotalCoeff, #1 ;// Number of runs excluding last
+ SUBS ZerosLeft, T1, TotalCoeff ;// Maximum number of zeros there could be
+ M_ADR pRun, ppRun
+ MOV CoeffNum,TotalCoeff
+ SUB CoeffNum,CoeffNum,#1
+ BEQ NoZerosLeft
+
+ ;// Unpack number of zeros from bitstream
+ TEQ T1, #4
+ LDREQ pVLDTable, =(armVCM4P10_CAVLCTotalZeros2x2Tables-4)
+ LDRNE pVLDTable, =(armVCM4P10_CAVLCTotalZeroTables-4)
+ LDR pVLDTable, [pVLDTable, TotalCoeff, LSL #2]
+
+ M_BD_VLD Symbol, T1, T2, pVLDTable, 4, 2 ;// Symbol = ZerosLeft
+ CMP Symbol,#16
+ BGE EndBadSymbol
+
+ LDR ppRunTable, =(armVCM4P10_CAVLCRunBeforeTables-4)
+ M_ADR pRun, ppRun
+ MOVS ZerosLeft, Symbol
+
+ ADD CoeffNum,CoeffNum,ZerosLeft
+
+ BEQ NoZerosLeft
+
+ ;// Decode runs while zeros are left and more than one coefficient
+RunLoop
+ SUBS Count, Count, #1
+ LDR pVLDTable, [ppRunTable, ZerosLeft, LSL#2]
+ BLT LastRun
+ M_BD_VLD Symbol, T1, T2, pVLDTable, 3, 2 ;// Symbol = Run
+ CMP Symbol,#15
+ BGE EndBadSymbol
+
+ SUBS ZerosLeft, ZerosLeft, Symbol
+ M_PRINTF "Run=%d\n", Symbol
+ STRB Symbol, [pRun], #1
+ BGT RunLoop
+
+ ;// Decode runs while no zeros are left
+NoZerosLeft
+ SUBS Count, Count, #1
+ M_PRINTF "Run=%d\n", ZerosLeft
+ STRGEB ZerosLeft, [pRun], #1
+ BGT NoZerosLeft
+
+LastRun
+ ;// Final run length is remaining zeros
+ M_PRINTF "LastRun=%d\n", ZerosLeft
+ STRB ZerosLeft, [pRun], #1
+
+ ;// Write coefficients to output array
+ M_LDR T1, pSMaxNumCoeff ;// sMaxNumCoeff
+ TEQ T1, #15
+ ADDEQ CoeffNum,CoeffNum,#1
+
+
+ SUB pRun,pRun,TotalCoeff
+ SUB pLevel,pLevel,TotalCoeff
+ SUB pLevel,pLevel,TotalCoeff
+
+ M_LDR ppPosCoefbuf, pppPosCoefbuf
+ LDR pPosCoefbuf, [ppPosCoefbuf]
+ TEQ T1, #4
+ LDREQ pZigZag, =armVCM4P10_ZigZag_2x2
+ LDRNE pZigZag, =armVCM4P10_ZigZag_4x4
+
+
+
+OutputLoop
+
+ LDRB T2, [pRun],#1
+ LDRB T1, [pZigZag, CoeffNum]
+ SUB CoeffNum, CoeffNum, #1 ;// Skip Non zero
+ SUB CoeffNum, CoeffNum, T2 ;// Skip Zero run
+
+ LDRSH T2, [pLevel],#2
+
+ SUBS TotalCoeff, TotalCoeff, #1
+ ORREQ T1, T1, #LAST_COEFF
+
+ ADD temp, T2, #128
+ CMP temp, #256
+ ORRCS T1, T1, #TWO_BYTE_COEFF
+
+
+ TEQ TotalCoeff, #0 ;// Preserves carry
+
+ M_PRINTF "Output=%02x %04x\n", T1, T2
+ STRB T1, [pPosCoefbuf], #1
+ STRB T2, [pPosCoefbuf], #1
+ MOV T2, T2, LSR #8
+ STRCSB T2, [pPosCoefbuf], #1
+ BNE OutputLoop
+
+ ;// Finished
+ STR pPosCoefbuf, [ppPosCoefbuf]
+ M_LDR ppBitStream, pppBitStream
+ M_LDR pOffset, ppOffset
+ B EndNoError
+
+EndBadSymbol
+ MOV r0, #OMX_Sts_Err
+ B End
+
+EndNoError
+ ;// Finished reading from the bitstream
+ M_BD_FINI ppBitStream, pOffset
+
+ ;// Set return value
+ MOV r0, #OMX_Sts_NoErr
+End
+ M_END
+
+ ENDIF
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s
new file mode 100755
index 0000000..2761600
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_DequantTables_s.s
@@ -0,0 +1,123 @@
+;//
+;//
+;// File Name: armVCM4P10_DequantTables_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ EXPORT armVCM4P10_QPDivTable
+ EXPORT armVCM4P10_VMatrixQPModTable
+ EXPORT armVCM4P10_PosToVCol4x4
+ EXPORT armVCM4P10_PosToVCol2x2
+ EXPORT armVCM4P10_VMatrix
+ EXPORT armVCM4P10_QPModuloTable
+ EXPORT armVCM4P10_VMatrixU16
+
+;// Define the processor variants supported by this file
+
+ M_VARIANTS CortexA8
+
+
+;// Guarding implementation by the processor name
+
+
+ IF CortexA8
+
+
+ M_TABLE armVCM4P10_PosToVCol4x4
+ DCB 0, 2, 0, 2
+ DCB 2, 1, 2, 1
+ DCB 0, 2, 0, 2
+ DCB 2, 1, 2, 1
+
+
+ M_TABLE armVCM4P10_PosToVCol2x2
+ DCB 0, 2
+ DCB 2, 1
+
+
+ M_TABLE armVCM4P10_VMatrix
+ DCB 10, 16, 13
+ DCB 11, 18, 14
+ DCB 13, 20, 16
+ DCB 14, 23, 18
+ DCB 16, 25, 20
+ DCB 18, 29, 23
+
+;//-------------------------------------------------------
+;// This table evaluates the expression [(INT)(QP/6)],
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_QPDivTable
+ DCB 0, 0, 0, 0, 0, 0
+ DCB 1, 1, 1, 1, 1, 1
+ DCB 2, 2, 2, 2, 2, 2
+ DCB 3, 3, 3, 3, 3, 3
+ DCB 4, 4, 4, 4, 4, 4
+ DCB 5, 5, 5, 5, 5, 5
+ DCB 6, 6, 6, 6, 6, 6
+ DCB 7, 7, 7, 7, 7, 7
+ DCB 8, 8, 8, 8, 8, 8
+
+;//----------------------------------------------------
+;// This table contains armVCM4P10_VMatrix[QP%6][0] entires,
+;// for values of QP from 0 to 51 (inclusive).
+;//----------------------------------------------------
+
+ M_TABLE armVCM4P10_VMatrixQPModTable
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+ DCB 10, 11, 13, 14, 16, 18
+
+;//-------------------------------------------------------
+;// This table evaluates the modulus expression [QP%6]*6,
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_QPModuloTable
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+ DCB 0, 6, 12, 18, 24, 30
+
+;//-------------------------------------------------------
+;// This table contains the invidual byte values stored as
+;// halfwords. This avoids unpacking inside the function
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_VMatrixU16
+ DCW 10, 16, 13
+ DCW 11, 18, 14
+ DCW 13, 20, 16
+ DCW 14, 23, 18
+ DCW 16, 25, 20
+ DCW 18, 29, 23
+
+ ENDIF ;//ARM1136JS
+
+
+
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
new file mode 100755
index 0000000..6e912d7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Align_unsafe_s.s
@@ -0,0 +1,236 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_Align_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+ EXPORT armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+ EXPORT armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+
+DEBUG_ON SETL {FALSE}
+
+ IF ARM1136JS
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 8
+iHeight RN 9
+
+;// Declare inner loop registers
+x RN 7
+x0 RN 7
+x1 RN 10
+x2 RN 11
+Scratch RN 12
+
+;// Function:
+;// armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to a 4 byte aligned
+;// destination pointed by (pDst) for horizontal interpolation.
+;// This function needs to copy 9 bytes in horizontal direction.
+;//
+;// Registers used as input for this function
+;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no rows to copy
+;//
+;// Registers preserved for top level function
+;// r2,r3,r4,r5,r6
+;//
+;// Registers modified by the function
+;// r7,r8,r9,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the new aligned location which will be used as pSrc
+;// r1 - step size to this aligned location
+
+ ;// Function header
+ M_START armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+
+ ;// Copy pDst to scratch
+ MOV Scratch, pDst
+
+StartAlignedStackCopy
+ AND x, pSrc, #3
+ BIC pSrc, pSrc, #3
+
+ M_SWITCH x
+ M_CASE Copy0toAligned
+ M_CASE Copy1toAligned
+ M_CASE Copy2toAligned
+ M_CASE Copy3toAligned
+ M_ENDSWITCH
+
+Copy0toAligned
+ LDM pSrc, {x0, x1, x2}
+ SUBS iHeight, iHeight, #1
+ ADD pSrc, pSrc, srcStep
+
+ ;// One cycle stall
+
+ STM pDst!, {x0, x1, x2} ;// Store aligned output row
+ BGT Copy0toAligned
+ B CopyEnd
+
+Copy1toAligned
+ LDM pSrc, {x0, x1, x2}
+ SUBS iHeight, iHeight, #1
+ ADD pSrc, pSrc, srcStep
+
+ ;// One cycle stall
+
+ MOV x0, x0, LSR #8
+ ORR x0, x0, x1, LSL #24
+ MOV x1, x1, LSR #8
+ ORR x1, x1, x2, LSL #24
+ MOV x2, x2, LSR #8
+ STM pDst!, {x0, x1, x2} ;// Store aligned output row
+ BGT Copy1toAligned
+ B CopyEnd
+
+Copy2toAligned
+ LDM pSrc, {x0, x1, x2}
+ SUBS iHeight, iHeight, #1
+ ADD pSrc, pSrc, srcStep
+
+ ;// One cycle stall
+
+ MOV x0, x0, LSR #16
+ ORR x0, x0, x1, LSL #16
+ MOV x1, x1, LSR #16
+ ORR x1, x1, x2, LSL #16
+ MOV x2, x2, LSR #16
+ STM pDst!, {x0, x1, x2} ;// Store aligned output row
+ BGT Copy2toAligned
+ B CopyEnd
+
+Copy3toAligned
+ LDM pSrc, {x0, x1, x2}
+ SUBS iHeight, iHeight, #1
+ ADD pSrc, pSrc, srcStep
+
+ ;// One cycle stall
+
+ MOV x0, x0, LSR #24
+ ORR x0, x0, x1, LSL #8
+ MOV x1, x1, LSR #24
+ ORR x1, x1, x2, LSL #8
+ MOV x2, x2, LSR #24
+ STM pDst!, {x0, x1, x2} ;// Store aligned output row
+ BGT Copy3toAligned
+
+CopyEnd
+
+ MOV pSrc, Scratch
+ MOV srcStep, #12
+
+ M_END
+
+
+;// Function:
+;// armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned
+;// destination pointed by (pDst) for vertical interpolation.
+;// This function needs to copy 4 bytes in horizontal direction
+;//
+;// Registers used as input for this function
+;// r0,r1,r8,r9 where r8 containings aligned memory pointer and r9 no of rows to copy
+;//
+;// Registers preserved for top level function
+;// r2,r3,r4,r5,r6
+;//
+;// Registers modified by the function
+;// r7,r8,r9,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the new aligned location which will be used as pSrc
+;// r1 - step size to this aligned location
+
+ ;// Function header
+ M_START armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+
+ ;// Copy pSrc to stack
+StartVAlignedStackCopy
+ AND x, pSrc, #3
+ BIC pSrc, pSrc, #3
+
+
+ M_SWITCH x
+ M_CASE Copy0toVAligned
+ M_CASE Copy1toVAligned
+ M_CASE Copy2toVAligned
+ M_CASE Copy3toVAligned
+ M_ENDSWITCH
+
+Copy0toVAligned
+ M_LDR x0, [pSrc], srcStep
+ SUBS iHeight, iHeight, #1
+
+ ;// One cycle stall
+
+ STR x0, [pDst], #4 ;// Store aligned output row
+ BGT Copy0toVAligned
+ B CopyVEnd
+
+Copy1toVAligned
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ SUBS iHeight, iHeight, #1
+
+ ;// One cycle stall
+
+ MOV x1, x1, LSL #24
+ ORR x0, x1, x0, LSR #8
+ STR x0, [pDst], #4 ;// Store aligned output row
+ BGT Copy1toVAligned
+ B CopyVEnd
+
+Copy2toVAligned
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ SUBS iHeight, iHeight, #1
+
+ ;// One cycle stall
+
+ MOV x1, x1, LSL #16
+ ORR x0, x1, x0, LSR #16
+ STR x0, [pDst], #4 ;// Store aligned output row
+ BGT Copy2toVAligned
+ B CopyVEnd
+
+Copy3toVAligned
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ SUBS iHeight, iHeight, #1
+
+ ;// One cycle stall
+
+ MOV x1, x1, LSL #8
+ ORR x0, x1, x0, LSR #24
+ STR x0, [pDst], #4 ;// Store aligned output row
+ BGT Copy3toVAligned
+
+CopyVEnd
+
+ SUB pSrc, pDst, #28
+ MOV srcStep, #4
+
+ M_END
+
+
+ ENDIF
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
new file mode 100755
index 0000000..d275891
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
@@ -0,0 +1,149 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_Copy_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Function:
+;// armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+;//
+;// Implements copy from an arbitrary aligned source memory location (pSrc) to an aligned
+;// destination pointed by (pDst)
+;//
+;// Registers preserved for top level function
+;// r1,r3,r4,r5,r6,r7,r10,r11,r14
+;//
+;// Registers modified by the function
+;// r0,r2,r8,r9,r12
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+ EXPORT armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare other intermediate registers
+x0 RN 4
+x1 RN 5
+x2 RN 8
+x3 RN 9
+Temp RN 12
+
+ IF ARM1136JS
+
+ M_START armVCM4P10_InterpolateLuma_Copy4x4_unsafe, r6
+
+Copy4x4Start
+ ;// Do Copy and branch to EndOfInterpolation
+ AND Temp, pSrc, #3
+ BIC pSrc, pSrc, #3
+
+ M_SWITCH Temp
+ M_CASE Copy4x4Align0
+ M_CASE Copy4x4Align1
+ M_CASE Copy4x4Align2
+ M_CASE Copy4x4Align3
+ M_ENDSWITCH
+
+Copy4x4Align0
+ M_LDR x0, [pSrc], srcStep
+ M_LDR x1, [pSrc], srcStep
+ M_STR x0, [pDst], dstStep
+ M_LDR x2, [pSrc], srcStep
+ M_STR x1, [pDst], dstStep
+ M_LDR x3, [pSrc], srcStep
+ M_STR x2, [pDst], dstStep
+ M_STR x3, [pDst], dstStep
+ B Copy4x4End
+
+Copy4x4Align1
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #8
+ ORR x0, x0, x1, LSL #24
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #8
+ ORR x2, x2, x3, LSL #24
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ M_STR x2, [pDst], dstStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #8
+ ORR x0, x0, x1, LSL #24
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #8
+ ORR x2, x2, x3, LSL #24
+ M_STR x2, [pDst], dstStep
+ B Copy4x4End
+
+Copy4x4Align2
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #16
+ ORR x0, x0, x1, LSL #16
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #16
+ ORR x2, x2, x3, LSL #16
+ M_STR x2, [pDst], dstStep
+
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #16
+ ORR x0, x0, x1, LSL #16
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #16
+ ORR x2, x2, x3, LSL #16
+ M_STR x2, [pDst], dstStep
+ B Copy4x4End
+
+Copy4x4Align3
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #24
+ ORR x0, x0, x1, LSL #8
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #24
+ ORR x2, x2, x3, LSL #8
+ M_STR x2, [pDst], dstStep
+
+ LDR x1, [pSrc, #4]
+ M_LDR x0, [pSrc], srcStep
+ LDR x3, [pSrc, #4]
+ M_LDR x2, [pSrc], srcStep
+ MOV x0, x0, LSR #24
+ ORR x0, x0, x1, LSL #8
+ M_STR x0, [pDst], dstStep
+ MOV x2, x2, LSR #24
+ ORR x2, x2, x3, LSL #8
+ M_STR x2, [pDst], dstStep
+ B Copy4x4End
+
+Copy4x4End
+ M_END
+
+ ENDIF
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
new file mode 100755
index 0000000..4e5a39d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
@@ -0,0 +1,178 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+ EXPORT armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+ EXPORT armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+
+;// Functions:
+;// armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe and
+;// armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+;//
+;// Implements re-arrangement of data from temporary buffer to a buffer pointed by pBuf.
+;// This will do the convertion of data from 16 bit to 8 bit and it also
+;// remove offset and check for saturation.
+;//
+;// Registers used as input for this function
+;// r0,r1,r7 where r0 is input pointer and r2 its step size, r7 is output pointer
+;//
+;// Registers preserved for top level function
+;// r4,r5,r6,r8,r9,r14
+;//
+;// Registers modified by the function
+;// r7,r10,r11,r12
+;//
+;// Output registers
+;// r0 - pointer to the destination location
+;// r1 - step size to this destination location
+
+
+DEBUG_ON SETL {FALSE}
+
+MASK EQU 0x80808080 ;// Mask is used to implement (a+b+1)/2
+
+;// Declare input registers
+
+pSrc0 RN 0
+srcStep0 RN 1
+
+;// Declare other intermediate registers
+Temp1 RN 4
+Temp2 RN 5
+Temp3 RN 10
+Temp4 RN 11
+pBuf RN 7
+r0x0fe00fe0 RN 6
+r0x00ff00ff RN 12
+Count RN 14
+ValueA0 RN 10
+ValueA1 RN 11
+
+ IF ARM1136JS
+
+
+ ;// Function header
+ M_START armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe, r6
+
+ ;// Code start
+ MOV Count, #4
+ LDR r0x0fe00fe0, =0x0fe00fe0
+ LDR r0x00ff00ff, =0x00ff00ff
+LoopStart1
+ LDR Temp4, [pSrc0, #12]
+ LDR Temp3, [pSrc0, #8]
+ LDR Temp2, [pSrc0, #4]
+ M_LDR Temp1, [pSrc0], srcStep0
+ UQSUB16 Temp4, Temp4, r0x0fe00fe0
+ UQSUB16 Temp3, Temp3, r0x0fe00fe0
+ UQSUB16 Temp2, Temp2, r0x0fe00fe0
+ UQSUB16 Temp1, Temp1, r0x0fe00fe0
+ USAT16 Temp4, #13, Temp4
+ USAT16 Temp3, #13, Temp3
+ USAT16 Temp2, #13, Temp2
+ USAT16 Temp1, #13, Temp1
+ AND Temp4, r0x00ff00ff, Temp4, LSR #5
+ AND Temp3, r0x00ff00ff, Temp3, LSR #5
+ AND Temp2, r0x00ff00ff, Temp2, LSR #5
+ AND Temp1, r0x00ff00ff, Temp1, LSR #5
+ ORR ValueA1, Temp3, Temp4, LSL #8
+ ORR ValueA0, Temp1, Temp2, LSL #8
+ SUBS Count, Count, #1
+ STRD ValueA0, [pBuf], #8
+ BGT LoopStart1
+End1
+ SUB pSrc0, pBuf, #32
+ MOV srcStep0, #8
+
+ M_END
+
+
+ ;// Function header
+ M_START armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe, r6
+
+ ;// Code start
+ LDR r0x0fe00fe0, =0x0fe00fe0
+ LDR r0x00ff00ff, =0x00ff00ff
+ MOV Count, #2
+
+LoopStart
+ LDR Temp4, [pSrc0, #12]
+ LDR Temp3, [pSrc0, #8]
+ LDR Temp2, [pSrc0, #4]
+ M_LDR Temp1, [pSrc0], srcStep0
+
+ UQSUB16 Temp4, Temp4, r0x0fe00fe0
+ UQSUB16 Temp3, Temp3, r0x0fe00fe0
+ UQSUB16 Temp2, Temp2, r0x0fe00fe0
+ UQSUB16 Temp1, Temp1, r0x0fe00fe0
+
+ USAT16 Temp4, #13, Temp4
+ USAT16 Temp3, #13, Temp3
+ USAT16 Temp2, #13, Temp2
+ USAT16 Temp1, #13, Temp1
+
+ AND Temp4, r0x00ff00ff, Temp4, LSR #5
+ AND Temp3, r0x00ff00ff, Temp3, LSR #5
+ AND Temp2, r0x00ff00ff, Temp2, LSR #5
+ AND Temp1, r0x00ff00ff, Temp1, LSR #5
+ ORR ValueA1, Temp3, Temp4, LSL #8 ;// [d2 c2 d0 c0]
+ ORR ValueA0, Temp1, Temp2, LSL #8 ;// [b2 a2 b0 a0]
+
+ PKHBT Temp1, ValueA0, ValueA1, LSL #16 ;// [d0 c0 b0 a0]
+
+ STR Temp1, [pBuf], #8
+ PKHTB Temp2, ValueA1, ValueA0, ASR #16 ;// [d2 c2 b2 a2]
+ STR Temp2, [pBuf], #-4
+
+ LDR Temp4, [pSrc0, #12]
+ LDR Temp3, [pSrc0, #8]
+ LDR Temp2, [pSrc0, #4]
+ M_LDR Temp1, [pSrc0], srcStep0
+
+ UQSUB16 Temp4, Temp4, r0x0fe00fe0
+ UQSUB16 Temp3, Temp3, r0x0fe00fe0
+ UQSUB16 Temp2, Temp2, r0x0fe00fe0
+ UQSUB16 Temp1, Temp1, r0x0fe00fe0
+
+ USAT16 Temp4, #13, Temp4
+ USAT16 Temp3, #13, Temp3
+ USAT16 Temp2, #13, Temp2
+ USAT16 Temp1, #13, Temp1
+
+ AND Temp4, r0x00ff00ff, Temp4, LSR #5
+ AND Temp3, r0x00ff00ff, Temp3, LSR #5
+ AND Temp2, r0x00ff00ff, Temp2, LSR #5
+ AND Temp1, r0x00ff00ff, Temp1, LSR #5
+ ORR ValueA1, Temp3, Temp4, LSL #8 ;// [d2 c2 d0 c0]
+ ORR ValueA0, Temp1, Temp2, LSL #8 ;// [b2 a2 b0 a0]
+
+ PKHBT Temp1, ValueA0, ValueA1, LSL #16 ;// [d0 c0 b0 a0]
+ SUBS Count, Count, #1
+ STR Temp1, [pBuf], #8
+ PKHTB Temp2, ValueA1, ValueA0, ASR #16 ;// [d2 c2 b2 a2]
+ STR Temp2, [pBuf], #4
+
+ BGT LoopStart
+End2
+ SUB pSrc0, pBuf, #32-8
+ MOV srcStep0, #4
+
+ M_END
+
+ ENDIF
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
new file mode 100755
index 0000000..d1684cb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
@@ -0,0 +1,313 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+ M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+qCoeff5 QN 14.S32
+qCoeff20 QN 15.S32
+
+qSrc01 QN 0.U8
+dSrc0 DN 0.U8
+dSrc1 DN 1.U8
+
+dSrcb DN 4.U8
+dSrcc DN 2.U8
+dSrcd DN 3.U8
+dSrce DN 5.U8
+dSrcf DN 1.U8
+
+qSrcb QN 2.S16
+qSrcc QN 1.S16
+dSrcB DN 4.S16
+dSrcC DN 2.S16
+
+qRes0 QN 5.S16
+qRes1 QN 6.S16
+qRes2 QN 7.S16
+qRes3 QN 8.S16
+qRes4 QN 9.S16
+qRes5 QN 10.S16
+qRes6 QN 11.S16
+qRes7 QN 12.S16
+qRes8 QN 13.S16
+
+dRes0 DN 10.S16
+dRes1 DN 12.S16
+dRes2 DN 14.S16
+dRes3 DN 16.S16
+dRes4 DN 18.S16
+dRes5 DN 20.S16
+dRes6 DN 22.S16
+dRes7 DN 24.S16
+dRes8 DN 26.S16
+
+qAcc01 QN 5.S32
+qAcc23 QN 6.S32
+qAcc45 QN 2.S32
+qAcc67 QN 3.S32
+qSumBE QN 0.S32
+qSumCD QN 1.S32
+
+dTempAcc0 DN 0.U16
+dTempAcc1 DN 2.U16
+dTempAcc2 DN 4.U16
+dTempAcc3 DN 6.U16
+
+qTAcc0 QN 0.U16
+qTAcc1 QN 1.U16
+qTAcc2 QN 2.U16
+qTAcc3 QN 3.U16
+
+dAcc0 DN 0.U8
+dAcc1 DN 2.U8
+dAcc2 DN 4.U8
+dAcc3 DN 6.U8
+
+dTmp0 DN 8.S16
+dTmp1 DN 9.S16
+qTmp0 QN 4.S32
+
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+ VMOV dCoeff20, #20
+ VMOV dCoeff5, #5
+
+ ;// Row0
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+ VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row1
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes0, dRes0, dTmp0 ;// TeRi
+
+ VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row2
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes1, dRes1, dTmp0
+
+ VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row3
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes2, dRes2, dTmp0
+
+ VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row4
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes3, dRes3, dTmp0
+
+ VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row5
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes4, dRes4, dTmp0
+
+ VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row6
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes5, dRes5, dTmp0
+
+ VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row7
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f
+ VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..]
+
+ VSUB dRes6, dRes6, dTmp0
+
+ VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ ;// Row8
+ VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrcc, dSrc0, dSrc1, #2
+ VEXT dSrcd, dSrc0, dSrc1, #3
+ VEXT dSrce, dSrc0, dSrc1, #4
+ VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..]
+ VADDL qSrcc, dSrcc, dSrcd ;// c+d
+ VADDL qSrcb, dSrcb, dSrce ;// b+e
+ VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f
+
+ VSUB dRes7, dRes7, dTmp0
+
+ VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e)
+
+ VMOV qCoeff20, #20
+ VMOV qCoeff5, #5
+
+ ;// Col0
+ VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f
+ VADDL qSumCD, dRes2, dRes3 ;// c+d
+ VADDL qSumBE, dRes1, dRes4 ;// b+e
+
+ VSUB dRes8, dRes8, dTmp0
+
+ VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col1
+ VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f
+ VADDL qSumCD, dRes3, dRes4 ;// c+d
+ VADDL qSumBE, dRes2, dRes5 ;// b+e
+ VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc01, qAcc01, qTmp0
+
+; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col2
+ VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f
+ VADDL qSumCD, dRes4, dRes5 ;// c+d
+ VADDL qSumBE, dRes3, dRes6 ;// b+e
+ VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc23, qAcc23, qTmp0
+
+; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+ VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ ;// Col3
+ VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f
+ VADDL qSumCD, dRes5, dRes6 ;// c+d
+ VADDL qSumBE, dRes4, dRes7 ;// b+e
+ VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d)
+
+ VSUB qAcc45, qAcc45, qTmp0
+
+ VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e)
+
+ VQRSHRUN dTempAcc0, qAcc01, #10
+ VQRSHRUN dTempAcc1, qAcc23, #10
+ VQRSHRUN dTempAcc2, qAcc45, #10
+ VQRSHRUN dTempAcc3, qAcc67, #10
+
+ VQMOVN dAcc0, qTAcc0
+ VQMOVN dAcc1, qTAcc1
+ VQMOVN dAcc2, qTAcc2
+ VQMOVN dAcc3, qTAcc3
+
+ M_END
+
+ ENDIF
+
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
new file mode 100755
index 0000000..7bc091f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
@@ -0,0 +1,266 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+ M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare Neon registers
+dTCoeff5 DN 30.U8
+dTCoeff20 DN 31.U8
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+
+qSrcA01 QN 0.U8
+qSrcB23 QN 1.U8
+qSrcC45 QN 2.U8
+qSrcD67 QN 3.U8
+qSrcE89 QN 4.U8
+qSrcF1011 QN 5.U8
+qSrcG1213 QN 6.U8
+qSrcH1415 QN 7.U8
+qSrcI1617 QN 8.U8
+
+dSrcA0 DN 0.U8
+dSrcB2 DN 2.U8
+dSrcC4 DN 4.U8
+dSrcD6 DN 6.U8
+dSrcE8 DN 8.U8
+dSrcF10 DN 10.U8
+dSrcG12 DN 12.U8
+dSrcH14 DN 14.U8
+dSrcI16 DN 16.U8
+
+dSrcA1 DN 1.U8
+dSrcB3 DN 3.U8
+dSrcC5 DN 5.U8
+dSrcD7 DN 7.U8
+dSrcE9 DN 9.U8
+dSrcF11 DN 11.U8
+dSrcG13 DN 13.U8
+dSrcH15 DN 15.U8
+dSrcI17 DN 17.U8
+
+qTempP01 QN 9.S16
+qTempQ01 QN 10.S16
+qTempR01 QN 11.S16
+qTempS01 QN 12.S16
+
+qTempP23 QN 0.S16
+qTempQ23 QN 1.S16
+qTempR23 QN 2.S16
+qTempS23 QN 3.S16
+
+dTempP0 DN 18.S16
+dTempP1 DN 19.S16
+dTempP2 DN 0.S16
+
+dTempQ0 DN 20.S16
+dTempQ1 DN 21.S16
+dTempQ2 DN 2.S16
+
+dTempR0 DN 22.S16
+dTempR1 DN 23.S16
+dTempR2 DN 4.S16
+
+dTempS0 DN 24.S16
+dTempS1 DN 25.S16
+dTempS2 DN 6.S16
+
+dTempB0 DN 26.S16
+dTempC0 DN 27.S16
+dTempD0 DN 28.S16
+dTempF0 DN 29.S16
+
+dTempAcc0 DN 0.U16
+dTempAcc1 DN 2.U16
+dTempAcc2 DN 4.U16
+dTempAcc3 DN 6.U16
+
+dAcc0 DN 0.U8
+dAcc1 DN 2.U8
+dAcc2 DN 4.U8
+dAcc3 DN 6.U8
+
+qAcc0 QN 0.S32
+qAcc1 QN 1.S32
+qAcc2 QN 2.S32
+qAcc3 QN 3.S32
+
+qTAcc0 QN 0.U16
+qTAcc1 QN 1.U16
+qTAcc2 QN 2.U16
+qTAcc3 QN 3.U16
+
+qTmp QN 4.S16
+dTmp DN 8.S16
+
+ VLD1 qSrcA01, [pSrc], srcStep ;// [a0 a1 a2 a3 .. a15]
+ ADD r12, pSrc, srcStep, LSL #2
+ VMOV dTCoeff5, #5
+ VMOV dTCoeff20, #20
+ VLD1 qSrcF1011, [r12], srcStep
+ VLD1 qSrcB23, [pSrc], srcStep ;// [b0 b1 b2 b3 .. b15]
+
+ VLD1 qSrcG1213, [r12], srcStep
+ VADDL qTempP01, dSrcA0, dSrcF10
+ VLD1 qSrcC45, [pSrc], srcStep ;// [c0 c1 c2 c3 .. c15]
+ VADDL qTempP23, dSrcA1, dSrcF11
+ VLD1 qSrcD67, [pSrc], srcStep
+ VADDL qTempQ01, dSrcB2, dSrcG12
+ VLD1 qSrcE89, [pSrc], srcStep
+
+ ;//t0
+ VMLAL qTempP01, dSrcC4, dTCoeff20
+
+ VLD1 qSrcH1415, [r12], srcStep
+
+ VMLAL qTempP23, dSrcC5, dTCoeff20
+
+ VLD1 qSrcI1617, [r12], srcStep ;// [i0 i1 i2 i3 .. ]
+
+ VMLAL qTempP01, dSrcD6, dTCoeff20
+ VMLAL qTempQ01, dSrcD6, dTCoeff20
+ VMLSL qTempP23, dSrcB3, dTCoeff5
+
+ VADDL qTempR01, dSrcC4, dSrcH14
+
+ VMLSL qTempP01, dSrcB2, dTCoeff5
+
+ VADDL qTempQ23, dSrcB3, dSrcG13
+
+ VMLAL qTempP23, dSrcD7, dTCoeff20
+ VMLAL qTempQ01, dSrcE8, dTCoeff20
+
+ VMLSL qTempP01, dSrcE8, dTCoeff5
+ VMLAL qTempQ23, dSrcD7, dTCoeff20
+
+ VMLSL qTempP23, dSrcE9, dTCoeff5
+
+ ;//t1
+
+ VMLAL qTempR01, dSrcE8, dTCoeff20
+ VMLSL qTempQ01, dSrcC4, dTCoeff5
+ VMLSL qTempQ23, dSrcC5, dTCoeff5
+ VADDL qTempR23, dSrcC5, dSrcH15
+
+ VMLAL qTempR01, dSrcF10, dTCoeff20
+ VMLSL qTempQ01, dSrcF10, dTCoeff5
+ VMLAL qTempQ23, dSrcE9, dTCoeff20
+ VMLAL qTempR23, dSrcE9, dTCoeff20
+ VADDL qTempS01, dSrcD6, dSrcI16
+
+
+ VMLSL qTempR01, dSrcD6, dTCoeff5
+ VMLSL qTempQ23, dSrcF11, dTCoeff5
+ VMLSL qTempR23, dSrcD7, dTCoeff5
+
+ ;//t2
+ VADDL qTempS23, dSrcD7, dSrcI17
+ VMLAL qTempS01, dSrcF10, dTCoeff20
+ VMLSL qTempR01, dSrcG12, dTCoeff5
+ VMLSL qTempR23, dSrcG13, dTCoeff5
+
+ VMLAL qTempS23, dSrcF11, dTCoeff20
+ VMLAL qTempS01, dSrcG12, dTCoeff20
+ VEXT dTempB0, dTempP0, dTempP1, #1
+ VMLAL qTempR23, dSrcF11, dTCoeff20
+
+
+ ;//t3
+ VMLAL qTempS23, dSrcG13, dTCoeff20
+ VMLSL qTempS01, dSrcE8, dTCoeff5
+ VEXT dTempC0, dTempP0, dTempP1, #2
+ VMOV dCoeff20, #20
+ VMLSL qTempS23, dSrcE9, dTCoeff5
+ VMLSL qTempS01, dSrcH14, dTCoeff5
+ VEXT dTempF0, dTempP1, dTempP2, #1
+ VEXT dTempD0, dTempP0, dTempP1, #3
+ VMLSL qTempS23, dSrcH15, dTCoeff5
+
+ VADDL qAcc0, dTempP0, dTempF0
+ VADD dTempC0, dTempC0, dTempD0
+ ;//h
+ VMOV dCoeff5, #5
+
+ ;// res0
+ VADD dTempB0, dTempB0, dTempP1
+ VMLAL qAcc0, dTempC0, dCoeff20
+ VEXT dTempC0, dTempQ0, dTempQ1, #2
+ VEXT dTempD0, dTempQ0, dTempQ1, #3
+ VEXT dTempF0, dTempQ1, dTempQ2, #1
+ VMLSL qAcc0, dTempB0, dCoeff5
+
+ ;// res1
+ VEXT dTempB0, dTempQ0, dTempQ1, #1
+ VADDL qAcc1, dTempQ0, dTempF0
+ VADD dTempC0, dTempC0, dTempD0
+ VADD dTempB0, dTempB0, dTempQ1
+ VEXT dTempD0, dTempR0, dTempR1, #3
+ VMLAL qAcc1, dTempC0, dCoeff20
+ VEXT dTempF0, dTempR1, dTempR2, #1
+ VEXT dTempC0, dTempR0, dTempR1, #2
+ VEXT dTmp, dTempR0, dTempR1, #1
+ VADDL qAcc2, dTempR0, dTempF0
+ VMLSL qAcc1, dTempB0, dCoeff5
+; VEXT dTempB0, dTempR0, dTempR1, #1
+ VADD dTempC0, dTempC0, dTempD0
+
+ ;// res2
+ VADD dTempB0, dTmp, dTempR1
+ VEXT dTempD0, dTempS0, dTempS1, #3
+ VMLAL qAcc2, dTempC0, dCoeff20
+; VADD dTempB0, dTempB0, dTempR1
+
+ ;// res3
+ VEXT dTempC0, dTempS0, dTempS1, #2
+ VEXT dTempF0, dTempS1, dTempS2, #1
+ VADD dTempC0, dTempC0, dTempD0
+ VEXT dTmp, dTempS0, dTempS1, #1
+ VADDL qAcc3, dTempS0, dTempF0
+ VMLSL qAcc2, dTempB0, dCoeff5
+ VMLAL qAcc3, dTempC0, dCoeff20
+ VADD dTmp, dTmp, dTempS1
+ VMLSL qAcc3, dTmp, dCoeff5
+
+ VQRSHRUN dTempAcc0, qAcc0, #10
+ VQRSHRUN dTempAcc1, qAcc1, #10
+ VQRSHRUN dTempAcc2, qAcc2, #10
+ VQRSHRUN dTempAcc3, qAcc3, #10
+
+ VQMOVN dAcc0, qTAcc0
+ VQMOVN dAcc1, qTAcc1
+ VQMOVN dAcc2, qTAcc2
+ VQMOVN dAcc3, qTAcc3
+
+ M_END
+
+ ENDIF
+
+
+
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
new file mode 100755
index 0000000..babe8ad
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
@@ -0,0 +1,228 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+
+DEBUG_ON SETL {FALSE}
+
+ IF CortexA8
+
+ M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+
+qSrcA01 QN 11.U8
+qSrcB01 QN 12.U8
+qSrcC01 QN 13.U8
+qSrcD01 QN 14.U8
+
+dSrcA0 DN 22.U8
+dSrcA1 DN 23.U8
+dSrcB0 DN 24.U8
+dSrcB1 DN 25.U8
+dSrcC0 DN 26.U8
+dSrcC1 DN 27.U8
+dSrcD0 DN 28.U8
+dSrcD1 DN 29.U8
+
+dSrcb DN 12.U8
+dSrce DN 13.U8
+dSrcf DN 10.U8
+
+dSrc0c DN 14.U8
+dSrc1c DN 16.U8
+dSrc2c DN 18.U8
+dSrc3c DN 20.U8
+
+dSrc0d DN 15.U8
+dSrc1d DN 17.U8
+dSrc2d DN 19.U8
+dSrc3d DN 21.U8
+
+qTemp01 QN 4.S16
+qTemp23 QN 6.S16
+dTemp0 DN 8.S16
+dTemp2 DN 12.S16
+
+qRes01 QN 11.S16
+qRes23 QN 12.S16
+qRes45 QN 13.S16
+qRes67 QN 14.S16
+
+dRes0 DN 22.S16
+dRes2 DN 24.S16
+dRes4 DN 26.S16
+dRes6 DN 28.S16
+
+dAcc0 DN 22.U8
+dAcc2 DN 24.U8
+dAcc4 DN 26.U8
+dAcc6 DN 28.U8
+
+dResult0 DN 22.U32
+dResult2 DN 24.U32
+dResult4 DN 26.U32
+dResult6 DN 28.U32
+
+ VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..]
+ ;// One cycle stall
+ VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..]
+; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..]
+ VEXT dSrc0c, dSrcA0, dSrcA1, #2
+ VEXT dSrc0d, dSrcA0, dSrcA1, #3
+ VEXT dSrce, dSrcA0, dSrcA1, #4
+ VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f
+ VADDL qTemp01, dSrc0c, dSrc0d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..]
+; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..]
+ VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi
+
+ VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc1c, dSrcB0, dSrcB1, #2
+ VEXT dSrc1d, dSrcB0, dSrcB1, #3
+ VEXT dSrce, dSrcB0, dSrcB1, #4
+ VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f
+
+ VSUB dRes0, dRes0, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc1c, dSrc1d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..]
+; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..]
+
+ VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi
+
+ VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc2c, dSrcC0, dSrcC1, #2
+ VEXT dSrc2d, dSrcC0, dSrcC1, #3
+ VEXT dSrce, dSrcC0, dSrcC1, #4
+ VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f
+
+ VSUB dRes2, dRes2, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc2c, dSrc2d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+
+ VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..]
+
+ VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+ VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi
+
+
+ VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..]
+ VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..]
+ VEXT dSrc3c, dSrcD0, dSrcD1, #2
+ VEXT dSrc3d, dSrcD0, dSrcD1, #3
+ VEXT dSrce, dSrcD0, dSrcD1, #4
+ VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f
+
+ VSUB dRes4, dRes4, dTemp0 ;// TeRi
+
+ VADDL qTemp01, dSrc3c, dSrc3d ;// c+d
+ VADDL qTemp23, dSrcb, dSrce ;// b+e
+ VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d)
+ VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e)
+
+ VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32)
+ VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32)
+
+ M_END
+
+ ENDIF
+
+
+ END
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
new file mode 100755
index 0000000..89c90aa
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
@@ -0,0 +1,134 @@
+;//
+;//
+;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+
+ IF CortexA8
+
+ M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+
+Temp RN 12
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+
+dSrc0 DN 7.U8
+dSrc1 DN 8.U8
+dSrc2 DN 9.U8
+dSrc3 DN 10.U8
+dSrc4 DN 11.U8
+dSrc5 DN 12.U8
+dSrc6 DN 13.U8
+dSrc7 DN 14.U8
+dSrc8 DN 15.U8
+
+qSumBE01 QN 8.S16
+qSumCD01 QN 9.S16
+dSumBE0 DN 16.S16
+dSumCD0 DN 18.S16
+
+qAcc01 QN 0.S16
+qAcc23 QN 1.S16
+qAcc45 QN 2.S16
+qAcc67 QN 3.S16
+
+dRes0 DN 0.S16
+dRes1 DN 2.S16
+dRes2 DN 4.S16
+dRes3 DN 6.S16
+
+dAcc0 DN 0.U8
+dAcc1 DN 2.U8
+dAcc2 DN 4.U8
+dAcc3 DN 6.U8
+
+
+dTmp0 DN 20.S16
+dTmp1 DN 21.S16
+dTmp2 DN 22.S16
+dTmp3 DN 23.S16
+
+
+ VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ]
+ ADD Temp, pSrc, srcStep, LSL #2
+ VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ]
+ ;// One cycle stall
+ VLD1 dSrc5, [Temp], srcStep
+ ;// One cycle stall
+ VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ]
+ VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f
+ VLD1 dSrc3, [pSrc], srcStep
+ ;// One cycle stall
+ VLD1 dSrc6, [Temp], srcStep ;// TeRi
+
+ VLD1 dSrc4, [pSrc], srcStep
+ VLD1 dSrc7, [Temp], srcStep ;// TeRi
+ VADDL qSumBE01, dSrc1, dSrc4 ;// b+e
+ VADDL qSumCD01, dSrc2, dSrc3 ;// c+d
+ VLD1 dSrc8, [Temp], srcStep ;// TeRi
+ VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e)
+; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+ VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+
+; VLD1 dSrc6, [Temp], srcStep
+ VADDL qSumBE01, dSrc2, dSrc5 ;// b+e
+ VADDL qSumCD01, dSrc3, dSrc4 ;// c+d
+ VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f
+ VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e)
+; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+ VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+
+; VLD1 dSrc7, [Temp], srcStep
+ VADDL qSumBE01, dSrc3, dSrc6 ;// b+e
+ VADDL qSumCD01, dSrc4, dSrc5 ;// c+d
+ VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f
+ VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e)
+; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+ VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+
+; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ]
+ VADDL qSumBE01, dSrc4, dSrc7 ;// b+e
+ VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f
+ VADDL qSumCD01, dSrc5, dSrc6 ;// c+d
+ VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e)
+ VADD dRes0, dRes0, dTmp0
+ VADD dRes1, dRes1, dTmp1
+ VADD dRes2, dRes2, dTmp2
+ VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d)
+; VADD dRes3, dRes3, dTmp3
+
+ VQRSHRUN dAcc0, qAcc01, #5
+ VQRSHRUN dAcc1, qAcc23, #5
+ VQRSHRUN dAcc2, qAcc45, #5
+ VQRSHRUN dAcc3, qAcc67, #5
+
+ M_END
+
+ ENDIF
+
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
new file mode 100755
index 0000000..0f0ec78
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Interpolate_Chroma_s.s
@@ -0,0 +1,318 @@
+;//
+;//
+;// File Name: armVCM4P10_Interpolate_Chroma_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 9641
+;// Date: Thursday, February 7, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+ IF CortexA8
+
+ M_TABLE armVCM4P10_WidthBranchTableMVIsNotZero
+
+ DCD WidthIs2MVIsNotZero, WidthIs2MVIsNotZero
+ DCD WidthIs4MVIsNotZero, WidthIs4MVIsNotZero
+ DCD WidthIs8MVIsNotZero
+
+ M_TABLE armVCM4P10_WidthBranchTableMVIsZero
+
+ DCD WidthIs2MVIsZero, WidthIs2MVIsZero
+ DCD WidthIs4MVIsZero, WidthIs4MVIsZero
+ DCD WidthIs8MVIsZero
+
+
+;// input registers
+
+pSrc RN 0
+iSrcStep RN 1
+pDst RN 2
+iDstStep RN 3
+iWidth RN 4
+iHeight RN 5
+dx RN 6
+dy RN 7
+
+;// local variable registers
+pc RN 15
+return RN 0
+EightMinusdx RN 8
+EightMinusdy RN 9
+
+ACoeff RN 12
+BCoeff RN 9
+CCoeff RN 8
+DCoeff RN 6
+
+pTable RN 11
+
+Step1 RN 10
+SrcStepMinus1 RN 14
+
+dACoeff DN D12.U8
+dBCoeff DN D13.U8
+dCCoeff DN D14.U8
+dDCoeff DN D15.U8
+
+dRow0a DN D0.U8
+dRow0b DN D1.U8
+dRow1a DN D2.U8
+dRow1b DN D3.U8
+
+qRow0a QN Q2.S16
+qRow0b QN Q3.S16
+
+;//dIndex DN D16.U8
+qRow1a QN Q11.S16
+qRow1b QN Q12.S16
+
+dRow2a DN D16.U8
+dRow2b DN D17.U8
+dRow3a DN D18.U8
+dRow3b DN D19.U8
+
+qOutRow2 QN Q11.U16
+qOutRow3 QN Q12.U16
+dOutRow2 DN D20.U8
+dOutRow3 DN D21.U8
+dOutRow2U64 DN D20.U64
+dOutRow3U64 DN D21.U64
+
+qOutRow0 QN Q2.U16
+qOutRow1 QN Q3.U16
+dOutRow0 DN D8.U8
+dOutRow1 DN D9.U8
+
+dOutRow0U64 DN D8.U64
+dOutRow1U64 DN D9.U64
+
+dOutRow0U32 DN D8.U32
+dOutRow1U32 DN D9.U32
+
+dOutRow0U16 DN D8.U16
+dOutRow1U16 DN D9.U16
+
+
+dOut0U64 DN D0.U64
+dOut1U64 DN D1.U64
+
+dOut00U32 DN D0.U32
+dOut01U32 DN D1.U32
+dOut10U32 DN D2.U32
+dOut11U32 DN D3.U32
+
+dOut0U16 DN D0.U16
+dOut1U16 DN D1.U16
+
+;//-----------------------------------------------------------------------------------------------
+;// armVCM4P10_Interpolate_Chroma_asm starts
+;//-----------------------------------------------------------------------------------------------
+
+ ;// Write function header
+ M_START armVCM4P10_Interpolate_Chroma, r11, d15
+
+ ;// Define stack arguments
+ M_ARG Width, 4
+ M_ARG Height, 4
+ M_ARG Dx, 4
+ M_ARG Dy, 4
+
+ ;// Load argument from the stack
+ ;// M_STALL ARM1136JS=4
+
+ M_LDRD dx, dy, Dx
+ M_LDRD iWidth, iHeight, Width
+
+ ;// EightMinusdx = 8 - dx
+ ;// EightMinusdy = 8 - dy
+
+ ;// ACoeff = EightMinusdx * EightMinusdy
+ ;// BCoeff = dx * EightMinusdy
+ ;// CCoeff = EightMinusdx * dy
+ ;// DCoeff = dx * dy
+
+ RSB EightMinusdx, dx, #8
+ RSB EightMinusdy, dy, #8
+ CMN dx,dy
+ MOV Step1, #1
+ LDREQ pTable, =armVCM4P10_WidthBranchTableMVIsZero
+ SUB SrcStepMinus1, iSrcStep, Step1
+ LDRNE pTable, =armVCM4P10_WidthBranchTableMVIsNotZero
+
+ VLD1 dRow0a, [pSrc], Step1 ;// 0a
+
+ SMULBB ACoeff, EightMinusdx, EightMinusdy
+ SMULBB BCoeff, dx, EightMinusdy
+ VLD1 dRow0b, [pSrc], SrcStepMinus1 ;// 0b
+ SMULBB CCoeff, EightMinusdx, dy
+ SMULBB DCoeff, dx, dy
+
+ VDUP dACoeff, ACoeff
+ VDUP dBCoeff, BCoeff
+ VDUP dCCoeff, CCoeff
+ VDUP dDCoeff, DCoeff
+
+ LDR pc, [pTable, iWidth, LSL #1] ;// Branch to the case based on iWidth
+
+;// Pixel layout:
+;//
+;// x00 x01 x02
+;// x10 x11 x12
+;// x20 x21 x22
+
+;// If fractionl mv is not (0, 0)
+WidthIs8MVIsNotZero
+
+ VLD1 dRow1a, [pSrc], Step1 ;// 1a
+ VMULL qRow0a, dRow0a, dACoeff
+ VLD1 dRow1b, [pSrc], SrcStepMinus1 ;// 1b
+ VMULL qRow0b, dRow1a, dACoeff
+ VLD1 dRow2a, [pSrc], Step1 ;// 2a
+ VMLAL qRow0a, dRow0b, dBCoeff
+ VLD1 dRow2b, [pSrc], SrcStepMinus1 ;// 2b
+ VMULL qRow1a, dRow2a, dACoeff
+ VMLAL qRow0b, dRow1b, dBCoeff
+ VLD1 dRow3a, [pSrc], Step1 ;// 3a
+ VMLAL qRow0a, dRow1a, dCCoeff
+ VMLAL qRow1a, dRow2b, dBCoeff
+ VMULL qRow1b, dRow3a, dACoeff
+ VLD1 dRow3b, [pSrc], SrcStepMinus1 ;// 3b
+ VMLAL qRow0b, dRow2a, dCCoeff
+ VLD1 dRow0a, [pSrc], Step1 ;// 0a
+ VMLAL qRow1b, dRow3b, dBCoeff
+ VMLAL qRow1a, dRow3a, dCCoeff
+ VMLAL qRow0a, dRow1b, dDCoeff
+ VLD1 dRow0b, [pSrc], SrcStepMinus1 ;// 0b
+ VMLAL qRow1b, dRow0a, dCCoeff
+ VMLAL qRow0b, dRow2b, dDCoeff
+ VMLAL qRow1a, dRow3b, dDCoeff
+
+
+ SUBS iHeight, iHeight, #4
+ VMLAL qRow1b, dRow0b, dDCoeff
+
+ VQRSHRN dOutRow0, qOutRow0, #6
+ VQRSHRN dOutRow1, qOutRow1, #6
+ VQRSHRN dOutRow2, qOutRow2, #6
+ VST1 dOutRow0U64, [pDst], iDstStep
+ VQRSHRN dOutRow3, qOutRow3, #6
+
+ VST1 dOutRow1U64, [pDst], iDstStep
+ VST1 dOutRow2U64, [pDst], iDstStep
+ VST1 dOutRow3U64, [pDst], iDstStep
+
+
+ BGT WidthIs8MVIsNotZero
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+WidthIs4MVIsNotZero
+
+ VLD1 dRow1a, [pSrc], Step1
+ VMULL qRow0a, dRow0a, dACoeff
+ VMULL qRow0b, dRow1a, dACoeff
+ VLD1 dRow1b, [pSrc], SrcStepMinus1
+ VMLAL qRow0a, dRow0b, dBCoeff
+ VMLAL qRow0b, dRow1b, dBCoeff
+ VLD1 dRow0a, [pSrc], Step1
+ VMLAL qRow0a, dRow1a, dCCoeff
+ VMLAL qRow0b, dRow0a, dCCoeff
+ VLD1 dRow0b, [pSrc], SrcStepMinus1
+ SUBS iHeight, iHeight, #2
+ VMLAL qRow0b, dRow0b, dDCoeff
+ VMLAL qRow0a, dRow1b, dDCoeff
+
+ VQRSHRN dOutRow1, qOutRow1, #6
+ VQRSHRN dOutRow0, qOutRow0, #6
+
+ VST1 dOutRow0U32[0], [pDst], iDstStep
+ VST1 dOutRow1U32[0], [pDst], iDstStep
+
+ BGT WidthIs4MVIsNotZero
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+WidthIs2MVIsNotZero
+
+ VLD1 dRow1a, [pSrc], Step1
+ VMULL qRow0a, dRow0a, dACoeff
+ VMULL qRow0b, dRow1a, dACoeff
+ VLD1 dRow1b, [pSrc], SrcStepMinus1
+ VMLAL qRow0a, dRow0b, dBCoeff
+ VMLAL qRow0b, dRow1b, dBCoeff
+ VLD1 dRow0a, [pSrc], Step1
+ VMLAL qRow0a, dRow1a, dCCoeff
+ VMLAL qRow0b, dRow0a, dCCoeff
+ VLD1 dRow0b, [pSrc], SrcStepMinus1
+ SUBS iHeight, iHeight, #2
+ VMLAL qRow0b, dRow0b, dDCoeff
+ VMLAL qRow0a, dRow1b, dDCoeff
+
+ VQRSHRN dOutRow1, qOutRow1, #6
+ VQRSHRN dOutRow0, qOutRow0, #6
+
+ VST1 dOutRow0U16[0], [pDst], iDstStep
+ VST1 dOutRow1U16[0], [pDst], iDstStep
+
+ BGT WidthIs2MVIsNotZero
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+;// If fractionl mv is (0, 0)
+WidthIs8MVIsZero
+ SUB pSrc, pSrc, iSrcStep
+
+WidthIs8LoopMVIsZero
+ VLD1 dRow0a, [pSrc], iSrcStep
+ SUBS iHeight, iHeight, #2
+ VLD1 dRow0b, [pSrc], iSrcStep
+ VST1 dOut0U64, [pDst], iDstStep
+ VST1 dOut1U64, [pDst], iDstStep
+ BGT WidthIs8LoopMVIsZero
+
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+WidthIs4MVIsZero
+ VLD1 dRow0b, [pSrc], iSrcStep
+
+ SUBS iHeight, iHeight, #2
+
+ VST1 dOut00U32[0], [pDst], iDstStep
+ VLD1 dRow0a, [pSrc], iSrcStep
+ VST1 dOut01U32[0], [pDst], iDstStep
+
+ BGT WidthIs4MVIsZero
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+WidthIs2MVIsZero
+ VLD1 dRow0b, [pSrc], iSrcStep
+ SUBS iHeight, iHeight, #2
+
+ VST1 dOut0U16[0], [pDst], iDstStep
+ VLD1 dRow0a, [pSrc], iSrcStep
+ VST1 dOut1U16[0], [pDst], iDstStep
+
+ BGT WidthIs2MVIsZero
+ MOV return, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF ;// CortexA8
+
+ END
+
+;//-----------------------------------------------------------------------------------------------
+;// armVCM4P10_Interpolate_Chroma_asm ends
+;//-----------------------------------------------------------------------------------------------
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s
new file mode 100755
index 0000000..7e2642b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_QuantTables_s.s
@@ -0,0 +1,74 @@
+;//
+;//
+;// File Name: armVCM4P10_QuantTables_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;// Description:
+;// This file contains quantization tables
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+
+ EXPORT armVCM4P10_MFMatrixQPModTable
+ EXPORT armVCM4P10_QPDivIntraTable
+ EXPORT armVCM4P10_QPDivPlusOneTable
+
+;//--------------------------------------------------------------
+;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires,
+;// for values of iQP from 0 to 51 (inclusive).
+;//--------------------------------------------------------------
+
+ M_TABLE armVCM4P10_MFMatrixQPModTable
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+ DCW 13107, 11916, 10082, 9362, 8192, 7282
+
+;//---------------------------------------------------------------
+;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values,
+;// for values of iQP from 0 to 51 (inclusive).
+;//---------------------------------------------------------------
+
+ M_TABLE armVCM4P10_QPDivPlusOneTable
+ DCB 16, 16, 16, 16, 16, 16
+ DCB 17, 17, 17, 17, 17, 17
+ DCB 18, 18, 18, 18, 18, 18
+ DCB 19, 19, 19, 19, 19, 19
+ DCB 20, 20, 20, 20, 20, 20
+ DCB 21, 21, 21, 21, 21, 21
+ DCB 22, 22, 22, 22, 22, 22
+ DCB 23, 23, 23, 23, 23, 23
+ DCB 24, 24, 24, 24, 24, 24
+
+;//------------------------------------------------------------------
+;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) ,
+;// for values of iQP from 0 to 51 (inclusive).
+;//------------------------------------------------------------------
+
+ M_TABLE armVCM4P10_QPDivIntraTable, 2
+ DCD 21845, 21845, 21845, 21845, 21845, 21845
+ DCD 43690, 43690, 43690, 43690, 43690, 43690
+ DCD 87381, 87381, 87381, 87381, 87381, 87381
+ DCD 174762, 174762, 174762, 174762, 174762, 174762
+ DCD 349525, 349525, 349525, 349525, 349525, 349525
+ DCD 699050, 699050, 699050, 699050, 699050, 699050
+ DCD 1398101, 1398101, 1398101, 1398101, 1398101, 1398101
+ DCD 2796202, 2796202, 2796202, 2796202, 2796202, 2796202
+ DCD 5592405, 5592405, 5592405, 5592405, 5592405, 5592405
+
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
new file mode 100755
index 0000000..ee9c339
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
@@ -0,0 +1,186 @@
+;//
+;//
+;// File Name: armVCM4P10_TransformResidual4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Transform Residual 4x4 Coefficients
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+
+
+
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;// ARM Registers
+
+;//Input Registers
+pDst RN 0
+pSrc RN 1
+
+
+;// Neon Registers
+
+;// Packed Input pixels
+dIn0 DN D0.S16
+dIn1 DN D1.S16
+dIn2 DN D2.S16
+dIn3 DN D3.S16
+
+;// Intermediate calculations
+dZero DN D4.S16
+de0 DN D5.S16
+de1 DN D6.S16
+de2 DN D7.S16
+de3 DN D8.S16
+dIn1RS DN D7.S16
+dIn3RS DN D8.S16
+df0 DN D0.S16
+df1 DN D1.S16
+df2 DN D2.S16
+df3 DN D3.S16
+qf01 QN Q0.32
+qf23 QN Q1.32
+dg0 DN D5.S16
+dg1 DN D6.S16
+dg2 DN D7.S16
+dg3 DN D8.S16
+df1RS DN D7.S16
+df3RS DN D8.S16
+
+;// Output pixels
+dh0 DN D0.S16
+dh1 DN D1.S16
+dh2 DN D2.S16
+dh3 DN D3.S16
+
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START armVCM4P10_TransformResidual4x4, ,d8
+
+ ;******************************************************************
+ ;// The strategy used in implementing the transform is as follows:*
+ ;// Load the 4x4 block into 8 registers *
+ ;// Transpose the 4x4 matrix *
+ ;// Perform the row operations (on columns) using SIMD *
+ ;// Transpose the 4x4 result matrix *
+ ;// Perform the coloumn operations *
+ ;// Store the 4x4 block at one go *
+ ;******************************************************************
+
+ ;// Load all the 4x4 pixels in transposed form
+
+ VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc]
+
+ VMOV dZero,#0 ;// Used to right shift by 1
+
+
+ ;****************************************
+ ;// Row Operations (Performed on columns)
+ ;****************************************
+
+
+ VADD de0,dIn0,dIn2 ;// e0 = d0 + d2
+ VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2
+ VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0
+ VHADD dIn3RS,dIn3,dZero
+ VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3
+ VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1)
+ VADD df0,de0,de3 ;// f0 = e0 + e3
+ VADD df1,de1,de2 ;// f1 = e1 + e2
+ VSUB df2,de1,de2 ;// f2 = e1 - e2
+ VSUB df3,de0,de3 ;// f3 = e0 - e3
+
+
+
+ ;*****************************************************************
+ ;// Transpose the resultant matrix
+ ;*****************************************************************
+
+ VTRN df0,df1
+ VTRN df2,df3
+ VTRN qf01,qf23
+
+
+ ;*******************************
+ ;// Coloumn Operations
+ ;*******************************
+
+
+ VADD dg0,df0,df2 ;// e0 = d0 + d2
+ VSUB dg1,df0,df2 ;// e1 = d0 - d2
+ VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0
+ VHADD df3RS,df3,dZero
+ VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3
+ VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1)
+ VADD dh0,dg0,dg3 ;// f0 = e0 + e3
+ VADD dh1,dg1,dg2 ;// f1 = e1 + e2
+ VSUB dh2,dg1,dg2 ;// f2 = e1 - e2
+ VSUB dh3,dg0,dg3 ;// f3 = e0 - e3
+
+
+ ;************************************************
+ ;// Calculate final value (colOp[i][j] + 32)>>6
+ ;************************************************
+
+ VRSHR dh0,#6
+ VRSHR dh1,#6
+ VRSHR dh2,#6
+ VRSHR dh3,#6
+
+
+ ;***************************
+ ;// Store all the 4x4 pixels
+ ;***************************
+
+ VST1 {dh0,dh1,dh2,dh3},[pDst]
+
+
+ ;// Set return value
+
+End
+
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//CortexA8
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
new file mode 100755
index 0000000..4c52e22
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_UnpackBlock4x4_s.s
@@ -0,0 +1,92 @@
+;//
+;//
+;// File Name: armVCM4P10_UnpackBlock4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Define the processor variants supported by this file
+
+ M_VARIANTS ARM1136JS
+
+
+ IF ARM1136JS
+
+;//--------------------------------------
+;// Input Arguments and their scope/usage
+;//--------------------------------------
+ppSrc RN 0 ;// Persistent variable
+pDst RN 1 ;// Persistent variable
+
+;//--------------------------------
+;// Variables and their scope/usage
+;//--------------------------------
+pSrc RN 2 ;// Persistent variables
+Flag RN 3
+Value RN 4
+Value2 RN 5
+strOffset RN 6
+cstOffset RN 7
+
+
+ M_START armVCM4P10_UnpackBlock4x4, r7
+
+ LDR pSrc, [ppSrc] ;// Load pSrc
+ MOV cstOffset, #31 ;// To be used in the loop, to compute offset
+
+ ;//-----------------------------------------------------------------------
+ ; Firstly, fill all the coefficient values on the <pDst> buffer by zero
+ ;//-----------------------------------------------------------------------
+
+ MOV Value, #0 ;// Initialize the zero value
+ MOV Value2, #0 ;// Initialize the zero value
+ LDRB Flag, [pSrc], #1 ;// Preload <Flag> before <unpackLoop>
+
+ STRD Value, [pDst, #0] ;// pDst[0] = pDst[1] = pDst[2] = pDst[3] = 0
+ STRD Value, [pDst, #8] ;// pDst[4] = pDst[5] = pDst[6] = pDst[7] = 0
+ STRD Value, [pDst, #16] ;// pDst[8] = pDst[9] = pDst[10] = pDst[11] = 0
+ STRD Value, [pDst, #24] ;// pDst[12] = pDst[13] = pDst[14] = pDst[15] = 0
+
+ ;//----------------------------------------------------------------------------
+ ;// The loop below parses and unpacks the input stream. The C-model has
+ ;// a somewhat complicated logic for sign extension. But in the v6 version,
+ ;// that can be easily taken care by loading the data from <pSrc> stream as
+ ;// SIGNED byte/halfword. So, based on the first TST instruction, 8-bits or
+ ;// 16-bits are read.
+ ;//
+ ;// Next, to compute the offset, where the unpacked value needs to be stored,
+ ;// we modify the computation to perform [(Flag & 15) < 1] as [(Flag < 1) & 31]
+ ;// This results in a saving of one cycle.
+ ;//----------------------------------------------------------------------------
+
+unpackLoop
+ TST Flag, #0x10 ;// Computing (Flag & 0x10)
+ LDRSBNE Value2,[pSrc,#1] ;// Load byte wise to avoid unaligned access
+ LDRBNE Value, [pSrc], #2
+ AND strOffset, cstOffset, Flag, LSL #1 ;// strOffset = (Flag & 15) < 1;
+ LDRSBEQ Value, [pSrc], #1 ;// Value = (OMX_U8) *pSrc++
+ ORRNE Value,Value,Value2, LSL #8 ;// Value = (OMX_U16) *pSrc++
+
+ TST Flag, #0x20 ;// Computing (Flag & 0x20) to check, if we're done
+ LDRBEQ Flag, [pSrc], #1 ;// Flag = (OMX_U8) *pSrc++, for next iteration
+ STRH Value, [pDst, strOffset] ;// Store <Value> at offset <strOffset>
+ BEQ unpackLoop ;// Branch to the loop beginning
+
+ STR pSrc, [ppSrc] ;// Update the bitstream pointer
+ M_END
+
+ ENDIF
+
+
+
+ END
+ \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
new file mode 100755
index 0000000..40d4d5e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockChroma_I.c
@@ -0,0 +1,88 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: omxVCM4P10_DeblockChroma_I.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * H.264 intra chroma deblock
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DeblockChroma_I
+ *
+ * Description:
+ * Performs deblocking filtering on all edges of the chroma macroblock (16x16).
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDst pointer to the input macroblock. Must be 8-byte aligned.
+ * [in] srcdstStep Step of the arrays
+ * [in] pAlpha pointer to a 2x2 array of alpha thresholds, organized as follows: { external
+ * vertical edge, internal vertical edge, external
+ * horizontal edge, internal horizontal edge }
+ * [in] pBeta pointer to a 2x2 array of beta thresholds, organized as follows: { external
+ * vertical edge, internal vertical edge, external horizontal edge,
+ * internal horizontal edge }
+ * [in] pThresholds AArray of size 8x2 of Thresholds (TC0) (values for the left or
+ * above edge of each 4x2 or 2x4 block, arranged in vertical block order
+ * and then in horizontal block order)
+ * [in] pBS array of size 16x2 of BS parameters (arranged in scan block order for vertical edges and then horizontal edges);
+ * valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4. Must be 4-byte aligned.
+ * [out] pSrcDst pointer to filtered output macroblock
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds, or pBS is NULL.
+ * - pSrcDst is not 8-byte aligned.
+ * - either pThresholds or pBS is not 4-byte aligned.
+ * - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3.
+ * - srcdstStep is not a multiple of 8.
+ *
+ */
+OMXResult omxVCM4P10_DeblockChroma_I(
+ OMX_U8* pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8* pAlpha,
+ const OMX_U8* pBeta,
+ const OMX_U8* pThresholds,
+ const OMX_U8 *pBS
+)
+{
+ OMXResult errorCode;
+
+ armRetArgErrIf(pSrcDst == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot8ByteAligned(pSrcDst), OMX_Sts_BadArgErr);
+ armRetArgErrIf(srcdstStep & 7, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pAlpha == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pBeta == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pThresholds == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr);
+ armRetArgErrIf(pBS == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot4ByteAligned(pBS), OMX_Sts_BadArgErr);
+
+ errorCode = omxVCM4P10_FilterDeblockingChroma_VerEdge_I(
+ pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS);
+
+ armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode)
+
+ errorCode = omxVCM4P10_FilterDeblockingChroma_HorEdge_I(
+ pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+8, pBS+16);
+
+ return errorCode;
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
new file mode 100755
index 0000000..619365f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DeblockLuma_I.c
@@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: omxVCM4P10_DeblockLuma_I.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * H.264 luma deblock
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+
+/**
+ * Function: omxVCM4P10_DeblockLuma_I
+ *
+ * Description:
+ * This function performs deblock filtering the horizontal and vertical edges of a luma macroblock
+ *(16x16).
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrcDst pointer to the input macroblock. Must be 8-byte aligned.
+ * [in] srcdstStep image width
+ * [in] pAlpha pointer to a 2x2 table of alpha thresholds, organized as follows: { external
+ * vertical edge, internal vertical edge, external horizontal
+ * edge, internal horizontal edge }
+ * [in] pBeta pointer to a 2x2 table of beta thresholds, organized as follows: { external
+ * vertical edge, internal vertical edge, external horizontal edge,
+ * internal horizontal edge }
+ * [in] pThresholds pointer to a 16x2 table of threshold (TC0), organized as follows: { values for
+ * the left or above edge of each 4x4 block, arranged in vertical block order
+ * and then in horizontal block order)
+ * [in] pBS pointer to a 16x2 table of BS parameters arranged in scan block order for vertical edges and then horizontal edges;
+ * valid in the range [0,4] with the following restrictions: i) pBS[i]== 4 may occur only for 0<=i<=3, ii) pBS[i]== 4 if and only if pBS[i^1]== 4. Must be 4-byte aligned.
+ * [out] pSrcDst pointer to filtered output macroblock.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - Either of the pointers in pSrcDst, pAlpha, pBeta, pTresholds or pBS is NULL.
+ * - pSrcDst is not 8-byte aligned.
+ * - srcdstStep is not a multiple of 8
+ * - pBS is out of range, i.e., one of the following conditions is true: pBS[i]<0, pBS[i]>4, pBS[i]==4 for i>=4, or (pBS[i]==4 && pBS[i^1]!=4) for 0<=i<=3.
+.
+ *
+ */
+
+OMXResult omxVCM4P10_DeblockLuma_I(
+ OMX_U8* pSrcDst,
+ OMX_S32 srcdstStep,
+ const OMX_U8* pAlpha,
+ const OMX_U8* pBeta,
+ const OMX_U8* pThresholds,
+ const OMX_U8 *pBS
+)
+{
+ OMXResult errorCode;
+
+ armRetArgErrIf(pSrcDst == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot8ByteAligned(pSrcDst), OMX_Sts_BadArgErr);
+ armRetArgErrIf(srcdstStep & 7, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pAlpha == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pBeta == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(pThresholds == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot4ByteAligned(pThresholds), OMX_Sts_BadArgErr);
+ armRetArgErrIf(pBS == NULL, OMX_Sts_BadArgErr);
+ armRetArgErrIf(armNot4ByteAligned(pBS), OMX_Sts_BadArgErr);
+
+ errorCode = omxVCM4P10_FilterDeblockingLuma_VerEdge_I(
+ pSrcDst, srcdstStep, pAlpha, pBeta, pThresholds, pBS);
+
+ armRetArgErrIf(errorCode != OMX_Sts_NoErr, errorCode)
+
+ errorCode = omxVCM4P10_FilterDeblockingLuma_HorEdge_I(
+ pSrcDst, srcdstStep, pAlpha+2, pBeta+2, pThresholds+16, pBS+16);
+
+ return errorCode;
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
new file mode 100755
index 0000000..4e871bf
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * H.264 decode coefficients module
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse raster scan for 2x2 block of
+ * ChromaDCLevel. The decoded coefficients in packed position-coefficient
+ * buffer are stored in increasing raster scan order, namely position order.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream Double pointer to current byte in bit stream
+ * buffer
+ * [in] pOffset Pointer to current bit position in the byte
+ * pointed to by *ppBitStream
+ * [out] ppBitStream *ppBitStream is updated after each block is decoded
+ * [out] pOffset *pOffset is updated after each block is decoded
+ * [out] pNumCoeff Pointer to the number of nonzero coefficients
+ * in this block
+ * [out] ppPosCoefbuf Double pointer to destination residual
+ * coefficient-position pair buffer
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC (
+ const OMX_U8** ppBitStream,
+ OMX_S32* pOffset,
+ OMX_U8* pNumCoeff,
+ OMX_U8** ppPosCoefbuf
+ )
+
+{
+ return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff,
+ ppPosCoefbuf, 17, 4);
+
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
new file mode 100755
index 0000000..b29e576
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DecodeCoeffsToPairCAVLC.c
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------
+ *
+ *
+ * File Name: omxVCM4P10_DecodeCoeffsToPairCAVLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * H.264 decode coefficients module
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/**
+ * Function: omxVCM4P10_DecodeCoeffsToPairCAVLC
+ *
+ * Description:
+ * Performs CAVLC decoding and inverse zigzag scan for 4x4 block of
+ * Intra16x16DCLevel, Intra16x16ACLevel,LumaLevel, and ChromaACLevel.
+ * Inverse field scan is not supported. The decoded coefficients in packed
+ * position-coefficient buffer are stored in increasing zigzag order instead
+ * of position order.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream Double pointer to current byte in bit stream buffer
+ * [in] pOffset Pointer to current bit position in the byte pointed
+ * to by *ppBitStream
+ * [in] sMaxNumCoeff Maximum number of non-zero coefficients in current
+ * block
+ * [in] sVLCSelect VLC table selector, obtained from number of non-zero
+ * AC coefficients of above and left 4x4 blocks. It is
+ * equivalent to the variable nC described in H.264 standard
+ * table 9-5, except its value can¡¯t be less than zero.
+ * [out] ppBitStream *ppBitStream is updated after each block is decoded
+ * [out] pOffset *pOffset is updated after each block is decoded
+ * [out] pNumCoeff Pointer to the number of nonzero coefficients in
+ * this block
+ * [out] ppPosCoefbuf Double pointer to destination residual
+ * coefficient-position pair buffer
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxVCM4P10_DecodeCoeffsToPairCAVLC(
+ const OMX_U8** ppBitStream,
+ OMX_S32* pOffset,
+ OMX_U8* pNumCoeff,
+ OMX_U8**ppPosCoefbuf,
+ OMX_INT sVLCSelect,
+ OMX_INT sMaxNumCoeff
+ )
+{
+ return armVCM4P10_DecodeCoeffsToPair(ppBitStream, pOffset, pNumCoeff,
+ ppPosCoefbuf, sVLCSelect, sMaxNumCoeff);
+}
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
new file mode 100755
index 0000000..485a488
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
@@ -0,0 +1,396 @@
+;//
+;//
+;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;//
+;//
+
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Import symbols required from other files
+;// (For example tables)
+
+ IMPORT armVCM4P10_UnpackBlock4x4
+ IMPORT armVCM4P10_TransformResidual4x4
+ IMPORT armVCM4P10_QPDivTable
+ IMPORT armVCM4P10_VMatrixU16
+ IMPORT armVCM4P10_QPModuloTable
+
+ M_VARIANTS CortexA8
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_DequantLumaAC4x4
+
+;// Guarding implementation by the processor name
+
+
+
+;// Guarding implementation by the processor name
+
+
+
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
+
+;// Guarding implementation by the processor name
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+
+;// ARM Registers
+
+;//Input Registers
+ppSrc RN 0
+pPred RN 1
+pDC RN 2
+pDst RN 3
+
+
+;//Output Registers
+result RN 0
+
+;//Local Scratch Registers
+
+;//Registers used in armVCM4P10_DequantLumaAC4x4
+pQPdiv RN 10
+pQPmod RN 11
+pVRow RN 2
+QPmod RN 12
+shift RN 14
+index0 RN 1
+index1 RN 10
+
+;//Registers used in DequantTransformResidualFromPairAndAdd
+pDelta RN 4
+pDeltaTmp RN 6
+AC RN 5 ;//Load from stack
+pPredTemp RN 7
+pDCTemp RN 8
+pDstTemp RN 9
+pDeltaArg1 RN 1
+pDeltaArg0 RN 0
+QP RN 1 ;//Load from stack
+DCval RN 10
+predstep RN 1
+dstStep RN 10
+PredVal1 RN 3
+PredVal2 RN 5
+
+
+
+
+;// Neon Registers
+
+;// Registers used in armVCM4P10_DequantLumaAC4x4
+
+dVmatrix DN D6.8
+dindexRow0 DN D7.32
+dindexRow1 DN D9.32
+dByteIndexRow0 DN D7.8
+dByteIndexRow1 DN D9.8
+dVRow0 DN D8.8
+dVRow1 DN D4.8
+dVRow0U16 DN D8.U16
+dVRow1U16 DN D4.U16
+dVRow2U16 DN D8.U16
+dVRow3U16 DN D4.U16
+
+dShift DN D5.U16
+dSrcRow0 DN D0.I16
+dSrcRow1 DN D1.I16
+dSrcRow2 DN D2.I16
+dSrcRow3 DN D3.I16
+dDqntRow0 DN D0.I16
+dDqntRow1 DN D1.I16
+dDqntRow2 DN D2.I16
+dDqntRow3 DN D3.I16
+
+;// Registers used in TransformResidual4x4
+
+;// Packed Input pixels
+dIn0 DN D0.S16
+dIn1 DN D1.S16
+dIn2 DN D2.S16
+dIn3 DN D3.S16
+qIn01 QN Q0.32
+qIn23 QN Q1.32
+
+;// Intermediate calculations
+dZero DN D4.S16
+de0 DN D5.S16
+de1 DN D6.S16
+de2 DN D7.S16
+de3 DN D8.S16
+dIn1RS DN D7.S16
+dIn3RS DN D8.S16
+df0 DN D0.S16
+df1 DN D1.S16
+df2 DN D2.S16
+df3 DN D3.S16
+qf01 QN Q0.32
+qf23 QN Q1.32
+dg0 DN D5.S16
+dg1 DN D6.S16
+dg2 DN D7.S16
+dg3 DN D8.S16
+df1RS DN D7.S16
+df3RS DN D8.S16
+
+;// Output pixels
+dh0 DN D0.S16
+dh1 DN D1.S16
+dh2 DN D2.S16
+dh3 DN D3.S16
+
+;// Registers used in DequantTransformResidualFromPairAndAdd
+
+dDeltaRow0 DN D0.S16
+dDeltaRow1 DN D1.S16
+dDeltaRow2 DN D2.S16
+dDeltaRow3 DN D3.S16
+qDeltaRow01 QN Q0.S16
+qDeltaRow23 QN Q1.S16
+
+dPredValRow01 DN D4.U8
+dPredValRow23 DN D5.U8
+
+qSumRow01 QN Q3.S16
+qSumRow23 QN Q4.S16
+dDstRow01 DN D0.U8
+dDstRow23 DN D1.U8
+dDstRow0 DN D0.32[0]
+dDstRow1 DN D0.32[1]
+dDstRow2 DN D1.32[0]
+dDstRow3 DN D1.32[1]
+
+
+ ;// Allocate stack memory required by the function
+ M_ALLOC8 pBuffer, 32
+
+
+ ;// Write function header
+ M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
+
+ ;// Define stack arguments
+ M_ARG predStepOnStack, 4
+ M_ARG dstStepOnStack,4
+ M_ARG QPOnStack, 4
+ M_ARG ACOnStack,4
+
+
+ M_ADR pDelta,pBuffer
+ M_LDR AC,ACOnStack
+
+
+ ;// Save registers r1,r2,r3 before function call
+ MOV pPredTemp,pPred
+ MOV pDCTemp,pDC
+ MOV pDstTemp,pDst
+
+ CMP AC,#0
+ BEQ DCcase
+ MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
+
+ BL armVCM4P10_UnpackBlock4x4
+
+ ;//--------------------------------------------------------
+ ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
+ ;//--------------------------------------------------------
+
+ ;//BL armVCM4P10_DequantLumaAC4x4
+ M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
+
+ LDR pQPmod,=armVCM4P10_QPModuloTable
+ LDR pQPdiv,=armVCM4P10_QPDivTable
+ LDR pVRow,=armVCM4P10_VMatrixU16
+
+
+ LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
+ LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
+
+ LDR index1,=0x03020504
+ LDR index0,=0x05040100 ;// Indexes into dVmatrix
+ ADD pVRow,pVRow,QPmod
+ VDUP dindexRow0,index0
+ VDUP dindexRow1,index1
+ VDUP dShift,shift
+
+ ;// Load all 4x4 pVRow[] values
+ VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a]
+
+
+ VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
+ VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
+ CMP pDCTemp,#0
+ ;// Load all the 4x4 'src' values
+ VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta]
+
+ VSHL dVRow0U16,dVRow0U16,dShift
+ VSHL dVRow1U16,dVRow1U16,dShift
+ LDRSHNE DCval,[pDCTemp]
+
+
+ ;// Multiply src[] with pVRow[]
+ VMUL dDqntRow0,dSrcRow0,dVRow0U16
+ VMUL dDqntRow1,dSrcRow1,dVRow1U16
+ VMUL dDqntRow2,dSrcRow2,dVRow2U16
+ VMUL dDqntRow3,dSrcRow3,dVRow3U16
+
+
+
+ ;//-------------------------------------------------------------
+ ;// TransformResidual4x4 : Inlined to avoid Load/Stores
+ ;//-------------------------------------------------------------
+
+
+ ;//BL armVCM4P10_TransformResidual4x4
+ ;//STRHNE DCval,[pDelta]
+ VMOVNE dIn0[0],DCval
+
+
+
+ ;//*****************************************************************
+ ;// Transpose the input pixels : perform Row ops as Col ops
+ ;//*****************************************************************
+
+ VTRN dIn0,dIn1
+ VTRN dIn2,dIn3
+ VTRN qIn01,qIn23
+
+
+ VMOV dZero,#0 ;// Used to right shift by 1
+
+
+ ;//****************************************
+ ;// Row Operations (Performed on columns)
+ ;//****************************************
+
+
+ VADD de0,dIn0,dIn2 ;// e0 = d0 + d2
+ VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2
+ VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0
+ VHADD dIn3RS,dIn3,dZero
+ VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3
+ VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1)
+ VADD df0,de0,de3 ;// f0 = e0 + e3
+ VADD df1,de1,de2 ;// f1 = e1 + e2
+ VSUB df2,de1,de2 ;// f2 = e1 - e2
+ VSUB df3,de0,de3 ;// f3 = e0 - e3
+
+
+
+ ;//*****************************************************************
+ ;// Transpose the resultant matrix
+ ;//*****************************************************************
+
+ VTRN df0,df1
+ VTRN df2,df3
+ VTRN qf01,qf23
+
+
+ ;//*******************************
+ ;// Coloumn Operations
+ ;//*******************************
+
+
+ VADD dg0,df0,df2 ;// e0 = d0 + d2
+ VSUB dg1,df0,df2 ;// e1 = d0 - d2
+ VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0
+ VHADD df3RS,df3,dZero
+ VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3
+ VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1)
+ VADD dh0,dg0,dg3 ;// f0 = e0 + e3
+ VADD dh1,dg1,dg2 ;// f1 = e1 + e2
+ VSUB dh2,dg1,dg2 ;// f2 = e1 - e2
+ VSUB dh3,dg0,dg3 ;// f3 = e0 - e3
+
+
+ ;//************************************************
+ ;// Calculate final value (colOp[i][j] + 32)>>6
+ ;//************************************************
+
+ VRSHR dh0,#6
+ VRSHR dh1,#6
+ VRSHR dh2,#6
+ VRSHR dh3,#6
+
+
+ B OutDCcase
+
+
+DCcase
+ ;// Calculate the Transformed DCvalue : (DCval+32)>>6
+ LDRSH DCval,[pDCTemp]
+ ADD DCval,DCval,#32
+ ASR DCval,DCval,#6
+
+ VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
+ VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
+ VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
+ VDUP dDeltaRow3, DCval
+
+
+OutDCcase
+ M_LDR predstep,predStepOnStack
+ M_LDR dstStep,dstStepOnStack
+
+ LDR PredVal1,[pPredTemp],predstep
+ LDR PredVal2,[pPredTemp],predstep
+ VMOV dPredValRow01,PredVal1,PredVal2
+
+ LDR PredVal1,[pPredTemp],predstep
+ LDR PredVal2,[pPredTemp]
+ VMOV dPredValRow23,PredVal1,PredVal2
+
+
+ VADDW qSumRow01,qDeltaRow01,dPredValRow01
+ VADDW qSumRow23,qDeltaRow23,dPredValRow23
+ VQMOVUN dDstRow01,qSumRow01
+ VQMOVUN dDstRow23,qSumRow23
+
+
+ VST1 dDstRow0,[pDstTemp],dstStep
+ VST1 dDstRow1,[pDstTemp],dstStep
+ VST1 dDstRow2,[pDstTemp],dstStep
+ VST1 dDstRow3,[pDstTemp]
+
+ ;// Set return value
+ MOV result,#OMX_Sts_NoErr
+
+End
+
+
+ ;// Write function tail
+
+ M_END
+
+ ENDIF ;//CORTEXA8
+
+
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
new file mode 100644
index 0000000..4606197
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
@@ -0,0 +1,202 @@
+;//
+;//
+;// File Name: omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+ IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe
+ IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe
+
+LOOP_COUNT EQU 0x40000000
+MASK_3 EQU 0x03030303
+MASK_4 EQU 0x04040404
+
+;// Function arguments
+
+pSrcDst RN 0
+srcdstStep RN 1
+pAlpha RN 2
+pBeta RN 3
+
+pThresholds RN 5
+pBS RN 4
+bS3210 RN 6
+
+;// Loop
+
+XY RN 7
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+
+;// Filtering Decision
+dAlpha DN D0.U8
+dBeta DN D2.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+dAp1p0 DN D12.U8
+dAq1q0 DN D18.U8
+dAp2p0 DN D19.U8
+dAq2q0 DN D17.U8
+
+qBS3210 QN Q13.U16
+dBS3210 DN D26
+dMask_bs DN D27
+dFilt_bs DN D26.U16
+
+;// bSLT4
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+dMask_4 DN D1.U16
+
+Mask_4 RN 8
+Mask_3 RN 9
+
+dTemp DN D19.U8
+
+;// Result
+dP_0t DN D13.U8
+dQ_0t DN D31.U8
+
+dP_0n DN D29.U8
+dQ_0n DN D24.U8
+
+
+ ;// Function header
+ M_START omxVCM4P10_FilterDeblockingChroma_HorEdge_I, r9, d15
+
+ ;//Arguments on the stack
+ M_ARG ppThresholds, 4
+ M_ARG ppBS, 4
+
+ ;// d0-dAlpha_0
+ ;// d2-dBeta_0
+
+ ;load alpha1,beta1 somewhere to avoid more loads
+ VLD1 {dAlpha[]}, [pAlpha]!
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 ;?
+ SUB pSrcDst, pSrcDst, srcdstStep
+ VLD1 {dBeta[]}, [pBeta]!
+
+ M_LDR pBS, ppBS
+ M_LDR pThresholds, ppThresholds
+
+ LDR Mask_3, =MASK_3
+ LDR Mask_4, =MASK_4
+
+ VMOV dMask_0, #0
+ VMOV dMask_1, #1
+ VMOV dMask_4, #4
+
+ LDR XY, =LOOP_COUNT
+
+ ;// p0-p3 - d4-d7
+ ;// q0-q3 - d8-d11
+LoopY
+ LDR bS3210, [pBS], #8
+
+ VLD1 dP_2, [pSrcDst], srcdstStep
+ ;1
+ VLD1 dP_1, [pSrcDst], srcdstStep
+ CMP bS3210, #0
+ VLD1 dP_0, [pSrcDst], srcdstStep
+ ;1
+ VLD1 dQ_0, [pSrcDst], srcdstStep
+ VABD dAp2p0, dP_2, dP_0
+ VLD1 dQ_1, [pSrcDst], srcdstStep
+ VABD dAp0q0, dP_0, dQ_0
+ VLD1 dQ_2, [pSrcDst], srcdstStep
+ BEQ NoFilterBS0
+
+ VABD dAp1p0, dP_1, dP_0
+ VABD dAq1q0, dQ_1, dQ_0
+
+ VCGT dFilt, dAlpha, dAp0q0
+ VMOV.U32 dBS3210[0], bS3210
+ VMAX dAp1p0, dAq1q0, dAp1p0
+ VMOVL qBS3210, dBS3210.U8
+ VABD dAq2q0, dQ_2, dQ_0
+ VCGT dMask_bs.S16, dBS3210.S16, #0
+
+ VCGT dAp1p0, dBeta, dAp1p0
+ VCGT dAp2p0, dBeta, dAp2p0
+
+ VAND dFilt, dMask_bs.U8
+
+ TST bS3210, Mask_3
+
+ VCGT dAq2q0, dBeta, dAq2q0
+ VAND dFilt, dFilt, dAp1p0
+
+ VAND dAqflg, dFilt, dAq2q0
+ VAND dApflg, dFilt, dAp2p0
+
+ ;// bS < 4 Filtering
+ BLNE armVCM4P10_DeblockingChromabSLT4_unsafe
+
+ TST bS3210, Mask_4
+
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #2
+ VTST dFilt_bs, dFilt_bs, dMask_4
+
+ ;// bS == 4 Filtering
+ BLNE armVCM4P10_DeblockingChromabSGE4_unsafe
+
+ VBIT dP_0n, dP_0t, dFilt_bs
+ VBIT dQ_0n, dQ_0t, dFilt_bs
+
+ VBIF dP_0n, dP_0, dFilt
+ VBIF dQ_0n, dQ_0, dFilt
+
+ ;// Result Storage
+ VST1 dP_0n, [pSrcDst], srcdstStep
+ ADDS XY, XY, XY
+ VST1 dQ_0n, [pSrcDst], srcdstStep
+
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+
+ M_EXIT
+
+NoFilterBS0
+
+ VLD1 {dAlpha[]}, [pAlpha]
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #1
+ ADDS XY, XY, XY
+ VLD1 {dBeta[]}, [pBeta]
+ ADD pThresholds, pThresholds, #4
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF
+
+
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
new file mode 100644
index 0000000..18e6c1d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
@@ -0,0 +1,282 @@
+;//
+;//
+;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+ IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe
+ IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe
+
+LOOP_COUNT EQU 0x40000000
+MASK_3 EQU 0x03030303
+MASK_4 EQU 0x04040404
+
+;// Function arguments
+
+pSrcDst RN 0
+srcdstStep RN 1
+pAlpha RN 2
+pBeta RN 3
+
+pThresholds RN 5
+pBS RN 4
+bS3210 RN 6
+pSrcDst_P RN 10
+pSrcDst_Q RN 12
+
+pTmp RN 10
+pTmp2 RN 12
+step RN 14
+
+;// Loop
+
+XY RN 7
+
+;// Rows input
+dRow0 DN D7.U8
+dRow1 DN D8.U8
+dRow2 DN D5.U8
+dRow3 DN D10.U8
+dRow4 DN D6.U8
+dRow5 DN D9.U8
+dRow6 DN D4.U8
+dRow7 DN D11.U8
+
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+
+;// Filtering Decision
+dAlpha DN D0.U8
+dBeta DN D2.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+dAp1p0 DN D12.U8
+dAq1q0 DN D18.U8
+dAp2p0 DN D19.U8
+dAq2q0 DN D17.U8
+
+qBS3210 QN Q13.U16
+dBS3210 DN D26
+dMask_bs DN D27
+dFilt_bs DN D26.U16
+
+;// bSLT4
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+dMask_4 DN D1.U16
+
+Mask_4 RN 8
+Mask_3 RN 9
+
+dTemp DN D19.U8
+
+;// Result
+dP_0t DN D13.U8
+dQ_0t DN D31.U8
+
+dP_0n DN D29.U8
+dQ_0n DN D24.U8
+
+
+ ;// Function header
+ M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15
+
+ ;//Arguments on the stack
+ M_ARG ppThresholds, 4
+ M_ARG ppBS, 4
+
+ ;// d0-dAlpha_0
+ ;// d2-dBeta_0
+
+ ;load alpha1,beta1 somewhere to avoid more loads
+ VLD1 {dAlpha[]}, [pAlpha]!
+ SUB pSrcDst, pSrcDst, #4
+ VLD1 {dBeta[]}, [pBeta]!
+
+ M_LDR pBS, ppBS
+ M_LDR pThresholds, ppThresholds
+
+ LDR Mask_4, =MASK_4
+ LDR Mask_3, =MASK_3
+
+ ;dMask_0-14
+ ;dMask_1-15
+ ;dMask_4-19
+
+ VMOV dMask_0, #0
+ VMOV dMask_1, #1
+ VMOV dMask_4, #4
+
+ LDR XY, =LOOP_COUNT
+
+ ;// p0-p3 - d4-d7
+ ;// q0-q3 - d8-d11
+
+
+LoopY
+ LDR bS3210, [pBS], #8
+ ADD pTmp, pSrcDst, srcdstStep
+ ADD step, srcdstStep, srcdstStep
+
+ ;1
+ VLD1 dRow0, [pSrcDst], step
+ ;1
+ VLD1 dRow1, [pTmp], step
+ VLD1 dRow2, [pSrcDst], step
+ VLD1 dRow3, [pTmp], step
+ VLD1 dRow4, [pSrcDst], step
+ VLD1 dRow5, [pTmp], step
+ VLD1 dRow6, [pSrcDst], step
+ VLD1 dRow7, [pTmp], step
+
+
+ ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
+ ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
+ ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
+ ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
+ ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
+ ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
+ ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
+ ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
+
+ ;// 8x8 Transpose
+ VZIP.8 dRow0, dRow1
+ VZIP.8 dRow2, dRow3
+ VZIP.8 dRow4, dRow5
+ VZIP.8 dRow6, dRow7
+
+ VZIP.16 dRow0, dRow2
+ VZIP.16 dRow1, dRow3
+ VZIP.16 dRow4, dRow6
+ VZIP.16 dRow5, dRow7
+
+ VZIP.32 dRow0, dRow4
+ VZIP.32 dRow2, dRow6
+ VZIP.32 dRow3, dRow7
+ VZIP.32 dRow1, dRow5
+
+
+ ;Realign the pointers
+
+ CMP bS3210, #0
+ VABD dAp2p0, dP_2, dP_0
+ VABD dAp0q0, dP_0, dQ_0
+ BEQ NoFilterBS0
+
+ VABD dAp1p0, dP_1, dP_0
+ VABD dAq1q0, dQ_1, dQ_0
+
+ VMOV.U32 dBS3210[0], bS3210
+ VCGT dFilt, dAlpha, dAp0q0
+ VMAX dAp1p0, dAq1q0, dAp1p0
+ VMOVL qBS3210, dBS3210.U8
+ VABD dAq2q0, dQ_2, dQ_0
+ VCGT dMask_bs.S16, dBS3210.S16, #0
+
+ VCGT dAp1p0, dBeta, dAp1p0
+ VCGT dAp2p0, dBeta, dAp2p0
+ VAND dFilt, dMask_bs.U8
+
+ TST bS3210, Mask_3
+
+ VCGT dAq2q0, dBeta, dAq2q0
+ VAND dFilt, dFilt, dAp1p0
+
+ VAND dAqflg, dFilt, dAq2q0
+ VAND dApflg, dFilt, dAp2p0
+
+ ;// bS < 4 Filtering
+ BLNE armVCM4P10_DeblockingChromabSLT4_unsafe
+
+ TST bS3210, Mask_4
+
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
+ VTST dFilt_bs, dFilt_bs, dMask_4
+
+ ;// bS == 4 Filtering
+ BLNE armVCM4P10_DeblockingChromabSGE4_unsafe
+
+ VBIT dP_0n, dP_0t, dFilt_bs
+ VBIT dQ_0n, dQ_0t, dFilt_bs
+
+ ;// Result Storage
+ ADD pSrcDst_P, pSrcDst, #3
+ VBIF dP_0n, dP_0, dFilt
+
+ ADD pTmp2, pSrcDst_P, srcdstStep
+ ADD step, srcdstStep, srcdstStep
+ VBIF dQ_0n, dQ_0, dFilt
+
+ ADDS XY, XY, XY
+
+ VST1 {dP_0n[0]}, [pSrcDst_P], step
+ VST1 {dP_0n[1]}, [pTmp2], step
+ VST1 {dP_0n[2]}, [pSrcDst_P], step
+ VST1 {dP_0n[3]}, [pTmp2], step
+ VST1 {dP_0n[4]}, [pSrcDst_P], step
+ VST1 {dP_0n[5]}, [pTmp2], step
+ VST1 {dP_0n[6]}, [pSrcDst_P], step
+ VST1 {dP_0n[7]}, [pTmp2], step
+
+ ADD pSrcDst_Q, pSrcDst, #4
+ ADD pTmp, pSrcDst_Q, srcdstStep
+
+ VST1 {dQ_0n[0]}, [pSrcDst_Q], step
+ VST1 {dQ_0n[1]}, [pTmp], step
+ VST1 {dQ_0n[2]}, [pSrcDst_Q], step
+ VST1 {dQ_0n[3]}, [pTmp], step
+ VST1 {dQ_0n[4]}, [pSrcDst_Q], step
+ VST1 {dQ_0n[5]}, [pTmp], step
+ VST1 {dQ_0n[6]}, [pSrcDst_Q], step
+ VST1 {dQ_0n[7]}, [pTmp], step
+
+ ADD pSrcDst, pSrcDst, #4
+
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+
+ M_EXIT
+
+NoFilterBS0
+ VLD1 {dAlpha[]}, [pAlpha]
+ ADD pSrcDst, pSrcDst, #4
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
+ ADDS XY, XY, XY
+ VLD1 {dBeta[]}, [pBeta]
+ ADD pThresholds, pThresholds, #4
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+
+ M_END
+
+ ENDIF
+
+
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
new file mode 100755
index 0000000..0c3f4f2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
@@ -0,0 +1,288 @@
+;//
+;//
+;// File Name: omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe
+ IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe
+
+ IF CortexA8
+
+LOOP_COUNT EQU 0x55000000
+
+
+;// Function arguments
+
+pSrcDst RN 0
+srcdstStep RN 1
+pAlpha RN 2
+pBeta RN 3
+
+pThresholds RN 5
+pBS RN 4
+bS10 RN 12
+
+pAlpha_0 RN 2
+pBeta_0 RN 3
+
+pAlpha_1 RN 7
+pBeta_1 RN 8
+
+
+
+;// Loop
+
+XY RN 9
+
+pTmp RN 6
+step RN 10
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dP_3 DN D7.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+dQ_3 DN D11.U8
+
+
+;// Filtering Decision
+dAlpha DN D0.U8
+dBeta DN D2.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+dAp1p0 DN D12.U8
+dAq1q0 DN D18.U8
+dAp2p0 DN D19.U8
+dAq2q0 DN D17.U8
+
+;// bSLT4
+dTC0 DN D18.U8
+dTC1 DN D19.U8
+dTC01 DN D18.U8
+
+dTCs DN D31.S8
+dTC DN D31.U8
+
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+
+Mask_0 RN 11
+
+dTemp DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0 QN Q10.S16
+qDp1q1 QN Q11.S16
+qDelta QN Q10.S16 ; reuse qDq0p0
+dDelta DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0 DN D24.U8
+
+dMaxP DN D23.U8
+dMinP DN D22.U8
+
+dMaxQ DN D19.U8
+dMinQ DN D21.U8
+
+dDeltaP DN D26.U8
+dDeltaQ DN D27.U8
+
+qP_0n QN Q14.S16
+qQ_0n QN Q12.S16
+
+dQ_0n DN D24.U8
+dQ_1n DN D25.U8
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+
+;// bSGE4
+
+qSp0q0 QN Q10.U16
+
+qSp2q1 QN Q11.U16
+qSp0q0p1 QN Q12.U16
+qSp3p2 QN Q13.U16
+dHSp0q1 DN D28.U8
+
+qSq2p1 QN Q11.U16
+qSp0q0q1 QN Q12.U16
+qSq3q2 QN Q13.U16 ;!!
+dHSq0p1 DN D28.U8 ;!!
+
+qTemp1 QN Q11.U16 ;!!;qSp2q1
+qTemp2 QN Q12.U16 ;!!;qSp0q0p1
+
+dP_0t DN D28.U8 ;!!;dHSp0q1
+dQ_0t DN D22.U8 ;!!;Temp1
+
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+dP_2n DN D31.U8
+
+dQ_0n DN D24.U8 ;!!;Temp2
+dQ_1n DN D25.U8 ;!!;Temp2
+dQ_2n DN D28.U8 ;!!;dQ_0t
+
+
+ ;// Function header
+ M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15
+
+ ;//Arguments on the stack
+ M_ARG ppThresholds, 4
+ M_ARG ppBS, 4
+
+ ;// d0-dAlpha_0
+ ;// d2-dBeta_0
+
+ ADD pAlpha_1, pAlpha_0, #1
+ ADD pBeta_1, pBeta_0, #1
+
+ VLD1 {dAlpha[]}, [pAlpha_0]
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #2
+ VLD1 {dBeta[]}, [pBeta_0]
+
+ M_LDR pBS, ppBS
+ M_LDR pThresholds, ppThresholds
+
+ MOV Mask_0,#0
+
+ ;dMask_0-14
+ ;dMask_1-15
+
+ VMOV dMask_0, #0
+ VMOV dMask_1, #1
+
+ ADD step, srcdstStep, srcdstStep
+
+ LDR XY,=LOOP_COUNT
+
+ ;// p0-p3 - d4-d7
+ ;// q0-q3 - d8-d11
+LoopY
+LoopX
+ LDRH bS10, [pBS], #2
+ ADD pTmp, pSrcDst, srcdstStep
+ CMP bS10, #0
+ BEQ NoFilterBS0
+
+ VLD1 dP_3, [pSrcDst], step
+ VLD1 dP_2, [pTmp], step
+ VLD1 dP_1, [pSrcDst], step
+ VLD1 dP_0, [pTmp], step
+ VLD1 dQ_0, [pSrcDst], step
+ VABD dAp1p0, dP_0, dP_1
+ VLD1 dQ_1, [pTmp]
+ VABD dAp0q0, dQ_0, dP_0
+ VLD1 dQ_2, [pSrcDst], srcdstStep
+
+ VABD dAq1q0, dQ_1, dQ_0
+ VABD dAp2p0, dP_2, dP_0
+ VCGT dFilt, dAlpha, dAp0q0
+
+ TST bS10, #0xff
+ VMAX dAp1p0, dAq1q0, dAp1p0
+ VABD dAq2q0, dQ_2, dQ_0
+
+ VMOVEQ.U32 dFilt[0], Mask_0
+ TST bS10, #0xff00
+
+ VCGT dAp2p0, dBeta, dAp2p0
+ VCGT dAp1p0, dBeta, dAp1p0
+
+ VMOVEQ.U32 dFilt[1], Mask_0
+
+ VCGT dAq2q0, dBeta, dAq2q0
+ VLD1 dQ_3, [pSrcDst]
+ VAND dFilt, dFilt, dAp1p0
+ TST bS10, #4
+
+ VAND dAqflg, dFilt, dAq2q0
+ VAND dApflg, dFilt, dAp2p0
+
+ BNE bSGE4
+bSLT4
+ ;// bS < 4 Filtering
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #2
+ SUB pSrcDst, pSrcDst, srcdstStep
+
+ BL armVCM4P10_DeblockingLumabSLT4_unsafe
+
+ ;// Result Storage
+ VST1 dP_1n, [pSrcDst], srcdstStep
+ VST1 dP_0n, [pSrcDst], srcdstStep
+ SUB pTmp, pSrcDst, srcdstStep, LSL #2
+ VST1 dQ_0n, [pSrcDst], srcdstStep
+ ADDS XY, XY, XY
+ VST1 dQ_1n, [pSrcDst]
+ ADD pSrcDst, pTmp, #8
+
+ BCC LoopX
+ B ExitLoopY
+
+NoFilterBS0
+ ADD pSrcDst, pSrcDst, #8
+ ADDS XY, XY, XY
+ ADD pThresholds, pThresholds, #2
+ BCC LoopX
+ B ExitLoopY
+bSGE4
+ ;// bS >= 4 Filtering
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #2
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #1
+ BL armVCM4P10_DeblockingLumabSGE4_unsafe
+
+ ;// Result Storage
+ VST1 dP_2n, [pSrcDst], srcdstStep
+ VST1 dP_1n, [pSrcDst], srcdstStep
+ VST1 dP_0n, [pSrcDst], srcdstStep
+ SUB pTmp, pSrcDst, srcdstStep, LSL #2
+ VST1 dQ_0n, [pSrcDst], srcdstStep
+ ADDS XY,XY,XY
+ VST1 dQ_1n, [pSrcDst], srcdstStep
+ ADD pThresholds, pThresholds, #2
+ VST1 dQ_2n, [pSrcDst]
+
+ ADD pSrcDst, pTmp, #8
+ BCC LoopX
+
+ExitLoopY
+
+ SUB pSrcDst, pSrcDst, #16
+ VLD1 {dAlpha[]}, [pAlpha_1]
+ ADD pSrcDst, pSrcDst, srcdstStep, LSL #2
+ VLD1 {dBeta[]}, [pBeta_1]
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+
+ M_END
+
+ ENDIF
+
+
+
+
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
new file mode 100755
index 0000000..e6fbb34
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
@@ -0,0 +1,436 @@
+;//
+;//
+;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe
+ IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe
+
+ IF CortexA8
+
+LOOP_COUNT EQU 0x11000000
+
+
+;// Function arguments
+
+pSrcDst RN 0
+srcdstStep RN 1
+pAlpha RN 2
+pBeta RN 3
+
+pThresholds RN 5
+pBS RN 4
+bS10 RN 12
+
+pAlpha_0 RN 2
+pBeta_0 RN 3
+
+pAlpha_1 RN 7
+pBeta_1 RN 8
+
+pTmp RN 10
+pTmpStep RN 11
+
+;// Loop
+
+XY RN 9
+
+;// Rows input
+dRow0 DN D7.U8
+dRow1 DN D8.U8
+dRow2 DN D5.U8
+dRow3 DN D10.U8
+dRow4 DN D6.U8
+dRow5 DN D9.U8
+dRow6 DN D4.U8
+dRow7 DN D11.U8
+
+;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
+;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
+
+;// Rows output
+dRown0 DN D7.U8
+dRown1 DN D24.U8
+dRown2 DN D30.U8
+dRown3 DN D10.U8
+dRown4 DN D6.U8
+dRown5 DN D25.U8
+dRown6 DN D29.U8
+dRown7 DN D11.U8
+
+;// dP_0n DN D29.U8
+;// dP_1n DN D30.U8
+;// dP_2n DN D31.U8
+;//
+;// dQ_0n DN D24.U8 ;!!;Temp2
+;// dQ_1n DN D25.U8 ;!!;Temp2
+;// dQ_2n DN D28.U8 ;!!;dQ_0t
+;//
+;// dRown0 - dP_3, dRown1 - dQ_0n
+;// dRown2 - dP_1n, dRown3 - dQ_2
+;// dRown4 - dP_2, dRown5 - dQ_1n
+;// dRown6 - dP_0n, dRown7 - dQ_3
+
+dRow0n DN D7.U8
+dRow1n DN D24.U8
+dRow2n DN D30.U8
+dRow3n DN D28.U8
+dRow4n DN D31.U8
+dRow5n DN D25.U8
+dRow6n DN D29.U8
+dRow7n DN D11.U8
+
+;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
+;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
+
+;// Pixels
+dP_0 DN D4.U8
+dP_1 DN D5.U8
+dP_2 DN D6.U8
+dP_3 DN D7.U8
+dQ_0 DN D8.U8
+dQ_1 DN D9.U8
+dQ_2 DN D10.U8
+dQ_3 DN D11.U8
+
+
+;// Filtering Decision
+dAlpha DN D0.U8
+dBeta DN D2.U8
+
+dFilt DN D16.U8
+dAqflg DN D12.U8
+dApflg DN D17.U8
+
+dAp0q0 DN D13.U8
+dAp1p0 DN D12.U8
+dAq1q0 DN D18.U8
+dAp2p0 DN D19.U8
+dAq2q0 DN D17.U8
+
+;// bSLT4
+dTC0 DN D18.U8
+dTC1 DN D19.U8
+dTC01 DN D18.U8
+
+dTCs DN D31.S8
+dTC DN D31.U8
+
+dMask_0 DN D14.U8
+dMask_1 DN D15.U8
+
+Mask_0 RN 6
+
+dTemp DN D19.U8
+
+;// Computing P0,Q0
+qDq0p0 QN Q10.S16
+qDp1q1 QN Q11.S16
+qDelta QN Q10.S16 ; reuse qDq0p0
+dDelta DN D20.S8
+
+
+;// Computing P1,Q1
+dRp0q0 DN D24.U8
+
+dMaxP DN D23.U8
+dMinP DN D22.U8
+
+dMaxQ DN D19.U8
+dMinQ DN D21.U8
+
+dDeltaP DN D26.U8
+dDeltaQ DN D27.U8
+
+qP_0n QN Q14.S16
+qQ_0n QN Q12.S16
+
+dQ_0n DN D24.U8
+dQ_1n DN D25.U8
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+
+;// bSGE4
+
+qSp0q0 QN Q10.U16
+
+qSp2q1 QN Q11.U16
+qSp0q0p1 QN Q12.U16
+qSp3p2 QN Q13.U16
+dHSp0q1 DN D28.U8
+
+qSq2p1 QN Q11.U16
+qSp0q0q1 QN Q12.U16
+qSq3q2 QN Q13.U16 ;!!
+dHSq0p1 DN D28.U8 ;!!
+
+qTemp1 QN Q11.U16 ;!!;qSp2q1
+qTemp2 QN Q12.U16 ;!!;qSp0q0p1
+
+dP_0t DN D28.U8 ;!!;dHSp0q1
+dQ_0t DN D22.U8 ;!!;Temp1
+
+dP_0n DN D29.U8
+dP_1n DN D30.U8
+dP_2n DN D31.U8
+
+dQ_0n DN D24.U8 ;!!;Temp2
+dQ_1n DN D25.U8 ;!!;Temp2
+dQ_2n DN D28.U8 ;!!;dQ_0t
+
+
+ ;// Function header
+ M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
+
+ ;//Arguments on the stack
+ M_ARG ppThresholds, 4
+ M_ARG ppBS, 4
+
+ ;// d0-dAlpha_0
+ ;// d2-dBeta_0
+
+ ADD pAlpha_1, pAlpha_0, #1
+ ADD pBeta_1, pBeta_0, #1
+
+ VLD1 {dAlpha[]}, [pAlpha_0]
+ SUB pSrcDst, pSrcDst, #4
+ VLD1 {dBeta[]}, [pBeta_0]
+
+ M_LDR pBS, ppBS
+ M_LDR pThresholds, ppThresholds
+
+ MOV Mask_0,#0
+
+ ;dMask_0-14
+ ;dMask_1-15
+
+ VMOV dMask_0, #0
+ VMOV dMask_1, #1
+
+ LDR XY,=LOOP_COUNT
+
+ ADD pTmpStep, srcdstStep, srcdstStep
+
+ ;// p0-p3 - d4-d7
+ ;// q0-q3 - d8-d11
+LoopY
+LoopX
+ LDRH bS10, [pBS], #4
+
+ CMP bS10, #0
+ BEQ NoFilterBS0
+
+ ;// Load 8 rows of data
+ ADD pTmp, pSrcDst, srcdstStep
+ VLD1 dRow0, [pSrcDst], pTmpStep
+ VLD1 dRow1, [pTmp], pTmpStep
+ VLD1 dRow2, [pSrcDst], pTmpStep
+ VZIP.8 dRow0, dRow1
+ VLD1 dRow3, [pTmp], pTmpStep
+ VLD1 dRow4, [pSrcDst], pTmpStep
+ VZIP.8 dRow2, dRow3
+ VLD1 dRow5, [pTmp], pTmpStep
+ VLD1 dRow6, [pSrcDst], pTmpStep
+ VLD1 dRow7, [pTmp], pTmpStep
+ VZIP.8 dRow4, dRow5
+ VZIP.16 dRow1, dRow3
+
+
+ ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
+ ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
+ ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
+ ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
+ ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
+ ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
+ ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
+ ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]
+
+ ;// 8x8 Transpose
+
+ VZIP.8 dRow6, dRow7
+
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
+ VZIP.16 dRow0, dRow2
+ VZIP.16 dRow5, dRow7
+
+
+ VZIP.16 dRow4, dRow6
+ VZIP.32 dRow1, dRow5
+ VZIP.32 dRow2, dRow6
+ VZIP.32 dRow3, dRow7
+ VZIP.32 dRow0, dRow4
+
+
+ ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
+ ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3
+
+ ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
+ ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
+ ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
+ ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]
+
+ ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
+ ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
+ ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
+ ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]
+
+ VABD dAp0q0, dP_0, dQ_0
+ VABD dAp1p0, dP_1, dP_0
+
+ VABD dAq1q0, dQ_1, dQ_0
+ VABD dAp2p0, dP_2, dP_0
+
+ TST bS10, #0xff
+ VCGT dFilt, dAlpha, dAp0q0
+
+ VMAX dAp1p0, dAq1q0, dAp1p0
+ VABD dAq2q0, dQ_2, dQ_0
+
+ VMOVEQ.U32 dFilt[0], Mask_0
+ TST bS10, #0xff00
+
+ VCGT dAp2p0, dBeta, dAp2p0
+ VCGT dAp1p0, dBeta, dAp1p0
+
+ VMOVEQ.U32 dFilt[1], Mask_0
+
+ VCGT dAq2q0, dBeta, dAq2q0
+ VAND dFilt, dFilt, dAp1p0
+ TST bS10, #4
+
+ VAND dAqflg, dFilt, dAq2q0
+ VAND dApflg, dFilt, dAp2p0
+
+ BNE bSGE4
+bSLT4
+ ;// bS < 4 Filtering
+
+ BL armVCM4P10_DeblockingLumabSLT4_unsafe
+
+ ;// Transpose
+
+ VZIP.8 dP_3, dP_2
+ VZIP.8 dP_1n, dP_0n
+ VZIP.8 dQ_0n, dQ_1n
+ VZIP.8 dQ_2, dQ_3
+
+
+ VZIP.16 dP_3, dP_1n
+ ADD pTmp, pSrcDst, srcdstStep
+ VZIP.16 dQ_0n, dQ_2
+ VZIP.16 dQ_1n, dQ_3
+ VZIP.16 dP_2, dP_0n
+
+ VZIP.32 dP_3, dQ_0n
+ VZIP.32 dP_1n, dQ_2
+ VZIP.32 dP_2, dQ_1n
+ VZIP.32 dP_0n, dQ_3
+
+ ;// dRown0 - dP_3, dRown1 - dQ_0n
+ ;// dRown2 - dP_1n, dRown3 - dQ_2
+ ;// dRown4 - dP_2, dRown5 - dQ_1n
+ ;// dRown6 - dP_0n, dRown7 - dQ_3
+
+ VST1 dRown0, [pSrcDst], pTmpStep
+ VST1 dRown1, [pTmp], pTmpStep
+ VST1 dRown2, [pSrcDst], pTmpStep
+ VST1 dRown3, [pTmp], pTmpStep
+ ;1
+ VST1 dRown4, [pSrcDst], pTmpStep
+ VST1 dRown5, [pTmp], pTmpStep
+ ADDS XY, XY, XY
+ VST1 dRown6, [pSrcDst], pTmpStep
+ ADD pThresholds, pThresholds, #2
+ VST1 dRown7, [pTmp], srcdstStep
+
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
+ VLD1 {dAlpha[]}, [pAlpha_1]
+ ADD pSrcDst, pSrcDst, #4
+ VLD1 {dBeta[]}, [pBeta_1]
+
+ BCC LoopX
+ B ExitLoopY
+
+NoFilterBS0
+ ADD pSrcDst, pSrcDst, #4
+ ADDS XY, XY, XY
+ VLD1 {dAlpha[]}, [pAlpha_1]
+ ADD pThresholds, pThresholds, #4
+ VLD1 {dBeta[]}, [pBeta_1]
+ BCC LoopX
+ B ExitLoopY
+bSGE4
+ ;// bS >= 4 Filtering
+
+ BL armVCM4P10_DeblockingLumabSGE4_unsafe
+
+ ;// Transpose
+
+ VZIP.8 dP_3, dP_2n
+ VZIP.8 dP_1n, dP_0n
+ VZIP.8 dQ_0n, dQ_1n
+ VZIP.8 dQ_2n, dQ_3
+
+ VZIP.16 dP_3, dP_1n
+ ADD pTmp, pSrcDst, srcdstStep
+ VZIP.16 dQ_0n, dQ_2n
+ VZIP.16 dQ_1n, dQ_3
+ VZIP.16 dP_2n, dP_0n
+
+ VZIP.32 dP_3, dQ_0n
+ VZIP.32 dP_1n, dQ_2n
+ VZIP.32 dP_2n, dQ_1n
+ VZIP.32 dP_0n, dQ_3
+
+ ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
+ ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
+
+ VST1 dRow0n, [pSrcDst], pTmpStep
+ VST1 dRow1n, [pTmp], pTmpStep
+ VST1 dRow2n, [pSrcDst], pTmpStep
+ VST1 dRow3n, [pTmp], pTmpStep
+ VST1 dRow4n, [pSrcDst], pTmpStep
+ VST1 dRow5n, [pTmp], pTmpStep
+ ADDS XY,XY,XY
+ VST1 dRow6n, [pSrcDst], pTmpStep
+ ADD pThresholds, pThresholds, #4
+ VST1 dRow7n, [pTmp], pTmpStep
+
+ SUB pSrcDst, pSrcDst, srcdstStep, LSL #3
+ VLD1 {dAlpha[]}, [pAlpha_1]
+ ADD pSrcDst, pSrcDst, #4
+ VLD1 {dBeta[]}, [pBeta_1]
+
+ BCC LoopX
+
+ExitLoopY
+ SUB pBS, pBS, #14
+ SUB pThresholds, pThresholds, #14
+ SUB pSrcDst, pSrcDst, #16
+ VLD1 {dAlpha[]}, [pAlpha_0]
+ ADD pSrcDst, pSrcDst, srcdstStep, LSL #3
+ VLD1 {dBeta[]}, [pBeta_0]
+ BNE LoopY
+
+ MOV r0, #OMX_Sts_NoErr
+
+ M_END
+
+ ENDIF
+
+
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
new file mode 100755
index 0000000..3ce41be
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateChroma.c
@@ -0,0 +1,79 @@
+/**
+ *
+ * File Name: omxVCM4P10_InterpolateChroma.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ * Description:
+ * This function will calculate 1/8 Pixel interpolation for Chroma Block
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armVC.h"
+#include "armCOMM.h"
+
+
+/**
+ * Function: omxVCM4P10_InterpolateChroma,
+ *
+ * Description:
+ * Performs 1/8-pixel interpolation for inter chroma MB.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] pSrc Pointer to the source reference frame buffer
+ * [in] srcStep Reference frame step in byte
+ * [in] dstStep Destination frame step in byte. Must be multiple of roi.width.
+ * [in] dx Fractional part of horizontal motion vector component
+ * in 1/8 pixel unit;valid in the range [0,7]
+ * [in] dy Fractional part of vertical motion vector component
+ * in 1/8 pixel unit;valid in the range [0,7]
+ * [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must
+ * be equal to either 2, 4, or 8.
+ * [out] pDst Pointer to the destination frame buffer.
+ * if roi.width==2, 2-byte alignment required
+ * if roi.width==4, 4-byte alignment required
+ * if roi.width==8, 8-byte alignment required
+ *
+ * Return Value:
+ * If the function runs without error, it returns OMX_Sts_NoErr.
+ * If one of the following cases occurs, the function returns OMX_Sts_BadArgErr:
+ * pSrc or pDst is NULL.
+ * srcStep or dstStep < 8.
+ * dx or dy is out of range [0-7].
+ * roi.width or roi.height is out of range {2,4,8}.
+ * roi.width is equal to 2, but pDst is not 2-byte aligned.
+ * roi.width is equal to 4, but pDst is not 4-byte aligned.
+ * roi.width is equal to 8, but pDst is not 8 byte aligned.
+ * srcStep or dstStep is not a multiple of 8.
+ *
+ */
+
+OMXResult omxVCM4P10_InterpolateChroma (
+ const OMX_U8* pSrc,
+ OMX_S32 srcStep,
+ OMX_U8* pDst,
+ OMX_S32 dstStep,
+ OMX_S32 dx,
+ OMX_S32 dy,
+ OMXSize roi
+ )
+{
+ return armVCM4P10_Interpolate_Chroma
+ ((OMX_U8*)pSrc, srcStep, pDst, dstStep, roi.width, roi.height, dx, dy);
+}
+
+
+/*****************************************************************************
+ * END OF FILE
+ *****************************************************************************/
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
new file mode 100755
index 0000000..942ebc6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_InterpolateLuma_s.s
@@ -0,0 +1,553 @@
+;//
+;//
+;// File Name: omxVCM4P10_InterpolateLuma_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Function:
+;// omxVCM4P10_InterpolateLuma
+;//
+;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly.
+;// Performs quarter pel interpolation of inter luma MB.
+;// It's assumed that the frame is already padded when calling this function.
+;// Parameters:
+;// [in] pSrc Pointer to the source reference frame buffer
+;// [in] srcStep Reference frame step in byte
+;// [in] dstStep Destination frame step in byte. Must be multiple of roi.width
+;// [in] dx Fractional part of horizontal motion vector
+;// component in 1/4 pixel unit; valid in the range [0,3]
+;// [in] dy Fractional part of vertical motion vector
+;// component in 1/4 pixel unit; valid in the range [0,3]
+;// [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must
+;// be equal to either 4, 8, or 16.
+;// [out] pDst Pointer to the destination frame buffer.
+;// if roi.width==4, 4-byte alignment required
+;// if roi.width==8, 8-byte alignment required
+;// if roi.width==16, 16-byte alignment required
+;//
+;// Return Value:
+;// If the function runs without error, it returns OMX_Sts_NoErr.
+;// It is assued that following cases are satisfied before calling this function:
+;// pSrc or pDst is not NULL.
+;// srcStep or dstStep >= roi.width.
+;// dx or dy is in the range [0-3].
+;// roi.width or roi.height is not out of range {4, 8, 16}.
+;// If roi.width is equal to 4, Dst is 4 byte aligned.
+;// If roi.width is equal to 8, pDst is 8 byte aligned.
+;// If roi.width is equal to 16, pDst is 16 byte aligned.
+;// srcStep and dstStep is multiple of 8.
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ EXPORT omxVCM4P10_InterpolateLuma
+
+
+ IF CortexA8
+ IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ ENDIF
+
+
+
+;// Declare input registers
+pSrc RN 0
+srcStep RN 1
+pDst RN 2
+dstStep RN 3
+iHeight RN 4
+iWidth RN 5
+
+;// Declare other intermediate registers
+idx RN 6
+idy RN 7
+index RN 6
+Temp RN 12
+pArgs RN 11
+
+
+ IF CortexA8
+
+ ;//
+ ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time.
+ ;//
+ M_ALLOC4 ppArgs, 16
+
+ ;// Function header
+ M_START omxVCM4P10_InterpolateLuma, r11, d15
+
+pSrcBK RN 8
+
+;// Declare Neon registers
+dCoeff5 DN 30.S16
+dCoeff20 DN 31.S16
+
+;// Registers used for implementing Horizontal interpolation
+dSrc0c DN 14.U8
+dSrc1c DN 16.U8
+dSrc2c DN 18.U8
+dSrc3c DN 20.U8
+dSrc0d DN 15.U8
+dSrc1d DN 17.U8
+dSrc2d DN 19.U8
+dSrc3d DN 21.U8
+dAccH0 DN 22.U8
+dAccH1 DN 24.U8
+dAccH2 DN 26.U8
+dAccH3 DN 28.U8
+dResultH0 DN 22.U32
+dResultH1 DN 24.U32
+dResultH2 DN 26.U32
+dResultH3 DN 28.U32
+
+;// Registers used for implementing Vertical interpolation
+dSrc0 DN 9.U8
+dSrc1 DN 10.U8
+dSrc2 DN 11.U8
+dSrc3 DN 12.U8
+dSrc4 DN 13.U8
+dAccV0 DN 0.U8
+dAccV1 DN 2.U8
+dAccV2 DN 4.U8
+dAccV3 DN 6.U8
+dResultV0 DN 0.U32
+dResultV1 DN 2.U32
+dResultV2 DN 4.U32
+dResultV3 DN 6.U32
+
+;// Registers used for implementing Diagonal interpolation
+dTAcc0 DN 0.U8
+dTAcc1 DN 2.U8
+dTAcc2 DN 4.U8
+dTAcc3 DN 6.U8
+dTRes0 DN 0.32
+dTRes1 DN 2.32
+dTRes2 DN 4.32
+dTRes3 DN 6.32
+dTResult0 DN 14.U8
+dTResult1 DN 16.U8
+dTResult2 DN 18.U8
+dTResult3 DN 20.U8
+dTempP0 DN 18.S16
+dTempP1 DN 19.S16
+dTempQ0 DN 20.S16
+dTempQ1 DN 21.S16
+dTempR0 DN 22.S16
+dTempR1 DN 23.S16
+dTempS0 DN 24.S16
+dTempS1 DN 25.S16
+qTempP01 QN 9.S16
+qTempQ01 QN 10.S16
+qTempR01 QN 11.S16
+qTempS01 QN 12.S16
+
+;// Intermediate values for averaging
+qRes2 QN 7.S16
+qRes3 QN 8.S16
+qRes4 QN 9.S16
+qRes5 QN 10.S16
+qRes6 QN 11.S16
+
+;// For implementing copy
+dDst0 DN 9.32
+dDst1 DN 10.32
+dDst2 DN 11.32
+dDst3 DN 12.32
+
+ ;// Define stack arguments
+ M_ARG ptridx, 4
+ M_ARG ptridy, 4
+ M_ARG ptrWidth, 4
+ M_ARG ptrHeight, 4
+
+ ;// Load structure elements of roi
+ M_LDR idx, ptridx
+ M_LDR idy, ptridy
+ M_LDR iWidth, ptrWidth
+ M_LDR iHeight, ptrHeight
+
+ ADD index, idx, idy, LSL #2 ;// [index] = [idy][idx]
+ M_ADR pArgs, ppArgs
+
+ ;// Move coefficients Neon registers
+ VMOV dCoeff20, #20
+ VMOV dCoeff5, #5
+
+Block4x4WidthLoop
+Block4x4HeightLoop
+
+ STM pArgs, {pSrc,srcStep,pDst,dstStep}
+
+ ;// switch table using motion vector as index
+ ADD pc, pc, index, LSL #2
+ B Case_f
+ B Case_0
+ B Case_1
+ B Case_2
+ B Case_3
+ B Case_4
+ B Case_5
+ B Case_6
+ B Case_7
+ B Case_8
+ B Case_9
+ B Case_a
+ B Case_b
+ B Case_c
+ B Case_d
+ B Case_e
+ B Case_f
+
+Case_0
+ ;// Case G
+ M_PRINTF "Case 0 \n"
+
+ ;// Loads a 4x4 block of .8 and stores as .32
+ ADD Temp, pSrc, srcStep, LSL #1
+ VLD1 dSrc0, [pSrc], srcStep
+ VLD1 dSrc2, [Temp], srcStep
+ VLD1 dSrc1, [pSrc]
+ VLD1 dSrc3, [Temp]
+
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dDst0[0], [pDst], dstStep
+ VST1 dDst2[0], [Temp], dstStep
+ VST1 dDst1[0], [pDst]
+ VST1 dDst3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_1
+ ;// Case a
+ M_PRINTF "Case 1 \n"
+
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dSrc0c
+ VRHADD dAccH2, dAccH2, dSrc2c
+ VRHADD dAccH1, dAccH1, dSrc1c
+ VRHADD dAccH3, dAccH3, dSrc3c
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_2
+ ;// Case b
+ M_PRINTF "Case 2 \n"
+
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_3
+ ;// Case c
+ M_PRINTF "Case 3 \n"
+
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dSrc0d
+ VRHADD dAccH2, dAccH2, dSrc2d
+ VRHADD dAccH1, dAccH1, dSrc1d
+ VRHADD dAccH3, dAccH3, dSrc3d
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_4
+ ;// Case d
+ M_PRINTF "Case 4 \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ VRHADD dAccV0, dAccV0, dSrc0
+ VRHADD dAccV2, dAccV2, dSrc2
+ VRHADD dAccV1, dAccV1, dSrc1
+ VRHADD dAccV3, dAccV3, dSrc3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultV0[0], [pDst], dstStep
+ VST1 dResultV2[0], [Temp], dstStep
+ VST1 dResultV1[0], [pDst]
+ VST1 dResultV3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_5
+ ;// Case e
+ M_PRINTF "Case 5 \n"
+
+ MOV pSrcBK, pSrc
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ SUB pSrc, pSrcBK, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dAccV0
+ VRHADD dAccH2, dAccH2, dAccV2
+ VRHADD dAccH1, dAccH1, dAccV1
+ VRHADD dAccH3, dAccH3, dAccV3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_6
+ ;// Case f
+ M_PRINTF "Case 6 \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ VQRSHRUN dTResult0, qRes2, #5
+ VQRSHRUN dTResult1, qRes3, #5
+ VQRSHRUN dTResult2, qRes4, #5
+ VQRSHRUN dTResult3, qRes5, #5
+ VRHADD dTAcc0, dTAcc0, dTResult0
+ VRHADD dTAcc2, dTAcc2, dTResult2
+ VRHADD dTAcc1, dTAcc1, dTResult1
+ VRHADD dTAcc3, dTAcc3, dTResult3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dTRes0[0], [pDst], dstStep
+ VST1 dTRes2[0], [Temp], dstStep
+ VST1 dTRes1[0], [pDst]
+ VST1 dTRes3[0], [Temp]
+
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_7
+ ;// Case g
+ M_PRINTF "Case 7 \n"
+ MOV pSrcBK, pSrc
+ ADD pSrc, pSrc, #1
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ SUB pSrc, pSrcBK, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dAccV0
+ VRHADD dAccH2, dAccH2, dAccV2
+ VRHADD dAccH1, dAccH1, dAccV1
+ VRHADD dAccH3, dAccH3, dAccV3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_8
+ ;// Case h
+ M_PRINTF "Case 8 \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultV0[0], [pDst], dstStep
+ VST1 dResultV2[0], [Temp], dstStep
+ VST1 dResultV1[0], [pDst]
+ VST1 dResultV3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_9
+ ;// Case i
+ M_PRINTF "Case 9 \n"
+ SUB pSrc, pSrc, srcStep, LSL #1
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ VEXT dTempP0, dTempP0, dTempP1, #2
+ VEXT dTempQ0, dTempQ0, dTempQ1, #2
+ VEXT dTempR0, dTempR0, dTempR1, #2
+ VEXT dTempS0, dTempS0, dTempS1, #2
+
+ VQRSHRUN dTResult0, qTempP01, #5
+ VQRSHRUN dTResult1, qTempQ01, #5
+ VQRSHRUN dTResult2, qTempR01, #5
+ VQRSHRUN dTResult3, qTempS01, #5
+
+ VRHADD dTAcc0, dTAcc0, dTResult0
+ VRHADD dTAcc2, dTAcc2, dTResult2
+ VRHADD dTAcc1, dTAcc1, dTResult1
+ VRHADD dTAcc3, dTAcc3, dTResult3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dTRes0[0], [pDst], dstStep
+ VST1 dTRes2[0], [Temp], dstStep
+ VST1 dTRes1[0], [pDst]
+ VST1 dTRes3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_a
+ ;// Case j
+ M_PRINTF "Case a \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dTRes0[0], [pDst], dstStep
+ VST1 dTRes2[0], [Temp], dstStep
+ VST1 dTRes1[0], [pDst]
+ VST1 dTRes3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_b
+ ;// Case k
+ M_PRINTF "Case b \n"
+ SUB pSrc, pSrc, srcStep, LSL #1
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ VEXT dTempP0, dTempP0, dTempP1, #3
+ VEXT dTempQ0, dTempQ0, dTempQ1, #3
+ VEXT dTempR0, dTempR0, dTempR1, #3
+ VEXT dTempS0, dTempS0, dTempS1, #3
+
+ VQRSHRUN dTResult0, qTempP01, #5
+ VQRSHRUN dTResult1, qTempQ01, #5
+ VQRSHRUN dTResult2, qTempR01, #5
+ VQRSHRUN dTResult3, qTempS01, #5
+
+ VRHADD dTAcc0, dTAcc0, dTResult0
+ VRHADD dTAcc2, dTAcc2, dTResult2
+ VRHADD dTAcc1, dTAcc1, dTResult1
+ VRHADD dTAcc3, dTAcc3, dTResult3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dTRes0[0], [pDst], dstStep
+ VST1 dTRes2[0], [Temp], dstStep
+ VST1 dTRes1[0], [pDst]
+ VST1 dTRes3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_c
+ ;// Case n
+ M_PRINTF "Case c \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ VRHADD dAccV0, dAccV0, dSrc1
+ VRHADD dAccV2, dAccV2, dSrc3
+ VRHADD dAccV1, dAccV1, dSrc2
+ VRHADD dAccV3, dAccV3, dSrc4
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultV0[0], [pDst], dstStep
+ VST1 dResultV2[0], [Temp], dstStep
+ VST1 dResultV1[0], [pDst]
+ VST1 dResultV3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_d
+ ;// Case p
+ M_PRINTF "Case d \n"
+
+ MOV pSrcBK, pSrc
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD pSrc, pSrcBK, srcStep
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dAccV0
+ VRHADD dAccH2, dAccH2, dAccV2
+ VRHADD dAccH1, dAccH1, dAccV1
+ VRHADD dAccH3, dAccH3, dAccV3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_e
+ ;// Case q
+ M_PRINTF "Case e \n"
+
+ SUB pSrc, pSrc, srcStep, LSL #1
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ VQRSHRUN dTResult0, qRes3, #5
+ VQRSHRUN dTResult1, qRes4, #5
+ VQRSHRUN dTResult2, qRes5, #5
+ VQRSHRUN dTResult3, qRes6, #5
+
+ VRHADD dTAcc0, dTAcc0, dTResult0
+ VRHADD dTAcc2, dTAcc2, dTResult2
+ VRHADD dTAcc1, dTAcc1, dTResult1
+ VRHADD dTAcc3, dTAcc3, dTResult3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dTRes0[0], [pDst], dstStep
+ VST1 dTRes2[0], [Temp], dstStep
+ VST1 dTRes1[0], [pDst]
+ VST1 dTRes3[0], [Temp]
+ M_ADR pArgs, ppArgs
+ B Block4x4LoopEnd
+Case_f
+ ;// Case r
+ M_PRINTF "Case f \n"
+ MOV pSrcBK, pSrc
+ ADD pSrc, pSrc, #1
+ SUB pSrc, pSrc, srcStep, LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD pSrc, pSrcBK, srcStep
+ SUB pSrc, pSrc, #2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD dAccH0, dAccH0, dAccV0
+ VRHADD dAccH2, dAccH2, dAccV2
+ VRHADD dAccH1, dAccH1, dAccV1
+ VRHADD dAccH3, dAccH3, dAccV3
+ ADD Temp, pDst, dstStep, LSL #1
+ VST1 dResultH0[0], [pDst], dstStep
+ VST1 dResultH2[0], [Temp], dstStep
+ VST1 dResultH1[0], [pDst]
+ VST1 dResultH3[0], [Temp]
+ M_ADR pArgs, ppArgs
+
+
+Block4x4LoopEnd
+
+ ;// Width Loop
+ ;//M_ADR pArgs, ppArgs
+ LDM pArgs, {pSrc,srcStep,pDst,dstStep} ;// Load arguments
+ SUBS iWidth, iWidth, #4
+ ADD pSrc, pSrc, #4
+ ADD pDst, pDst, #4
+ BGT Block4x4WidthLoop
+
+ ;// Height Loop
+ SUBS iHeight, iHeight, #4
+ M_LDR iWidth, ptrWidth
+ M_ADR pArgs, ppArgs
+ ADD pSrc, pSrc, srcStep, LSL #2
+ ADD pDst, pDst, dstStep, LSL #2
+ SUB pSrc, pSrc, iWidth
+ SUB pDst, pDst, iWidth
+ BGT Block4x4HeightLoop
+
+EndOfInterpolation
+ MOV r0, #0
+ M_END
+
+ ENDIF
+ ;// End of CortexA8
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
new file mode 100755
index 0000000..3a60705
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntraChroma_8x8_s.s
@@ -0,0 +1,436 @@
+;//
+;//
+;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ EXPORT armVCM4P10_pIndexTable8x8
+
+;// Define the processor variants supported by this file
+
+ M_VARIANTS CortexA8
+
+ AREA table, DATA
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_pIndexTable8x8
+ DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR
+ DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE
+
+ M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
+ DCW 3, 2, 1,4
+ DCW -3,-2,-1,0
+ DCW 1, 2, 3,4
+
+
+
+ IF CortexA8
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+
+pc RN 15
+return RN 0
+pTable RN 8
+
+;//--------------------------------------------
+;// Input Arguments
+;//--------------------------------------------
+pSrcLeft RN 0 ;// input pointer
+pSrcAbove RN 1 ;// input pointer
+pSrcAboveLeft RN 2 ;// input pointer
+pDst RN 3 ;// output pointer
+leftStep RN 4 ;// input variable
+dstStep RN 5 ;// input variable
+predMode RN 6 ;// input variable
+availability RN 7 ;// input variable
+pMultiplierTable RN 2
+
+pTmp RN 9
+step RN 10
+
+;//---------------------
+;// Neon Registers
+;//---------------------
+
+;// OMX_VC_CHROMA_HOR
+
+dLeftVal0 DN D0.8
+dLeftVal1 DN D1.8
+dLeftVal2 DN D2.8
+dLeftVal3 DN D3.8
+dLeftVal4 DN D4.8
+dLeftVal5 DN D5.8
+dLeftVal6 DN D6.8
+dLeftVal7 DN D7.8
+
+;// OMX_VC_CHROMA_VERT
+
+dAboveVal DN D0.U8
+
+;// OMX_VC_CHROMA_DC
+
+dLeftVal DN D1.U8
+dSumAboveValU16 DN D2.U16
+dSumAboveValU32 DN D3.U32
+dSumAboveValU8 DN D3.U8
+dSumLeftValU16 DN D2.U16
+dSumLeftValU32 DN D1.U32
+dSumLeftValU8 DN D1.U8
+dSumAboveLeft DN D2.U32
+dSumAboveLeftU8 DN D2.U8
+dIndexRow0U8 DN D5.U8
+dIndexRow0 DN D5.U64
+dIndexRow4U8 DN D6.U8
+dIndexRow4 DN D6.U64
+dDstRow0 DN D0.U8
+dDstRow4 DN D4.U8
+dConst128U8 DN D0.U8
+
+;// OMX_VC_CHROMA_PLANE
+
+dRevAboveVal DN D3.U8
+dRevAboveValU64 DN D3.U64
+dAboveLeftVal DN D2.U8
+qAbove7minus0 QN Q3.S16
+qAboveDiff QN Q2.S16
+dIndex DN D8.U8
+dDiffAboveU8 DN D9.U8
+dDiffAboveS16 DN D9.S16
+dAboveDiff0U8 DN D4.U8
+dAboveDiff0U64 DN D4.U64
+dAbove7minus0U8 DN D6.U8
+dMultiplier DN D10.S16
+dHorPred DN D11.S16
+dRevLeftVal DN D3.U8
+dRevLeftValU64 DN D3.U64
+qLeft7minus0 QN Q7.S16
+qLeftDiff QN Q6.S16
+dDiffLeftU8 DN D16.U8
+dDiffLeftS16 DN D16.S16
+dLeftDiff0U8 DN D12.U8
+dLeftDiff0U64 DN D12.U64
+dLeft7minus0U8 DN D14.U8
+dVerPred DN D3.S16
+dHVValS16 DN D3.S16
+dHVValS32 DN D3.S32
+dHVTempS32 DN D2.S32
+qA QN Q0.S16
+qB QN Q2.S16
+qC QN Q3.S16
+qMultiplier QN Q5.S16
+dMultiplier0 DN D10.S16
+dMultiplier1 DN D11.S16
+qC0 QN Q0.S16
+qC1 QN Q1.S16
+qC2 QN Q4.S16
+qC3 QN Q5.S16
+qC4 QN Q6.S16
+qC5 QN Q7.S16
+qC6 QN Q8.S16
+qC7 QN Q9.S16
+qSum0 QN Q0.S16
+qSum1 QN Q1.S16
+qSum2 QN Q4.S16
+qSum3 QN Q5.S16
+qSum4 QN Q6.S16
+qSum5 QN Q7.S16
+qSum6 QN Q8.S16
+qSum7 QN Q9.S16
+dSum0 DN D0.U8
+dSum1 DN D1.U8
+dSum2 DN D2.U8
+dSum3 DN D3.U8
+dSum4 DN D4.U8
+dSum5 DN D5.U8
+dSum6 DN D6.U8
+dSum7 DN D7.U8
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntraChroma_8x8 starts
+;//-----------------------------------------------------------------------------------------------
+
+ ;// Write function header
+ M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
+
+ ;// Define stack arguments
+ M_ARG LeftStep, 4
+ M_ARG DstStep, 4
+ M_ARG PredMode, 4
+ M_ARG Availability, 4
+
+ LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case
+
+ ;// Load argument from the stack
+ M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg
+ M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg
+ M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg
+ M_LDR availability, Availability ;// Arg availability loaded from stack to reg
+
+
+ LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
+
+OMX_VC_CHROMA_DC
+
+ TST availability, #OMX_VC_LEFT
+ BEQ DCChroma8x8LeftNotAvailable
+
+ ADD pTmp, pSrcLeft, leftStep
+ ADD step, leftStep, leftStep
+
+ ;// Load Left Edge
+ VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep]
+ VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
+ VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep]
+ VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
+ VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep]
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCChroma8x8LeftOnlyAvailable
+
+ ;// Load Upper Edge also
+ VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7]
+
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+
+ VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
+ VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ]
+
+ VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
+ VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ]
+
+ VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
+ VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3
+ VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
+ VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
+
+ VMOV dIndexRow0U8,#0x0c
+ VMOV dIndexRow4U8,#0x04
+ VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000
+ VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404
+ VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404
+ VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
+ VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
+
+DCChroma8x8LeftStore
+ ADD pTmp, pDst, dstStep
+ ADD step, dstStep, dstStep
+
+ VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
+ VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
+ VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
+ VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
+ VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7
+
+ M_EXIT
+
+
+DCChroma8x8LeftOnlyAvailable
+
+ MOV return, #OMX_Sts_NoErr
+
+ VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
+ VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ]
+ VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
+
+ VDUP dDstRow0,dSumLeftValU8[0]
+ VDUP dDstRow4,dSumLeftValU8[4]
+
+ B DCChroma8x8LeftStore
+
+
+DCChroma8x8LeftNotAvailable
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCChroma8x8NoneAvailable
+
+ ;// Load Upper Edge
+ VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7]
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+
+ VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
+ VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ]
+ VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
+ VMOV dIndexRow0U8,#0x04
+ VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000
+ VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8
+
+ B DCChroma8x8UpperStore
+
+
+DCChroma8x8NoneAvailable
+
+ VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+
+DCChroma8x8UpperStore
+
+ ADD pTmp, pDst, dstStep
+ ADD step, dstStep, dstStep
+
+ VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
+ VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7
+
+ M_EXIT
+
+
+OMX_VC_CHROMA_VERT
+
+ VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
+ MOV return, #OMX_Sts_NoErr
+
+ B DCChroma8x8UpperStore
+
+
+OMX_VC_CHROMA_HOR
+
+ ADD pTmp, pSrcLeft, leftStep
+ ADD step, leftStep, leftStep
+
+ VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep]
+ VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
+ VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep]
+ VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
+ VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep]
+
+ B DCChroma8x8PlaneStore
+
+
+OMX_VC_CHROMA_PLANE
+ ADD pTmp, pSrcLeft, leftStep
+ ADD step, leftStep, leftStep
+
+ VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
+ VLD1 dAboveLeftVal[0],[pSrcAboveLeft]
+
+ VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep]
+ VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep]
+ VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep]
+ VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep]
+ VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep]
+
+
+ VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
+ VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
+ VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6]
+ VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0]
+ ;// pSrcAbove[5] - pSrcAbove[1]
+ ;// pSrcAbove[4] - pSrcAbove[2]
+
+ VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
+ VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
+ VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6]
+ VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0]
+ ;// pSrcLeft[5] - pSrcLeft[1]
+ ;// pSrcLeft[4] - pSrcLeft[2]
+
+ LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval
+ VSHL dAboveDiff0U64,dAboveDiff0U64,#16
+ VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
+ VLD1 dMultiplier,[pMultiplierTable]!
+ VSHL dLeftDiff0U64,dLeftDiff0U64,#16
+ VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]
+
+
+ VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
+ VMUL dVerPred,dDiffLeftS16,dMultiplier
+ VPADD dHVValS16,dHorPred,dVerPred
+
+
+ VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each
+ VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H
+ VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each
+ VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]
+ VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each
+ VADDL qA,dAboveVal,dLeftVal
+ VDUP qA,qA[7]
+ VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a]
+ VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b]
+ VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c]
+
+
+ VMUL qB,qB,qMultiplier
+ VMUL qC,qC,qMultiplier
+ VADD qB,qB,qA
+
+ VDUP qC0,qC[0]
+ VDUP qC1,qC[1]
+ VDUP qC2,qC[2]
+ VDUP qC3,qC[3]
+ VDUP qC4,qC[4]
+ VDUP qC5,qC[5]
+ VDUP qC6,qC[6]
+ VDUP qC7,qC[7]
+
+ VADD qSum0,qB,qC0
+ VADD qSum1,qB,qC1
+ VADD qSum2,qB,qC2
+ VADD qSum3,qB,qC3
+ VADD qSum4,qB,qC4
+ VADD qSum5,qB,qC5
+ VADD qSum6,qB,qC6
+ VADD qSum7,qB,qC7
+
+ VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
+ VQRSHRUN dSum1,qSum1,#5
+ VQRSHRUN dSum2,qSum2,#5
+ VQRSHRUN dSum3,qSum3,#5
+ VQRSHRUN dSum4,qSum4,#5
+ VQRSHRUN dSum5,qSum5,#5
+ VQRSHRUN dSum6,qSum6,#5
+ VQRSHRUN dSum7,qSum7,#5
+
+DCChroma8x8PlaneStore
+ ADD pTmp, pDst, dstStep
+ ADD step, dstStep, dstStep
+
+ VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7
+ VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7
+ VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7
+ VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7
+ VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7
+ VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7
+ VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7
+ VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7
+
+ MOV return, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF ;// CortexA8
+
+ END
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntraChroma_8x8 ends
+;//-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
new file mode 100755
index 0000000..e9c0eee
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
@@ -0,0 +1,424 @@
+;//
+;//
+;// File Name: omxVCM4P10_PredictIntra_16x16_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_pIndexTable16x16
+ DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
+ DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE
+
+
+ IF CortexA8
+
+ M_TABLE armVCM4P10_MultiplierTable16x16,1
+ DCW 7, 6, 5, 4, 3, 2, 1, 8
+ DCW 0, 1, 2, 3, 4, 5, 6, 7
+ DCW 8, 9, 10, 11, 12, 13, 14, 15
+
+;//--------------------------------------------
+;// Constants
+;//--------------------------------------------
+BLK_SIZE EQU 0x10
+MUL_CONST0 EQU 0x01010101
+MUL_CONST1 EQU 0x00060004
+MUL_CONST2 EQU 0x00070005
+MUL_CONST3 EQU 0x00030001
+MASK_CONST EQU 0x00FF00FF
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+y RN 12
+pc RN 15
+
+return RN 0
+pTable RN 9
+count RN 11
+pMultTable RN 9
+; ----------------------------------------------
+; Neon registers
+; ----------------------------------------------
+qAbove QN Q0.U8
+qLeft QN Q1.U8
+qSum8 QN Q0.U16
+dSum80 DN D0.U16
+dSum81 DN D1.U16
+dSum4 DN D0.U16
+dSum2 DN D0.U32
+dSum1 DN D0.U64
+qOut QN Q3.U8
+dSumLeft DN D6.U64
+dSumAbove DN D7.U64
+dSum DN D8.U64
+dSum0 DN D8.U8[0]
+
+qH QN Q11.S32
+qV QN Q12.S32
+qA QN Q11.S16
+qB QN Q6.S16
+qC QN Q7.S16
+
+qB0 QN Q5.S16
+qB1 QN Q6.S16
+dA1 DN D23.S16
+
+dH0 DN D22.S32
+dH1 DN D23.S32
+dV0 DN D24.S32
+dV1 DN D25.S32
+
+qHV QN Q11.S64
+qHV0 QN Q11.S32
+qHV1 QN Q12.S64
+
+dHV00 DN D22.S32
+dHV01 DN D23.S32
+
+dHV0 DN D22.S16[0]
+dHV1 DN D23.S16[0]
+dHV10 DN D24.S64
+dHV11 DN D25.S64
+
+qSum0 QN Q0.S16
+qSum1 QN Q1.S16
+
+dOut0 DN D6.U8
+dOut1 DN D7.U8
+
+dLeft0 DN D2.U8
+dLeft1 DN D3.U8
+qConst QN Q13.S16
+
+dAbove0 DN D0.U8
+dAbove1 DN D1.U8
+
+dRevLeft64 DN D12.U64
+dRevLeft DN D12.U8
+dRevAbove64 DN D5.U64
+dRevAbove DN D5.U8
+qLeftDiff QN Q8.S16
+dLeftDiff1 DN D17.S16
+dLeftDiff64 DN D17.S64
+qDiffLeft QN Q8.S16
+qDiffAbove QN Q4.S16
+dAboveDiff1 DN D9.S16
+dAboveDiff64 DN D9.S64
+qAboveDiff QN Q4.S16
+
+dAboveLeft DN D4.U8
+
+dDiffLeft0 DN D16.S16
+dDiffLeft1 DN D17.S16
+dDiffAbove0 DN D8.S16
+dDiffAbove1 DN D9.S16
+
+qLeft15minus0 QN Q7.S16
+dLeft15minus0 DN D14.S16
+qAbove15minus0 QN Q3.S16
+dAbove15minus0 DN D6.S16
+
+qMultiplier QN Q10.S16
+qMultiplier0 QN Q10.S16
+qMultiplier1 QN Q12.S16
+dMultiplier0 DN D20.S16
+dMultiplier1 DN D21.S16
+
+dBPlusCMult7 DN D1.S64
+dBPlusCMult7S16 DN D1.S16
+
+qTmp QN Q0.U8
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+pSrcLeft RN 0 ;// input pointer
+pSrcAbove RN 1 ;// input pointer
+pSrcAboveLeft RN 2 ;// input pointer
+pDst RN 3 ;// output pointer
+leftStep RN 4 ;// input variable
+dstStep RN 5 ;// input variable
+predMode RN 6 ;// input variable
+availability RN 7 ;// input variable
+
+pTmp RN 8
+step RN 10
+pTmp2 RN 11
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_16x16 starts
+;//-----------------------------------------------------------------------------------------------
+
+ ;// Write function header
+ M_START omxVCM4P10_PredictIntra_16x16, r11, d15
+
+ ;// Define stack arguments
+ M_ARG LeftStep, 4
+ M_ARG DstStep, 4
+ M_ARG PredMode, 4
+ M_ARG Availability, 4
+
+ ;// M_STALL ARM1136JS=4
+
+ LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
+
+ ;// Load argument from the stack
+ M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg
+ M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg
+ M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg
+ M_LDR availability, Availability ;// Arg availability loaded from stack to reg
+
+ MOV y, #BLK_SIZE ;// Outer Loop Count
+ LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
+
+OMX_VC_16X16_VERT
+ VLD1 qAbove, [pSrcAbove]
+ ADD pTmp, pDst, dstStep
+ ADD step, dstStep, dstStep
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst], step
+ VST1 qAbove, [pTmp], step
+ VST1 qAbove, [pDst]
+ VST1 qAbove, [pTmp]
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+ M_EXIT
+
+OMX_VC_16X16_HOR
+ ADD pTmp, pSrcLeft, leftStep
+ ADD leftStep, leftStep, leftStep
+ ADD pTmp2, pDst, dstStep
+ ADD dstStep, dstStep, dstStep
+LoopHor
+ VLD1 {qLeft[]}, [pSrcLeft], leftStep
+ VLD1 {qTmp[]}, [pTmp], leftStep
+ SUBS y, y, #8
+ VST1 qLeft, [pDst], dstStep
+ VST1 qTmp, [pTmp2], dstStep
+ VLD1 {qLeft[]}, [pSrcLeft], leftStep
+ VLD1 {qTmp[]}, [pTmp], leftStep
+ VST1 qLeft, [pDst], dstStep
+ VST1 qTmp, [pTmp2], dstStep
+ VLD1 {qLeft[]}, [pSrcLeft], leftStep
+ VLD1 {qTmp[]}, [pTmp], leftStep
+ VST1 qLeft, [pDst], dstStep
+ VST1 qTmp, [pTmp2], dstStep
+ VLD1 {qLeft[]}, [pSrcLeft], leftStep
+ VLD1 {qTmp[]}, [pTmp], leftStep
+ VST1 qLeft, [pDst], dstStep
+ VST1 qTmp, [pTmp2], dstStep
+
+ BNE LoopHor ;// Loop for 16 times
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+OMX_VC_16X16_DC
+ MOV count, #0 ;// count = 0
+ TST availability, #OMX_VC_LEFT
+ BEQ UpperOrNoneAvailable ;// Jump to Upper if not left
+
+ ADD pTmp, pSrcLeft, leftStep
+ ADD step, leftStep, leftStep
+
+ VLD1 {qLeft[0]}, [pSrcLeft],step
+ VLD1 {qLeft[1]}, [pTmp],step
+ VLD1 {qLeft[2]}, [pSrcLeft],step
+ VLD1 {qLeft[3]}, [pTmp],step
+ VLD1 {qLeft[4]}, [pSrcLeft],step
+ VLD1 {qLeft[5]}, [pTmp],step
+ VLD1 {qLeft[6]}, [pSrcLeft],step
+ VLD1 {qLeft[7]}, [pTmp],step
+ VLD1 {qLeft[8]}, [pSrcLeft],step
+ VLD1 {qLeft[9]}, [pTmp],step
+ VLD1 {qLeft[10]},[pSrcLeft],step
+ VLD1 {qLeft[11]},[pTmp],step
+ VLD1 {qLeft[12]},[pSrcLeft],step
+ VLD1 {qLeft[13]},[pTmp],step
+ VLD1 {qLeft[14]},[pSrcLeft],step
+ VLD1 {qLeft[15]},[pTmp]
+
+ VPADDL qSum8, qLeft
+ ADD count, count, #1
+ VPADD dSum4, dSum80, dSum81
+ VPADDL dSum2, dSum4
+ VPADDL dSumLeft, dSum2
+ VRSHR dSum, dSumLeft, #4
+
+UpperOrNoneAvailable
+ TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER)
+ BEQ BothOrNoneAvailable ;// Jump to Left if not upper
+ VLD1 qAbove, [pSrcAbove]
+ ADD count, count, #1 ;// if upper inc count by 1
+ VPADDL qSum8, qAbove
+ VPADD dSum4, dSum80, dSum81
+ VPADDL dSum2, dSum4
+ VPADDL dSumAbove, dSum2
+ VRSHR dSum, dSumAbove, #4
+
+BothOrNoneAvailable
+ CMP count, #2 ;// check if both available
+ BNE NoneAvailable
+ VADD dSum, dSumAbove, dSumLeft
+ VRSHR dSum, dSum, #5
+
+
+NoneAvailable
+ VDUP qOut, dSum0
+ CMP count, #0 ;// check if none available
+ ADD pTmp, pDst, dstStep
+ ADD step, dstStep, dstStep
+ BNE LoopDC
+ VMOV qOut, #128
+LoopDC
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ VST1 qOut, [pDst], step
+ VST1 qOut, [pTmp], step
+ MOV return, #OMX_Sts_NoErr
+ M_EXIT
+
+OMX_VC_16X16_PLANE
+ LDR pMultTable, =armVCM4P10_MultiplierTable16x16
+ VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
+ VLD1 dAboveLeft[0],[pSrcAboveLeft]
+ ADD pTmp, pSrcLeft, leftStep
+ ADD step, leftStep, leftStep
+ VLD1 {qLeft[0]}, [pSrcLeft],step
+ VLD1 {qLeft[1]}, [pTmp],step
+ VLD1 {qLeft[2]}, [pSrcLeft],step
+ VLD1 {qLeft[3]}, [pTmp],step
+ VLD1 {qLeft[4]}, [pSrcLeft],step
+ VLD1 {qLeft[5]}, [pTmp],step
+ VLD1 {qLeft[6]}, [pSrcLeft],step
+ VLD1 {qLeft[7]}, [pTmp],step
+ VLD1 {qLeft[8]}, [pSrcLeft],step
+ VLD1 {qLeft[9]}, [pTmp],step
+ VLD1 {qLeft[10]}, [pSrcLeft],step
+ VLD1 {qLeft[11]}, [pTmp],step
+ VLD1 {qLeft[12]}, [pSrcLeft],step
+ VLD1 {qLeft[13]}, [pTmp],step
+ VLD1 {qLeft[14]}, [pSrcLeft],step
+ VLD1 {qLeft[15]}, [pTmp]
+
+ VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8]
+ VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
+ VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X]
+ VSUBL qAboveDiff, dRevAbove, dAbove0
+
+ VSHL dAboveDiff64, dAboveDiff64, #16
+ VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
+
+ VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8]
+ VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
+ VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X]
+ VSUBL qLeftDiff,dRevLeft, dLeft0
+
+ ;// Multiplier = [8|1|2|...|6|7]
+ VLD1 qMultiplier, [pMultTable]!
+
+ VSHL dLeftDiff64, dLeftDiff64, #16
+ VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
+
+ VMULL qH,dDiffAbove0, dMultiplier0
+ VMULL qV,dDiffLeft0, dMultiplier0
+ VMLAL qH,dDiffAbove1, dMultiplier1
+ VMLAL qV,dDiffLeft1, dMultiplier1
+
+ VPADD dHV00,dH1,dH0
+ VPADD dHV01,dV1,dV0
+ VPADDL qHV, qHV0
+ VSHL qHV1,qHV,#2
+ VADD qHV,qHV,qHV1
+
+ ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
+ VRSHR qHV,qHV,#6
+
+ ;// HV1 = [c*7|b*7]
+ VSHL qHV1,qHV,#3
+ VSUB qHV1,qHV1,qHV
+
+ ;// Multiplier1 = [0|1|2|...|7]
+ VLD1 qMultiplier0, [pMultTable]!
+ VDUP qB, dHV0
+ VDUP qC, dHV1
+
+ VADDL qA,dAbove1,dLeft1
+ VSHL qA,qA, #4
+ VDUP qA,dA1[3]
+ VADD dBPlusCMult7, dHV10, dHV11
+
+ ;// Multiplier1 = [8|9|10|...|15]
+ VLD1 qMultiplier1, [pMultTable]
+ ;// Const = a - 7*(b+c)
+ VDUP qConst, dBPlusCMult7S16[0]
+ VSUB qConst, qA, qConst
+
+ ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
+ VMUL qB0,qB,qMultiplier0
+
+ ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
+ VMUL qB1,qB,qMultiplier1
+
+ VADD qSum0, qB0, qConst
+ VADD qSum1, qB1, qConst
+
+ ;// Loops for 16 times
+LoopPlane
+ ;// (b*x + c*y + C)>>5
+ VQRSHRUN dOut0, qSum0,#5
+ VQRSHRUN dOut1, qSum1,#5
+ SUBS y, y, #1
+ VST1 qOut,[pDst],dstStep
+ VADD qSum0,qSum0,qC
+ VADD qSum1,qSum1,qC
+ BNE LoopPlane
+
+ MOV return, #OMX_Sts_NoErr
+
+ M_END
+
+ ENDIF ;// CortexA8
+
+ END
+;-----------------------------------------------------------------------------------------------
+; omxVCM4P10_PredictIntra_16x16 ends
+;-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
new file mode 100755
index 0000000..39eb8a4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
@@ -0,0 +1,531 @@
+;//
+;//
+;// File Name: omxVCM4P10_PredictIntra_4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Define the processor variants supported by this file
+
+ M_VARIANTS CortexA8
+
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_pSwitchTable4x4
+ DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR
+ DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL
+ DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR
+ DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL
+ DCD OMX_VC_4x4_HU
+
+
+ IF CortexA8
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+return RN 0
+pTable RN 8
+pc RN 15
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+pSrcLeft RN 0 ;// input pointer
+pSrcAbove RN 1 ;// input pointer
+pSrcAboveLeft RN 2 ;// input pointer
+pDst RN 3 ;// output pointer
+leftStep RN 4 ;// input variable
+dstStep RN 5 ;// input variable
+predMode RN 6 ;// input variable
+availability RN 7 ;// input variable
+pDst1 RN 1
+pDst2 RN 4
+pDst3 RN 6
+
+pSrcTmp RN 9
+srcStep RN 10
+pDstTmp RN 11
+dstep RN 12
+
+;//-------------------
+;// Neon registers
+;//-------------------
+
+;// OMX_VC_CHROMA_VERT
+dAboveU32 DN D0.U32
+
+;// OMX_VC_CHROMA_HOR
+dLeftVal0 DN D0.8
+dLeftVal1 DN D1.8
+dLeftVal2 DN D2.8
+dLeftVal3 DN D3.8
+dLeftVal0U32 DN D0.U32
+dLeftVal1U32 DN D1.U32
+dLeftVal2U32 DN D2.U32
+dLeftVal3U32 DN D3.U32
+
+;// OMX_VC_4x4_DC
+dLeftVal DN D0.U8
+dLeftValU32 DN D0.U32
+dSumAboveLeftU16 DN D1.U16
+dSumAboveLeftU32 DN D1.U32
+dSumAboveLeftU64 DN D1.U64
+dSumAboveLeftU8 DN D1.U8
+dSum DN D0.U8
+
+dSumLeftValU16 DN D1.U16
+dSumLeftValU32 DN D1.U32
+dSumLeftValU64 DN D1.U64
+dSumLeftValU8 DN D1.U8
+
+dAboveVal DN D0.U8
+dSumAboveValU16 DN D1.U16
+dSumAboveValU32 DN D1.U32
+dSumAboveValU64 DN D1.U64
+dSumAboveValU8 DN D1.U8
+dConst128U8 DN D0.U8
+
+
+;//OMX_VC_4x4_DIAG_DL
+
+dAbove DN D0.U8
+dU7 DN D2.U8
+dU3 DN D2.U8
+dAbove0 DN D3.U8
+dAbove1 DN D4.U8
+dAbove2 DN D5.U8
+dTmp DN D6.U8
+dTmp0 DN D7.U8
+dTmp1 DN D8.U8
+dTmp2 DN D9.U8
+dTmp3 DN D10.U8
+dTmpU32 DN D6.U32
+
+
+;//OMX_VC_4x4_DIAG_DR
+dLeft DN D1.U8
+dUL DN D2.U8
+
+;//OMX_VC_4x4_VR
+dLeft0 DN D1.U8
+dLeft1 DN D2.U8
+dEven0 DN D3.U8
+dEven1 DN D4.U8
+dEven2 DN D5.U8
+dOdd0 DN D6.U8
+dOdd1 DN D11.U8
+dOdd2 DN D12.U8
+dTmp3U32 DN D10.U32
+dTmp2U32 DN D9.U32
+
+
+;//OMX_VC_4x4_HD
+dTmp1U64 DN D8.U64
+dTmp0U64 DN D7.U64
+dTmpU64 DN D6.U64
+dTmpU32 DN D6.U32
+dTmp1U32 DN D8.U32
+
+;//OMX_VC_4x4_HU
+dL3 DN D2.U8
+dLeftHU0 DN D3.U8
+dLeftHU1 DN D4.U8
+dLeftHU2 DN D5.U8
+dTmp0U32 DN D7.U32
+
+
+
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 starts
+;//-----------------------------------------------------------------------------------------------
+
+ ;// Write function header
+ M_START omxVCM4P10_PredictIntra_4x4, r12,d12
+
+ ;// Define stack arguments
+ M_ARG LeftStep, 4
+ M_ARG DstStep, 4
+ M_ARG PredMode, 4
+ M_ARG Availability, 4
+
+
+ LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case
+
+ ;// Load argument from the stack
+ M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg
+ M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg
+
+
+ LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
+
+
+OMX_VC_4x4_HOR
+
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ;// Load Left Edge
+ VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+ VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VERT
+
+ ;// Load Upper Edge
+ VLD1 dAboveU32[0],[pSrcAbove]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+DCPredict4x4VertStore
+
+ VST1 dAboveU32[0],[pDst],dstep
+ VST1 dAboveU32[0],[pDstTmp],dstep
+ VST1 dAboveU32[0],[pDst]
+ VST1 dAboveU32[0],[pDstTmp]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_DC
+
+
+ TST availability, #OMX_VC_LEFT
+ BEQ DCPredict4x4LeftNotAvailable
+
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ;// Load Left Edge
+ VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCPredict4x4LeftOnlyAvailable
+
+ ;// Load Upper Edge also
+ VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3]
+ MOV return, #OMX_Sts_NoErr
+
+ VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
+ VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
+ VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
+ VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumAboveLeftU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4LeftOnlyAvailable
+
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+
+ VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]]
+ VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]]
+
+ VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumLeftValU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4LeftNotAvailable
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCPredict4x4NoneAvailable
+
+ ;// Load Upper Edge
+ VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3]
+ MOV return, #OMX_Sts_NoErr
+
+ VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]]
+ VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]]
+
+ VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumAboveValU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4NoneAvailable
+
+ VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
+ MOV return, #OMX_Sts_NoErr
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ B DCPredict4x4VertStore
+
+
+
+OMX_VC_4x4_DIAG_DL
+
+ TST availability, #OMX_VC_UPPER_RIGHT
+ BEQ DiagDLUpperRightNotAvailable
+
+ VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
+ VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7]
+ VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1]
+ VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2]
+ B DiagDLPredict4x4Store
+
+DiagDLUpperRightNotAvailable
+ VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
+ VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+ VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+ VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+ VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+
+DiagDLPredict4x4Store
+
+ VHADD dTmp, dAbove0, dAbove2
+ VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
+
+
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+
+OMX_VC_4x4_DIAG_DR
+
+
+ ;// Load U0,U1,U2,U3
+
+ VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0]
+
+ ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
+ VLD1 {dLeft[7]},[pSrcAboveLeft]
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ADD pDst1,pDst,dstStep
+
+ VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+
+ VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
+ ADD pDst2,pDst1,dstStep
+ VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
+ ADD pDst3,pDst2,dstStep
+ VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
+
+ VHADD dTmp, dAbove0, dAbove2
+ VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
+
+
+ VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VR
+
+
+ ;// Load UL,U0,U1,U2,U3
+ VLD1 dAboveU32[0],[pSrcAbove]
+ VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0]
+
+ ;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
+ ;// dLeft1 = [L1| X|X|X|X|X|X|X]
+ VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+
+
+ VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ]
+ VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ]
+ VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ]
+ VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ]
+ VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ]
+ VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ]
+
+ VHADD dTmp1, dOdd0, dOdd2
+ VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ]
+
+ VHADD dTmp0, dEven0, dEven2
+ VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ]
+
+
+ VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ]
+
+
+ VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3]
+ VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2]
+ VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1]
+ VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_HD
+
+
+ ;// Load U0,U1,U2,U3
+ VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
+
+ ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
+ VLD1 {dLeft[7]},[pSrcAboveLeft]
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+
+ VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
+ VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
+ VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
+
+ VHADD dTmp0, dAbove0, dAbove2
+ VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
+
+
+ VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1
+ VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
+
+
+ VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ]
+ VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
+ VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ]
+ VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+ VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3]
+ VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5]
+ VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7]
+ VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VL
+
+
+ TST availability, #OMX_VC_UPPER_RIGHT
+ BEQ DiagVLUpperRightNotAvailable
+
+ VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
+ VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1]
+ VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2]
+
+ B DiagVLPredict4x4Store
+
+DiagVLUpperRightNotAvailable
+ VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
+ VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+ VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+ VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+ VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+
+DiagVLPredict4x4Store
+
+ VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1
+ ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
+
+ VHADD dTmp3, dAbove0, dAbove2
+ VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2
+ ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
+
+ VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
+
+ VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0]
+ VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1]
+ VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2]
+ VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_HU
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+
+ ;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X]
+ VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3]
+
+ VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0]
+ VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1]
+ VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2]
+
+ VHADD dTmp0, dLeftHU0, dLeftHU2
+ VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
+
+ VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1
+ ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
+
+ VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
+ ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
+
+
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0]
+ VEXT dTmp1,dTmp1,dTmp1,#2
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2]
+ VEXT dTmp1,dTmp1,dTmp1,#2
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4]
+ VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6]
+
+
+ExitPredict4x4
+
+ MOV return, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF ;// CortexA8
+
+ END
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 ends
+;//-----------------------------------------------------------------------------------------------
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
new file mode 100755
index 0000000..e394339
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantChromaDCFromPair_s.s
@@ -0,0 +1,140 @@
+;//
+;//
+;// File Name: omxVCM4P10_TransformDequantChromaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ IMPORT armVCM4P10_QPDivTable
+ IMPORT armVCM4P10_VMatrixQPModTable
+
+ M_VARIANTS CortexA8
+
+
+
+
+ IF CortexA8
+
+;// ARM Registers
+;//--------------------------------------
+;// Declare input registers
+;//--------------------------------------
+ppSrc RN 0
+pDst RN 1
+QP RN 2
+
+;//--------------------------------
+;// Scratch variable for Unpack2x2
+;//--------------------------------
+pSrc RN 9
+Value RN 4
+Value2 RN 5
+Flag RN 6
+strOffset RN 7
+cstOffset RN 8
+
+;//--------------------------------
+;// Scratch variable
+;//--------------------------------
+r0w0 RN 3
+r0w1 RN 4
+
+c0w0 RN 5
+c1w0 RN 6
+
+return RN 0
+pQPDivTable RN 5
+pQPModTable RN 6
+Shift RN 9
+Scale RN 2
+
+
+
+;// Neon Registers
+
+dZero DN D0.U16
+dInvTrCoeff DN D0.S16
+dScale DN D1.S16
+qDqntCoeff QN Q1.S32
+dDqntCoeff DN D2.S16
+
+
+ ;// Write function header
+ M_START omxVCM4P10_TransformDequantChromaDCFromPair, r9
+
+ LDR pSrc, [ppSrc] ;// Load pSrc
+ VMOV dZero, #0
+ MOV cstOffset, #31 ;// To be used in the loop, to compute offset
+
+ ;//-----------------------------------------------------------------------
+ ;// Firstly, fill all the coefficient values on the <pDst> buffer by zero
+ ;//-----------------------------------------------------------------------
+
+ VST1 dZero,[pDst] ;// pDst[0] = pDst[1] = pDst[2] = pDst[3] = 0
+ LDRB Flag, [pSrc], #1 ;// Preload <Flag> before <unpackLoop>
+
+
+unpackLoop
+ TST Flag, #0x10 ;// Computing (Flag & 0x10)
+ LDRSBNE Value2,[pSrc,#1]
+ LDRBNE Value, [pSrc], #2 ;// Load byte wise to avoid unaligned access
+ AND strOffset, cstOffset, Flag, LSL #1 ;// strOffset = (Flag & 15) < 1;
+ LDRSBEQ Value, [pSrc], #1 ;// Value = (OMX_U8) *pSrc++
+ ORRNE Value,Value,Value2, LSL #8 ;// Value = (OMX_U16) *pSrc++
+
+ TST Flag, #0x20 ;// Computing (Flag & 0x20) to check, if we're done
+ LDRBEQ Flag, [pSrc], #1 ;// Flag = (OMX_U8) *pSrc++, for next iteration
+ STRH Value, [pDst, strOffset] ;// Store <Value> at offset <strOffset>
+ BEQ unpackLoop ;// Branch to the loop beginning
+
+ ;//--------------------------------------------------
+ ;//InvTransformDC2x2: Inlined (Implemented in ARM V6)
+ ;//--------------------------------------------------
+
+ LDMIA pDst, {r0w0, r0w1} ;// r0w0 = |c1|c0| & r0w1 = |c3|c2|
+
+ STR pSrc, [ppSrc] ;// Update the bitstream pointer
+
+ LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
+ LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+
+ SADDSUBX r0w0, r0w0, r0w0 ;// [ c00+c01, c00-c01 ]
+ SADDSUBX r0w1, r0w1, r0w1 ;// [ c10+c11, c10-c11 ]
+
+ LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP]
+ LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP]
+
+ SADD16 c0w0, r0w0, r0w1 ;// [ d00+d10, d01+d11 ]
+ SSUB16 c1w0, r0w0, r0w1 ;// [ d00-d10, d01-d11 ]
+
+ ;//-------------------------------------------------
+ ;//DequantChromaDC2x2: Inlined (Neon Implementation)
+ ;//-------------------------------------------------
+
+ LSL Scale, Scale, Shift ;// Scale = Scale << Shift
+ VMOV dInvTrCoeff, c0w0, c1w0
+ VREV32 dInvTrCoeff,dInvTrCoeff
+ VDUP dScale,Scale
+
+ VMULL qDqntCoeff,dInvTrCoeff,dScale
+ VSHRN dDqntCoeff,qDqntCoeff,#1
+
+
+ VST1 dDqntCoeff,[pDst] ;// Storing all the coefficients at once
+
+ MOV return, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF ;// CortexA8
+
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
new file mode 100755
index 0000000..2529959
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
@@ -0,0 +1,264 @@
+;//
+;//
+;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;//
+;//
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Import/Export symbols required from/to other files
+;// (For example tables)
+
+ IMPORT armVCM4P10_UnpackBlock4x4
+ IMPORT armVCM4P10_QPDivTable
+ IMPORT armVCM4P10_VMatrixQPModTable
+
+ M_VARIANTS CortexA8
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+pData RN 0
+QP RN 1
+
+
+;//Local Scratch Registers
+
+;// ARM Registers
+
+pQPDivTable RN 2
+pQPModTable RN 3
+Shift RN 4
+Scale RN 5
+
+;// NEON Registers
+
+;// Packed Input pixels
+dIn0 DN D0.S16
+dIn1 DN D1.S16
+dIn2 DN D2.S16
+dIn3 DN D3.S16
+
+;// Intermediate calculations
+dRowSum1 DN D4.S16
+dRowSum2 DN D5.S16
+dRowDiff1 DN D6.S16
+dRowDiff2 DN D7.S16
+
+;// Row operated pixels
+dRowOp0 DN D0.S16
+dRowOp1 DN D1.S16
+dRowOp2 DN D2.S16
+dRowOp3 DN D3.S16
+qRowOp01 QN Q0.32
+qRowOp23 QN Q1.32
+
+;// Intermediate calculations
+dColSum1 DN D4.S16
+dColSum2 DN D5.S16
+dColDiff1 DN D6.S16
+dColDiff2 DN D7.S16
+
+;// Coloumn operated pixels
+dColOp0 DN D0.S16
+dColOp1 DN D1.S16
+dColOp2 DN D2.S16
+dColOp3 DN D3.S16
+
+;// Temporary scratch varaibles
+
+dScale DN D5.S16
+qRound0 QN Q3.S32
+qRound1 QN Q4.S32
+qRound2 QN Q5.S32
+qRound3 QN Q6.S32
+
+;// InvTransformed and Dequantized pixels
+dOut0 DN D0.S16
+dOut1 DN D1.S16
+dOut2 DN D2.S16
+dOut3 DN D3.S16
+
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
+
+ ;******************************************************************
+ ;// The strategy used in implementing the transform is as follows:*
+ ;// Load the 4x4 block into 4 D-registers *
+ ;// Transpose the 4x4 matrix *
+ ;// Perform the row operations (on columns) using SIMD *
+ ;// Transpose the 4x4 result matrix *
+ ;// Perform the coloumn operations *
+ ;******************************************************************
+
+ ;// Load all the 4x4 pixels in Transposed form
+
+ VLD4 {dIn0,dIn1,dIn2,dIn3},[pData]
+ LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
+ LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+
+ ;****************************************
+ ;// Row Operations (Performed on columns)
+ ;****************************************
+ ;// Scale factor calculation is done using ARM instructions
+ ;// Interleaved with NEON instructions inorder to Dual issue
+
+ VADD dRowSum1,dIn0,dIn1
+ VADD dRowSum2,dIn2,dIn3
+ VSUB dRowDiff1,dIn0,dIn1
+ LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP]
+ VSUB dRowDiff2,dIn2,dIn3
+ LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP]
+ VADD dRowOp0,dRowSum1,dRowSum2
+ VSUB dRowOp1,dRowSum1,dRowSum2
+ VSUB dRowOp2,dRowDiff1,dRowDiff2
+ LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift
+ VADD dRowOp3,dRowDiff1,dRowDiff2
+
+ ;****************************************
+ ;// Transpose the resultant matrix
+ ;****************************************
+
+ VTRN dRowOp0,dRowOp1
+ VTRN dRowOp2,dRowOp3
+ VTRN qRowOp01,qRowOp23
+
+ ;****************************************
+ ;// Coloumn Operations
+ ;****************************************
+
+ VADD dColSum1,dRowOp0,dRowOp1
+ VADD dColSum2,dRowOp2,dRowOp3
+ VSUB dColDiff1,dRowOp0,dRowOp1
+ VSUB dColDiff2,dRowOp2,dRowOp3
+ VADD dColOp0,dColSum1,dColSum2
+ VSUB dColOp1,dColSum1,dColSum2
+ VSUB dColOp2,dColDiff1,dColDiff2
+ VADD dColOp3,dColDiff1,dColDiff2
+
+ ;//----------------------------------------------------------------------
+ ;//
+ ;// <Dequantize> improves on the c-reference code
+ ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together
+ ;// We do not subtract 2 from Shift as in C reference, instead perform a
+ ;// Scale << Shift once in the beginning and do a right shift by a
+ ;// constant 2 after the Multiplication. The value of Round would be 2
+ ;//
+ ;// By doing this we aviod the Branches required and also
+ ;// reduce the code size substantially
+ ;//
+ ;//----------------------------------------------------------------------
+
+
+ VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector
+
+
+ VMOV qRound0,#2 ;// Set the Round Value
+ VMOV qRound1,#2
+ VMOV qRound2,#2
+ VMOV qRound3,#2
+
+ VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round
+ VMLAL qRound1,dColOp1,dScale
+ VMLAL qRound2,dColOp2,dScale
+ VMLAL qRound3,dColOp3,dScale
+
+ VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value
+ VSHRN dOut1,qRound1,#2
+ VSHRN dOut2,qRound2,#2
+ VSHRN dOut3,qRound3,#2
+
+ ;***************************
+ ;// Store all the 4x4 pixels
+ ;***************************
+
+ VST1 {dOut0,dOut1,dOut2,dOut3}, [pData]
+
+
+ ;// Set return value
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//CORTEXA8
+
+
+
+;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
+
+;//Input Registers
+ppSrc RN 0
+pDst RN 1
+QPR2 RN 2
+
+;//Output Registers
+result RN 0
+
+;//Local Scratch Registers
+pDstR4 RN 4
+pDstR0 RN 0
+QPR1 RN 1
+QPR5 RN 5
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
+
+ MOV pDstR4,pDst ;// Saving register r1
+ MOV QPR5,QPR2 ;// Saving register r2
+ BL armVCM4P10_UnpackBlock4x4
+
+ MOV pDstR0,pDstR4 ;// Setting up register r0
+ MOV QPR1,QPR5 ;// Setting up register r1
+ BL armVCM4P10_InvTransformDequantLumaDC4x4
+
+
+ ;// Set return value
+ MOV result,#OMX_Sts_NoErr
+
+ ;// Write function tail
+ M_END
+
+
+ ENDIF ;//ARM1136JS
+
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S
new file mode 100644
index 0000000..aca2df4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Average_4x_Align_unsafe_s.S
@@ -0,0 +1,134 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_Average_4x4_Align0_unsafe
+ .func armVCM4P10_Average_4x4_Align0_unsafe
+armVCM4P10_Average_4x4_Align0_unsafe:
+ PUSH {r4-r6,lr}
+ LDR r7, =0x80808080
+ LDR r12,[r2,#0]
+ LDR r10,[r0],r1
+ LDR lr,[r2,r3]
+ LDR r11,[r0],r1
+ MVN r12,r12
+ MVN lr,lr
+ UHSUB8 r5,r10,r12
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ LDR r10,[r0],r1
+ LDR r12,[r2,#0]
+ LDR r11,[r0],r1
+ LDR lr,[r2,r3]
+ MVN r12,r12
+ UHSUB8 r5,r10,r12
+ MVN lr,lr
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ POP {r4-r6,pc}
+ .endfunc
+
+ .global armVCM4P10_Average_4x4_Align2_unsafe
+ .func armVCM4P10_Average_4x4_Align2_unsafe
+armVCM4P10_Average_4x4_Align2_unsafe:
+ PUSH {r4-r6,lr}
+ LDR r7, =0x80808080
+ LDR r4,[r0,#4]
+ LDR r10,[r0],r1
+ LDR r12,[r2,#0]
+ LDR lr,[r2,r3]
+ LDR r5,[r0,#4]
+ LDR r11,[r0],r1
+ MVN r12,r12
+ MVN lr,lr
+ LSR r10,r10,#16
+ ORR r10,r10,r4,LSL #16
+ LSR r11,r11,#16
+ ORR r11,r11,r5,LSL #16
+ UHSUB8 r5,r10,r12
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ LDR r4,[r0,#4]
+ LDR r10,[r0],r1
+ LDR r12,[r2,#0]
+ LDR lr,[r2,r3]
+ LDR r5,[r0,#4]
+ LDR r11,[r0],r1
+ MVN r12,r12
+ MVN lr,lr
+ LSR r10,r10,#16
+ ORR r10,r10,r4,LSL #16
+ LSR r11,r11,#16
+ ORR r11,r11,r5,LSL #16
+ UHSUB8 r5,r10,r12
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ POP {r4-r6,pc}
+ .endfunc
+
+ .global armVCM4P10_Average_4x4_Align3_unsafe
+ .func armVCM4P10_Average_4x4_Align3_unsafe
+armVCM4P10_Average_4x4_Align3_unsafe:
+ PUSH {r4-r6,lr}
+ LDR r7, =0x80808080
+ LDR r4,[r0,#4]
+ LDR r10,[r0],r1
+ LDR r12,[r2,#0]
+ LDR lr,[r2,r3]
+ LDR r5,[r0,#4]
+ LDR r11,[r0],r1
+ MVN r12,r12
+ MVN lr,lr
+ LSR r10,r10,#24
+ ORR r10,r10,r4,LSL #8
+ LSR r11,r11,#24
+ ORR r11,r11,r5,LSL #8
+ UHSUB8 r5,r10,r12
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ LDR r4,[r0,#4]
+ LDR r10,[r0],r1
+ LDR r12,[r2,#0]
+ LDR lr,[r2,r3]
+ LDR r5,[r0,#4]
+ LDR r11,[r0],r1
+ MVN r12,r12
+ MVN lr,lr
+ LSR r10,r10,#24
+ ORR r10,r10,r4,LSL #8
+ LSR r11,r11,#24
+ ORR r11,r11,r5,LSL #8
+ UHSUB8 r5,r10,r12
+ UHSUB8 r4,r11,lr
+ EOR r5,r5,r7
+ STR r5,[r2],r3
+ EOR r4,r4,r7
+ STR r4,[r2],r3
+ POP {r4-r6,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S
new file mode 100644
index 0000000..b9ee221
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingChroma_unsafe_s.S
@@ -0,0 +1,54 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_DeblockingChromabSLT4_unsafe
+ .func armVCM4P10_DeblockingChromabSLT4_unsafe
+armVCM4P10_DeblockingChromabSLT4_unsafe:
+ VLD1.32 {d18[0]},[r5]!
+ VSUBL.U8 q11,d5,d9
+ VMOV d28,d18
+ VSUBL.U8 q10,d8,d4
+ VSHR.S16 q11,q11,#2
+ VZIP.8 d18,d28
+ VBIF d18,d14,d16
+ VRHADD.S16 q10,q11,q10
+ VADD.I8 d31,d18,d15
+ VQMOVN.S16 d20,q10
+ VLD1.8 {d0[]},[r2]
+ VMIN.S8 d20,d20,d31
+ VNEG.S8 d31,d31
+ VLD1.8 {d2[]},[r3]
+ VMAX.S8 d20,d20,d31
+ VMOVL.U8 q14,d4
+ VMOVL.U8 q12,d8
+ VADDW.S8 q14,q14,d20
+ VSUBW.S8 q12,q12,d20
+ VQMOVUN.S16 d29,q14
+ VQMOVUN.S16 d24,q12
+ BX lr
+ .endfunc
+
+ .global armVCM4P10_DeblockingChromabSGE4_unsafe
+ .func armVCM4P10_DeblockingChromabSGE4_unsafe
+armVCM4P10_DeblockingChromabSGE4_unsafe:
+ VHADD.U8 d13,d4,d9
+ VHADD.U8 d31,d8,d5
+ VLD1.8 {d0[]},[r2]
+ ADD r5,r5,#4
+ VLD1.8 {d2[]},[r3]
+ VRHADD.U8 d13,d13,d5
+ VRHADD.U8 d31,d31,d9
+ BX lr
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S
new file mode 100644
index 0000000..47f3d44
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DeblockingLuma_unsafe_s.S
@@ -0,0 +1,102 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_DeblockingLumabSLT4_unsafe
+ .func armVCM4P10_DeblockingLumabSLT4_unsafe
+armVCM4P10_DeblockingLumabSLT4_unsafe:
+ VSUBL.U8 q11,d5,d9
+ VLD1.8 {d18[]},[r5]!
+ VSUBL.U8 q10,d8,d4
+ VLD1.8 {d19[]},[r5]!
+ VSHR.S16 q11,q11,#2
+ VEXT.8 d18,d18,d19,#4
+ VAND d19,d17,d15
+ VBIF d18,d14,d16
+ VRHADD.S16 q10,q11,q10
+ VRHADD.U8 d24,d4,d8
+ VADD.I8 d31,d18,d19
+ VAND d19,d12,d15
+ VQADD.U8 d23,d5,d18
+ VQMOVN.S16 d20,q10
+ VADD.I8 d31,d31,d19
+ VQSUB.U8 d22,d5,d18
+ VQADD.U8 d19,d9,d18
+ VHADD.U8 d26,d24,d6
+ VMIN.S8 d20,d20,d31
+ VNEG.S8 d31,d31
+ VQSUB.U8 d21,d9,d18
+ VHADD.U8 d27,d24,d10
+ VMAX.U8 d30,d26,d22
+ VMAX.S8 d20,d20,d31
+ VMOVL.U8 q14,d4
+ VMOVL.U8 q12,d8
+ VADDW.S8 q14,q14,d20
+ VSUBW.S8 q12,q12,d20
+ VQMOVUN.S16 d29,q14
+ VQMOVUN.S16 d24,q12
+ VMAX.U8 d25,d27,d21
+ VMIN.U8 d30,d30,d23
+ VMIN.U8 d25,d25,d19
+ VBIF d29,d4,d16
+ VBIF d30,d5,d17
+ VBIF d24,d8,d16
+ VBIF d25,d9,d12
+ BX lr
+ .endfunc
+
+ .global armVCM4P10_DeblockingLumabSGE4_unsafe
+ .func armVCM4P10_DeblockingLumabSGE4_unsafe
+armVCM4P10_DeblockingLumabSGE4_unsafe:
+ VSHR.U8 d19,d0,#2
+ VADD.I8 d19,d19,d15
+ VADDL.U8 q10,d8,d4
+ VADD.I8 d19,d19,d15
+ VADDL.U8 q11,d6,d9
+ VADDW.U8 q12,q10,d5
+ VCGT.U8 d19,d19,d13
+ VSHR.U16 q11,q11,#1
+ VHADD.U16 q11,q12,q11
+ VADDW.U8 q12,q12,d6
+ VADDL.U8 q13,d7,d6
+ VAND d17,d17,d19
+ VHADD.U8 d28,d4,d9
+ VSRA.U16 q13,q12,#1
+ VAND d12,d12,d19
+ VQRSHRN.U16 d29,q11,#1
+ VRHADD.U8 d28,d28,d5
+ VQRSHRN.U16 d30,q12,#2
+ VADDL.U8 q11,d10,d5
+ VADDW.U8 q12,q10,d9
+ VBIF d29,d28,d17
+ VQRSHRN.U16 d31,q13,#2
+ VADDL.U8 q13,d11,d10
+ VSHR.U16 q11,q11,#1
+ VHADD.U16 q11,q12,q11
+ VADDW.U8 q12,q12,d10
+ VHADD.U8 d28,d8,d5
+ VBIF d29,d4,d16
+ VBIF d30,d5,d17
+ VSRA.U16 q13,q12,#1
+ VQRSHRN.U16 d25,q12,#2
+ VQRSHRN.U16 d24,q11,#1
+ VRHADD.U8 d22,d28,d9
+ VBIF d25,d9,d12
+ VBIF d31,d6,d17
+ VBIF d24,d22,d12
+ VQRSHRN.U16 d28,q13,#2
+ VBIF d24,d8,d16
+ VBIF d28,d10,d12
+ BX lr
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S
new file mode 100644
index 0000000..e68bd8e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DecodeCoeffsToPair_s.S
@@ -0,0 +1,272 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_DecodeCoeffsToPair
+ .func armVCM4P10_DecodeCoeffsToPair
+armVCM4P10_DecodeCoeffsToPair:
+ PUSH {r4-r12,lr}
+ SUB sp,sp,#0x40
+ LDR r10,[r0,#0]
+ LDR r12,[r1,#0]
+ LDR r6, =armVCM4P10_CAVLCCoeffTokenTables
+ LDR r4,[sp,#0x68]
+ LDRB r9,[r10,#2]
+ LDRB r8,[r10,#1]
+ LDRB r11,[r10],#3
+ ADD r12,r12,#8
+ LDR r6,[r6,r4,LSL #2]
+ ORR r9,r9,r8,LSL #8
+ ORR r11,r9,r11,LSL #16
+ LSLS r8,r11,r12
+ MOVS r7,#0x1e
+ AND r7,r7,r8,LSR #27
+ SUBS r12,r12,#8
+L0x44:
+ BCC L1
+ LDRB r8,[r10],#1
+L1:
+ LDRH r7,[r6,r7]
+ ADDCC r12,r12,#8
+ ADD r12,r12,#4
+ ORRCS r11,r8,r11,LSL #8
+ LSRS r8,r7,#1
+ BCS L0x74
+ LSLS r8,r11,r12
+ SUBS r12,r12,#0xa
+ ADD r7,r7,r8,LSR #29
+ BIC r7,r7,#1
+ B L0x44
+L0x74:
+ SUB r12,r12,r7,LSR #13
+ BIC r7,r8,#0xf000
+ LSRS r5,r7,#2
+ STRB r5,[r2,#0]
+ BEQ L0x344
+ CMP r7,#0x44
+ BGE L0x33c
+ STR r0,[sp,#0]
+ STR r1,[sp,#4]
+ STR r3,[sp,#8]
+ ANDS r1,r7,#3
+ ADD r2,sp,#0xc
+ BEQ L0xd8
+ MOV r0,r1
+L0xac:
+ LSLS r7,r11,r12
+ SUBS r12,r12,#7
+ BCC L2
+ LDRB r8,[r10],#1
+L2:
+ ADDCC r12,r12,#8
+ LSR r7,r7,#31
+ ORRCS r11,r8,r11,LSL #8
+ SUBS r0,r0,#1
+ MOV r8,#1
+ SUB r8,r8,r7,LSL #1
+ STRH r8,[r2],#2
+ BGT L0xac
+L0xd8:
+ SUBS r0,r5,r1
+ BEQ L0x1b8
+ MOV r4,#1
+ CMP r5,#0xa
+ MOVLE r4,#0
+ CMP r1,#3
+ MOVLT r1,#4
+ MOVGE r1,#2
+ MOVGE r4,#0
+L0xfc:
+ LSLS r7,r11,r12
+ CLZ r7,r7
+ ADD r12,r12,r7
+ SUBS r12,r12,#7
+ BCC L3
+ LDRB r8,[r10],#1
+ ORR r11,r8,r11,LSL #8
+ SUBS r12,r12,#8
+ BCC L3
+ LDRB r8,[r10],#1
+L3:
+ ADDCC r12,r12,#8
+ ORRCS r11,r8,r11,LSL #8
+ CMP r7,#0x10
+ BGE L0x33c
+ MOVS lr,r4
+ TEQEQ r7,#0xe
+ MOVEQ lr,#4
+ TEQ r7,#0xf
+ MOVEQ lr,#0xc
+ TEQEQ r4,#0
+ ADDEQ r7,r7,#0xf
+ TEQ lr,#0
+ BEQ L0x184
+ LSL r3,r11,r12
+ ADD r12,r12,lr
+ SUBS r12,r12,#8
+ RSB r9,lr,#0x20
+ BCC L4
+ LDRB r8,[r10],#1
+ ORR r11,r8,r11,LSL #8
+ SUBS r12,r12,#8
+ BCC L4
+ LDRB r8,[r10],#1
+L4:
+ ADDCC r12,r12,#8
+ LSR r3,r3,r9
+ ORRCS r11,r8,r11,LSL #8
+ LSL r7,r7,r4
+ ADD r7,r3,r7
+L0x184:
+ ADD r7,r7,r1
+ MOV r1,#2
+ LSRS r8,r7,#1
+ RSBCS r8,r8,#0
+ STRH r8,[r2],#2
+ LDR r9, =armVCM4P10_SuffixToLevel
+ LDRSB r8,[r9,r4]
+ TEQ r4,#0
+ MOVEQ r4,#1
+ CMP r7,r8
+ ADDCS r4,r4,#1
+ SUBS r0,r0,#1
+ BGT L0xfc
+L0x1b8:
+ LDR r8,[sp,#0x6c]
+ SUB r0,r5,#1
+ SUBS r1,r8,r5
+ ADD r4,sp,#0x2c
+ MOV lr,r5
+ SUB lr,lr,#1
+ BEQ L0x2b0
+ TEQ r8,#4
+ LDREQ r6, =(armVCM4P10_CAVLCTotalZeros2x2Tables - 4)
+ LDRNE r6, =(armVCM4P10_CAVLCTotalZeroTables - 4)
+ LDR r6,[r6,r5,LSL #2]
+ LSLS r8,r11,r12
+ MOVS r7,#0x1e
+ AND r7,r7,r8,LSR #27
+ SUBS r12,r12,#8
+L0x1f4:
+ BCC L5
+ LDRB r8,[r10],#1
+L5:
+ LDRH r7,[r6,r7]
+ ADDCC r12,r12,#8
+ ADD r12,r12,#4
+ ORRCS r11,r8,r11,LSL #8
+ LSRS r8,r7,#1
+ BCS L0x224
+ LSLS r8,r11,r12
+ SUBS r12,r12,#0xa
+ ADD r7,r7,r8,LSR #29
+ BIC r7,r7,#1
+ B L0x1f4
+L0x224:
+ SUB r12,r12,r7,LSR #13
+ BIC r7,r8,#0xf000
+ CMP r7,#0x10
+ BGE L0x33c
+ LDR r3, =(armVCM4P10_CAVLCRunBeforeTables - 4)
+ ADD r4,sp,#0x2c
+ MOVS r1,r7
+ ADD lr,lr,r1
+ BEQ L0x2b0
+L0x248:
+ SUBS r0,r0,#1
+ LDR r6,[r3,r1,LSL #2]
+ BLT L0x2bc
+ LSLS r8,r11,r12
+ MOVS r7,#0xe
+ AND r7,r7,r8,LSR #28
+ SUBS r12,r12,#8
+L0x264:
+ BCC L6
+ LDRB r8,[r10],#1
+L6:
+ LDRH r7,[r6,r7]
+ ADDCC r12,r12,#8
+ ADD r12,r12,#3
+ ORRCS r11,r8,r11,LSL #8
+ LSRS r8,r7,#1
+ BCS L0x294
+ LSLS r8,r11,r12
+ SUBS r12,r12,#9
+ ADD r7,r7,r8,LSR #29
+ BIC r7,r7,#1
+ B L0x264
+L0x294:
+ SUB r12,r12,r7,LSR #13
+ BIC r7,r8,#0xf000
+ CMP r7,#0xf
+ BGE L0x33c
+ SUBS r1,r1,r7
+ STRB r7,[r4],#1
+ BGT L0x248
+L0x2b0:
+ SUBS r0,r0,#1
+ BLT L7
+ STRB r1,[r4],#1
+L7:
+ BGT L0x2b0
+L0x2bc:
+ STRB r1,[r4],#1
+ LDR r8,[sp,#0x6c]
+ TEQ r8,#0xf
+ ADDEQ lr,lr,#1
+ SUB r4,r4,r5
+ SUB r2,r2,r5
+ SUB r2,r2,r5
+ LDR r3,[sp,#8]
+ LDR r0,[r3,#0]
+ TEQ r8,#4
+ LDREQ r6, =armVCM4P10_ZigZag_2x2
+ LDRNE r6, =armVCM4P10_ZigZag_4x4
+L0x2ec:
+ LDRB r9,[r4],#1
+ LDRB r8,[r6,lr]
+ SUB lr,lr,#1
+ SUB lr,lr,r9
+ LDRSH r9,[r2],#2
+ SUBS r5,r5,#1
+ ORREQ r8,r8,#0x20
+ ADD r1,r9,#0x80
+ CMP r1,#0x100
+ ORRCS r8,r8,#0x10
+ TEQ r5,#0
+ STRB r8,[r0],#1
+ STRB r9,[r0],#1
+ LSR r9,r9,#8
+ BCC L8
+ STRB r9,[r0],#1
+L8:
+ BNE L0x2ec
+ STR r0,[r3,#0]
+ LDR r0,[sp,#0]
+ LDR r1,[sp,#4]
+ B L0x344
+L0x33c:
+ MVN r0,#1
+ B L0x35c
+L0x344:
+ ADD r10,r10,r12,LSR #3
+ AND r12,r12,#7
+ SUB r10,r10,#4
+ STR r12,[r1,#0]
+ STR r10,[r0,#0]
+ MOV r0,#0
+L0x35c:
+ ADD sp,sp,#0x40
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S
new file mode 100644
index 0000000..44eb428
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_DequantTables_s.S
@@ -0,0 +1,103 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .section .rodata
+ .align 4
+
+
+ .global armVCM4P10_QPDivTable
+ .global armVCM4P10_VMatrixQPModTable
+ .global armVCM4P10_PosToVCol4x4
+ .global armVCM4P10_PosToVCol2x2
+ .global armVCM4P10_VMatrix
+ .global armVCM4P10_QPModuloTable
+ .global armVCM4P10_VMatrixU16
+
+armVCM4P10_PosToVCol4x4:
+ .byte 0, 2, 0, 2
+ .byte 2, 1, 2, 1
+ .byte 0, 2, 0, 2
+ .byte 2, 1, 2, 1
+
+armVCM4P10_PosToVCol2x2:
+ .byte 0, 2
+ .byte 2, 1
+
+armVCM4P10_VMatrix:
+ .byte 10, 16, 13
+ .byte 11, 18, 14
+ .byte 13, 20, 16
+ .byte 14, 23, 18
+ .byte 16, 25, 20
+ .byte 18, 29, 23
+
+;//-------------------------------------------------------
+;// This table evaluates the expression [(INT)(QP/6)],
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+armVCM4P10_QPDivTable:
+ .byte 0, 0, 0, 0, 0, 0
+ .byte 1, 1, 1, 1, 1, 1
+ .byte 2, 2, 2, 2, 2, 2
+ .byte 3, 3, 3, 3, 3, 3
+ .byte 4, 4, 4, 4, 4, 4
+ .byte 5, 5, 5, 5, 5, 5
+ .byte 6, 6, 6, 6, 6, 6
+ .byte 7, 7, 7, 7, 7, 7
+ .byte 8, 8, 8, 8, 8, 8
+
+;//----------------------------------------------------
+;// This table contains armVCM4P10_VMatrix[QP%6][0] entires,
+;// for values of QP from 0 to 51 (inclusive).
+;//----------------------------------------------------
+
+armVCM4P10_VMatrixQPModTable:
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+ .byte 10, 11, 13, 14, 16, 18
+
+;//-------------------------------------------------------
+;// This table evaluates the modulus expression [QP%6]*6,
+;// for values of QP from 0 to 51 (inclusive).
+;//-------------------------------------------------------
+
+armVCM4P10_QPModuloTable:
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+ .byte 0, 6, 12, 18, 24, 30
+
+;//-------------------------------------------------------
+;// This table contains the invidual byte values stored as
+;// halfwords. This avoids unpacking inside the function
+;//-------------------------------------------------------
+
+armVCM4P10_VMatrixU16:
+ .hword 10, 16, 13
+ .hword 11, 18, 14
+ .hword 13, 20, 16
+ .hword 14, 23, 18
+ .hword 16, 25, 20
+ .hword 18, 29, 23
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S
new file mode 100644
index 0000000..37bc69b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Align_unsafe_s.S
@@ -0,0 +1,123 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+ .func armVCM4P10_InterpolateLuma_HorAlign9x_unsafe
+armVCM4P10_InterpolateLuma_HorAlign9x_unsafe:
+ MOV r12,r8
+ AND r7,r0,#3
+ BIC r0,r0,#3
+ ADD pc,pc,r7,LSL #2
+ NOP
+ B Copy0toAligned
+ B Copy1toAligned
+ B Copy2toAligned
+ B Copy3toAligned
+Copy0toAligned:
+ LDM r0,{r7,r10,r11}
+ SUBS r9,r9,#1
+ ADD r0,r0,r1
+ STM r8!,{r7,r10,r11}
+ BGT Copy0toAligned
+ B CopyEnd
+Copy1toAligned:
+ LDM r0,{r7,r10,r11}
+ SUBS r9,r9,#1
+ ADD r0,r0,r1
+ LSR r7,r7,#8
+ ORR r7,r7,r10,LSL #24
+ LSR r10,r10,#8
+ ORR r10,r10,r11,LSL #24
+ LSR r11,r11,#8
+ STM r8!,{r7,r10,r11}
+ BGT Copy1toAligned
+ B CopyEnd
+Copy2toAligned:
+ LDM r0,{r7,r10,r11}
+ SUBS r9,r9,#1
+ ADD r0,r0,r1
+ LSR r7,r7,#16
+ ORR r7,r7,r10,LSL #16
+ LSR r10,r10,#16
+ ORR r10,r10,r11,LSL #16
+ LSR r11,r11,#16
+ STM r8!,{r7,r10,r11}
+ BGT Copy2toAligned
+ B CopyEnd
+Copy3toAligned:
+ LDM r0,{r7,r10,r11}
+ SUBS r9,r9,#1
+ ADD r0,r0,r1
+ LSR r7,r7,#24
+ ORR r7,r7,r10,LSL #8
+ LSR r10,r10,#24
+ ORR r10,r10,r11,LSL #8
+ LSR r11,r11,#24
+ STM r8!,{r7,r10,r11}
+ BGT Copy3toAligned
+CopyEnd:
+ MOV r0,r12
+ MOV r1,#0xc
+ BX lr
+ .endfunc
+
+ .global armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+ .func armVCM4P10_InterpolateLuma_VerAlign4x_unsafe
+armVCM4P10_InterpolateLuma_VerAlign4x_unsafe:
+ AND r7,r0,#3
+ BIC r0,r0,#3
+ ADD pc,pc,r7,LSL #2
+ NOP
+ B Copy0toVAligned
+ B Copy1toVAligned
+ B Copy2toVAligned
+ B Copy3toVAligned
+Copy0toVAligned:
+ LDR r7,[r0],r1
+ SUBS r9,r9,#1
+ STR r7,[r8],#4
+ BGT Copy0toVAligned
+ B CopyVEnd
+Copy1toVAligned:
+ LDR r10,[r0,#4]
+ LDR r7,[r0],r1
+ SUBS r9,r9,#1
+ LSL r10,r10,#24
+ ORR r7,r10,r7,LSR #8
+ STR r7,[r8],#4
+ BGT Copy1toVAligned
+ B CopyVEnd
+Copy2toVAligned:
+ LDR r10,[r0,#4]
+ LDR r7,[r0],r1
+ SUBS r9,r9,#1
+ LSL r10,r10,#16
+ ORR r7,r10,r7,LSR #16
+ STR r7,[r8],#4
+ BGT Copy2toVAligned
+ B CopyVEnd
+Copy3toVAligned:
+ LDR r10,[r0,#4]
+ LDR r7,[r0],r1
+ SUBS r9,r9,#1
+ LSL r10,r10,#8
+ ORR r7,r10,r7,LSR #24
+ STR r7,[r8],#4
+ BGT Copy3toVAligned
+CopyVEnd:
+ SUB r0,r8,#0x1c
+ MOV r1,#4
+ BX lr
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S
new file mode 100644
index 0000000..fe92201
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_Copy_unsafe_s.S
@@ -0,0 +1,105 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+ .func armVCM4P10_InterpolateLuma_Copy4x4_unsafe
+armVCM4P10_InterpolateLuma_Copy4x4_unsafe:
+ PUSH {r4-r6,lr}
+ AND r12,r0,#3
+ BIC r0,r0,#3
+ ADD pc,pc,r12,LSL #2
+ NOP
+ B Copy4x4Align0
+ B Copy4x4Align1
+ B Copy4x4Align2
+ B Copy4x4Align3
+Copy4x4Align0:
+ LDR r4,[r0],r1
+ LDR r5,[r0],r1
+ STR r4,[r2],r3
+ LDR r8,[r0],r1
+ STR r5,[r2],r3
+ LDR r9,[r0],r1
+ STR r8,[r2],r3
+ STR r9,[r2],r3
+ B Copy4x4End
+Copy4x4Align1:
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#8
+ ORR r4,r4,r5,LSL #24
+ STR r4,[r2],r3
+ LSR r8,r8,#8
+ ORR r8,r8,r9,LSL #24
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ STR r8,[r2],r3
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#8
+ ORR r4,r4,r5,LSL #24
+ STR r4,[r2],r3
+ LSR r8,r8,#8
+ ORR r8,r8,r9,LSL #24
+ STR r8,[r2],r3
+ B Copy4x4End
+Copy4x4Align2:
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#16
+ ORR r4,r4,r5,LSL #16
+ STR r4,[r2],r3
+ LSR r8,r8,#16
+ ORR r8,r8,r9,LSL #16
+ STR r8,[r2],r3
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#16
+ ORR r4,r4,r5,LSL #16
+ STR r4,[r2],r3
+ LSR r8,r8,#16
+ ORR r8,r8,r9,LSL #16
+ STR r8,[r2],r3
+ B Copy4x4End
+Copy4x4Align3:
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#24
+ ORR r4,r4,r5,LSL #8
+ STR r4,[r2],r3
+ LSR r8,r8,#24
+ ORR r8,r8,r9,LSL #8
+ STR r8,[r2],r3
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ LDR r9,[r0,#4]
+ LDR r8,[r0],r1
+ LSR r4,r4,#24
+ ORR r4,r4,r5,LSL #8
+ STR r4,[r2],r3
+ LSR r8,r8,#24
+ ORR r8,r8,r9,LSL #8
+ STR r8,[r2],r3
+Copy4x4End:
+ POP {r4-r6,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S
new file mode 100644
index 0000000..544abe8
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_DiagCopy_unsafe_s.S
@@ -0,0 +1,107 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+ .func armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe
+armVCM4P10_InterpolateLuma_HorDiagCopy_unsafe:
+ PUSH {r4-r6,lr}
+ MOV lr,#4
+ LDR r6, =0xfe00fe0
+ LDR r12, =0xff00ff
+LoopStart1:
+ LDR r11,[r0,#0xc]
+ LDR r10,[r0,#8]
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ UQSUB16 r11,r11,r6
+ UQSUB16 r10,r10,r6
+ UQSUB16 r5,r5,r6
+ UQSUB16 r4,r4,r6
+ USAT16 r11,#13,r11
+ USAT16 r10,#13,r10
+ USAT16 r5,#13,r5
+ USAT16 r4,#13,r4
+ AND r11,r12,r11,LSR #5
+ AND r10,r12,r10,LSR #5
+ AND r5,r12,r5,LSR #5
+ AND r4,r12,r4,LSR #5
+ ORR r11,r10,r11,LSL #8
+ ORR r10,r4,r5,LSL #8
+ SUBS lr,lr,#1
+ STRD r10,r11,[r7],#8
+ BGT LoopStart1
+ SUB r0,r7,#0x20
+ MOV r1,#8
+ POP {r4-r6,pc}
+ .endfunc
+
+ .global armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+ .func armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe
+armVCM4P10_InterpolateLuma_VerDiagCopy_unsafe:
+ PUSH {r4-r6,lr}
+ LDR r6, =0xfe00fe0
+ LDR r12, =0xff00ff
+ MOV lr,#2
+LoopStart:
+ LDR r11,[r0,#0xc]
+ LDR r10,[r0,#8]
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ UQSUB16 r11,r11,r6
+ UQSUB16 r10,r10,r6
+ UQSUB16 r5,r5,r6
+ UQSUB16 r4,r4,r6
+ USAT16 r11,#13,r11
+ USAT16 r10,#13,r10
+ USAT16 r5,#13,r5
+ USAT16 r4,#13,r4
+ AND r11,r12,r11,LSR #5
+ AND r10,r12,r10,LSR #5
+ AND r5,r12,r5,LSR #5
+ AND r4,r12,r4,LSR #5
+ ORR r11,r10,r11,LSL #8
+ ORR r10,r4,r5,LSL #8
+ PKHBT r4,r10,r11,LSL #16
+ STR r4,[r7],#8
+ PKHTB r5,r11,r10,ASR #16
+ STR r5,[r7],#-4
+ LDR r11,[r0,#0xc]
+ LDR r10,[r0,#8]
+ LDR r5,[r0,#4]
+ LDR r4,[r0],r1
+ UQSUB16 r11,r11,r6
+ UQSUB16 r10,r10,r6
+ UQSUB16 r5,r5,r6
+ UQSUB16 r4,r4,r6
+ USAT16 r11,#13,r11
+ USAT16 r10,#13,r10
+ USAT16 r5,#13,r5
+ USAT16 r4,#13,r4
+ AND r11,r12,r11,LSR #5
+ AND r10,r12,r10,LSR #5
+ AND r5,r12,r5,LSR #5
+ AND r4,r12,r4,LSR #5
+ ORR r11,r10,r11,LSL #8
+ ORR r10,r4,r5,LSL #8
+ PKHBT r4,r10,r11,LSL #16
+ SUBS lr,lr,#1
+ STR r4,[r7],#8
+ PKHTB r5,r11,r10,ASR #16
+ STR r5,[r7],#4
+ BGT LoopStart
+ SUB r0,r7,#0x18
+ MOV r1,#4
+ POP {r4-r6,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S
new file mode 100644
index 0000000..a330972
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.S
@@ -0,0 +1,164 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ .func armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe:
+ PUSH {r4-r12,lr}
+ VLD1.8 {d0,d1},[r0],r1
+ VMOV.I16 d31,#0x14
+ VMOV.I16 d30,#0x5
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q5,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VMLA.I16 d10,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q6,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d10,d10,d8
+ VMLA.I16 d12,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q7,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d12,d12,d8
+ VMLA.I16 d14,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q8,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d14,d14,d8
+ VMLA.I16 d16,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q9,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d16,d16,d8
+ VMLA.I16 d18,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q10,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d18,d18,d8
+ VMLA.I16 d20,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q11,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d20,d20,d8
+ VMLA.I16 d22,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q12,d0,d1
+ VLD1.8 {d0,d1},[r0],r1
+ VSUB.I16 d22,d22,d8
+ VMLA.I16 d24,d2,d31
+ VMUL.I16 d8,d4,d30
+ VEXT.8 d4,d0,d1,#1
+ VEXT.8 d2,d0,d1,#2
+ VEXT.8 d3,d0,d1,#3
+ VEXT.8 d5,d0,d1,#4
+ VEXT.8 d1,d0,d1,#5
+ VADDL.U8 q1,d2,d3
+ VADDL.U8 q2,d4,d5
+ VADDL.U8 q13,d0,d1
+ VSUB.I16 d24,d24,d8
+ VMLA.I16 d26,d2,d31
+ VMUL.I16 d8,d4,d30
+ VMOV.I32 q15,#0x14
+ VMOV.I32 q14,#0x5
+ VADDL.S16 q5,d10,d20
+ VADDL.S16 q1,d14,d16
+ VADDL.S16 q0,d12,d18
+ VSUB.I16 d26,d26,d8
+ VMLA.I32 q5,q1,q15
+ VMUL.I32 q4,q0,q14
+ VADDL.S16 q6,d12,d22
+ VADDL.S16 q1,d16,d18
+ VADDL.S16 q0,d14,d20
+ VMLA.I32 q6,q1,q15
+ VSUB.I32 q5,q5,q4
+ VMUL.I32 q4,q0,q14
+ VADDL.S16 q2,d14,d24
+ VADDL.S16 q1,d18,d20
+ VADDL.S16 q0,d16,d22
+ VMLA.I32 q2,q1,q15
+ VSUB.I32 q6,q6,q4
+ VMUL.I32 q4,q0,q14
+ VADDL.S16 q3,d16,d26
+ VADDL.S16 q1,d20,d22
+ VADDL.S16 q0,d18,d24
+ VMLA.I32 q3,q1,q15
+ VSUB.I32 q2,q2,q4
+ VMLS.I32 q3,q0,q14
+ VQRSHRUN.S32 d0,q5,#10
+ VQRSHRUN.S32 d2,q6,#10
+ VQRSHRUN.S32 d4,q2,#10
+ VQRSHRUN.S32 d6,q3,#10
+ VQMOVN.U16 d0,q0
+ VQMOVN.U16 d2,q1
+ VQMOVN.U16 d4,q2
+ VQMOVN.U16 d6,q3
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S
new file mode 100644
index 0000000..991c33f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.S
@@ -0,0 +1,119 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ .func armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe:
+ PUSH {r4-r12,lr}
+ VLD1.8 {d0,d1},[r0],r1
+ ADD r12,r0,r1,LSL #2
+ VMOV.I8 d30,#0x5
+ VMOV.I8 d31,#0x14
+ VLD1.8 {d10,d11},[r12],r1
+ VLD1.8 {d2,d3},[r0],r1
+ VLD1.8 {d12,d13},[r12],r1
+ VADDL.U8 q9,d0,d10
+ VLD1.8 {d4,d5},[r0],r1
+ VADDL.U8 q0,d1,d11
+ VLD1.8 {d6,d7},[r0],r1
+ VADDL.U8 q10,d2,d12
+ VLD1.8 {d8,d9},[r0],r1
+ VMLAL.U8 q9,d4,d31
+ VLD1.8 {d14,d15},[r12],r1
+ VMLAL.U8 q0,d5,d31
+ VLD1.8 {d16,d17},[r12],r1
+ VMLAL.U8 q9,d6,d31
+ VMLAL.U8 q10,d6,d31
+ VMLSL.U8 q0,d3,d30
+ VADDL.U8 q11,d4,d14
+ VMLSL.U8 q9,d2,d30
+ VADDL.U8 q1,d3,d13
+ VMLAL.U8 q0,d7,d31
+ VMLAL.U8 q10,d8,d31
+ VMLSL.U8 q9,d8,d30
+ VMLAL.U8 q1,d7,d31
+ VMLSL.U8 q0,d9,d30
+ VMLAL.U8 q11,d8,d31
+ VMLSL.U8 q10,d4,d30
+ VMLSL.U8 q1,d5,d30
+ VADDL.U8 q2,d5,d15
+ VMLAL.U8 q11,d10,d31
+ VMLSL.U8 q10,d10,d30
+ VMLAL.U8 q1,d9,d31
+ VMLAL.U8 q2,d9,d31
+ VADDL.U8 q12,d6,d16
+ VMLSL.U8 q11,d6,d30
+ VMLSL.U8 q1,d11,d30
+ VMLSL.U8 q2,d7,d30
+ VADDL.U8 q3,d7,d17
+ VMLAL.U8 q12,d10,d31
+ VMLSL.U8 q11,d12,d30
+ VMLSL.U8 q2,d13,d30
+ VMLAL.U8 q3,d11,d31
+ VMLAL.U8 q12,d12,d31
+ VEXT.8 d26,d18,d19,#2
+ VMLAL.U8 q2,d11,d31
+ VMLAL.U8 q3,d13,d31
+ VMLSL.U8 q12,d8,d30
+ VEXT.8 d27,d18,d19,#4
+ VMOV.I16 d31,#0x14
+ VMLSL.U8 q3,d9,d30
+ VMLSL.U8 q12,d14,d30
+ VEXT.8 d29,d19,d0,#2
+ VEXT.8 d28,d18,d19,#6
+ VMLSL.U8 q3,d15,d30
+ VADDL.S16 q0,d18,d29
+ VADD.I16 d27,d27,d28
+ VMOV.I16 d30,#0x5
+ VADD.I16 d26,d26,d19
+ VMLAL.S16 q0,d27,d31
+ VEXT.8 d27,d20,d21,#4
+ VEXT.8 d28,d20,d21,#6
+ VEXT.8 d29,d21,d2,#2
+ VMLSL.S16 q0,d26,d30
+ VEXT.8 d26,d20,d21,#2
+ VADDL.S16 q1,d20,d29
+ VADD.I16 d27,d27,d28
+ VADD.I16 d26,d26,d21
+ VEXT.8 d28,d22,d23,#6
+ VMLAL.S16 q1,d27,d31
+ VEXT.8 d29,d23,d4,#2
+ VEXT.8 d27,d22,d23,#4
+ VEXT.8 d8,d22,d23,#2
+ VADDL.S16 q2,d22,d29
+ VMLSL.S16 q1,d26,d30
+ VADD.I16 d27,d27,d28
+ VADD.I16 d26,d8,d23
+ VEXT.8 d28,d24,d25,#6
+ VMLAL.S16 q2,d27,d31
+ VEXT.8 d27,d24,d25,#4
+ VEXT.8 d29,d25,d6,#2
+ VADD.I16 d27,d27,d28
+ VEXT.8 d8,d24,d25,#2
+ VADDL.S16 q3,d24,d29
+ VMLSL.S16 q2,d26,d30
+ VMLAL.S16 q3,d27,d31
+ VADD.I16 d8,d8,d25
+ VMLSL.S16 q3,d8,d30
+ VQRSHRUN.S32 d0,q0,#10
+ VQRSHRUN.S32 d2,q1,#10
+ VQRSHRUN.S32 d4,q2,#10
+ VQRSHRUN.S32 d6,q3,#10
+ VQMOVN.U16 d0,q0
+ VQMOVN.U16 d2,q1
+ VQMOVN.U16 d4,q2
+ VQMOVN.U16 d6,q3
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S
new file mode 100644
index 0000000..40e141b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.S
@@ -0,0 +1,72 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ .func armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe:
+ PUSH {r4-r12,lr}
+ VLD1.8 {d22,d23},[r0],r1
+ VEXT.8 d10,d22,d23,#5
+ VEXT.8 d12,d22,d23,#1
+ VEXT.8 d14,d22,d23,#2
+ VEXT.8 d15,d22,d23,#3
+ VEXT.8 d13,d22,d23,#4
+ VADDL.U8 q11,d22,d10
+ VADDL.U8 q4,d14,d15
+ VADDL.U8 q6,d12,d13
+ VLD1.8 {d24,d25},[r0],r1
+ VMLA.I16 d22,d8,d31
+ VMUL.I16 d8,d12,d30
+ VEXT.8 d10,d24,d25,#5
+ VEXT.8 d12,d24,d25,#1
+ VEXT.8 d16,d24,d25,#2
+ VEXT.8 d17,d24,d25,#3
+ VEXT.8 d13,d24,d25,#4
+ VADDL.U8 q12,d24,d10
+ VSUB.I16 d22,d22,d8
+ VADDL.U8 q4,d16,d17
+ VADDL.U8 q6,d12,d13
+ VLD1.8 {d26,d27},[r0],r1
+ VMLA.I16 d24,d8,d31
+ VMUL.I16 d8,d12,d30
+ VEXT.8 d10,d26,d27,#5
+ VEXT.8 d12,d26,d27,#1
+ VEXT.8 d18,d26,d27,#2
+ VEXT.8 d19,d26,d27,#3
+ VEXT.8 d13,d26,d27,#4
+ VADDL.U8 q13,d26,d10
+ VSUB.I16 d24,d24,d8
+ VADDL.U8 q4,d18,d19
+ VADDL.U8 q6,d12,d13
+ VLD1.8 {d28,d29},[r0],r1
+ VMLA.I16 d26,d8,d31
+ VMUL.I16 d8,d12,d30
+ VEXT.8 d10,d28,d29,#5
+ VEXT.8 d12,d28,d29,#1
+ VEXT.8 d20,d28,d29,#2
+ VEXT.8 d21,d28,d29,#3
+ VEXT.8 d13,d28,d29,#4
+ VADDL.U8 q14,d28,d10
+ VSUB.I16 d26,d26,d8
+ VADDL.U8 q4,d20,d21
+ VADDL.U8 q6,d12,d13
+ VMLA.I16 d28,d8,d31
+ VMLS.I16 d28,d12,d30
+ VQRSHRUN.S16 d22,q11,#5
+ VQRSHRUN.S16 d24,q12,#5
+ VQRSHRUN.S16 d26,q13,#5
+ VQRSHRUN.S16 d28,q14,#5
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S
new file mode 100644
index 0000000..955846f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.S
@@ -0,0 +1,58 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ .func armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe:
+ PUSH {r4-r12,lr}
+ VLD1.8 {d7},[r0],r1
+ ADD r12,r0,r1,LSL #2
+ VLD1.8 {d8},[r0],r1
+ VLD1.8 {d12},[r12],r1
+ VLD1.8 {d9},[r0],r1
+ VADDL.U8 q0,d7,d12
+ VLD1.8 {d10},[r0],r1
+ VLD1.8 {d13},[r12],r1
+ VLD1.8 {d11},[r0],r1
+ VLD1.8 {d14},[r12],r1
+ VADDL.U8 q8,d8,d11
+ VADDL.U8 q9,d9,d10
+ VLD1.8 {d15},[r12],r1
+ VMLS.I16 d0,d16,d30
+ VMUL.I16 d20,d18,d31
+ VADDL.U8 q8,d9,d12
+ VADDL.U8 q9,d10,d11
+ VADDL.U8 q1,d8,d13
+ VMLS.I16 d2,d16,d30
+ VMUL.I16 d21,d18,d31
+ VADDL.U8 q8,d10,d13
+ VADDL.U8 q9,d11,d12
+ VADDL.U8 q2,d9,d14
+ VMLS.I16 d4,d16,d30
+ VMUL.I16 d22,d18,d31
+ VADDL.U8 q8,d11,d14
+ VADDL.U8 q3,d10,d15
+ VADDL.U8 q9,d12,d13
+ VMLS.I16 d6,d16,d30
+ VADD.I16 d0,d0,d20
+ VADD.I16 d2,d2,d21
+ VADD.I16 d4,d4,d22
+ VMLA.I16 d6,d18,d31
+ VQRSHRUN.S16 d0,q0,#5
+ VQRSHRUN.S16 d2,q1,#5
+ VQRSHRUN.S16 d4,q2,#5
+ VQRSHRUN.S16 d6,q3,#5
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S
new file mode 100644
index 0000000..66520da
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_Interpolate_Chroma_s.S
@@ -0,0 +1,175 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+
+ .section .rodata
+ .align 4
+
+armVCM4P10_WidthBranchTableMVIsNotZero:
+ .word WidthIs2MVIsNotZero, WidthIs2MVIsNotZero
+ .word WidthIs4MVIsNotZero, WidthIs4MVIsNotZero
+ .word WidthIs8MVIsNotZero
+
+armVCM4P10_WidthBranchTableMVIsZero:
+ .word WidthIs2MVIsZero, WidthIs2MVIsZero
+ .word WidthIs4MVIsZero, WidthIs4MVIsZero
+ .word WidthIs8MVIsZero
+
+ .text
+
+ .global armVCM4P10_Interpolate_Chroma
+ .func armVCM4P10_Interpolate_Chroma
+armVCM4P10_Interpolate_Chroma:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ LDRD r6,r7,[sp,#0x70]
+ LDRD r4,r5,[sp,#0x68]
+ RSB r8,r6,#8
+ RSB r9,r7,#8
+ CMN r6,r7
+ MOV r10,#1
+ LDREQ r11, =armVCM4P10_WidthBranchTableMVIsZero
+ SUB lr,r1,r10
+ LDRNE r11, =armVCM4P10_WidthBranchTableMVIsNotZero
+ VLD1.8 {d0},[r0],r10
+ SMULBB r12,r8,r9
+ SMULBB r9,r6,r9
+ VLD1.8 {d1},[r0],lr
+ SMULBB r8,r8,r7
+ SMULBB r6,r6,r7
+ VDUP.8 d12,r12
+ VDUP.8 d13,r9
+ VDUP.8 d14,r8
+ VDUP.8 d15,r6
+ LDR pc,[r11,r4,LSL #1]
+
+WidthIs8MVIsNotZero:
+ VLD1.8 {d2},[r0],r10
+ VMULL.U8 q2,d0,d12
+ VLD1.8 {d3},[r0],lr
+ VMULL.U8 q3,d2,d12
+ VLD1.8 {d16},[r0],r10
+ VMLAL.U8 q2,d1,d13
+ VLD1.8 {d17},[r0],lr
+ VMULL.U8 q11,d16,d12
+ VMLAL.U8 q3,d3,d13
+ VLD1.8 {d18},[r0],r10
+ VMLAL.U8 q2,d2,d14
+ VMLAL.U8 q11,d17,d13
+ VMULL.U8 q12,d18,d12
+ VLD1.8 {d19},[r0],lr
+ VMLAL.U8 q3,d16,d14
+ VLD1.8 {d0},[r0],r10
+ VMLAL.U8 q12,d19,d13
+ VMLAL.U8 q11,d18,d14
+ VMLAL.U8 q2,d3,d15
+ VLD1.8 {d1},[r0],lr
+ VMLAL.U8 q12,d0,d14
+ VMLAL.U8 q3,d17,d15
+ VMLAL.U8 q11,d19,d15
+ SUBS r5,r5,#4
+ VMLAL.U8 q12,d1,d15
+ VQRSHRN.U16 d8,q2,#6
+ VQRSHRN.U16 d9,q3,#6
+ VQRSHRN.U16 d20,q11,#6
+ VST1.64 {d8},[r2],r3
+ VQRSHRN.U16 d21,q12,#6
+ VST1.64 {d9},[r2],r3
+ VST1.64 {d20},[r2],r3
+ VST1.64 {d21},[r2],r3
+ BGT WidthIs8MVIsNotZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+
+WidthIs4MVIsNotZero:
+ VLD1.8 {d2},[r0],r10
+ VMULL.U8 q2,d0,d12
+ VMULL.U8 q3,d2,d12
+ VLD1.8 {d3},[r0],lr
+ VMLAL.U8 q2,d1,d13
+ VMLAL.U8 q3,d3,d13
+ VLD1.8 {d0},[r0],r10
+ VMLAL.U8 q2,d2,d14
+ VMLAL.U8 q3,d0,d14
+ VLD1.8 {d1},[r0],lr
+ SUBS r5,r5,#2
+ VMLAL.U8 q3,d1,d15
+ VMLAL.U8 q2,d3,d15
+ VQRSHRN.U16 d9,q3,#6
+ VQRSHRN.U16 d8,q2,#6
+ VST1.32 {d8[0]},[r2],r3
+ VST1.32 {d9[0]},[r2],r3
+ BGT WidthIs4MVIsNotZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+
+WidthIs2MVIsNotZero:
+ VLD1.8 {d2},[r0],r10
+ VMULL.U8 q2,d0,d12
+ VMULL.U8 q3,d2,d12
+ VLD1.8 {d3},[r0],lr
+ VMLAL.U8 q2,d1,d13
+ VMLAL.U8 q3,d3,d13
+ VLD1.8 {d0},[r0],r10
+ VMLAL.U8 q2,d2,d14
+ VMLAL.U8 q3,d0,d14
+ VLD1.8 {d1},[r0],lr
+ SUBS r5,r5,#2
+ VMLAL.U8 q3,d1,d15
+ VMLAL.U8 q2,d3,d15
+ VQRSHRN.U16 d9,q3,#6
+ VQRSHRN.U16 d8,q2,#6
+ VST1.16 {d8[0]},[r2],r3
+ VST1.16 {d9[0]},[r2],r3
+ BGT WidthIs2MVIsNotZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+
+WidthIs8MVIsZero:
+ SUB r0,r0,r1
+WidthIs8LoopMVIsZero:
+ VLD1.8 {d0},[r0],r1
+ SUBS r5,r5,#2
+ VLD1.8 {d1},[r0],r1
+ VST1.64 {d0},[r2],r3
+ VST1.64 {d1},[r2],r3
+ BGT WidthIs8LoopMVIsZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+
+WidthIs4MVIsZero:
+ VLD1.8 {d1},[r0],r1
+ SUBS r5,r5,#2
+ VST1.32 {d0[0]},[r2],r3
+ VLD1.8 {d0},[r0],r1
+ VST1.32 {d1[0]},[r2],r3
+ BGT WidthIs4MVIsZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+
+WidthIs2MVIsZero:
+ VLD1.8 {d1},[r0],r1
+ SUBS r5,r5,#2
+ VST1.16 {d0[0]},[r2],r3
+ VLD1.8 {d0},[r0],r1
+ VST1.16 {d1[0]},[r2],r3
+ BGT WidthIs2MVIsZero
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S
new file mode 100644
index 0000000..f5d6d1f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_QuantTables_s.S
@@ -0,0 +1,68 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .section .rodata
+ .align 4
+
+ .global armVCM4P10_MFMatrixQPModTable
+ .global armVCM4P10_QPDivIntraTable
+ .global armVCM4P10_QPDivPlusOneTable
+
+;//------------------------------------------------------------------
+;// This table contains (1 << QbitsPlusOne) / 3 Values (Intra case) ,
+;// for values of iQP from 0 to 51 (inclusive).
+;//------------------------------------------------------------------
+
+
+armVCM4P10_QPDivIntraTable:
+ .word 21845, 21845, 21845, 21845, 21845, 21845
+ .word 43690, 43690, 43690, 43690, 43690, 43690
+ .word 87381, 87381, 87381, 87381, 87381, 87381
+ .word 174762, 174762, 174762, 174762, 174762, 174762
+ .word 349525, 349525, 349525, 349525, 349525, 349525
+ .word 699050, 699050, 699050, 699050, 699050, 699050
+ .word 1398101, 1398101, 1398101, 1398101, 1398101, 1398101
+ .word 2796202, 2796202, 2796202, 2796202, 2796202, 2796202
+
+
+;//--------------------------------------------------------------
+;// This table contains armVCM4P10_MFMatrix [iQP % 6][0] entires,
+;// for values of iQP from 0 to 51 (inclusive).
+;//--------------------------------------------------------------
+
+armVCM4P10_MFMatrixQPModTable:
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+ .hword 13107, 11916, 10082, 9362, 8192, 7282
+
+;//---------------------------------------------------------------
+;// This table contains ARM_M4P10_Q_OFFSET + 1 + (iQP / 6) values,
+;// for values of iQP from 0 to 51 (inclusive).
+;//---------------------------------------------------------------
+
+armVCM4P10_QPDivPlusOneTable:
+ .byte 16, 16, 16, 16, 16, 16
+ .byte 17, 17, 17, 17, 17, 17
+ .byte 18, 18, 18, 18, 18, 18
+ .byte 19, 19, 19, 19, 19, 19
+ .byte 20, 20, 20, 20, 20, 20
+ .byte 21, 21, 21, 21, 21, 21
+ .byte 22, 22, 22, 22, 22, 22
+ .byte 23, 23, 23, 23, 23, 23
+ .byte 24, 24, 24, 24, 24, 24
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S
new file mode 100644
index 0000000..c24d717
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_TransformResidual4x4_s.S
@@ -0,0 +1,52 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_TransformResidual4x4
+ .func armVCM4P10_TransformResidual4x4
+armVCM4P10_TransformResidual4x4:
+ VPUSH {d8}
+ VLD4.16 {d0,d1,d2,d3},[r1]
+ VMOV.I16 d4,#0
+ VADD.I16 d5,d0,d2
+ VSUB.I16 d6,d0,d2
+ VHADD.S16 d7,d1,d4
+ VHADD.S16 d8,d3,d4
+ VSUB.I16 d7,d7,d3
+ VADD.I16 d8,d1,d8
+ VADD.I16 d0,d5,d8
+ VADD.I16 d1,d6,d7
+ VSUB.I16 d2,d6,d7
+ VSUB.I16 d3,d5,d8
+ VTRN.16 d0,d1
+ VTRN.16 d2,d3
+ VTRN.32 q0,q1
+ VADD.I16 d5,d0,d2
+ VSUB.I16 d6,d0,d2
+ VHADD.S16 d7,d1,d4
+ VHADD.S16 d8,d3,d4
+ VSUB.I16 d7,d7,d3
+ VADD.I16 d8,d1,d8
+ VADD.I16 d0,d5,d8
+ VADD.I16 d1,d6,d7
+ VSUB.I16 d2,d6,d7
+ VSUB.I16 d3,d5,d8
+ VRSHR.S16 d0,d0,#6
+ VRSHR.S16 d1,d1,#6
+ VRSHR.S16 d2,d2,#6
+ VRSHR.S16 d3,d3,#6
+ VST1.16 {d0,d1,d2,d3},[r0]
+ VPOP {d8}
+ BX lr
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S
new file mode 100644
index 0000000..c552f8d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/armVCM4P10_UnpackBlock4x4_s.S
@@ -0,0 +1,40 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_UnpackBlock4x4
+ .func armVCM4P10_UnpackBlock4x4
+armVCM4P10_UnpackBlock4x4:
+ PUSH {r4-r8,lr}
+ LDR r2,[r0,#0]
+ MOV r7,#0x1f
+ MOV r4,#0
+ MOV r5,#0
+ LDRB r3,[r2],#1
+ STRD r4,r5,[r1,#0]
+ STRD r4,r5,[r1,#8]
+ STRD r4,r5,[r1,#0x10]
+ STRD r4,r5,[r1,#0x18]
+unpackLoop:
+ TST r3,#0x10
+ LDRNESB r5,[r2,#1]
+ LDRNEB r4,[r2],#2
+ AND r6,r7,r3,LSL #1
+ LDREQSB r4,[r2],#1
+ ORRNE r4,r4,r5,LSL #8
+ TST r3,#0x20
+ LDREQB r3,[r2],#1
+ STRH r4,[r1,r6]
+ BEQ unpackLoop
+ STR r2,[r0,#0]
+ POP {r4-r8,pc}
+ .endfunc
+ .end
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S
new file mode 100644
index 0000000..ba61059
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DeblockLuma_I.S
@@ -0,0 +1,67 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_DeblockLuma_I
+ .func omxVCM4P10_DeblockLuma_I
+omxVCM4P10_DeblockLuma_I:
+ PUSH {r4-r9,lr}
+ MOVS r6,r0
+ SUB sp,sp,#0xc
+ MOV r9,r1
+ MOV r7,r2
+ MOV r8,r3
+ LDR r4,[sp,#0x28]
+ LDR r5,[sp,#0x2c]
+ BEQ L0x58
+ TST r6,#7
+ TSTEQ r9,#7
+ BNE L0x58
+ CMP r7,#0
+ CMPNE r8,#0
+ CMPNE r4,#0
+ BEQ L0x58
+ TST r4,#3
+ BNE L0x58
+ CMP r5,#0
+ BEQ L0x58
+ TST r5,#3
+ BEQ L0x64
+L0x58:
+ MVN r0,#4
+L0x5c:
+ ADD sp,sp,#0xc
+ POP {r4-r9,pc}
+L0x64:
+ STR r4,[sp,#0]
+ MOV r3,r8
+ STR r5,[sp,#4]
+ MOV r2,r7
+ MOV r1,r9
+ MOV r0,r6
+ BL omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+ CMP r0,#0
+ BNE L0x5c
+ ADD r3,r5,#0x10
+ ADD r2,r4,#0x10
+ STR r3,[sp,#4]
+ STR r2,[sp,#0]
+ ADD r3,r8,#2
+ ADD r2,r7,#2
+ MOV r1,r9
+ MOV r0,r6
+ BL omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+ ADD sp,sp,#0xc
+ POP {r4-r9,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S
new file mode 100644
index 0000000..be21ee7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.S
@@ -0,0 +1,119 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_DequantTransformResidualFromPairAndAdd
+ .func omxVCM4P10_DequantTransformResidualFromPairAndAdd
+omxVCM4P10_DequantTransformResidualFromPairAndAdd:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d9}
+ SUB sp,sp,#0x20
+ ADD r4,sp,#0
+ LDR r5,[sp,#0x64]
+ MOV r7,r1
+ MOV r8,r2
+ MOV r9,r3
+ CMP r5,#0
+ BEQ L0x114
+ MOV r1,r4
+ BL armVCM4P10_UnpackBlock4x4 ;//
+ LDR r1,[sp,#0x60]
+ LDR r11, =armVCM4P10_QPModuloTable
+ LDR r10, =armVCM4P10_QPDivTable
+ LDR r2, =armVCM4P10_VMatrixU16
+ LDRSB r12,[r11,r1]
+ LDRSB lr,[r10,r1]
+ LDR r10, =0x3020504
+ LDR r1, =0x5040100
+ ADD r2,r2,r12
+ VDUP.32 d7,r1
+ VDUP.32 d9,r10
+ VDUP.16 d5,lr
+ VLD1.8 {d6},[r2]
+ VTBL.8 d8,{d6},d7
+ VTBL.8 d4,{d6},d9
+ CMP r8,#0
+ VLD1.16 {d0,d1,d2,d3},[r4]
+ VSHL.U16 d8,d8,d5
+ VSHL.U16 d4,d4,d5
+ BEQ L1
+ LDRSH r10,[r8,#0]
+L1:
+ VMUL.I16 d0,d0,d8
+ VMUL.I16 d1,d1,d4
+ VMUL.I16 d2,d2,d8
+ VMUL.I16 d3,d3,d4
+ VMOVNE.16 d0[0],r10
+ VTRN.16 d0,d1
+ VTRN.16 d2,d3
+ VTRN.32 q0,q1
+ VMOV.I16 d4,#0
+ VADD.I16 d5,d0,d2
+ VSUB.I16 d6,d0,d2
+ VHADD.S16 d7,d1,d4
+ VHADD.S16 d8,d3,d4
+ VSUB.I16 d7,d7,d3
+ VADD.I16 d8,d1,d8
+ VADD.I16 d0,d5,d8
+ VADD.I16 d1,d6,d7
+ VSUB.I16 d2,d6,d7
+ VSUB.I16 d3,d5,d8
+ VTRN.16 d0,d1
+ VTRN.16 d2,d3
+ VTRN.32 q0,q1
+ VADD.I16 d5,d0,d2
+ VSUB.I16 d6,d0,d2
+ VHADD.S16 d7,d1,d4
+ VHADD.S16 d8,d3,d4
+ VSUB.I16 d7,d7,d3
+ VADD.I16 d8,d1,d8
+ VADD.I16 d0,d5,d8
+ VADD.I16 d1,d6,d7
+ VSUB.I16 d2,d6,d7
+ VSUB.I16 d3,d5,d8
+ VRSHR.S16 d0,d0,#6
+ VRSHR.S16 d1,d1,#6
+ VRSHR.S16 d2,d2,#6
+ VRSHR.S16 d3,d3,#6
+ B L0x130
+L0x114:
+ LDRSH r10,[r8,#0]
+ ADD r10,r10,#0x20
+ ASR r10,r10,#6
+ VDUP.16 d0,r10
+ VDUP.16 d1,r10
+ VDUP.16 d2,r10
+ VDUP.16 d3,r10
+L0x130:
+ LDR r1,[sp,#0x58]
+ LDR r10,[sp,#0x5c]
+ LDR r3,[r7],r1
+ LDR r5,[r7],r1
+ VMOV d4,r3,r5
+ LDR r3,[r7],r1
+ LDR r5,[r7,#0]
+ VMOV d5,r3,r5
+ VADDW.U8 q3,q0,d4
+ VADDW.U8 q4,q1,d5
+ VQMOVUN.S16 d0,q3
+ VQMOVUN.S16 d1,q4
+ VST1.32 {d0[0]},[r9],r10
+ VST1.32 {d0[1]},[r9],r10
+ VST1.32 {d1[0]},[r9],r10
+ VST1.32 {d1[1]},[r9]
+ MOV r0,#0
+ ADD sp,sp,#0x20
+ VPOP {d8-d9}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S
new file mode 100644
index 0000000..79ba538
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_HorEdge_I_s.S
@@ -0,0 +1,87 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_FilterDeblockingChroma_HorEdge_I
+ .func omxVCM4P10_FilterDeblockingChroma_HorEdge_I
+omxVCM4P10_FilterDeblockingChroma_HorEdge_I:
+ PUSH {r4-r10,lr}
+ VPUSH {d8-d15}
+ VLD1.8 {d0[]},[r2]!
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,r1
+ VLD1.8 {d2[]},[r3]!
+ LDR r4,[sp,#0x64]
+ LDR r5,[sp,#0x60]
+ LDR r9, =0x3030303
+ LDR r8, =0x4040404
+ VMOV.I8 d14,#0
+ VMOV.I8 d15,#0x1
+ VMOV.I16 d1,#0x4
+ MOV r7,#0x40000000
+L0x38:
+ LDR r6,[r4],#8
+ VLD1.8 {d6},[r0],r1
+ VLD1.8 {d5},[r0],r1
+ CMP r6,#0
+ VLD1.8 {d4},[r0],r1
+ VLD1.8 {d8},[r0],r1
+ VABD.U8 d19,d6,d4
+ VLD1.8 {d9},[r0],r1
+ VABD.U8 d13,d4,d8
+ VLD1.8 {d10},[r0],r1
+ BEQ L0xe4
+ VABD.U8 d12,d5,d4
+ VABD.U8 d18,d9,d8
+ VCGT.U8 d16,d0,d13
+ VMOV.32 d26[0],r6
+ VMAX.U8 d12,d18,d12
+ VMOVL.U8 q13,d26
+ VABD.U8 d17,d10,d8
+ VCGT.S16 d27,d26,#0
+ VCGT.U8 d12,d2,d12
+ VCGT.U8 d19,d2,d19
+ VAND d16,d16,d27
+ TST r6,r9
+ VCGT.U8 d17,d2,d17
+ VAND d16,d16,d12
+ VAND d12,d16,d17
+ VAND d17,d16,d19
+ BLNE armVCM4P10_DeblockingChromabSLT4_unsafe
+ TST r6,r8
+ SUB r0,r0,r1,LSL #2
+ VTST.16 d26,d26,d1
+ BLNE armVCM4P10_DeblockingChromabSGE4_unsafe
+ VBIT d29,d13,d26
+ VBIT d24,d31,d26
+ VBIF d29,d4,d16
+ VBIF d24,d8,d16
+ VST1.8 {d29},[r0],r1
+ ADDS r7,r7,r7
+ VST1.8 {d24},[r0],r1
+ BNE L0x38
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r10,pc}
+L0xe4:
+ VLD1.8 {d0[]},[r2]
+ SUB r0,r0,r1,LSL #1
+ ADDS r7,r7,r7
+ VLD1.8 {d2[]},[r3]
+ ADD r5,r5,#4
+ BNE L0x38
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r10,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S
new file mode 100644
index 0000000..dcdddbe
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.S
@@ -0,0 +1,123 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_FilterDeblockingChroma_VerEdge_I
+ .func omxVCM4P10_FilterDeblockingChroma_VerEdge_I
+omxVCM4P10_FilterDeblockingChroma_VerEdge_I:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ VLD1.8 {d0[]},[r2]!
+ SUB r0,r0,#4
+ VLD1.8 {d2[]},[r3]!
+ LDR r4,[sp,#0x6c]
+ LDR r5,[sp,#0x68]
+ LDR r8, =0x4040404
+ LDR r9, =0x3030303
+ VMOV.I8 d14,#0
+ VMOV.I8 d15,#0x1
+ VMOV.I16 d1,#0x4
+ MOV r7,#0x40000000
+L0x34:
+ LDR r6,[r4],#8
+ ADD r10,r0,r1
+ ADD lr,r1,r1
+ VLD1.8 {d7},[r0],lr
+ VLD1.8 {d8},[r10],lr
+ VLD1.8 {d5},[r0],lr
+ VLD1.8 {d10},[r10],lr
+ VLD1.8 {d6},[r0],lr
+ VLD1.8 {d9},[r10],lr
+ VLD1.8 {d4},[r0],lr
+ VLD1.8 {d11},[r10],lr
+ VZIP.8 d7,d8
+ VZIP.8 d5,d10
+ VZIP.8 d6,d9
+ VZIP.8 d4,d11
+ VZIP.16 d7,d5
+ VZIP.16 d8,d10
+ VZIP.16 d6,d4
+ VZIP.16 d9,d11
+ VTRN.32 d7,d6
+ VTRN.32 d5,d4
+ VTRN.32 d10,d11
+ VTRN.32 d8,d9
+ CMP r6,#0
+ VABD.U8 d19,d6,d4
+ VABD.U8 d13,d4,d8
+ BEQ L0x170
+ VABD.U8 d12,d5,d4
+ VABD.U8 d18,d9,d8
+ VMOV.32 d26[0],r6
+ VCGT.U8 d16,d0,d13
+ VMAX.U8 d12,d18,d12
+ VMOVL.U8 q13,d26
+ VABD.U8 d17,d10,d8
+ VCGT.S16 d27,d26,#0
+ VCGT.U8 d12,d2,d12
+ VCGT.U8 d19,d2,d19
+ VAND d16,d16,d27
+ TST r6,r9
+ VCGT.U8 d17,d2,d17
+ VAND d16,d16,d12
+ VAND d12,d16,d17
+ VAND d17,d16,d19
+ BLNE armVCM4P10_DeblockingChromabSLT4_unsafe
+ TST r6,r8
+ SUB r0,r0,r1,LSL #3
+ VTST.16 d26,d26,d1
+ BLNE armVCM4P10_DeblockingChromabSGE4_unsafe
+ VBIT d29,d13,d26
+ VBIT d24,d31,d26
+ ADD r10,r0,#3
+ VBIF d29,d4,d16
+ ADD r12,r10,r1
+ ADD lr,r1,r1
+ VBIF d24,d8,d16
+ ADDS r7,r7,r7
+ VST1.8 {d29[0]},[r10],lr
+ VST1.8 {d29[1]},[r12],lr
+ VST1.8 {d29[2]},[r10],lr
+ VST1.8 {d29[3]},[r12],lr
+ VST1.8 {d29[4]},[r10],lr
+ VST1.8 {d29[5]},[r12],lr
+ VST1.8 {d29[6]},[r10],lr
+ VST1.8 {d29[7]},[r12],lr
+ ADD r12,r0,#4
+ ADD r10,r12,r1
+ VST1.8 {d24[0]},[r12],lr
+ VST1.8 {d24[1]},[r10],lr
+ VST1.8 {d24[2]},[r12],lr
+ VST1.8 {d24[3]},[r10],lr
+ VST1.8 {d24[4]},[r12],lr
+ VST1.8 {d24[5]},[r10],lr
+ VST1.8 {d24[6]},[r12],lr
+ VST1.8 {d24[7]},[r10],lr
+ ADD r0,r0,#4
+ BNE L0x34
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+L0x170:
+ VLD1.8 {d0[]},[r2]
+ ADD r0,r0,#4
+ SUB r0,r0,r1,LSL #3
+ ADDS r7,r7,r7
+ VLD1.8 {d2[]},[r3]
+ ADD r5,r5,#4
+ BNE L0x34
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S
new file mode 100644
index 0000000..9755899
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.S
@@ -0,0 +1,107 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+ .func omxVCM4P10_FilterDeblockingLuma_HorEdge_I
+omxVCM4P10_FilterDeblockingLuma_HorEdge_I:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ ADD r7,r2,#1
+ ADD r8,r3,#1
+ VLD1.8 {d0[]},[r2]
+ SUB r0,r0,r1,LSL #2
+ VLD1.8 {d2[]},[r3]
+ LDR r4,[sp,#0x6c]
+ LDR r5,[sp,#0x68]
+ MOV r11,#0
+ VMOV.I8 d14,#0
+ VMOV.I8 d15,#0x1
+ ADD r10,r1,r1
+ MOV r9,#0x55000000
+L0x38:
+ LDRH r12,[r4],#2
+ ADD r6,r0,r1
+ CMP r12,#0
+ BEQ L0xe4
+ VLD1.8 {d7},[r0],r10
+ VLD1.8 {d6},[r6],r10
+ VLD1.8 {d5},[r0],r10
+ VLD1.8 {d4},[r6],r10
+ VLD1.8 {d8},[r0],r10
+ VABD.U8 d12,d4,d5
+ VLD1.8 {d9},[r6]
+ VABD.U8 d13,d8,d4
+ VLD1.8 {d10},[r0],r1
+ VABD.U8 d18,d9,d8
+ VABD.U8 d19,d6,d4
+ VCGT.U8 d16,d0,d13
+ TST r12,#0xff
+ VMAX.U8 d12,d18,d12
+ VABD.U8 d17,d10,d8
+ VMOVEQ.32 d16[0],r11
+ TST r12,#0xff00
+ VCGT.U8 d19,d2,d19
+ VCGT.U8 d12,d2,d12
+ VMOVEQ.32 d16[1],r11
+ VCGT.U8 d17,d2,d17
+ VLD1.8 {d11},[r0]
+ VAND d16,d16,d12
+ TST r12,#4
+ VAND d12,d16,d17
+ VAND d17,d16,d19
+ BNE L0xf8
+ SUB r0,r0,r1,LSL #2
+ SUB r0,r0,r1
+ BL armVCM4P10_DeblockingLumabSLT4_unsafe
+ VST1.8 {d30},[r0],r1
+ VST1.8 {d29},[r0],r1
+ SUB r6,r0,r1,LSL #2
+ VST1.8 {d24},[r0],r1
+ ADDS r9,r9,r9
+ VST1.8 {d25},[r0]
+ ADD r0,r6,#8
+ BCC L0x38
+ B L0x130
+L0xe4:
+ ADD r0,r0,#8
+ ADDS r9,r9,r9
+ ADD r5,r5,#2
+ BCC L0x38
+ B L0x130
+L0xf8:
+ SUB r0,r0,r1,LSL #2
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_DeblockingLumabSGE4_unsafe
+ VST1.8 {d31},[r0],r1
+ VST1.8 {d30},[r0],r1
+ VST1.8 {d29},[r0],r1
+ SUB r6,r0,r1,LSL #2
+ VST1.8 {d24},[r0],r1
+ ADDS r9,r9,r9
+ VST1.8 {d25},[r0],r1
+ ADD r5,r5,#2
+ VST1.8 {d28},[r0]
+ ADD r0,r6,#8
+ BCC L0x38
+L0x130:
+ SUB r0,r0,#0x10
+ VLD1.8 {d0[]},[r7]
+ ADD r0,r0,r1,LSL #2
+ VLD1.8 {d2[]},[r8]
+ BNE L0x38
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S
new file mode 100644
index 0000000..66cc32e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.S
@@ -0,0 +1,157 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+ .func omxVCM4P10_FilterDeblockingLuma_VerEdge_I
+omxVCM4P10_FilterDeblockingLuma_VerEdge_I:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ ADD r7,r2,#1
+ ADD r8,r3,#1
+ VLD1.8 {d0[]},[r2]
+ SUB r0,r0,#4
+ VLD1.8 {d2[]},[r3]
+ LDR r4,[sp,#0x6c]
+ LDR r5,[sp,#0x68]
+ MOV r6,#0
+ VMOV.I8 d14,#0
+ VMOV.I8 d15,#0x1
+ MOV r9,#0x11000000
+ ADD r11,r1,r1
+L0x38:
+ LDRH r12,[r4],#4
+ CMP r12,#0
+ BEQ L0x160
+ ADD r10,r0,r1
+ VLD1.8 {d7},[r0],r11
+ VLD1.8 {d8},[r10],r11
+ VLD1.8 {d5},[r0],r11
+ VZIP.8 d7,d8
+ VLD1.8 {d10},[r10],r11
+ VLD1.8 {d6},[r0],r11
+ VZIP.8 d5,d10
+ VLD1.8 {d9},[r10],r11
+ VLD1.8 {d4},[r0],r11
+ VLD1.8 {d11},[r10],r11
+ VZIP.8 d6,d9
+ VZIP.16 d8,d10
+ VZIP.8 d4,d11
+ SUB r0,r0,r1,LSL #3
+ VZIP.16 d7,d5
+ VZIP.16 d9,d11
+ VZIP.16 d6,d4
+ VTRN.32 d8,d9
+ VTRN.32 d5,d4
+ VTRN.32 d10,d11
+ VTRN.32 d7,d6
+ VABD.U8 d13,d4,d8
+ VABD.U8 d12,d5,d4
+ VABD.U8 d18,d9,d8
+ VABD.U8 d19,d6,d4
+ TST r12,#0xff
+ VCGT.U8 d16,d0,d13
+ VMAX.U8 d12,d18,d12
+ VABD.U8 d17,d10,d8
+ VMOVEQ.32 d16[0],r6
+ TST r12,#0xff00
+ VCGT.U8 d19,d2,d19
+ VCGT.U8 d12,d2,d12
+ VMOVEQ.32 d16[1],r6
+ VCGT.U8 d17,d2,d17
+ VAND d16,d16,d12
+ TST r12,#4
+ VAND d12,d16,d17
+ VAND d17,d16,d19
+ BNE L0x17c
+ BL armVCM4P10_DeblockingLumabSLT4_unsafe
+ VZIP.8 d7,d6
+ VZIP.8 d30,d29
+ VZIP.8 d24,d25
+ VZIP.8 d10,d11
+ VZIP.16 d7,d30
+ ADD r10,r0,r1
+ VZIP.16 d24,d10
+ VZIP.16 d25,d11
+ VZIP.16 d6,d29
+ VTRN.32 d7,d24
+ VTRN.32 d30,d10
+ VTRN.32 d6,d25
+ VTRN.32 d29,d11
+ VST1.8 {d7},[r0],r11
+ VST1.8 {d24},[r10],r11
+ VST1.8 {d30},[r0],r11
+ VST1.8 {d10},[r10],r11
+ VST1.8 {d6},[r0],r11
+ VST1.8 {d25},[r10],r11
+ ADDS r9,r9,r9
+ VST1.8 {d29},[r0],r11
+ ADD r5,r5,#2
+ VST1.8 {d11},[r10],r1
+ SUB r0,r0,r1,LSL #3
+ VLD1.8 {d0[]},[r7]
+ ADD r0,r0,#4
+ VLD1.8 {d2[]},[r8]
+ BCC L0x38
+ B L0x1f0
+L0x160:
+ ADD r0,r0,#4
+ ADDS r9,r9,r9
+ VLD1.8 {d0[]},[r7]
+ ADD r5,r5,#4
+ VLD1.8 {d2[]},[r8]
+ BCC L0x38
+ B L0x1f0
+L0x17c:
+ BL armVCM4P10_DeblockingLumabSGE4_unsafe
+ VZIP.8 d7,d31
+ VZIP.8 d30,d29
+ VZIP.8 d24,d25
+ VZIP.8 d28,d11
+ VZIP.16 d7,d30
+ ADD r10,r0,r1
+ VZIP.16 d24,d28
+ VZIP.16 d25,d11
+ VZIP.16 d31,d29
+ VTRN.32 d7,d24
+ VTRN.32 d30,d28
+ VTRN.32 d31,d25
+ VTRN.32 d29,d11
+ VST1.8 {d7},[r0],r11
+ VST1.8 {d24},[r10],r11
+ VST1.8 {d30},[r0],r11
+ VST1.8 {d28},[r10],r11
+ VST1.8 {d31},[r0],r11
+ VST1.8 {d25},[r10],r11
+ ADDS r9,r9,r9
+ VST1.8 {d29},[r0],r11
+ ADD r5,r5,#4
+ VST1.8 {d11},[r10],r11
+ SUB r0,r0,r1,LSL #3
+ VLD1.8 {d0[]},[r7]
+ ADD r0,r0,#4
+ VLD1.8 {d2[]},[r8]
+ BCC L0x38
+L0x1f0:
+ SUB r4,r4,#0xe
+ SUB r5,r5,#0xe
+ SUB r0,r0,#0x10
+ VLD1.8 {d0[]},[r2]
+ ADD r0,r0,r1,LSL #3
+ VLD1.8 {d2[]},[r3]
+ BNE L0x38
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S
new file mode 100644
index 0000000..76c3d7d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_InterpolateLuma_s.S
@@ -0,0 +1,323 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_InterpolateLuma
+ .func omxVCM4P10_InterpolateLuma
+omxVCM4P10_InterpolateLuma:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ SUB sp,sp,#0x10
+ LDR r6,[sp,#0x78]
+ LDR r7,[sp,#0x7c]
+ LDR r5,[sp,#0x80]
+ LDR r4,[sp,#0x84]
+ ADD r6,r6,r7,LSL #2
+ ADD r11,sp,#0
+ VMOV.I16 d31,#0x14
+ VMOV.I16 d30,#0x5
+L0x2c:
+ STM r11,{r0-r3}
+ ADD pc,pc,r6,LSL #2
+ B L0x3f0
+ B L0x78
+ B L0xa8
+ B L0xdc
+ B L0x100
+ B L0x134
+ B L0x168
+ B L0x1a8
+ B L0x1f0
+ B L0x234
+ B L0x258
+ B L0x2b0
+ B L0x2d8
+ B L0x330
+ B L0x364
+ B L0x3a8
+ B L0x3f0
+L0x78:
+ ADD r12,r0,r1,LSL #1
+ VLD1.8 {d9},[r0],r1
+ VLD1.8 {d11},[r12],r1
+ VLD1.8 {d10},[r0]
+ VLD1.8 {d12},[r12]
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d9[0]},[r2],r3
+ VST1.32 {d11[0]},[r12],r3
+ VST1.32 {d10[0]},[r2]
+ VST1.32 {d12[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0xa8:
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d14
+ VRHADD.U8 d26,d26,d18
+ VRHADD.U8 d24,d24,d16
+ VRHADD.U8 d28,d28,d20
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0xdc:
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x100:
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d15
+ VRHADD.U8 d26,d26,d19
+ VRHADD.U8 d24,d24,d17
+ VRHADD.U8 d28,d28,d21
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x134:
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ VRHADD.U8 d0,d0,d9
+ VRHADD.U8 d4,d4,d11
+ VRHADD.U8 d2,d2,d10
+ VRHADD.U8 d6,d6,d12
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x168:
+ MOV r8,r0
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ SUB r0,r8,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d0
+ VRHADD.U8 d26,d26,d4
+ VRHADD.U8 d24,d24,d2
+ VRHADD.U8 d28,d28,d6
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x1a8:
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ VQRSHRUN.S16 d14,q7,#5
+ VQRSHRUN.S16 d16,q8,#5
+ VQRSHRUN.S16 d18,q9,#5
+ VQRSHRUN.S16 d20,q10,#5
+ VRHADD.U8 d0,d0,d14
+ VRHADD.U8 d4,d4,d18
+ VRHADD.U8 d2,d2,d16
+ VRHADD.U8 d6,d6,d20
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x1f0:
+ MOV r8,r0
+ ADD r0,r0,#1
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ SUB r0,r8,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d0
+ VRHADD.U8 d26,d26,d4
+ VRHADD.U8 d24,d24,d2
+ VRHADD.U8 d28,d28,d6
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x234:
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x258:
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ VEXT.8 d18,d18,d19,#4
+ VEXT.8 d20,d20,d21,#4
+ VEXT.8 d22,d22,d23,#4
+ VEXT.8 d24,d24,d25,#4
+ VQRSHRUN.S16 d14,q9,#5
+ VQRSHRUN.S16 d16,q10,#5
+ VQRSHRUN.S16 d18,q11,#5
+ VQRSHRUN.S16 d20,q12,#5
+ VRHADD.U8 d0,d0,d14
+ VRHADD.U8 d4,d4,d18
+ VRHADD.U8 d2,d2,d16
+ VRHADD.U8 d6,d6,d20
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x2b0:
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x2d8:
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
+ VEXT.8 d18,d18,d19,#6
+ VEXT.8 d20,d20,d21,#6
+ VEXT.8 d22,d22,d23,#6
+ VEXT.8 d24,d24,d25,#6
+ VQRSHRUN.S16 d14,q9,#5
+ VQRSHRUN.S16 d16,q10,#5
+ VQRSHRUN.S16 d18,q11,#5
+ VQRSHRUN.S16 d20,q12,#5
+ VRHADD.U8 d0,d0,d14
+ VRHADD.U8 d4,d4,d18
+ VRHADD.U8 d2,d2,d16
+ VRHADD.U8 d6,d6,d20
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x330:
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ VRHADD.U8 d0,d0,d10
+ VRHADD.U8 d4,d4,d12
+ VRHADD.U8 d2,d2,d11
+ VRHADD.U8 d6,d6,d13
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x364:
+ MOV r8,r0
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD r0,r8,r1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d0
+ VRHADD.U8 d26,d26,d4
+ VRHADD.U8 d24,d24,d2
+ VRHADD.U8 d28,d28,d6
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x3a8:
+ SUB r0,r0,r1,LSL #1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
+ VQRSHRUN.S16 d14,q8,#5
+ VQRSHRUN.S16 d16,q9,#5
+ VQRSHRUN.S16 d18,q10,#5
+ VQRSHRUN.S16 d20,q11,#5
+ VRHADD.U8 d0,d0,d14
+ VRHADD.U8 d4,d4,d18
+ VRHADD.U8 d2,d2,d16
+ VRHADD.U8 d6,d6,d20
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d0[0]},[r2],r3
+ VST1.32 {d4[0]},[r12],r3
+ VST1.32 {d2[0]},[r2]
+ VST1.32 {d6[0]},[r12]
+ ADD r11,sp,#0
+ B L0x434
+L0x3f0:
+ MOV r8,r0
+ ADD r0,r0,#1
+ SUB r0,r0,r1,LSL #1
+ BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
+ ADD r0,r8,r1
+ SUB r0,r0,#2
+ BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
+ VRHADD.U8 d22,d22,d0
+ VRHADD.U8 d26,d26,d4
+ VRHADD.U8 d24,d24,d2
+ VRHADD.U8 d28,d28,d6
+ ADD r12,r2,r3,LSL #1
+ VST1.32 {d22[0]},[r2],r3
+ VST1.32 {d26[0]},[r12],r3
+ VST1.32 {d24[0]},[r2]
+ VST1.32 {d28[0]},[r12]
+ ADD r11,sp,#0
+L0x434:
+ LDM r11,{r0-r3}
+ SUBS r5,r5,#4
+ ADD r0,r0,#4
+ ADD r2,r2,#4
+ BGT L0x2c
+ SUBS r4,r4,#4
+ LDR r5,[sp,#0x80]
+ ADD r11,sp,#0
+ ADD r0,r0,r1,LSL #2
+ ADD r2,r2,r3,LSL #2
+ SUB r0,r0,r5
+ SUB r2,r2,r5
+ BGT L0x2c
+ MOV r0,#0
+ ADD sp,sp,#0x10
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S
new file mode 100644
index 0000000..0d49e4b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntraChroma_8x8_s.S
@@ -0,0 +1,217 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .section .rodata
+ .align 4
+
+armVCM4P10_pIndexTable8x8:
+ .word OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR
+ .word OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE
+
+armVCM4P10_MultiplierTableChroma8x8:
+ .hword 3, 2, 1,4
+ .hword -3,-2,-1,0
+ .hword 1, 2, 3,4
+
+
+ .text
+ .global omxVCM4P10_PredictIntraChroma_8x8
+ .func omxVCM4P10_PredictIntraChroma_8x8
+omxVCM4P10_PredictIntraChroma_8x8:
+ PUSH {r4-r10,lr}
+ VPUSH {d8-d15}
+ LDR r8, =armVCM4P10_pIndexTable8x8
+ LDR r6,[sp,#0x68]
+ LDR r4,[sp,#0x60]
+ LDR r5,[sp,#0x64]
+ LDR r7,[sp,#0x6c]
+ LDR pc,[r8,r6,LSL #2]
+OMX_VC_CHROMA_DC:
+ TST r7,#2
+ BEQ L0xe8
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d1[0]},[r0],r10
+ VLD1.8 {d1[1]},[r9],r10
+ VLD1.8 {d1[2]},[r0],r10
+ VLD1.8 {d1[3]},[r9],r10
+ VLD1.8 {d1[4]},[r0],r10
+ VLD1.8 {d1[5]},[r9],r10
+ VLD1.8 {d1[6]},[r0],r10
+ VLD1.8 {d1[7]},[r9]
+ TST r7,#1
+ BEQ L0xcc
+ VLD1.8 {d0},[r1]
+ MOV r0,#0
+ VPADDL.U8 d2,d0
+ VPADDL.U16 d3,d2
+ VPADDL.U8 d2,d1
+ VPADDL.U16 d1,d2
+ VADD.I32 d2,d3,d1
+ VRSHR.U32 d2,d2,#3
+ VRSHR.U32 d3,d3,#2
+ VRSHR.U32 d1,d1,#2
+ VMOV.I8 d5,#0xc
+ VMOV.I8 d6,#0x4
+ VSHL.I64 d5,d5,#32
+ VSHR.U64 d6,d6,#32
+ VADD.I8 d6,d6,d5
+ VTBL.8 d0,{d2-d3},d5
+ VTBL.8 d4,{d1-d2},d6
+L0x9c:
+ ADD r9,r3,r5
+ ADD r10,r5,r5
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9],r10
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9],r10
+ VST1.8 {d4},[r3],r10
+ VST1.8 {d4},[r9],r10
+ VST1.8 {d4},[r3],r10
+ VST1.8 {d4},[r9]
+ VPOP {d8-d15}
+ POP {r4-r10,pc}
+L0xcc:
+ MOV r0,#0
+ VPADDL.U8 d2,d1
+ VPADDL.U16 d1,d2
+ VRSHR.U32 d1,d1,#2
+ VDUP.8 d0,d1[0]
+ VDUP.8 d4,d1[4]
+ B L0x9c
+L0xe8:
+ TST r7,#1
+ BEQ L0x114
+ VLD1.8 {d0},[r1]
+ MOV r0,#0
+ VPADDL.U8 d2,d0
+ VPADDL.U16 d3,d2
+ VRSHR.U32 d3,d3,#2
+ VMOV.I8 d5,#0x4
+ VSHL.I64 d5,d5,#32
+ VTBL.8 d0,{d3},d5
+ B L0x11c
+L0x114:
+ VMOV.I8 d0,#0x80
+ MOV r0,#0
+L0x11c:
+ ADD r9,r3,r5
+ ADD r10,r5,r5
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9],r10
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9],r10
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9],r10
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d0},[r9]
+ VPOP {d8-d15}
+ POP {r4-r10,pc}
+OMX_VC_CHROMA_VERT:
+ VLD1.8 {d0},[r1]
+ MOV r0,#0
+ B L0x11c
+OMX_VC_CHROMA_HOR:
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d0[]},[r0],r10
+ VLD1.8 {d1[]},[r9],r10
+ VLD1.8 {d2[]},[r0],r10
+ VLD1.8 {d3[]},[r9],r10
+ VLD1.8 {d4[]},[r0],r10
+ VLD1.8 {d5[]},[r9],r10
+ VLD1.8 {d6[]},[r0],r10
+ VLD1.8 {d7[]},[r9]
+ B L0x28c
+OMX_VC_CHROMA_PLANE:
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d0},[r1]
+ VLD1.8 {d2[0]},[r2]
+ VLD1.8 {d1[0]},[r0],r10
+ VLD1.8 {d1[1]},[r9],r10
+ VLD1.8 {d1[2]},[r0],r10
+ VLD1.8 {d1[3]},[r9],r10
+ VLD1.8 {d1[4]},[r0],r10
+ VLD1.8 {d1[5]},[r9],r10
+ VLD1.8 {d1[6]},[r0],r10
+ VLD1.8 {d1[7]},[r9]
+ VREV64.8 d3,d0
+ VSUBL.U8 q3,d3,d2
+ VSHR.U64 d3,d3,#8
+ VSUBL.U8 q2,d3,d0
+ VREV64.8 d3,d1
+ VSUBL.U8 q7,d3,d2
+ VSHR.U64 d3,d3,#8
+ VSUBL.U8 q6,d3,d1
+ LDR r2, =armVCM4P10_MultiplierTableChroma8x8
+ VSHL.I64 d4,d4,#16
+ VEXT.8 d9,d4,d6,#2
+ VLD1.16 {d10},[r2]!
+ VSHL.I64 d12,d12,#16
+ VEXT.8 d16,d12,d14,#2
+ VMUL.I16 d11,d9,d10
+ VMUL.I16 d3,d16,d10
+ VPADD.I16 d3,d11,d3
+ VPADDL.S16 d3,d3
+ VSHL.I32 d2,d3,#4
+ VADD.I32 d3,d3,d2
+ VLD1.16 {d10,d11},[r2]
+ VRSHR.S32 d3,d3,#5
+ VADDL.U8 q0,d0,d1
+ VDUP.16 q0,d1[3]
+ VSHL.I16 q0,q0,#4
+ VDUP.16 q2,d3[0]
+ VDUP.16 q3,d3[2]
+ VMUL.I16 q2,q2,q5
+ VMUL.I16 q3,q3,q5
+ VADD.I16 q2,q2,q0
+ VDUP.16 q0,d6[0]
+ VDUP.16 q1,d6[1]
+ VDUP.16 q4,d6[2]
+ VDUP.16 q5,d6[3]
+ VDUP.16 q6,d7[0]
+ VDUP.16 q7,d7[1]
+ VDUP.16 q8,d7[2]
+ VDUP.16 q9,d7[3]
+ VADD.I16 q0,q2,q0
+ VADD.I16 q1,q2,q1
+ VADD.I16 q4,q2,q4
+ VADD.I16 q5,q2,q5
+ VADD.I16 q6,q2,q6
+ VADD.I16 q7,q2,q7
+ VADD.I16 q8,q2,q8
+ VADD.I16 q9,q2,q9
+ VQRSHRUN.S16 d0,q0,#5
+ VQRSHRUN.S16 d1,q1,#5
+ VQRSHRUN.S16 d2,q4,#5
+ VQRSHRUN.S16 d3,q5,#5
+ VQRSHRUN.S16 d4,q6,#5
+ VQRSHRUN.S16 d5,q7,#5
+ VQRSHRUN.S16 d6,q8,#5
+ VQRSHRUN.S16 d7,q9,#5
+L0x28c:
+ ADD r9,r3,r5
+ ADD r10,r5,r5
+ VST1.8 {d0},[r3],r10
+ VST1.8 {d1},[r9],r10
+ VST1.8 {d2},[r3],r10
+ VST1.8 {d3},[r9],r10
+ VST1.8 {d4},[r3],r10
+ VST1.8 {d5},[r9],r10
+ VST1.8 {d6},[r3],r10
+ VST1.8 {d7},[r9]
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r10,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
new file mode 100644
index 0000000..53268f6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
@@ -0,0 +1,239 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+
+ .section .rodata
+ .align 4
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+armVCM4P10_pIndexTable16x16:
+ .word OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
+ .word OMX_VC_16X16_DC, OMX_VC_16X16_PLANE
+
+
+
+armVCM4P10_MultiplierTable16x16:
+ .hword 7, 6, 5, 4, 3, 2, 1, 8
+ .hword 0, 1, 2, 3, 4, 5, 6, 7
+ .hword 8, 9, 10, 11, 12, 13, 14, 15
+
+ .text
+
+ .global omxVCM4P10_PredictIntra_16x16
+ .func omxVCM4P10_PredictIntra_16x16
+omxVCM4P10_PredictIntra_16x16:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ LDR r9, =armVCM4P10_pIndexTable16x16
+ LDR r6,[sp,#0x70]
+ LDR r4,[sp,#0x68]
+ LDR r5,[sp,#0x6c]
+ LDR r7,[sp,#0x74]
+ MOV r12,#0x10
+ LDR pc,[r9,r6,LSL #2]
+OMX_VC_16X16_VERT:
+ VLD1.8 {d0,d1},[r1]
+ ADD r8,r3,r5
+ ADD r10,r5,r5
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3]
+ VST1.8 {d0,d1},[r8]
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_HOR:
+ ADD r8,r0,r4
+ ADD r4,r4,r4
+ ADD r11,r3,r5
+ ADD r5,r5,r5
+L0x8c:
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ SUBS r12,r12,#8
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ BNE L0x8c
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_DC:
+ MOV r11,#0
+ TST r7,#2
+ BEQ L0x14c
+ ADD r8,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d2[0]},[r0],r10
+ VLD1.8 {d2[1]},[r8],r10
+ VLD1.8 {d2[2]},[r0],r10
+ VLD1.8 {d2[3]},[r8],r10
+ VLD1.8 {d2[4]},[r0],r10
+ VLD1.8 {d2[5]},[r8],r10
+ VLD1.8 {d2[6]},[r0],r10
+ VLD1.8 {d2[7]},[r8],r10
+ VLD1.8 {d3[0]},[r0],r10
+ VLD1.8 {d3[1]},[r8],r10
+ VLD1.8 {d3[2]},[r0],r10
+ VLD1.8 {d3[3]},[r8],r10
+ VLD1.8 {d3[4]},[r0],r10
+ VLD1.8 {d3[5]},[r8],r10
+ VLD1.8 {d3[6]},[r0],r10
+ VLD1.8 {d3[7]},[r8]
+ VPADDL.U8 q0,q1
+ ADD r11,r11,#1
+ VPADD.I16 d0,d0,d1
+ VPADDL.U16 d0,d0
+ VPADDL.U32 d6,d0
+ VRSHR.U64 d8,d6,#4
+L0x14c:
+ TST r7,#1
+ BEQ L0x170
+ VLD1.8 {d0,d1},[r1]
+ ADD r11,r11,#1
+ VPADDL.U8 q0,q0
+ VPADD.I16 d0,d0,d1
+ VPADDL.U16 d0,d0
+ VPADDL.U32 d7,d0
+ VRSHR.U64 d8,d7,#4
+L0x170:
+ CMP r11,#2
+ BNE L0x180
+ VADD.I64 d8,d7,d6
+ VRSHR.U64 d8,d8,#5
+L0x180:
+ VDUP.8 q3,d8[0]
+ CMP r11,#0
+ ADD r8,r3,r5
+ ADD r10,r5,r5
+ BNE L0x198
+ VMOV.I8 q3,#0x80
+L0x198:
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_PLANE:
+ LDR r9, =armVCM4P10_MultiplierTable16x16
+ VLD1.8 {d0,d1},[r1]
+ VLD1.8 {d4[0]},[r2]
+ ADD r8,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d2[0]},[r0],r10
+ VLD1.8 {d2[1]},[r8],r10
+ VLD1.8 {d2[2]},[r0],r10
+ VLD1.8 {d2[3]},[r8],r10
+ VLD1.8 {d2[4]},[r0],r10
+ VLD1.8 {d2[5]},[r8],r10
+ VLD1.8 {d2[6]},[r0],r10
+ VLD1.8 {d2[7]},[r8],r10
+ VLD1.8 {d3[0]},[r0],r10
+ VLD1.8 {d3[1]},[r8],r10
+ VLD1.8 {d3[2]},[r0],r10
+ VLD1.8 {d3[3]},[r8],r10
+ VLD1.8 {d3[4]},[r0],r10
+ VLD1.8 {d3[5]},[r8],r10
+ VLD1.8 {d3[6]},[r0],r10
+ VLD1.8 {d3[7]},[r8]
+ VREV64.8 d5,d1
+ VSUBL.U8 q3,d5,d4
+ VSHR.U64 d5,d5,#8
+ VSUBL.U8 q4,d5,d0
+ VSHL.I64 d9,d9,#16
+ VEXT.8 d9,d9,d6,#2
+ VREV64.8 d12,d3
+ VSUBL.U8 q7,d12,d4
+ VSHR.U64 d12,d12,#8
+ VSUBL.U8 q8,d12,d2
+ VLD1.16 {d20,d21},[r9]!
+ VSHL.I64 d17,d17,#16
+ VEXT.8 d17,d17,d14,#2
+ VMULL.S16 q11,d8,d20
+ VMULL.S16 q12,d16,d20
+ VMLAL.S16 q11,d9,d21
+ VMLAL.S16 q12,d17,d21
+ VPADD.I32 d22,d23,d22
+ VPADD.I32 d23,d25,d24
+ VPADDL.S32 q11,q11
+ VSHL.I64 q12,q11,#2
+ VADD.I64 q11,q11,q12
+ VRSHR.S64 q11,q11,#6
+ VSHL.I64 q12,q11,#3
+ VSUB.I64 q12,q12,q11
+ VLD1.16 {d20,d21},[r9]!
+ VDUP.16 q6,d22[0]
+ VDUP.16 q7,d23[0]
+ VADDL.U8 q11,d1,d3
+ VSHL.I16 q11,q11,#4
+ VDUP.16 q11,d23[3]
+ VADD.I64 d1,d24,d25
+ VLD1.16 {d24,d25},[r9]
+ VDUP.16 q13,d1[0]
+ VSUB.I16 q13,q11,q13
+ VMUL.I16 q5,q6,q10
+ VMUL.I16 q6,q6,q12
+ VADD.I16 q0,q5,q13
+ VADD.I16 q1,q6,q13
+L0x2d4:
+ VQRSHRUN.S16 d6,q0,#5
+ VQRSHRUN.S16 d7,q1,#5
+ SUBS r12,r12,#1
+ VST1.8 {d6,d7},[r3],r5
+ VADD.I16 q0,q0,q7
+ VADD.I16 q1,q1,q7
+ BNE L0x2d4
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S
new file mode 100644
index 0000000..aa6d7ef
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_4x4_s.S
@@ -0,0 +1,261 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+
+ .section .rodata
+ .align 4
+
+armVCM4P10_pSwitchTable4x4:
+ .word OMX_VC_4x4_VERT, OMX_VC_4x4_HOR
+ .word OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL
+ .word OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR
+ .word OMX_VC_4x4_HD, OMX_VC_4x4_VL
+ .word OMX_VC_4x4_HU
+
+ .text
+
+ .global omxVCM4P10_PredictIntra_4x4
+ .func omxVCM4P10_PredictIntra_4x4
+omxVCM4P10_PredictIntra_4x4:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d12}
+ LDR r8, =armVCM4P10_pSwitchTable4x4
+ LDRD r6,r7,[sp,#0x58]
+ LDRD r4,r5,[sp,#0x50]
+ LDR pc,[r8,r6,LSL #2]
+OMX_VC_4x4_HOR:
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d0[]},[r0],r10
+ VLD1.8 {d1[]},[r9],r10
+ VLD1.8 {d2[]},[r0]
+ VLD1.8 {d3[]},[r9]
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VST1.32 {d0[0]},[r3],r12
+ VST1.32 {d1[0]},[r11],r12
+ VST1.32 {d2[0]},[r3]
+ VST1.32 {d3[0]},[r11]
+ B L0x348
+OMX_VC_4x4_VERT:
+ VLD1.32 {d0[0]},[r1]
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+L0x58:
+ VST1.32 {d0[0]},[r3],r12
+ VST1.32 {d0[0]},[r11],r12
+ VST1.32 {d0[0]},[r3]
+ VST1.32 {d0[0]},[r11]
+ B L0x348
+OMX_VC_4x4_DC:
+ TST r7,#2
+ BEQ L0xdc
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d0[0]},[r0],r10
+ VLD1.8 {d0[1]},[r9],r10
+ VLD1.8 {d0[2]},[r0]
+ VLD1.8 {d0[3]},[r9]
+ TST r7,#1
+ BEQ L0xbc
+ VLD1.32 {d0[1]},[r1]
+ MOV r0,#0
+ VPADDL.U8 d1,d0
+ VPADDL.U16 d1,d1
+ VPADDL.U32 d1,d1
+ VRSHR.U64 d1,d1,#3
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VDUP.8 d0,d1[0]
+ B L0x58
+L0xbc:
+ MOV r0,#0
+ VPADDL.U8 d1,d0
+ VPADDL.U16 d1,d1
+ VRSHR.U32 d1,d1,#2
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VDUP.8 d0,d1[0]
+ B L0x58
+L0xdc:
+ TST r7,#1
+ BEQ L0x108
+ VLD1.32 {d0[0]},[r1]
+ MOV r0,#0
+ VPADDL.U8 d1,d0
+ VPADDL.U16 d1,d1
+ VRSHR.U32 d1,d1,#2
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VDUP.8 d0,d1[0]
+ B L0x58
+L0x108:
+ VMOV.I8 d0,#0x80
+ MOV r0,#0
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ B L0x58
+OMX_VC_4x4_DIAG_DL:
+ TST r7,#0x40
+ BEQ L0x138
+ VLD1.8 {d3},[r1]
+ VDUP.8 d2,d3[7]
+ VEXT.8 d4,d3,d2,#1
+ VEXT.8 d5,d3,d2,#2
+ B L0x14c
+L0x138:
+ VLD1.32 {d0[1]},[r1]
+ VDUP.8 d2,d0[7]
+ VEXT.8 d3,d0,d2,#4
+ VEXT.8 d4,d0,d2,#5
+ VEXT.8 d5,d0,d2,#6
+L0x14c:
+ VHADD.U8 d6,d3,d5
+ VRHADD.U8 d6,d6,d4
+ VST1.32 {d6[0]},[r3],r5
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r3],r5
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r3],r5
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r3]
+ B L0x348
+OMX_VC_4x4_DIAG_DR:
+ VLD1.32 {d0[0]},[r1]
+ VLD1.8 {d1[7]},[r2]
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ ADD r1,r3,r5
+ VLD1.8 {d1[6]},[r0],r10
+ VLD1.8 {d1[5]},[r9],r10
+ VLD1.8 {d1[4]},[r0]
+ VLD1.8 {d1[3]},[r9]
+ VEXT.8 d3,d1,d0,#3
+ ADD r4,r1,r5
+ VEXT.8 d4,d1,d0,#4
+ ADD r6,r4,r5
+ VEXT.8 d5,d1,d0,#5
+ VHADD.U8 d6,d3,d5
+ VRHADD.U8 d6,d6,d4
+ VST1.32 {d6[0]},[r6]
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r4]
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r1]
+ VEXT.8 d6,d6,d6,#1
+ VST1.32 {d6[0]},[r3]
+ B L0x348
+OMX_VC_4x4_VR:
+ VLD1.32 {d0[0]},[r1]
+ VLD1.8 {d0[7]},[r2]
+ VLD1.8 {d1[7]},[r0],r4
+ VLD1.8 {d2[7]},[r0],r4
+ VLD1.8 {d1[6]},[r0]
+ VEXT.8 d12,d0,d0,#7
+ VEXT.8 d3,d1,d12,#6
+ VEXT.8 d4,d2,d12,#7
+ VEXT.8 d5,d1,d0,#7
+ VEXT.8 d6,d2,d0,#7
+ VEXT.8 d11,d1,d12,#7
+ VHADD.U8 d8,d6,d12
+ VRHADD.U8 d8,d8,d11
+ VHADD.U8 d7,d3,d5
+ VRHADD.U8 d7,d7,d4
+ VEXT.8 d10,d8,d8,#1
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VEXT.8 d9,d7,d7,#1
+ VST1.32 {d10[0]},[r3],r12
+ VST1.32 {d9[0]},[r11],r12
+ VST1.32 {d8[0]},[r3],r12
+ VST1.32 {d7[0]},[r11]
+ B L0x348
+OMX_VC_4x4_HD:
+ VLD1.8 {d0},[r1]
+ VLD1.8 {d1[7]},[r2]
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d1[6]},[r0],r10
+ VLD1.8 {d1[5]},[r9],r10
+ VLD1.8 {d1[4]},[r0]
+ VLD1.8 {d1[3]},[r9]
+ VEXT.8 d3,d1,d0,#3
+ VEXT.8 d4,d1,d0,#2
+ VEXT.8 d5,d1,d0,#1
+ VHADD.U8 d7,d3,d5
+ VRHADD.U8 d7,d7,d4
+ VRHADD.U8 d8,d4,d3
+ VSHL.I64 d8,d8,#24
+ VSHL.I64 d6,d7,#16
+ VZIP.8 d8,d6
+ VEXT.8 d7,d7,d7,#6
+ VEXT.8 d8,d6,d7,#2
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VST1.32 {d8[1]},[r3],r12
+ VST1.32 {d6[1]},[r11],r12
+ VST1.32 {d8[0]},[r3]
+ VST1.32 {d6[0]},[r11]
+ B L0x348
+OMX_VC_4x4_VL:
+ TST r7,#0x40
+ BEQ L0x2b4
+ VLD1.8 {d3},[r1]
+ VEXT.8 d4,d3,d3,#1
+ VEXT.8 d5,d4,d4,#1
+ B L0x2c8
+L0x2b4:
+ VLD1.32 {d0[1]},[r1]
+ VDUP.8 d2,d0[7]
+ VEXT.8 d3,d0,d2,#4
+ VEXT.8 d4,d0,d2,#5
+ VEXT.8 d5,d0,d2,#6
+L0x2c8:
+ VRHADD.U8 d7,d4,d3
+ VHADD.U8 d10,d3,d5
+ VRHADD.U8 d10,d10,d4
+ VEXT.8 d8,d7,d7,#1
+ ADD r11,r3,r5
+ ADD r12,r5,r5
+ VEXT.8 d9,d10,d8,#1
+ VST1.32 {d7[0]},[r3],r12
+ VST1.32 {d10[0]},[r11],r12
+ VST1.32 {d8[0]},[r3]
+ VST1.32 {d9[0]},[r11]
+ B L0x348
+OMX_VC_4x4_HU:
+ ADD r9,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d1[4]},[r0],r10
+ VLD1.8 {d1[5]},[r9],r10
+ VLD1.8 {d1[6]},[r0]
+ VLD1.8 {d1[7]},[r9]
+ VDUP.8 d2,d1[7]
+ VEXT.8 d3,d1,d2,#4
+ VEXT.8 d4,d1,d2,#5
+ VEXT.8 d5,d1,d2,#6
+ VHADD.U8 d7,d3,d5
+ VRHADD.U8 d7,d7,d4
+ VRHADD.U8 d8,d4,d3
+ VZIP.8 d8,d7
+ VST1.32 {d8[0]},[r3],r5
+ VEXT.8 d8,d8,d8,#2
+ VST1.32 {d8[0]},[r3],r5
+ VEXT.8 d8,d8,d8,#2
+ VST1.32 {d8[0]},[r3],r5
+ VST1.32 {d7[0]},[r3]
+L0x348:
+ MOV r0,#0
+ VPOP {d8-d12}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S
new file mode 100644
index 0000000..28a89cb
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantChromaDCFromPair_s.S
@@ -0,0 +1,54 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global omxVCM4P10_TransformDequantChromaDCFromPair
+ .func omxVCM4P10_TransformDequantChromaDCFromPair
+omxVCM4P10_TransformDequantChromaDCFromPair:
+ push {r4-r10, lr}
+ ldr r9, [r0,#0]
+ vmov.i16 d0, #0
+ mov r8, #0x1f
+ vst1.16 {d0}, [r1]
+ ldrb r6, [r9], #1
+unpackLoop:
+ tst r6, #0x10
+ ldrnesb r5, [r9, #1]
+ ldrneb r4, [r9], #2
+ and r7, r8, r6, lsl #1
+ ldreqsb r4, [r9], #1
+ orrne r4, r4, r5, lsl #8
+ tst r6, #0x20
+ ldreqb r6, [r9], #1
+ strh r4, [r1, r7]
+ beq unpackLoop
+ ldmia r1, {r3, r4}
+ str r9, [r0, #0]
+ ldr r5, =armVCM4P10_QPDivTable
+ ldr r6, =armVCM4P10_VMatrixQPModTable
+ saddsubx r3, r3, r3
+ saddsubx r4, r4, r4
+ ldrsb r9, [r5, r2]
+ ldrsb r2, [r6, r2]
+ sadd16 r5, r3, r4
+ ssub16 r6, r3, r4
+ lsl r2, r2, r9
+ vmov d0, r5, r6
+ vrev32.16 d0, d0
+ vdup.16 d1, r2
+ vmull.s16 q1, d0, d1
+ vshrn.i32 d2, q1, #1
+ vst1.16 {d2}, [r1]
+ mov r0, #0
+ pop {r4-r10, pc}
+ .endfunc
+
+ .end
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S
new file mode 100644
index 0000000..a3a0715
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_TransformDequantLumaDCFromPair_s.S
@@ -0,0 +1,76 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+ .text
+
+ .global armVCM4P10_InvTransformDequantLumaDC4x4
+ .func armVCM4P10_InvTransformDequantLumaDC4x4
+armVCM4P10_InvTransformDequantLumaDC4x4:
+ PUSH {r4-r6,lr}
+ VPUSH {d8-d13}
+ VLD4.16 {d0,d1,d2,d3},[r0]
+ LDR r2, =armVCM4P10_QPDivTable
+ LDR r3, =armVCM4P10_VMatrixQPModTable
+ VADD.I16 d4,d0,d1
+ VADD.I16 d5,d2,d3
+ VSUB.I16 d6,d0,d1
+ LDRSB r4,[r2,r1]
+ VSUB.I16 d7,d2,d3
+ LDRSB r5,[r3,r1]
+ VADD.I16 d0,d4,d5
+ VSUB.I16 d1,d4,d5
+ VSUB.I16 d2,d6,d7
+ LSL r5,r5,r4
+ VADD.I16 d3,d6,d7
+ VTRN.16 d0,d1
+ VTRN.16 d2,d3
+ VTRN.32 q0,q1
+ VADD.I16 d4,d0,d1
+ VADD.I16 d5,d2,d3
+ VSUB.I16 d6,d0,d1
+ VSUB.I16 d7,d2,d3
+ VADD.I16 d0,d4,d5
+ VSUB.I16 d1,d4,d5
+ VSUB.I16 d2,d6,d7
+ VADD.I16 d3,d6,d7
+ VDUP.16 d5,r5
+ VMOV.I32 q3,#0x2
+ VMOV.I32 q4,#0x2
+ VMOV.I32 q5,#0x2
+ VMOV.I32 q6,#0x2
+ VMLAL.S16 q3,d0,d5
+ VMLAL.S16 q4,d1,d5
+ VMLAL.S16 q5,d2,d5
+ VMLAL.S16 q6,d3,d5
+ VSHRN.I32 d0,q3,#2
+ VSHRN.I32 d1,q4,#2
+ VSHRN.I32 d2,q5,#2
+ VSHRN.I32 d3,q6,#2
+ VST1.16 {d0,d1,d2,d3},[r0]
+ VPOP {d8-d13}
+ POP {r4-r6,pc}
+ .endfunc
+
+.global omxVCM4P10_TransformDequantLumaDCFromPair
+.func omxVCM4P10_TransformDequantLumaDCFromPair
+omxVCM4P10_TransformDequantLumaDCFromPair:
+ PUSH {r4-r6,lr}
+ MOV r4,r1
+ MOV r5,r2
+ BL armVCM4P10_UnpackBlock4x4
+ MOV r0,r4
+ MOV r1,r5
+ BL armVCM4P10_InvTransformDequantLumaDC4x4
+ MOV r0,#0
+ POP {r4-r6,pc}
+ .endfunc
+
+ .end
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
new file mode 100755
index 0000000..74b5505
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_Huff_Tables_VLC.h
@@ -0,0 +1,37 @@
+/**
+ *
+ * File Name: armVCM4P2_Huff_Tables_VLC.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ *
+ * File: armVCM4P2_Huff_Tables.h
+ * Description: Declares Tables used for Hufffman coding and decoding
+ * in MP4P2 codec.
+ *
+ */
+
+#ifndef _OMXHUFFTAB_H_
+#define _OMXHUFFTAB_H_
+
+
+extern const OMX_U16 armVCM4P2_IntraVlcL0L1[200];
+
+
+extern const OMX_U16 armVCM4P2_InterVlcL0L1[200];
+
+extern const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64];
+//extern const OMX_U16 armVCM4P2_aIntraDCChromaIndex[32];
+extern const OMX_U16 armVCM4P2_aVlcMVD[124];
+
+extern const OMX_U8 armVCM4P2_InterL0L1LMAX[73];
+extern const OMX_U8 armVCM4P2_InterL0L1RMAX[35];
+extern const OMX_U8 armVCM4P2_IntraL0L1LMAX[53];
+extern const OMX_U8 armVCM4P2_IntraL0L1RMAX[40]
+
+#endif /* _OMXHUFFTAB_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
new file mode 100755
index 0000000..e95203a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/api/armVCM4P2_ZigZag_Tables.h
@@ -0,0 +1,25 @@
+/**
+ *
+ * File Name: armVCM4P2_ZigZag_Tables.h
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ *
+ * File: armVCM4P2_Zigzag_Tables.h
+ * Description: Declares Tables used for Zigzag scan in MP4P2 codec.
+ *
+ */
+
+#ifndef _OMXZIGZAGTAB_H
+#define _OMXZIGZAGTAB_H
+
+extern const OMX_U8 armVCM4P2_aClassicalZigzagScan [192];
+//extern const OMX_U8 armVCM4P2_aHorizontalZigzagScan [64];
+//extern const OMX_U8 armVCM4P2_aVerticalZigzagScan [64];
+
+#endif /* _OMXZIGZAGTAB_H_ */
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s
new file mode 100755
index 0000000..95fe6d2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Clip8_s.s
@@ -0,0 +1,82 @@
+; /**
+; *
+; * File Name: armVCM4P2_Clip8_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains module for Clipping 16 bit value to [0,255] Range
+; */
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+;//Input Arguments
+
+pSrc RN 0
+pDst RN 1
+step RN 2
+
+;// Neon Registers
+
+qx0 QN Q0.S16
+dx00 DN D0.S16
+dx01 DN D1.S16
+qx1 QN Q1.S16
+dx10 DN D2.S16
+dx11 DN D3.S16
+
+qx2 QN Q2.S16
+dx20 DN D4.S16
+dx21 DN D5.S16
+qx3 QN Q3.S16
+dx30 DN D6.S16
+dx31 DN D7.S16
+
+
+dclip0 DN D0.U8
+dclip1 DN D2.U8
+dclip2 DN D4.U8
+dclip3 DN D6.U8
+
+ M_START armVCM4P2_Clip8
+
+ VLD1 {dx00,dx01,dx10,dx11},[pSrc]! ;// Load 16 entries from pSrc
+ VLD1 {dx20,dx21,dx30,dx31},[pSrc]! ;// Load next 16 entries from pSrc
+ VQSHRUN dclip0,qx0,#0 ;// dclip0[i]=clip qx0[i] to [0,255]
+ VQSHRUN dclip1,qx1,#0 ;// dclip1[i]=clip qx1[i] to [0,255]
+ VST1 {dclip0},[pDst],step ;// store 8 bytes and pDst=pDst+step
+ VST1 {dclip1},[pDst],step ;// store 8 bytes and pDst=pDst+step
+ VQSHRUN dclip2,qx2,#0
+ VQSHRUN dclip3,qx3,#0
+ VST1 {dclip2},[pDst],step
+ VST1 {dclip3},[pDst],step
+
+ VLD1 {dx00,dx01,dx10,dx11},[pSrc]! ;// Load 16 entries from pSrc
+ VLD1 {dx20,dx21,dx30,dx31},[pSrc]! ;// Load next 16 entries from pSrc
+ VQSHRUN dclip0,qx0,#0 ;// dclip0[i]=clip qx0[i] to [0,255]
+ VQSHRUN dclip1,qx1,#0 ;// dclip1[i]=clip qx1[i] to [0,255]
+ VST1 {dclip0},[pDst],step ;// store 8 bytes and pDst=pDst+step
+ VST1 {dclip1},[pDst],step ;// store 8 bytes and pDst=pDst+step
+ VQSHRUN dclip2,qx2,#0
+ VQSHRUN dclip3,qx3,#0
+ VST1 {dclip2},[pDst],step
+ VST1 {dclip3},[pDst],step
+
+
+
+ M_END
+ ENDIF
+
+
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
new file mode 100755
index 0000000..e4a7f33
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
@@ -0,0 +1,398 @@
+;/**
+; *
+; * File Name: armVCM4P2_DecodeVLCZigzag_AC_unsafe_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter, intra block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_AC_unsafe
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan
+; *
+; *
+; *
+; *
+; */
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+
+
+ M_VARIANTS ARM1136JS
+
+
+
+
+
+ IF ARM1136JS
+
+
+
+
+
+;//Input Arguments
+
+ppBitStream RN 0
+pBitOffset RN 1
+pDst RN 2
+shortVideoHeader RN 3
+
+
+;//Local Variables
+
+Return RN 0
+
+pVlcTableL0L1 RN 4
+pLMAXTableL0L1 RN 4
+pRMAXTableL0L1 RN 4
+pZigzagTable RN 4
+
+ftype RN 0
+temp3 RN 4
+temp RN 5
+Count RN 6
+Escape RN 5
+
+;// armVCM4P2_FillVLDBuffer
+zigzag RN 0
+storeLevel RN 1
+temp2 RN 4
+temp1 RN 5
+sign RN 5
+Last RN 7
+storeRun RN 14
+
+
+packRetIndex RN 5
+
+
+markerbit RN 5
+
+;// Scratch Registers
+
+RBitStream RN 8
+RBitBuffer RN 9
+RBitCount RN 10
+
+T1 RN 11
+T2 RN 12
+LR RN 14
+
+
+
+ M_ALLOC4 pppBitStream,4
+ M_ALLOC4 ppOffset,4
+ M_ALLOC4 pLinkRegister,4
+
+ M_START armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+ ;// get the table addresses from stack
+ M_ARG ppVlcTableL0L1,4
+ M_ARG ppLMAXTableL0L1,4
+ M_ARG ppRMAXTableL0L1,4
+ M_ARG ppZigzagTable,4
+
+ ;// Store ALL zeros at pDst
+
+ MOV temp1,#0 ;// Initialize Count to zero
+ MOV Last,#0
+ M_STR LR,pLinkRegister ;// Store Link Register on Stack
+ MOV temp2,#0
+ MOV LR,#0
+
+ ;// Initialize the Macro and Store all zeros to pDst
+
+ STM pDst!,{temp2,temp1,Last,LR}
+ M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount
+ STM pDst!,{temp2,temp1,Last,LR}
+ M_BD_INIT1 T1, T2, T2
+ STM pDst!,{temp2,temp1,Last,LR}
+ M_BD_INIT2 T1, T2, T2
+ STM pDst!,{temp2,temp1,Last,LR}
+ M_STR ppBitStream,pppBitStream ;// Store ppBitstream on stack
+ STM pDst!,{temp2,temp1,Last,LR}
+ M_STR pBitOffset,ppOffset ;// Store pBitOffset on stack
+ STM pDst!,{temp2,temp1,Last,LR}
+
+ STM pDst!,{temp2,temp1,Last,LR}
+ STM pDst!,{temp2,temp1,Last,LR}
+
+
+ SUB pDst,pDst,#128 ;// Restore pDst
+
+ ;// The armVCM4P2_GetVLCBits begins
+
+getVLCbits
+
+ M_BD_LOOK8 Escape,7 ;// Load Escape Value
+ LSR Escape,Escape,#25
+ CMP Escape,#3 ;// check for escape mode
+ MOVNE ftype,#0
+ BNE notEscapemode ;// Branch if not in Escape mode 3
+
+ M_BD_VSKIP8 #7,T1
+ CMP shortVideoHeader,#0 ;// Check shortVideoHeader flag to know the type of Escape mode
+ BEQ endFillVLD
+
+ ;// Escape Mode 4
+
+ M_BD_READ8 Last,1,T1
+ M_BD_READ8 storeRun,6,T1
+ M_BD_READ8 storeLevel,8,T1
+
+
+ ;// Check whether the Reserved values for Level are used and Exit with an Error Message if it is so
+
+ TEQ storeLevel,#0
+ TEQNE storeLevel,#128
+ BEQ ExitError
+
+ ADD temp2,storeRun,Count
+ CMP temp2,#64
+ BGE ExitError ;// error if Count+storeRun >= 64
+
+
+ ;// Load address of zigzagTable
+
+ M_LDR pZigzagTable,ppZigzagTable ;// Loading the Address of Zigzag table
+
+
+ ;// armVCM4P2_FillVLDBuffer
+
+ SXTB storeLevel,storeLevel ;// Sign Extend storeLevel to 32 bits
+
+
+ ;// To Reflect Runlength
+
+ ADD Count,Count,storeRun
+ LDRB zigzag,[pZigzagTable,Count]
+ ADD Count,Count,#1
+ STRH storeLevel,[pDst,zigzag] ;// store Level
+
+ B ExitOk
+
+
+
+endFillVLD
+
+
+ ;// Load Ftype( Escape Mode) value based on the two successive bits in the bitstream
+
+ M_BD_READ8 temp1,1,T1
+ CMP temp1,#0
+ MOVEQ ftype,#1
+ BEQ notEscapemode
+ M_BD_READ8 temp1,1,T1
+ CMP temp1,#1
+ MOVEQ ftype,#3
+ MOVNE ftype,#2
+
+
+notEscapemode
+
+ ;// Load optimized packed VLC table with last=0 and Last=1
+
+ M_LDR pVlcTableL0L1,ppVlcTableL0L1 ;// Load Combined VLC Table
+
+
+ CMP ftype,#3 ;// If ftype >=3 get perform Fixed Length Decoding (Escape Mode 3)
+ BGE EscapeMode3 ;// Else continue normal VLC Decoding
+
+ ;// Variable lengh decoding, "armUnPackVLC32"
+
+
+ M_BD_VLD packRetIndex,T1,T2,pVlcTableL0L1,4,2
+
+
+ LDR temp3,=0xFFF
+
+ CMP packRetIndex,temp3 ;// Check for invalid symbol
+ BEQ ExitError ;// if invalid symbol occurs exit with an error message
+
+ AND Last,packRetIndex,#2 ;// Get Last from packed Index
+
+
+
+
+ LSR storeRun,packRetIndex,#7 ;// Get Run Value from Packed index
+ AND storeLevel,packRetIndex,#0x7c ;// storeLevel=packRetIndex[2-6],storeLevel[0-1]=0
+
+
+ M_LDR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Load LMAX table
+
+
+ LSR storeLevel,storeLevel,#2 ;// Level value
+
+ CMP ftype,#1
+ BNE ftype2
+
+ ;// ftype==1; Escape mode =1
+
+
+ ADD temp1, pLMAXTableL0L1, Last, LSL#4 ;// If the Last=1 add 32 to table address
+ LDRB temp1,[temp1,storeRun]
+
+
+ ADD storeLevel,temp1,storeLevel
+
+ftype2
+
+ ;// ftype =2; Escape mode =2
+
+ M_LDR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Load RMAX Table
+
+ CMP ftype,#2
+ BNE FillVLDL1
+
+ ADD temp1, pRMAXTableL0L1, Last, LSL#4 ;// If Last=1 add 32 to table address
+ SUB temp2,storeLevel,#1
+ LDRB temp1,[temp1,temp2]
+
+
+ ADD storeRun,storeRun,#1
+ ADD storeRun,temp1
+
+FillVLDL1
+
+
+ ;// armVCM4P2_FillVLDBuffer
+
+ M_LDR pZigzagTable,ppZigzagTable ;// Load address of zigzagTable
+
+ M_BD_READ8 sign,1,T1
+
+ CMP sign,#1
+ RSBEQ storeLevel,storeLevel,#0
+
+ ADD temp1,storeRun,Count ;// Exit with an error message if Run + Count exceeds 63
+ CMP temp1,#64
+ BGE ExitError
+
+
+
+
+
+
+ ;// To Reflect Runlenght
+
+ ADD Count,Count,storeRun
+
+storeLevelL1
+
+ LDRB zigzag,[pZigzagTable,Count]
+ CMP Last,#2 ;// Check if the Level val is Last non zero val
+ ADD Count,Count,#1
+ LSR Last,Last,#1
+ STRH storeLevel,[pDst,zigzag]
+
+ BNE end
+
+ B ExitOk
+
+
+
+ ;// Fixed Lengh Decoding Escape Mode 3
+
+EscapeMode3
+
+ M_BD_READ8 Last,1,T1
+ M_BD_READ8 storeRun,6,T1
+
+ ADD temp2,storeRun,Count ;// Exit with an error message if Run + Count exceeds 63
+ CMP temp2,#64
+ BGE ExitError
+
+ M_BD_READ8 markerbit,1,T1
+ TEQ markerbit,#0 ;// Exit with an error message if marker bit is zero
+ BEQ ExitError
+
+ M_BD_READ16 storeLevel,12,T1
+
+ TST storeLevel,#0x800 ;// test if the level is negative
+ SUBNE storeLevel,storeLevel,#4096
+ CMP storeLevel,#0
+ CMPNE storeLevel,#-2048
+ BEQ ExitError ;// Exit with an error message if Level==0 or -2048
+
+ M_LDR pZigzagTable,ppZigzagTable ;// Load address of zigzagTable
+
+ M_BD_READ8 markerbit,1,T1
+
+
+ ;// armVCM4P2_FillVLDBuffer ( Sign not used as storeLevel is preprocessed)
+
+
+
+ ;// To Reflect Run Length
+
+ ADD Count,Count,storeRun
+
+
+
+storeLevelLast
+
+ LDRB zigzag,[pZigzagTable,Count]
+ CMP Last,#1
+ ADD Count,Count,#1
+ STRH storeLevel,[pDst,zigzag]
+
+ BNE end
+
+ B ExitOk
+
+end
+
+ CMP Count,#64 ;//Run the Loop untill Count reaches 64
+
+ BLT getVLCbits
+
+
+ExitOk
+ ;// Exit When VLC Decoding is done Successfully
+
+ ;// Loading ppBitStream and pBitOffset from stack
+
+ CMP Last,#1
+ M_LDR ppBitStream,pppBitStream
+ M_LDR pBitOffset,ppOffset
+
+ ;//Ending the macro
+
+ M_BD_FINI ppBitStream,pBitOffset
+
+ MOVEQ Return,#OMX_Sts_NoErr
+ MOVNE Return,#OMX_Sts_Err
+ M_LDR LR,pLinkRegister ;// Load the Link Register Back
+ B exit2
+
+ExitError
+ ;// Exit When an Error occurs
+
+ M_LDR ppBitStream,pppBitStream
+ M_LDR pBitOffset,ppOffset
+ ;//Ending the macro
+
+ M_BD_FINI ppBitStream,pBitOffset
+ M_LDR LR,pLinkRegister
+ MOV Return,#OMX_Sts_Err
+
+exit2
+
+
+ M_END
+ ENDIF
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
new file mode 100755
index 0000000..38af975
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Huff_Tables_VLC.c
@@ -0,0 +1,211 @@
+ /**
+ *
+ * File Name: armVCM4P2_Huff_Tables_VLC.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armVCM4P2_Huff_Tables_VLC.c
+ * Description: Contains all the Huffman tables used in MPEG4 codec
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+
+#include "armCOMM_Bitstream.h"
+
+
+
+
+// Contains optimized and Packed VLC tables with Last=0 and Last=1
+
+// optimized Packed VLC table Entry Format
+// ---------------------------------------
+//
+// 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00
+// +------------------------------------------------+
+// | Len | Run | Level |L | 1 |
+// +------------------------------------------------+
+// | Offset | 0 |
+// +------------------------------------------------+
+// If the table entry is a leaf entry then bit 0 set:
+// Len = Number of bits overread (0 to 7) 3 bits
+// Run = RunLength of the Symbol (0 to 63) 6 bits
+// Level = Level of the Symbol (0 to 31) 5 bits
+// L = Last Value of the Symbol (0 or 1) 1 bit
+//
+// If the table entry is an internal node then bit 0 is clear:
+// Offset = Number of (16-bit) half words from the table
+// start to the next table node
+//
+// The table is accessed by successive lookup up on the
+// next Step bits of the input bitstream until a leaf node
+// is obtained. The Step sizes are supplied to the VLD macro.
+
+// The VLC tables used for Intra and non inta coefficients in non Escape mode
+// contains symbols with both Last=0 and Last=1.
+// If a symbol is not found in the table it will be coded as 0xFFF
+
+
+const OMX_U16 armVCM4P2_InterVlcL0L1[200] = {
+ 0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x1b09,
+ 0x4009, 0x4009, 0x4009, 0x4009, 0x2109, 0x2109, 0x0209, 0x0011,
+ 0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058,
+ 0x3fff, 0x3fff, 0x0038, 0x0040, 0x2115, 0x2115, 0x201d, 0x201d,
+ 0x2059, 0x2059, 0x2051, 0x2051, 0x1c0d, 0x1b0d, 0x1a0d, 0x190d,
+ 0x0911, 0x0811, 0x0711, 0x0611, 0x0511, 0x0319, 0x0219, 0x0121,
+ 0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088,
+ 0x2061, 0x2061, 0x2129, 0x2129, 0x3709, 0x3709, 0x3809, 0x3809,
+ 0x3d0d, 0x3d0d, 0x3e0d, 0x3e0d, 0x3f0d, 0x3f0d, 0x200d, 0x200d,
+ 0x0098, 0x00a0, 0x00a8, 0x00b0, 0x0131, 0x0221, 0x0419, 0x0519,
+ 0x0619, 0x0a11, 0x1909, 0x1a09, 0x210d, 0x220d, 0x230d, 0x240d,
+ 0x250d, 0x260d, 0x270d, 0x280d, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+ 0x0049, 0x0041, 0x380d, 0x380d, 0x370d, 0x370d, 0x360d, 0x360d,
+ 0x350d, 0x350d, 0x340d, 0x340d, 0x330d, 0x330d, 0x320d, 0x320d,
+ 0x00e8, 0x00f0, 0x00f8, 0x0100, 0x310d, 0x310d, 0x2015, 0x2015,
+ 0x3609, 0x3609, 0x3509, 0x3509, 0x3409, 0x3409, 0x3309, 0x3309,
+ 0x3209, 0x3209, 0x3109, 0x3109, 0x0110, 0x0130, 0x0138, 0x0140,
+ 0x0118, 0x0120, 0x0128, 0x100d, 0x3009, 0x3009, 0x2f09, 0x2f09,
+ 0x2411, 0x2411, 0x2311, 0x2311, 0x2039, 0x2039, 0x2031, 0x2031,
+ 0x0f0d, 0x0e0d, 0x0d0d, 0x0c0d, 0x0b0d, 0x0a0d, 0x090d, 0x0e09,
+ 0x0d09, 0x0211, 0x0119, 0x0029, 0x0150, 0x0158, 0x0160, 0x0168,
+ 0x280d, 0x280d, 0x270d, 0x270d, 0x260d, 0x260d, 0x250d, 0x250d,
+ 0x2c09, 0x2c09, 0xb759, 0xb759, 0x2a09, 0x2a09, 0x2021, 0x2021,
+ 0x040d, 0x030d, 0x0b35, 0x010d, 0x0909, 0x0809, 0x0709, 0x0609,
+ 0x0111, 0x0019, 0x2509, 0x2509, 0x2409, 0x2409, 0x2309, 0x2309
+};
+
+
+const OMX_U16 armVCM4P2_IntraVlcL0L1[200] = {
+ 0x0020, 0x0108, 0x0148, 0x0170, 0x0178, 0x0180, 0x0188, 0x0f09,
+ 0x4009, 0x4009, 0x4009, 0x4009, 0x2011, 0x2011, 0x0109, 0x0019,
+ 0x0028, 0x0060, 0x00b8, 0x00e0, 0x0030, 0x0048, 0x0050, 0x0058,
+ 0x3fff, 0x3fff, 0x0038, 0x0040, 0x203d, 0x203d, 0x2035, 0x2035,
+ 0x20b1, 0x20b1, 0x20a9, 0x20a9, 0x0215, 0x011d, 0x002d, 0x0d09,
+ 0x0519, 0x0811, 0x0419, 0x0321, 0x0221, 0x0139, 0x00a1, 0x0099,
+ 0x0068, 0x0090, 0x3fff, 0x3fff, 0x0070, 0x0078, 0x0080, 0x0088,
+ 0x20b9, 0x20b9, 0x20c1, 0x20c1, 0x2141, 0x2141, 0x2911, 0x2911,
+ 0x2315, 0x2315, 0x2415, 0x2415, 0x2f0d, 0x2f0d, 0x300d, 0x300d,
+ 0x0098, 0x00a0, 0x00a8, 0x00b0, 0x00c9, 0x00d1, 0x00d9, 0x0149,
+ 0x0619, 0x0151, 0x0229, 0x0719, 0x0e09, 0x0045, 0x0515, 0x0615,
+ 0x110d, 0x120d, 0x130d, 0x140d, 0x00c0, 0x00c8, 0x00d0, 0x00d8,
+ 0x0091, 0x0089, 0x2e0d, 0x2e0d, 0x2d0d, 0x2d0d, 0x2c0d, 0x2c0d,
+ 0x2b0d, 0x2b0d, 0x2a0d, 0x2a0d, 0x2115, 0x2115, 0x2025, 0x2025,
+ 0x00e8, 0x00f0, 0x00f8, 0x0100, 0x2c09, 0x2c09, 0x2b09, 0x2b09,
+ 0x2711, 0x2711, 0x2611, 0x2611, 0x2511, 0x2511, 0x2319, 0x2319,
+ 0x2219, 0x2219, 0x2131, 0x2131, 0x0110, 0x0130, 0x0138, 0x0140,
+ 0x0118, 0x0120, 0x0128, 0x080d, 0x2129, 0x2129, 0x2081, 0x2081,
+ 0x2411, 0x2411, 0x2079, 0x2079, 0x2071, 0x2071, 0x2069, 0x2069,
+ 0x1bb5, 0x060d, 0x001d, 0xd3f9, 0x0909, 0x0809, 0x090d, 0x0311,
+ 0x0121, 0x0061, 0x0059, 0x0051, 0x0150, 0x0158, 0x0160, 0x0168,
+ 0x240d, 0x240d, 0x230d, 0x230d, 0x2609, 0x2609, 0x250d, 0x250d,
+ 0x2709, 0x2709, 0x2211, 0x2211, 0x2119, 0x2119, 0x2049, 0x2049,
+ 0x0015, 0x0509, 0x020d, 0x010d, 0x0409, 0x0309, 0x0041, 0x0039,
+ 0x0111, 0x0031, 0x2209, 0x2209, 0x2029, 0x2029, 0x2021, 0x2021
+};
+
+const OMX_U16 armVCM4P2_aIntraDCLumaChromaIndex[64] = {
+ 0x0020, 0x000b, 0x2009, 0x2009, 0x2007, 0x2007, 0x2001, 0x2001,
+ 0x4005, 0x4005, 0x4005, 0x4005, 0x4003, 0x4003, 0x4003, 0x4003,
+ 0x0028, 0x000f, 0x200d, 0x200d, 0x0030, 0x0013, 0x2011, 0x2011,
+ 0x0038, 0x0017, 0x2015, 0x2015, 0x3fff, 0x3fff, 0x2019, 0x2019,
+
+ 0x0020, 0x0009, 0x2007, 0x2007, 0x4005, 0x4005, 0x4005, 0x4005,
+ 0x4003, 0x4003, 0x4003, 0x4003, 0x4001, 0x4001, 0x4001, 0x4001,
+ 0x0028, 0x000d, 0x200b, 0x200b, 0x0030, 0x0011, 0x200f, 0x200f,
+ 0x0038, 0x0015, 0x2013, 0x2013, 0x1fff, 0x0019, 0x2017, 0x2017
+};
+
+
+const OMX_U16 armVCM4P2_aVlcMVD[124] = {
+ 0x0010, 0x00f0, 0x0043, 0x003f, 0x4041, 0x4041, 0x4041, 0x4041,
+ 0x0018, 0x00d8, 0x0047, 0x003b, 0x0020, 0x0080, 0x00a8, 0x00d0,
+ 0x0028, 0x0048, 0x0070, 0x0078, 0x1fff, 0x0030, 0x0038, 0x0040,
+ 0x0081, 0x0001, 0x007f, 0x0003, 0x207d, 0x207d, 0x2005, 0x2005,
+ 0x207b, 0x207b, 0x2007, 0x2007, 0x0050, 0x0058, 0x0060, 0x0068,
+ 0x2079, 0x2079, 0x2009, 0x2009, 0x2077, 0x2077, 0x200b, 0x200b,
+ 0x2075, 0x2075, 0x200d, 0x200d, 0x2073, 0x2073, 0x200f, 0x200f,
+ 0x0071, 0x0011, 0x006f, 0x0013, 0x006d, 0x0015, 0x006b, 0x0017,
+ 0x0088, 0x0090, 0x0098, 0x00a0, 0x0069, 0x0019, 0x0067, 0x001b,
+ 0x0065, 0x001d, 0x0063, 0x001f, 0x0061, 0x0021, 0x005f, 0x0023,
+ 0x005d, 0x0025, 0x005b, 0x0027, 0x00b0, 0x00b8, 0x00c0, 0x00c8,
+ 0x0059, 0x0029, 0x0057, 0x002b, 0x2055, 0x2055, 0x202d, 0x202d,
+ 0x2053, 0x2053, 0x202f, 0x202f, 0x2051, 0x2051, 0x2031, 0x2031,
+ 0x204f, 0x204f, 0x2033, 0x2033, 0x00e0, 0x00e8, 0x0049, 0x0039,
+ 0x204d, 0x204d, 0x2035, 0x2035, 0x204b, 0x204b, 0x2037, 0x2037,
+ 0x2045, 0x2045, 0x203d, 0x203d
+};
+
+/* LMAX table for non Inter (Last == 0 and Last=1)
+ Level - 1 Indexed
+ padded armVCM4P2_InterL0L1LMAX[27-31] with zeros to acess entries for Last=1 effectively
+
+*/
+const OMX_U8 armVCM4P2_InterL0L1LMAX[73] =
+{
+ 12, 6, 4, 3, 3, 3, 3, 2,
+ 2, 2, 2, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 0, 0, 0, 0, 0,
+ 3, 2, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1
+};
+
+/* RMAX table for non Inter (Last == 0 and Last=1)
+ Level - 1 Indexed
+ padded armVCM4P2_InterL0L1RMAX[12-31] with zeros to access entries for Last=1 table effectively */
+
+
+const OMX_U8 armVCM4P2_InterL0L1RMAX[35] =
+{
+ 26, 10, 6, 2, 1, 1,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 40, 1, 0
+};
+
+/* LMAX table for non Intra (Last == 0 and Last=1)
+ Level - 1 Indexed
+ padded armVCM4P2_IntraL0L1LMAX[15-31] with zeros to acess entries for Last=1 effectively
+
+*/
+const OMX_U8 armVCM4P2_IntraL0L1LMAX[53] =
+{
+ 27, 10, 5, 4, 3, 3, 3,
+ 3, 2, 2, 1, 1, 1, 1, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+
+ 8, 3, 2, 2, 2, 2, 2, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1
+};
+
+
+/* RMAX table for non Inter (Last == 0 and Last=1)
+ Level - 1 Indexed
+ padded armVCM4P2_IntraL0L1RMAX[27-31] with zeros to access entries for Last=1 table effectively */
+
+
+const OMX_U8 armVCM4P2_IntraL0L1RMAX[40] =
+{
+ 14, 9, 7, 3, 2, 1, 1,
+ 1, 1, 1, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0,
+
+ 20, 6, 1, 0, 0, 0, 0, 0
+
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
new file mode 100755
index 0000000..6948f80
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Lookup_Tables.c
@@ -0,0 +1,75 @@
+ /**
+ *
+ * File Name: armVCM4P2_Lookup_Tables.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armVCM4P2_Lookup_Tables.c
+ * Description: Contains all the Lookup tables used in MPEG4 codec
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+
+ /* * Table Entries contain Dc Scaler values
+ * armVCM4P2_DCScaler[i]= 8 for i=1 to 4 and i=33 to 36
+ * = 2*i for i=5 to 8
+ * = i+8 for i=9 to 25
+ * = 2*i-16 for i=26 to 31
+ * = (i-32+13)/2 for i=37 to 59
+ * = i-6-32 for i=60 to 63
+ * = 255 for i=0 and i=32
+ */
+
+const OMX_U8 armVCM4P2_DCScaler[64]={
+ 0xff, 0x8, 0x8, 0x8, 0x8, 0xa, 0xc, 0xe,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e,
+ 0xff, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa,
+ 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe,
+ 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12,
+ 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+
+};
+
+
+ /* Table Entries Contain reciprocal of 1 to 63
+ * armVCM4P2_Reciprocal_QP_S16[i]=round(32767/i)
+ * armVCM4P2_Reciprocal_QP_S16[0]= 0
+ */
+
+const OMX_S16 armVCM4P2_Reciprocal_QP_S16[64]={
+ 0x0000,0x7fff,0x4000,0x2aaa,0x2000,0x1999,0x1555,0x1249,
+ 0x1000,0x0e39,0x0ccd,0x0ba3,0x0aab,0x09d9,0x0925,0x0888,
+ 0x0800,0x0787,0x071c,0x06bd,0x0666,0x0618,0x05d1,0x0591,
+ 0x0555,0x051f,0x04ec,0x04be,0x0492,0x046a,0x0444,0x0421,
+ 0x0400,0x03e1,0x03c4,0x03a8,0x038e,0x0376,0x035e,0x0348,
+ 0x0333,0x031f,0x030c,0x02fa,0x02e9,0x02d8,0x02c8,0x02b9,
+ 0x02ab,0x029d,0x028f,0x0282,0x0276,0x026a,0x025f,0x0254,
+ 0x0249,0x023f,0x0235,0x022b,0x0222,0x0219,0x0211,0x0208
+
+};
+
+ /* Table Entries Contain reciprocal of 1 to 63
+ * armVCM4P2_Reciprocal_QP_S32[i]=round(131071/i)
+ * armVCM4P2_Reciprocal_QP_S32[0]= 0
+ */
+
+const OMX_S32 armVCM4P2_Reciprocal_QP_S32[64]={
+ 0x00000000,0x0001ffff,0x00010000,0x0000aaaa, 0x00008000, 0x00006666, 0x00005555, 0x00004924,
+ 0x00004000,0x000038e3,0x00003333,0x00002e8c, 0x00002aab, 0x00002762, 0x00002492, 0x00002222,
+ 0x00002000,0x00001e1e,0x00001c72,0x00001af2, 0x0000199a, 0x00001861, 0x00001746, 0x00001643,
+ 0x00001555,0x0000147b,0x000013b1,0x000012f6, 0x00001249, 0x000011a8, 0x00001111, 0x00001084,
+ 0x00001000,0x00000f84,0x00000f0f,0x00000ea1, 0x00000e39, 0x00000dd6, 0x00000d79, 0x00000d21,
+ 0x00000ccd,0x00000c7d,0x00000c31,0x00000be8, 0x00000ba3, 0x00000b61, 0x00000b21, 0x00000ae5,
+ 0x00000aab,0x00000a73,0x00000a3d,0x00000a0a, 0x000009d9, 0x000009a9, 0x0000097b, 0x0000094f,
+ 0x00000925,0x000008fb,0x000008d4,0x000008ae, 0x00000889, 0x00000865, 0x00000842, 0x00000820
+
+};
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
new file mode 100755
index 0000000..44f2460
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_SetPredDir_s.s
@@ -0,0 +1,104 @@
+;//
+;//
+;// File Name: armVCM4P2_SetPredDir_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+; **
+; * Function: armVCM4P2_SetPredDir
+; *
+; * Description:
+; * Performs detecting the prediction direction
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] blockIndex block index indicating the component type and
+; * position as defined in subclause 6.1.3.8, of ISO/IEC
+; * 14496-2. Furthermore, indexes 6 to 9 indicate the
+; * alpha blocks spatially corresponding to luminance
+; * blocks 0 to 3 in the same macroblock.
+; * [in] pCoefBufRow pointer to the coefficient row buffer
+; * [in] pQpBuf pointer to the quantization parameter buffer
+; * [out]predQP quantization parameter of the predictor block
+; * [out]predDir indicates the prediction direction which takes one
+; * of the following values:
+; * OMX_VC_HORIZONTAL predict horizontally
+; * OMX_VC_VERTICAL predict vertically
+; *
+; * Return Value:
+; * Standard OMXResult result. See enumeration for possible result codes.
+; *
+; */
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE omxVC_s.h
+
+
+ M_VARIANTS ARM1136JS
+
+
+ IF ARM1136JS
+
+;// Input Arguments
+BlockIndex RN 0
+pCoefBufRow RN 1
+pCoefBufCol RN 2
+predDir RN 3
+predQP RN 4
+pQpBuf RN 5
+
+;// Local Variables
+
+Return RN 0
+blockDCLeft RN 6
+blockDCTop RN 7
+blockDCTopLeft RN 8
+temp1 RN 9
+temp2 RN 14
+
+ M_START armVCM4P2_SetPredDir,r9
+
+ M_ARG ppredQP,4
+ M_ARG ppQpBuf,4
+
+ LDRH blockDCTopLeft,[pCoefBufRow,#-16]
+ LDRH blockDCLeft,[pCoefBufCol]
+
+ TEQ BlockIndex,#3
+ LDREQH blockDCTop,[pCoefBufCol,#-16]
+ LDRNEH blockDCTop,[pCoefBufRow]
+
+ SUBS temp1,blockDCLeft,blockDCTopLeft
+ RSBLT temp1,temp1,#0
+ SUBS temp2,blockDCTopLeft,blockDCTop
+ RSBLT temp2,temp2,#0
+
+ M_LDR pQpBuf,ppQpBuf
+ M_LDR predQP,ppredQP
+ CMP temp1,temp2
+ MOV temp2,#OMX_VC_VERTICAL
+ LDRLTB temp1,[pQpBuf,#1]
+ STRLT temp2,[predDir]
+ STRLT temp1,[predQP]
+ MOV temp2,#OMX_VC_HORIZONTAL
+ LDRGEB temp1,[pQpBuf]
+ STRGE temp2,[predDir]
+ MOV Return,#OMX_Sts_NoErr
+ STRGE temp1,[predQP]
+
+
+
+ M_END
+
+ ENDIF
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
new file mode 100755
index 0000000..21fa715
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/armVCM4P2_Zigzag_Tables.c
@@ -0,0 +1,61 @@
+/**
+ *
+ * File Name: armVCM4P2_Zigzag_Tables.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * File: armVCM4P2_ZigZag_Tables.c
+ * Description: Contains the zigzag tables
+ *
+ */
+
+#include "omxtypes.h"
+
+/* Contains Double the values in the reference Zigzag Table
+ * Contains Classical,Vetical and Horizontal Zigzagscan tables in one array
+ */
+
+const OMX_U8 armVCM4P2_aClassicalZigzagScan [192] =
+{
+ 0, 2, 16, 32, 18, 4, 6, 20,
+ 34, 48, 64, 50, 36, 22, 8, 10,
+ 24, 38, 52, 66, 80, 96, 82, 68,
+ 54, 40, 26, 12, 14, 28, 42, 56,
+ 70, 84, 98, 112, 114, 100, 86, 72,
+ 58, 44, 30, 46, 60, 74, 88, 102,
+ 116, 118, 104, 90, 76, 62, 78, 92,
+ 106, 120, 122, 104, 94, 110, 124, 126,
+
+ 0, 16, 32, 48, 2, 18, 4, 20,
+ 34, 50, 64, 80, 96, 112, 114, 98,
+ 82, 66, 52, 36, 6, 22, 8, 24,
+ 38, 54, 68, 84, 100, 116, 70, 86,
+ 102, 118, 40, 56, 10, 26, 12, 28,
+ 42, 58, 72, 88, 104, 120, 74, 90,
+ 106, 122, 44, 60, 14, 30, 46, 62,
+ 76, 92, 108, 124, 78, 94, 110, 126,
+
+ 0, 2, 4, 6, 16, 18, 32, 34,
+ 20, 22, 8, 10, 12, 14, 30, 28,
+ 26, 24, 38, 36, 48, 50, 64, 66,
+ 52, 54, 40, 42, 44, 46, 56, 58,
+ 60, 62, 68, 70, 80, 82, 96, 98,
+ 84, 86, 72, 74, 76, 78, 88, 90,
+ 92, 94, 100, 102, 112, 114, 116, 118,
+ 104, 106, 108, 110, 120, 122, 124, 126
+
+
+};
+
+
+
+
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
new file mode 100755
index 0000000..796ad6e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Inter.c
@@ -0,0 +1,102 @@
+/**
+ *
+ * File Name: omxVCM4P2_DecodeBlockCoef_Inter.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Description:
+ * Contains modules for inter reconstruction
+ *
+ */
+
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Inter
+ *
+ * Description:
+ * Decodes the INTER block coefficients. Inverse quantization, inversely zigzag
+ * positioning and IDCT, with appropriate clipping on each step, are performed
+ * on the coefficients. The results (residuals) are placed in a contiguous array
+ * of 64 elements. For INTER block, the output buffer holds the residuals for
+ * further reconstruction.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream buffer. There is no boundary
+ * check for the bit stream buffer.
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * to by *ppBitStream. *pBitOffset is valid within
+ * [0-7]
+ * [in] QP quantization parameter
+ * [in] shortVideoHeader a flag indicating presence of short_video_header;
+ * shortVideoHeader==1 indicates using quantization method defined in short
+ * video header mode, and shortVideoHeader==0 indicates normail quantization method.
+ * [out] ppBitStream *ppBitStream is updated after the block is decoded, so that it points to the
+ * current byte in the bit stream buffer.
+ * [out] pBitOffset *pBitOffset is updated so that it points to the current bit position in the
+ * byte pointed by *ppBitStream
+ * [out] pDst pointer to the decoded residual buffer (a contiguous array of 64 elements of
+ * OMX_S16 data type). Must be 16-byte aligned.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - At least one of the following pointers is Null: ppBitStream, *ppBitStream, pBitOffset , pDst
+ * - At least one of the below case:
+ * - *pBitOffset exceeds [0,7], QP <= 0;
+ * - pDst not 16-byte aligned
+ * OMX_Sts_Err - status error
+ *
+ */
+OMXResult omxVCM4P2_DecodeBlockCoef_Inter(
+ const OMX_U8 ** ppBitStream,
+ OMX_INT * pBitOffset,
+ OMX_S16 * pDst,
+ OMX_INT QP,
+ OMX_INT shortVideoHeader
+)
+{
+ /* 64 elements are needed but to align it to 16 bytes need
+ 15 more elements of padding */
+ OMX_S16 tempBuf[79];
+ OMX_S16 *pTempBuf1;
+ OMXResult errorCode;
+ /* Aligning the local buffers */
+ pTempBuf1 = armAlignTo16Bytes(tempBuf);
+
+
+ /* VLD and zigzag */
+ errorCode = omxVCM4P2_DecodeVLCZigzag_Inter(ppBitStream, pBitOffset,
+ pTempBuf1,shortVideoHeader);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ /* Dequantization */
+ errorCode = omxVCM4P2_QuantInvInter_I(
+ pTempBuf1,
+ QP);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ /* Inverse transform */
+ errorCode = omxVCM4P2_IDCT8x8blk(pTempBuf1, pDst);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ return OMX_Sts_NoErr;
+}
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
new file mode 100755
index 0000000..b28657c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeBlockCoef_Intra.c
@@ -0,0 +1,214 @@
+/**
+ *
+ * File Name: omxVCM4P2_DecodeBlockCoef_Intra.c
+ * OpenMAX DL: v1.0.2
+ * Revision: 12290
+ * Date: Wednesday, April 9, 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ *
+ * Description:
+ * Contains modules for intra reconstruction
+ *
+ */
+
+#include "omxtypes.h"
+#include "armOMX.h"
+#include "omxVC.h"
+
+#include "armCOMM.h"
+#include "armVC.h"
+
+/* Function for saturating 16 bit values to the [0,255] range and */
+/* writing out as 8 bit values. Does 64 entries */
+void armVCM4P2_Clip8(OMX_S16 *pSrc, OMX_U8 *pDst, OMX_INT dstStep );
+
+
+
+/**
+ * Function: omxVCM4P2_DecodeBlockCoef_Intra
+ *
+ * Description:
+ * Decodes the INTRA block coefficients. Inverse quantization, inversely zigzag
+ * positioning, and IDCT, with appropriate clipping on each step, are performed
+ * on the coefficients. The results are then placed in the output frame/plane on
+ * a pixel basis. For INTRA block, the output values are clipped to [0, 255] and
+ * written to corresponding block buffer within the destination plane.
+ *
+ * Remarks:
+ *
+ * Parameters:
+ * [in] ppBitStream pointer to the pointer to the current byte in
+ * the bit stream buffer. There is no boundary
+ * check for the bit stream buffer.
+ * [in] pBitOffset pointer to the bit position in the byte pointed
+ * to by *ppBitStream. *pBitOffset is valid within
+ * [0-7].
+ * [in] step width of the destination plane
+ * [in/out] pCoefBufRow [in] pointer to the coefficient row buffer
+ * [out] updated coefficient rwo buffer
+ * [in/out] pCoefBufCol [in] pointer to the coefficient column buffer
+ * [out] updated coefficient column buffer
+ * [in] curQP quantization parameter of the macroblock which
+ * the current block belongs to
+ * [in] pQpBuf Pointer to a 2-element QP array. pQpBuf[0] holds the QP of the 8x8 block left to
+ * the current block(QPa). pQpBuf[1] holds the QP of the 8x8 block just above the
+ * current block(QPc).
+ * Note, in case the corresponding block is out of VOP bound, the QP value will have
+ * no effect to the intra-prediction process. Refer to subclause "7.4.3.3 Adaptive
+ * ac coefficient prediction" of ISO/IEC 14496-2(MPEG4 Part2) for accurate description.
+ * [in] blockIndex block index indicating the component type and
+ * position as defined in subclause 6.1.3.8,
+ * Figure 6-5 of ISO/IEC 14496-2.
+ * [in] intraDCVLC a code determined by intra_dc_vlc_thr and QP.
+ * This allows a mechanism to switch between two VLC
+ * for coding of Intra DC coefficients as per Table
+ * 6-21 of ISO/IEC 14496-2.
+ * [in] ACPredFlag a flag equal to ac_pred_flag (of luminance) indicating
+ * if the ac coefficients of the first row or first
+ * column are differentially coded for intra coded
+ * macroblock.
+ * [in] shortVideoHeader a flag indicating presence of short_video_header;
+ * shortVideoHeader==1 selects linear intra DC mode,
+ * and shortVideoHeader==0 selects nonlinear intra DC mode.
+ * [out] ppBitStream *ppBitStream is updated after the block is
+ * decoded, so that it points to the current byte
+ * in the bit stream buffer
+ * [out] pBitOffset *pBitOffset is updated so that it points to the
+ * current bit position in the byte pointed by
+ * *ppBitStream
+ * [out] pDst pointer to the block in the destination plane.
+ * pDst should be 16-byte aligned.
+ * [out] pCoefBufRow pointer to the updated coefficient row buffer.
+ *
+ * Return Value:
+ * OMX_Sts_NoErr - no error
+ * OMX_Sts_BadArgErr - bad arguments
+ * - At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset,
+ * pCoefBufRow, pCoefBufCol, pQPBuf, pDst.
+ * or
+ * - At least one of the below case: *pBitOffset exceeds [0,7], curQP exceeds (1, 31),
+ * blockIndex exceeds [0,9], step is not the multiple of 8, intraDCVLC is zero while
+ * blockIndex greater than 5.
+ * or
+ * - pDst is not 16-byte aligned
+ * OMX_Sts_Err - status error
+ *
+ */
+
+OMXResult omxVCM4P2_DecodeBlockCoef_Intra(
+ const OMX_U8 ** ppBitStream,
+ OMX_INT *pBitOffset,
+ OMX_U8 *pDst,
+ OMX_INT step,
+ OMX_S16 *pCoefBufRow,
+ OMX_S16 *pCoefBufCol,
+ OMX_U8 curQP,
+ const OMX_U8 *pQPBuf,
+ OMX_INT blockIndex,
+ OMX_INT intraDCVLC,
+ OMX_INT ACPredFlag,
+ OMX_INT shortVideoHeader
+ )
+{
+ OMX_S16 tempBuf1[79], tempBuf2[79];
+ OMX_S16 *pTempBuf1, *pTempBuf2;
+ OMX_INT predDir, predACDir;
+ OMX_INT predQP;
+ OMXVCM4P2VideoComponent videoComp;
+ OMXResult errorCode;
+
+
+ /* Aligning the local buffers */
+ pTempBuf1 = armAlignTo16Bytes(tempBuf1);
+ pTempBuf2 = armAlignTo16Bytes(tempBuf2);
+
+ /* Setting the AC prediction direction and prediction direction */
+ armVCM4P2_SetPredDir(
+ blockIndex,
+ pCoefBufRow,
+ pCoefBufCol,
+ &predDir,
+ &predQP,
+ pQPBuf);
+
+ predACDir = predDir;
+
+
+ if (ACPredFlag == 0)
+ {
+ predACDir = OMX_VC_NONE;
+ }
+
+ /* Setting the videoComp */
+ if (blockIndex <= 3)
+ {
+ videoComp = OMX_VC_LUMINANCE;
+ }
+ else
+ {
+ videoComp = OMX_VC_CHROMINANCE;
+ }
+
+
+ /* VLD and zigzag */
+ if (intraDCVLC == 1)
+ {
+ errorCode = omxVCM4P2_DecodeVLCZigzag_IntraDCVLC(
+ ppBitStream,
+ pBitOffset,
+ pTempBuf1,
+ predACDir,
+ shortVideoHeader,
+ videoComp);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+ }
+ else
+ {
+ errorCode = omxVCM4P2_DecodeVLCZigzag_IntraACVLC(
+ ppBitStream,
+ pBitOffset,
+ pTempBuf1,
+ predACDir,
+ shortVideoHeader);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+ }
+
+ /* AC DC prediction */
+ errorCode = omxVCM4P2_PredictReconCoefIntra(
+ pTempBuf1,
+ pCoefBufRow,
+ pCoefBufCol,
+ curQP,
+ predQP,
+ predDir,
+ ACPredFlag,
+ videoComp);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ /* Dequantization */
+ errorCode = omxVCM4P2_QuantInvIntra_I(
+ pTempBuf1,
+ curQP,
+ videoComp,
+ shortVideoHeader);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ /* Inverse transform */
+ errorCode = omxVCM4P2_IDCT8x8blk (pTempBuf1, pTempBuf2);
+ armRetDataErrIf((errorCode != OMX_Sts_NoErr), errorCode);
+
+ /* Placing the linear array into the destination plane and clipping
+ it to 0 to 255 */
+
+ armVCM4P2_Clip8(pTempBuf2,pDst,step);
+
+
+ return OMX_Sts_NoErr;
+}
+
+/* End of file */
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
new file mode 100755
index 0000000..cc16f5a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodePadMV_PVOP_s.s
@@ -0,0 +1,364 @@
+; **********
+; *
+; * File Name: omxVCM4P2_DecodePadMV_PVOP_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; **
+; * Function: omxVCM4P2_DecodePadMV_PVOP
+; *
+; * Description:
+; * Decodes and pads four motion vectors of the non-intra macroblock in P-VOP.
+; * The motion vector padding process is specified in subclause 7.6.1.6 of
+; * ISO/IEC 14496-2.
+; *
+; * Remarks:
+; *
+; *
+; * Parameters:
+; * [in] ppBitStream pointer to the pointer to the current byte in
+; * the bit stream buffer
+; * [in] pBitOffset pointer to the bit position in the byte pointed
+; * to by *ppBitStream. *pBitOffset is valid within
+; * [0-7].
+; * [in] pSrcMVLeftMB pointers to the motion vector buffers of the
+; * macroblocks specially at the left side of the current macroblock
+; * respectively.
+; * [in] pSrcMVUpperMB pointers to the motion vector buffers of the
+; * macroblocks specially at the upper side of the current macroblock
+; * respectively.
+; * [in] pSrcMVUpperRightMB pointers to the motion vector buffers of the
+; * macroblocks specially at the upper-right side of the current macroblock
+; * respectively.
+; * [in] fcodeForward a code equal to vop_fcode_forward in MPEG-4
+; * bit stream syntax
+; * [in] MBType the type of the current macroblock. If MBType
+; * is not equal to OMX_VC_INTER4V, the destination
+; * motion vector buffer is still filled with the
+; * same decoded vector.
+; * [out] ppBitStream *ppBitStream is updated after the block is decoded,
+; * so that it points to the current byte in the bit
+; * stream buffer
+; * [out] pBitOffset *pBitOffset is updated so that it points to the
+; * current bit position in the byte pointed by
+; * *ppBitStream
+; * [out] pDstMVCurMB pointer to the motion vector buffer of the current
+; * macroblock which contains four decoded motion vectors
+; *
+; * Return Value:
+; * OMX_Sts_NoErr -no error
+; *
+; *
+; * OMX_Sts_Err - status error
+; *
+; *
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+ INCLUDE omxVC_s.h
+
+ M_VARIANTS ARM1136JS
+
+
+
+
+ IF ARM1136JS
+
+;//Input Arguments
+
+ppBitStream RN 0
+pBitOffset RN 1
+pSrcMVLeftMB RN 2
+pSrcMVUpperMB RN 3
+pSrcMVUpperRightMB RN 4
+pDstMVCurMB RN 5
+fcodeForward RN 6
+MBType RN 7
+
+;//Local Variables
+
+zero RN 4
+one RN 4
+scaleFactor RN 1
+
+
+Return RN 0
+
+VlcMVD RN 0
+index RN 4
+Count RN 7
+
+mvHorData RN 4
+mvHorResidual RN 0
+
+mvVerData RN 4
+mvVerResidual RN 0
+
+temp RN 1
+
+temp1 RN 3
+High RN 4
+Low RN 2
+Range RN 1
+
+BlkCount RN 14
+
+diffMVdx RN 0
+diffMVdy RN 1
+
+;// Scratch Registers
+
+RBitStream RN 8
+RBitCount RN 9
+RBitBuffer RN 10
+
+T1 RN 11
+T2 RN 12
+LR RN 14
+
+ IMPORT armVCM4P2_aVlcMVD
+ IMPORT omxVCM4P2_FindMVpred
+
+ ;// Allocate stack memory
+
+ M_ALLOC4 ppDstMVCurMB,4
+ M_ALLOC4 pDstMVPredME,4
+ M_ALLOC4 pBlkCount,4
+
+ M_ALLOC4 pppBitStream,4
+ M_ALLOC4 ppBitOffset,4
+ M_ALLOC4 ppSrcMVLeftMB,4
+ M_ALLOC4 ppSrcMVUpperMB,4
+
+ M_ALLOC4 pdiffMVdx,4
+ M_ALLOC4 pdiffMVdy,4
+ M_ALLOC4 pHigh,4
+
+
+
+
+ M_START omxVCM4P2_DecodePadMV_PVOP,r11
+
+ M_ARG pSrcMVUpperRightMBonStack,4 ;// pointer to pSrcMVUpperRightMB on stack
+ M_ARG pDstMVCurMBonStack,4 ;// pointer to pDstMVCurMB on stack
+ M_ARG fcodeForwardonStack,4 ;// pointer to fcodeForward on stack
+ M_ARG MBTypeonStack,4 ;// pointer to MBType on stack
+
+
+
+
+
+ ;// Initializing the BitStream Macro
+
+ M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount
+ M_LDR MBType,MBTypeonStack ;// Load MBType from stack
+ M_LDR pDstMVCurMB,pDstMVCurMBonStack ;// Load pDstMVCurMB from stack
+ MOV zero,#0
+
+ TEQ MBType,#OMX_VC_INTRA ;// Check if MBType=OMX_VC_INTRA
+ TEQNE MBType,#OMX_VC_INTRA_Q ;// check if MBType=OMX_VC_INTRA_Q
+ STREQ zero,[pDstMVCurMB]
+ M_BD_INIT1 T1, T2, T2
+ STREQ zero,[pDstMVCurMB,#4]
+ M_BD_INIT2 T1, T2, T2
+ STREQ zero,[pDstMVCurMB,#4]
+ MOVEQ Return,#OMX_Sts_NoErr
+ MOV BlkCount,#0
+ STREQ zero,[pDstMVCurMB,#4]
+
+ BEQ ExitOK
+
+ TEQ MBType,#OMX_VC_INTER4V ;// Check if MBType=OMX_VC_INTER4V
+ TEQNE MBType,#OMX_VC_INTER4V_Q ;// Check if MBType=OMX_VC_INTER4V_Q
+ MOVEQ Count,#4
+
+ TEQ MBType,#OMX_VC_INTER ;// Check if MBType=OMX_VC_INTER
+ TEQNE MBType,#OMX_VC_INTER_Q ;// Check if MBType=OMX_VC_INTER_Q
+ MOVEQ Count,#1
+
+ M_LDR fcodeForward,fcodeForwardonStack ;// Load fcodeForward from stack
+
+ ;// Storing the values temporarily on stack
+
+ M_STR ppBitStream,pppBitStream
+ M_STR pBitOffset,ppBitOffset
+
+
+ SUB temp,fcodeForward,#1 ;// temp=fcodeForward-1
+ MOV one,#1
+ M_STR pSrcMVLeftMB,ppSrcMVLeftMB
+ LSL scaleFactor,one,temp ;// scaleFactor=1<<(fcodeForward-1)
+ M_STR pSrcMVUpperMB,ppSrcMVUpperMB
+ LSL scaleFactor,scaleFactor,#5
+ M_STR scaleFactor,pHigh ;// [pHigh]=32*scaleFactor
+
+ ;// VLD Decoding
+
+
+Loop
+
+ LDR VlcMVD, =armVCM4P2_aVlcMVD ;// Load the optimized MVD VLC table
+
+ ;// Horizontal Data and Residual calculation
+
+ LDR temp,=0xFFF
+ M_BD_VLD index,T1,T2,VlcMVD,3,2 ;// variable lenght decoding using the macro
+
+ TEQ index,temp
+ BEQ ExitError ;// Exit with an Error Message if the decoded symbol is an invalied symbol
+
+ SUB mvHorData,index,#32 ;// mvHorData=index-32
+ MOV mvHorResidual,#1 ;// mvHorResidual=1
+ CMP fcodeForward,#1
+ TEQNE mvHorData,#0
+ MOVEQ diffMVdx,mvHorData ;// if scaleFactor=1(fcodeForward=1) or mvHorData=0 diffMVdx=mvHorData
+ BEQ VerticalData
+
+ SUB temp,fcodeForward,#1
+ M_BD_VREAD8 mvHorResidual,temp,T1,T2 ;// get mvHorResidual from bitstream if fcodeForward>1 and mvHorData!=0
+
+ CMP mvHorData,#0
+ RSBLT mvHorData,mvHorData,#0 ;// mvHorData=abs(mvHorData)
+ SUB mvHorResidual,mvHorResidual,fcodeForward
+ SMLABB diffMVdx,mvHorData,fcodeForward,mvHorResidual ;// diffMVdx=abs(mvHorData)*fcodeForward+mvHorResidual-fcodeForward
+ ADD diffMVdx,diffMVdx,#1
+ RSBLT diffMVdx,diffMVdx,#0
+
+ ;// Vertical Data and Residual calculation
+
+VerticalData
+
+ M_STR diffMVdx,pdiffMVdx ;// Store the diffMVdx on stack
+ LDR VlcMVD, =armVCM4P2_aVlcMVD ;// Loading the address of optimized VLC tables
+
+ LDR temp,=0xFFF
+ M_BD_VLD index,T1,T2,VlcMVD,3,2 ;// VLC decoding using the macro
+
+ TEQ index,temp
+ BEQ ExitError ;// Exit with an Error Message if an Invalied Symbol occurs
+
+ SUB mvVerData,index,#32 ;// mvVerData=index-32
+ MOV mvVerResidual,#1
+ CMP fcodeForward,#1
+ TEQNE mvVerData,#0
+ MOVEQ diffMVdy,mvVerData ;// diffMVdy = mvVerData if scaleFactor=1(fcodeForward=1) or mvVerData=0
+ BEQ FindMVPred
+
+ SUB temp,fcodeForward,#1
+ M_BD_VREAD8 mvVerResidual,temp,T1,T2 ;// Get mvVerResidual from bit stream if fcodeForward>1 and mnVerData!=0
+
+
+ CMP mvVerData,#0
+ RSBLT mvVerData,mvVerData,#0
+ SUB mvVerResidual,mvVerResidual,fcodeForward
+ SMLABB diffMVdy,mvVerData,fcodeForward,mvVerResidual ;// diffMVdy=abs(mvVerData)*fcodeForward+mvVerResidual-fcodeForward
+ ADD diffMVdy,diffMVdy,#1
+ RSBLT diffMVdy,diffMVdy,#0
+
+ ;//Calling the Function omxVCM4P2_FindMVpred
+
+FindMVPred
+
+ M_STR diffMVdy,pdiffMVdy
+ ADD temp,pDstMVCurMB,BlkCount,LSL #2 ;// temp=pDstMVCurMB[BlkCount]
+ M_STR temp,ppDstMVCurMB ;// store temp on stack for passing as an argument to FindMVPred
+
+ MOV temp,#0
+ M_STR temp,pDstMVPredME ;// Pass pDstMVPredME=NULL as an argument
+ M_STR BlkCount,pBlkCount ;// Passs BlkCount as Argument through stack
+
+ MOV temp,pSrcMVLeftMB ;// temp (RN 1)=pSrcMVLeftMB
+ M_LDR pSrcMVUpperRightMB,pSrcMVUpperRightMBonStack
+ MOV pSrcMVLeftMB,pSrcMVUpperMB ;// pSrcMVLeftMB ( RN 2) = pSrcMVUpperMB
+ MOV ppBitStream,pDstMVCurMB ;// ppBitStream ( RN 0) = pDstMVCurMB
+ MOV pSrcMVUpperMB,pSrcMVUpperRightMB ;// pSrcMVUpperMB( RN 3) = pSrcMVUpperRightMB
+ BL omxVCM4P2_FindMVpred ;// Branch to subroutine omxVCM4P2_FindMVpred
+
+ ;// Store Horizontal Motion Vector
+
+ M_LDR BlkCount,pBlkCount ;// Load BlkCount from stack
+ M_LDR High,pHigh ;// High=32*scaleFactor
+ LSL temp1,BlkCount,#2 ;// temp=BlkCount*4
+ M_LDR diffMVdx,pdiffMVdx ;// Laad diffMVdx
+
+ LDRSH temp,[pDstMVCurMB,temp1] ;// temp=pDstMVCurMB[BlkCount]
+
+
+ RSB Low,High,#0 ;// Low = -32*scaleFactor
+ ADD diffMVdx,temp,diffMVdx ;// diffMVdx=pDstMVCurMB[BlkCount]+diffMVdx
+ ADD Range,High,High ;// Range=64*ScaleFactor
+ SUB High,High,#1 ;// High= 32*scaleFactor-1
+
+ CMP diffMVdx,Low ;// If diffMVdx<Low
+ ADDLT diffMVdx,diffMVdx,Range ;// diffMVdx+=Range
+
+ CMP diffMVdx,High
+ SUBGT diffMVdx,diffMVdx,Range ;// If diffMVdx > High diffMVdx-=Range
+ STRH diffMVdx,[pDstMVCurMB,temp1]
+
+ ;// Store Vertical
+
+ ADD temp1,temp1,#2 ;// temp1=4*BlkCount+2
+ M_LDR diffMVdx,pdiffMVdy ;// Laad diffMVdy
+ LDRSH temp,[pDstMVCurMB,temp1] ;// temp=pDstMVCurMB[BlkCount].diffMVdy
+ ADD BlkCount,BlkCount,#1 ;// BlkCount=BlkCount+1
+ ADD diffMVdx,temp,diffMVdx
+ CMP diffMVdx,Low
+ ADDLT diffMVdx,diffMVdx,Range ;// If diffMVdy<Low diffMVdy+=Range
+ CMP diffMVdx,High
+ SUBGT diffMVdx,diffMVdx,Range ;// If diffMVdy > High diffMVdy-=Range
+ STRH diffMVdx,[pDstMVCurMB,temp1]
+
+ CMP BlkCount,Count
+ M_LDR pSrcMVLeftMB,ppSrcMVLeftMB
+ M_LDR pSrcMVUpperMB,ppSrcMVUpperMB
+
+ BLT Loop ;// If BlkCount<Count Continue the Loop
+
+
+ ;// If MBType=OMX_VC_INTER or MBtype=OMX_VC_INTER_Q copy pDstMVCurMB[0] to
+ ;// pDstMVCurMB[1], pDstMVCurMB[2], pDstMVCurMB[3]
+
+ M_LDR MBType,MBTypeonStack
+
+ TEQ MBType,#OMX_VC_INTER
+ TEQNE MBType,#OMX_VC_INTER_Q
+ LDREQ temp,[pDstMVCurMB]
+ M_LDR ppBitStream,pppBitStream
+ STREQ temp,[pDstMVCurMB,#4]
+
+ STREQ temp,[pDstMVCurMB,#8]
+ STREQ temp,[pDstMVCurMB,#12]
+
+
+ M_LDR pBitOffset,ppBitOffset
+ ;//Ending the macro
+ M_BD_FINI ppBitStream,pBitOffset ;// Finishing the Macro
+
+
+ MOV Return,#OMX_Sts_NoErr
+ B ExitOK
+
+ExitError
+
+ M_LDR ppBitStream,pppBitStream
+ M_LDR pBitOffset,ppBitOffset
+ ;//Ending the macro
+ M_BD_FINI ppBitStream,pBitOffset
+
+ MOV Return,#OMX_Sts_Err
+
+ExitOK
+
+ M_END
+ ENDIF
+ END
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
new file mode 100755
index 0000000..7208c21
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_Inter_s.s
@@ -0,0 +1,132 @@
+;/**
+; *
+; * File Name: omxVCM4P2_DecodeVLCZigzag_Inter_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one inter coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] ppBitStream pointer to the pointer to the current byte in
+; * the bitstream buffer
+; * [in] pBitOffset pointer to the bit position in the byte pointed
+; * to by *ppBitStream. *pBitOffset is valid within [0-7].
+; * [in] shortVideoHeader binary flag indicating presence of short_video_header;
+; * escape modes 0-3 are used if shortVideoHeader==0,
+; * and escape mode 4 is used when shortVideoHeader==1.
+; * [out] ppBitStream *ppBitStream is updated after the block is
+; * decoded, so that it points to the current byte
+; * in the bit stream buffer
+; * [out] pBitOffset *pBitOffset is updated so that it points to the
+; * current bit position in the byte pointed by
+; * *ppBitStream
+; * [out] pDst pointer to the coefficient buffer of current
+; * block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; * -pDst is not 16-byte aligned, or
+; * -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; * -At least one mark bit is equal to zero
+; * -Encountered an illegal stream code that cannot be found in the VLC table
+; * -Encountered and illegal code in the VLC FLC table
+; * -The number of coefficients is greater than 64
+; *
+; */
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+
+
+ M_VARIANTS ARM1136JS
+
+
+
+
+
+ IF ARM1136JS
+
+ ;// Import various tables needed for the function
+
+
+ IMPORT armVCM4P2_InterVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+ ;// Packed in Run:Level:Last format
+ IMPORT armVCM4P2_InterL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_InterL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_aClassicalZigzagScan ;// contains classical Zigzag table entries with double the original values
+ IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+
+
+;//Input Arguments
+
+ppBitStream RN 0
+pBitOffset RN 1
+pDst RN 2
+shortVideoHeader RN 3
+
+;//Local Variables
+
+Return RN 0
+
+pVlcTableL0L1 RN 4
+pLMAXTableL0L1 RN 4
+pRMAXTableL0L1 RN 4
+pZigzagTable RN 4
+Count RN 6
+
+
+
+ ;// Allocate stack memory to store the VLC,Zigzag,LMAX and RMAX tables
+
+
+ M_ALLOC4 ppVlcTableL0L1,4
+ M_ALLOC4 ppLMAXTableL0L1,4
+ M_ALLOC4 ppRMAXTableL0L1,4
+ M_ALLOC4 ppZigzagTable,4
+
+
+ M_START omxVCM4P2_DecodeVLCZigzag_Inter,r12
+
+
+
+
+ LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load zigzag table
+ M_STR pZigzagTable,ppZigzagTable ;// Store zigzag table on stack to pass as argument to unsafe function
+ LDR pVlcTableL0L1, =armVCM4P2_InterVlcL0L1 ;// Load optimized VLC table with both L=0 and L=1 entries
+ M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store optimized VLC table address on stack
+ LDR pLMAXTableL0L1, =armVCM4P2_InterL0L1LMAX ;// Load Interleaved L=0 and L=1 LMAX Tables
+ M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX table address on stack
+ LDR pRMAXTableL0L1, =armVCM4P2_InterL0L1RMAX ;// Load Interleaved L=0 and L=1 RMAX Tables
+ MOV Count,#0 ;// set start=0
+ M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// store RMAX table address on stack
+
+
+ BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// call Unsafe Function for VLC Zigzag Decoding
+
+
+
+ M_END
+ ENDIF
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
new file mode 100755
index 0000000..9a37ec9
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
@@ -0,0 +1,136 @@
+;/**
+; *
+; * File Name: omxVCM4P2_DecodeVLCZigzag_IntraACVLC_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] ppBitStream pointer to the pointer to the current byte in
+; * the bitstream buffer
+; * [in] pBitOffset pointer to the bit position in the byte pointed
+; * to by *ppBitStream. *pBitOffset is valid within [0-7].
+; * [in] shortVideoHeader binary flag indicating presence of short_video_header;
+; * escape modes 0-3 are used if shortVideoHeader==0,
+; * and escape mode 4 is used when shortVideoHeader==1.
+; * [out] ppBitStream *ppBitStream is updated after the block is
+; * decoded, so that it points to the current byte
+; * in the bit stream buffer
+; * [out] pBitOffset *pBitOffset is updated so that it points to the
+; * current bit position in the byte pointed by
+; * *ppBitStream
+; * [out] pDst pointer to the coefficient buffer of current
+; * block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; * -pDst is not 16-byte aligned, or
+; * -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; * -At least one mark bit is equal to zero
+; * -Encountered an illegal stream code that cannot be found in the VLC table
+; * -Encountered and illegal code in the VLC FLC table
+; * -The number of coefficients is greater than 64
+; *
+; */
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+
+
+ M_VARIANTS ARM1136JS
+
+
+
+
+
+ IF ARM1136JS
+
+ ;// Import various tables needed for the function
+
+
+ IMPORT armVCM4P2_IntraVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+ ;// Packed in Run:Level:Last format
+ IMPORT armVCM4P2_IntraL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_IntraL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_aClassicalZigzagScan ;// contains classical Zigzag table entries with double the original values
+ IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+;//Input Arguments
+
+ppBitStream RN 0
+pBitOffset RN 1
+pDst RN 2
+PredDir RN 3
+shortVideoHeader RN 3
+
+;//Local Variables
+
+Return RN 0
+
+pVlcTableL0L1 RN 4
+pLMAXTableL0L1 RN 4
+pRMAXTableL0L1 RN 4
+pZigzagTable RN 4
+Count RN 6
+
+
+
+ ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses
+
+ M_ALLOC4 ppVlcTableL0L1,4
+ M_ALLOC4 ppLMAXTableL0L1,4
+ M_ALLOC4 ppRMAXTableL0L1,4
+ M_ALLOC4 ppZigzagTable,4
+
+
+ M_START omxVCM4P2_DecodeVLCZigzag_IntraACVLC,r12
+
+ M_ARG shortVideoHeaderonStack,4 ;// pointer to Input Argument on stack
+
+ LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load Address of the Zigzag table
+ ADD pZigzagTable, pZigzagTable, PredDir, LSL #6 ;// Loading Different type of zigzag tables based on PredDir
+
+ M_STR pZigzagTable,ppZigzagTable ;// Store Zigzag table address on stack
+ LDR pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1 ;// Load optimized packed VLC Table with both L=0 and L=1 entries
+ M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store VLC Table address on stack
+ LDR pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX ;// Load LMAX Table
+ M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX Table address on Stack
+ LDR pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX ;// Load RMAX Table
+ MOV Count,#0 ;// Set Start=0
+
+ M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Store RMAX Table address on stack
+
+
+
+ M_LDR shortVideoHeader,shortVideoHeaderonStack ;// get the Input Argument from stack
+
+ BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// Call Unsafe Function
+
+
+
+
+ M_END
+ ENDIF
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
new file mode 100755
index 0000000..778aaf2
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
@@ -0,0 +1,224 @@
+;/**
+; *
+; * File Name: omxVCM4P2_DecodeVLCZigzag_IntraDCVLC_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for zigzag scanning and VLC decoding
+; * for inter block.
+; *
+; *
+; *
+; * Function: omxVCM4P2_DecodeVLCZigzag_Inter
+; *
+; * Description:
+; * Performs VLC decoding and inverse zigzag scan for one intra coded block.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] ppBitStream pointer to the pointer to the current byte in
+; * the bitstream buffer
+; * [in] pBitOffset pointer to the bit position in the byte pointed
+; * to by *ppBitStream. *pBitOffset is valid within [0-7].
+; * [in] shortVideoHeader binary flag indicating presence of short_video_header;
+; * escape modes 0-3 are used if shortVideoHeader==0,
+; * and escape mode 4 is used when shortVideoHeader==1.
+; * [out] ppBitStream *ppBitStream is updated after the block is
+; * decoded, so that it points to the current byte
+; * in the bit stream buffer
+; * [out] pBitOffset *pBitOffset is updated so that it points to the
+; * current bit position in the byte pointed by
+; * *ppBitStream
+; * [out] pDst pointer to the coefficient buffer of current
+; * block. Must be 16-byte aligned
+; *
+; * Return Value:
+; * OMX_Sts_BadArgErr - bad arguments
+; * -At least one of the following pointers is NULL: ppBitStream, *ppBitStream, pBitOffset, pDst, or
+; * -pDst is not 16-byte aligned, or
+; * -*pBitOffset exceeds [0,7].
+; * OMX_Sts_Err - status error
+; * -At least one mark bit is equal to zero
+; * -Encountered an illegal stream code that cannot be found in the VLC table
+; * -Encountered and illegal code in the VLC FLC table
+; * -The number of coefficients is greater than 64
+; *
+; */
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armCOMM_BitDec_s.h
+
+
+ M_VARIANTS CortexA8
+
+
+
+
+
+ IF CortexA8
+
+
+ ;// Import various tables needed for the function
+
+
+ IMPORT armVCM4P2_IntraVlcL0L1 ;// Contains optimized and packed VLC Tables for both Last =1 and last=0
+ ;// Packed in Run:Level:Last format
+ IMPORT armVCM4P2_IntraL0L1LMAX ;// Contains LMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_IntraL0L1RMAX ;// Contains RMAX table entries with both Last=0 and Last=1
+ IMPORT armVCM4P2_aClassicalZigzagScan ;// contains CLassical, Horizontal, Vertical Zigzag table entries with double the original values
+ IMPORT armVCM4P2_aIntraDCLumaChromaIndex ;// Contains Optimized DCLuma and DCChroma Index table Entries
+
+
+ IMPORT armVCM4P2_DecodeVLCZigzag_AC_unsafe
+
+;//Input Arguments
+
+ppBitStream RN 0
+pBitOffset RN 1
+pDst RN 2
+PredDir RN 3
+shortVideoHeader RN 3
+videoComp RN 5
+;//Local Variables
+
+Return RN 0
+
+pDCLumaChromaIndex RN 4
+pDCChromaIndex RN 7
+pVlcTableL0L1 RN 4
+pLMAXTableL0L1 RN 4
+pRMAXTableL0L1 RN 4
+pZigzagTable RN 4
+Count RN 6
+DCValueSize RN 6
+powOfSize RN 7
+temp1 RN 5
+
+
+;// Scratch Registers
+
+RBitStream RN 8
+RBitBuffer RN 9
+RBitCount RN 10
+
+T1 RN 11
+T2 RN 12
+DCVal RN 14
+
+
+ ;// Allocate stack memory to store optimized VLC,Zigzag, RMAX, LMAX Table Addresses
+
+ M_ALLOC4 ppVlcTableL0L1,4
+ M_ALLOC4 ppLMAXTableL0L1,4
+ M_ALLOC4 ppRMAXTableL0L1,4
+ M_ALLOC4 ppZigzagTable,4
+ M_ALLOC4 pDCCoeff,4
+
+
+
+ M_START omxVCM4P2_DecodeVLCZigzag_IntraDCVLC,r12
+
+ M_ARG shortVideoHeaderonStack,4 ;// Pointer to argument on stack
+ M_ARG videoComponstack,4 ;// Pointer to argument on stack
+
+
+ ;// Decode DC Coefficient
+
+
+ LDR pDCLumaChromaIndex, =armVCM4P2_aIntraDCLumaChromaIndex ;// Load Optimized VLC Table for Luminance and Chrominance
+
+ ;// Initializing the Bitstream Macro
+
+ M_BD_INIT0 ppBitStream, pBitOffset, RBitStream, RBitBuffer, RBitCount
+ M_LDR videoComp,videoComponstack
+ M_BD_INIT1 T1, T2, T2
+ ADD pDCLumaChromaIndex,pDCLumaChromaIndex,videoComp, LSL #6
+ M_BD_INIT2 T1, T2, T2
+
+
+ M_BD_VLD DCValueSize,T1,T2,pDCLumaChromaIndex,4,2 ;// VLC Decode using optimized Luminance and Chrominance VLC Table
+
+
+
+
+DecodeDC
+
+ CMP DCValueSize,#12
+ BGT ExitError
+
+ CMP DCValueSize,#0
+ MOVEQ DCVal,#0 ;// If DCValueSize is zero then DC coeff =0
+ BEQ ACDecode ;// Branch to perform AC Coeff Decoding
+
+ M_BD_VREAD16 DCVal,DCValueSize,T1,T2 ;// Get DC Value From Bit stream
+
+
+ MOV powOfSize,#1
+ LSL powOfSize,DCValueSize ;// powOfSize=pow(2,DCValueSize)
+ CMP DCVal,powOfSize,LSR #1 ;// Compare DCVal with powOfSize/2
+ ADDLT DCVal,DCVal,#1
+ SUBLT DCVal,DCVal,powOfSize ;// If Lessthan powOfSize/2 DCVal=DCVal-powOfSize+1
+ ;// Else DCVal= fetchbits from bit stream
+
+CheckDCValueSize
+
+ CMP DCValueSize,#8 ;// If DCValueSize greater than 8 check marker bit
+
+ BLE ACDecode
+
+ M_BD_READ8 temp1,1,T1
+ TEQ temp1,#0 ;// If Marker bit is zero Exit with an Error Message
+ BEQ ExitError
+
+
+
+ ;// Decode AC Coefficient
+
+ACDecode
+
+ M_STR DCVal,pDCCoeff ;// Store Decoded DC Coeff on Stack
+ M_BD_FINI ppBitStream,pBitOffset ;// Terminating the Bit stream Macro
+
+ LDR pZigzagTable, =armVCM4P2_aClassicalZigzagScan ;// Load Zigzag talbe address
+ ADD pZigzagTable, pZigzagTable, PredDir, LSL #6 ;// Modify the Zigzag table adress based on PredDir
+
+ M_STR pZigzagTable,ppZigzagTable ;// Store zigzag table on stack
+ LDR pVlcTableL0L1, =armVCM4P2_IntraVlcL0L1 ;// Load Optimized VLC Table With both Last=0 and Last=1 Entries
+ M_STR pVlcTableL0L1,ppVlcTableL0L1 ;// Store Optimized VLC Table on stack
+ LDR pLMAXTableL0L1, =armVCM4P2_IntraL0L1LMAX ;// Load LMAX Table
+ M_STR pLMAXTableL0L1,ppLMAXTableL0L1 ;// Store LMAX table on stack
+ LDR pRMAXTableL0L1, =armVCM4P2_IntraL0L1RMAX ;// Load RMAX Table
+ MOV Count,#1 ;// Set Start =1
+
+ M_STR pRMAXTableL0L1,ppRMAXTableL0L1 ;// Store RMAX Table on Stack
+
+
+ M_LDR shortVideoHeader,shortVideoHeaderonStack ;// Load the Input Argument From Stack
+
+ BL armVCM4P2_DecodeVLCZigzag_AC_unsafe ;// Call the Unsafe Function
+
+ M_LDR DCVal,pDCCoeff ;// Get the Decoded DC Value From Stack
+ STRH DCVal,[pDst] ;// Store the DC Value
+ B ExitOK
+
+
+
+ExitError
+
+ M_BD_FINI ppBitStream,pBitOffset ;// Terminating the Bit Stream Macro in case of an Error
+ MOV Return,#OMX_Sts_Err ;// Exit with an Error Message
+ExitOK
+
+ M_END
+ ENDIF
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
new file mode 100755
index 0000000..caf7121
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_FindMVpred_s.s
@@ -0,0 +1,194 @@
+;//
+;//
+;// File Name: omxVCM4P2_FindMVpred_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Function:
+;// omxVCM4P2_FindMVpred
+;//
+ ;// Include headers
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+ INCLUDE armVCCOMM_s.h
+
+ ;// Define cpu variants
+ M_VARIANTS CortexA8
+
+
+ IF CortexA8
+
+ M_TABLE armVCM4P2_pBlkIndexTable
+ DCD OMXVCBlk0, OMXVCBlk1
+ DCD OMXVCBlk2, OMXVCBlk3
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+
+pSrcMVCurMB RN 0
+pSrcCandMV1 RN 1
+pSrcCandMV2 RN 2
+pSrcCandMV3 RN 3
+pDstMVPred RN 4
+pDstMVPredME RN 5
+iBlk RN 6
+
+pTable RN 4
+CandMV RN 12
+
+pCandMV1 RN 7
+pCandMV2 RN 8
+pCandMV3 RN 9
+
+CandMV1dx RN 0
+CandMV1dy RN 1
+CandMV2dx RN 2
+CandMV2dy RN 3
+CandMV3dx RN 10
+CandMV3dy RN 11
+
+temp RN 14
+
+zero RN 14
+return RN 0
+
+; ----------------------------------------------
+; Main routine
+; ----------------------------------------------
+
+ M_ALLOC4 MV, 4
+
+ ;// Function header
+ M_START omxVCM4P2_FindMVpred, r11
+
+ ;// Define stack arguments
+ M_ARG ppDstMVPred, 4
+ M_ARG ppDstMVPredME, 4
+ M_ARG Blk, 4
+
+ M_ADR CandMV, MV
+ MOV zero, #0
+ M_LDR iBlk, Blk
+
+ ;// Set the default value for these
+ ;// to be used if pSrcCandMV[1|2|3] == NULL
+ MOV pCandMV1, CandMV
+ MOV pCandMV2, CandMV
+ MOV pCandMV3, CandMV
+
+ STR zero, [CandMV]
+
+ ;// Branch to the case based on blk number
+ M_SWITCH iBlk
+ M_CASE OMXVCBlk0 ;// iBlk=0
+ M_CASE OMXVCBlk1 ;// iBlk=0
+ M_CASE OMXVCBlk2 ;// iBlk=0
+ M_CASE OMXVCBlk3 ;// iBlk=0
+ M_ENDSWITCH
+
+OMXVCBlk0
+ CMP pSrcCandMV1, #0
+ ADDNE pCandMV1, pSrcCandMV1, #4
+
+ CMP pSrcCandMV2, #0
+ ADDNE pCandMV2, pSrcCandMV2, #8
+
+ CMP pSrcCandMV3, #0
+ ADDNE pCandMV3, pSrcCandMV3, #8
+ CMPEQ pSrcCandMV1, #0
+
+ MOVEQ pCandMV3, pCandMV2
+ MOVEQ pCandMV1, pCandMV2
+
+ CMP pSrcCandMV1, #0
+ CMPEQ pSrcCandMV2, #0
+
+ MOVEQ pCandMV1, pCandMV3
+ MOVEQ pCandMV2, pCandMV3
+
+ CMP pSrcCandMV2, #0
+ CMPEQ pSrcCandMV3, #0
+
+ MOVEQ pCandMV2, pCandMV1
+ MOVEQ pCandMV3, pCandMV1
+
+ B BlkEnd
+
+OMXVCBlk1
+ MOV pCandMV1, pSrcMVCurMB
+ CMP pSrcCandMV3, #0
+ ADDNE pCandMV3, pSrcCandMV3, #8
+
+ CMP pSrcCandMV2, #0
+ ADDNE pCandMV2, pSrcCandMV2, #12
+
+ CMPEQ pSrcCandMV3, #0
+
+ MOVEQ pCandMV2, pCandMV1
+ MOVEQ pCandMV3, pCandMV1
+
+ B BlkEnd
+
+OMXVCBlk2
+ CMP pSrcCandMV1, #0
+ MOV pCandMV2, pSrcMVCurMB
+ ADD pCandMV3, pSrcMVCurMB, #4
+ ADDNE pCandMV1, pSrcCandMV1, #12
+ B BlkEnd
+
+OMXVCBlk3
+ ADD pCandMV1, pSrcMVCurMB, #8
+ MOV pCandMV2, pSrcMVCurMB
+ ADD pCandMV3, pSrcMVCurMB, #4
+
+BlkEnd
+
+ ;// Using the transperancy info, zero
+ ;// out the candidate MV if neccesary
+ LDRSH CandMV1dx, [pCandMV1], #2
+ LDRSH CandMV2dx, [pCandMV2], #2
+ LDRSH CandMV3dx, [pCandMV3], #2
+
+ ;// Load argument from the stack
+ M_LDR pDstMVPredME, ppDstMVPredME
+
+ LDRSH CandMV1dy, [pCandMV1]
+ LDRSH CandMV2dy, [pCandMV2]
+ LDRSH CandMV3dy, [pCandMV3]
+
+ CMP pDstMVPredME, #0
+
+ ;// Store the candidate MV's into the pDstMVPredME,
+ ;// these can be used in the fast algorithm if implemented
+
+ STRHNE CandMV1dx, [pDstMVPredME], #2
+ STRHNE CandMV1dy, [pDstMVPredME], #2
+ STRHNE CandMV2dx, [pDstMVPredME], #2
+ STRHNE CandMV2dy, [pDstMVPredME], #2
+ STRHNE CandMV3dx, [pDstMVPredME], #2
+ STRHNE CandMV3dy, [pDstMVPredME]
+
+ ; Find the median of the 3 candidate MV's
+ M_MEDIAN3 CandMV1dx, CandMV2dx, CandMV3dx, temp
+
+ ;// Load argument from the stack
+ M_LDR pDstMVPred, ppDstMVPred
+
+ M_MEDIAN3 CandMV1dy, CandMV2dy, CandMV3dy, temp
+
+ STRH CandMV3dx, [pDstMVPred], #2
+ STRH CandMV3dy, [pDstMVPred]
+
+ MOV return, #OMX_Sts_NoErr
+
+ M_END
+ ENDIF ;// ARM1136JS :LOR: CortexA8
+
+ END \ No newline at end of file
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
new file mode 100755
index 0000000..b5e3d0d
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_IDCT8x8blk_s.s
@@ -0,0 +1,73 @@
+;//
+;//
+;// File Name: omxVCM4P2_IDCT8x8blk_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+;// Function:
+;// omxVCM4P2_IDCT8x8blk
+;//
+ ;// Include headers
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ ;// Define cpu variants
+ M_VARIANTS CortexA8
+
+ INCLUDE armCOMM_IDCT_s.h
+
+ IMPORT armCOMM_IDCTPreScale
+ ;//
+ ;// Function prototype
+ ;//
+ ;// OMXResult
+ ;// omxVCM4P2_IDCT8x8blk(const OMX_S16* pSrc,
+ ;// OMX_S16* pDst)
+ ;//
+
+ IF CortexA8
+ M_ALLOC4 ppDest, 4
+ M_ALLOC4 pStride, 4
+ M_ALLOC8 pBlk, 2*8*8
+ ENDIF
+
+
+ IF CortexA8
+ M_START omxVCM4P2_IDCT8x8blk, r11, d15
+ ENDIF
+
+ IF CortexA8
+
+;// Declare input registers
+pSrc RN 0
+pDst RN 1
+
+;// Declare other intermediate registers
+Result RN 0
+
+;// Prototype for macro M_IDCT
+;// pSrc RN 0 ;// source data buffer
+;// Stride RN 1 ;// destination stride in bytes
+;// pDest RN 2 ;// destination data buffer
+;// pScale RN 3 ;// pointer to scaling table
+
+pSrc RN 0
+Stride RN 1
+pDest RN 2
+pScale RN 3
+
+ MOV pDest, pDst
+ LDR pScale, =armCOMM_IDCTPreScale
+ M_IDCT s9, s16, 16
+ MOV Result, #OMX_Sts_NoErr
+ M_END
+ ENDIF
+ ;// ARM1136JS :LOR: CortexA8
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
new file mode 100755
index 0000000..dd00df5
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
@@ -0,0 +1,444 @@
+;//
+;//
+;// File Name: omxVCM4P2_MCReconBlock_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;//
+;//
+
+;// Include standard headers
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Import symbols required from other files
+
+ M_VARIANTS CortexA8
+
+;// ***************************************************************************
+;// ARM1136JS implementation
+;// ***************************************************************************
+
+;// ***************************************************************************
+;// CortexA8 implementation
+;// ***************************************************************************
+ IF CortexA8
+;// ***************************************************************************
+;// MACRO DEFINITIONS
+;// ***************************************************************************
+ ;// Description:
+ ;// Does interpolation for the case of "IntegerPixel" predictType. Both
+ ;// rounding cases are handled. Just copies a block from pSrc to pDst
+ ;//
+ ;// Syntax:
+ ;// M_MCRECONBLOCK_IntegerPixel
+ ;//
+ ;// Inputs: None
+ ;// Outputs: None
+
+ MACRO
+ M_MCRECONBLOCK_IntegerPixel
+CaseIntegerPixel_Rnd0
+CaseIntegerPixel_Rnd1
+
+ VLD1 dRow0, [pSrc], srcStep
+ VLD1 dRow1, [pSrc], srcStep
+ VLD1 dRow2, [pSrc], srcStep
+ VLD1 dRow3, [pSrc], srcStep
+ VLD1 dRow4, [pSrc], srcStep
+ VLD1 dRow5, [pSrc], srcStep
+ VLD1 dRow6, [pSrc], srcStep
+ VLD1 dRow7, [pSrc], srcStep
+
+ VST1 dRow0, [pDst@64], dstStep
+ VST1 dRow1, [pDst@64], dstStep
+ VST1 dRow2, [pDst@64], dstStep
+ VST1 dRow3, [pDst@64], dstStep
+ VST1 dRow4, [pDst@64], dstStep
+ VST1 dRow5, [pDst@64], dstStep
+ VST1 dRow6, [pDst@64], dstStep
+ VST1 dRow7, [pDst@64], dstStep
+
+ B SwitchPredictTypeEnd
+ MEND
+;// ***************************************************************************
+ ;// Description:
+ ;// Does interpolation for the case of "HalfPixelX" predictType. The two
+ ;// rounding cases are handled by the parameter "$rndVal". Averages between
+ ;// a pixel and pixel right to it, rounding it based on $rndVal. The
+ ;// rounding is implemented by using opCode switching between "VRHADD" and
+ ;// "VHADD" instructions.
+ ;//
+ ;// Syntax:
+ ;// M_MCRECONBLOCK_HalfPixelX $rndVal
+ ;//
+ ;// Inputs:
+ ;// $rndVal: 0 for rounding and 1 for no rounding
+ ;// Outputs: None
+
+ MACRO
+ M_MCRECONBLOCK_HalfPixelX $rndVal
+
+ LCLS M_VHADDR
+ IF $rndVal = 0
+M_VHADDR SETS "VRHADD"
+ ELSE
+M_VHADDR SETS "VHADD"
+ ENDIF
+
+CaseHalfPixelX_Rnd$rndVal
+
+ VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep
+ VEXT dRow0Shft, dRow0, dRow0Shft, #1
+ VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep
+ VEXT dRow1Shft, dRow1, dRow1Shft, #1
+ VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep
+ VEXT dRow2Shft, dRow2, dRow2Shft, #1
+ VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep
+ VEXT dRow3Shft, dRow3, dRow3Shft, #1
+ VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep
+ VEXT dRow4Shft, dRow4, dRow4Shft, #1
+ VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep
+ VEXT dRow5Shft, dRow5, dRow5Shft, #1
+ VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep
+ VEXT dRow6Shft, dRow6, dRow6Shft, #1
+ VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep
+ VEXT dRow7Shft, dRow7, dRow7Shft, #1
+ $M_VHADDR dRow0, dRow0, dRow0Shft
+ $M_VHADDR dRow1, dRow1, dRow1Shft
+ VST1 dRow0, [pDst@64], dstStep
+ $M_VHADDR dRow2, dRow2, dRow2Shft
+ VST1 dRow1, [pDst@64], dstStep
+ $M_VHADDR dRow3, dRow3, dRow3Shft
+ VST1 dRow2, [pDst@64], dstStep
+ $M_VHADDR dRow4, dRow4, dRow4Shft
+ VST1 dRow3, [pDst@64], dstStep
+ $M_VHADDR dRow5, dRow5, dRow5Shft
+ VST1 dRow4, [pDst@64], dstStep
+ $M_VHADDR dRow6, dRow6, dRow6Shft
+ VST1 dRow5, [pDst@64], dstStep
+ $M_VHADDR dRow7, dRow7, dRow7Shft
+ VST1 dRow6, [pDst@64], dstStep
+ VST1 dRow7, [pDst@64], dstStep
+
+ B SwitchPredictTypeEnd
+ MEND
+;// ***************************************************************************
+ ;// Description:
+ ;// Does interpolation for the case of "HalfPixelY" predictType. The two
+ ;// rounding cases are handled by the parameter "$rndVal". Averages between
+ ;// a pixel and pixel below it, rounding it based on $rndVal. The
+ ;// rounding is implemented by using opCode switching between "VRHADD" and
+ ;// "VHADD" instructions.
+ ;//
+ ;// Syntax:
+ ;// M_MCRECONBLOCK_HalfPixelY $rndVal
+ ;//
+ ;// Inputs:
+ ;// $rndVal: 0 for rounding and 1 for no rounding
+ ;// Outputs: None
+
+ MACRO
+ M_MCRECONBLOCK_HalfPixelY $rndVal
+
+ LCLS M_VHADDR
+ IF $rndVal = 0
+M_VHADDR SETS "VRHADD"
+ ELSE
+M_VHADDR SETS "VHADD"
+ ENDIF
+
+CaseHalfPixelY_Rnd$rndVal
+ VLD1 dRow0, [pSrc], srcStep
+ VLD1 dRow1, [pSrc], srcStep
+ VLD1 dRow2, [pSrc], srcStep
+ VLD1 dRow3, [pSrc], srcStep
+ VLD1 dRow4, [pSrc], srcStep
+ VLD1 dRow5, [pSrc], srcStep
+ VLD1 dRow6, [pSrc], srcStep
+ VLD1 dRow7, [pSrc], srcStep
+ $M_VHADDR dRow0, dRow0, dRow1
+ VLD1 dRow8, [pSrc], srcStep
+ $M_VHADDR dRow1, dRow1, dRow2
+ VST1 dRow0, [pDst@64], dstStep
+ $M_VHADDR dRow2, dRow2, dRow3
+ VST1 dRow1, [pDst@64], dstStep
+ $M_VHADDR dRow3, dRow3, dRow4
+ VST1 dRow2, [pDst@64], dstStep
+ $M_VHADDR dRow4, dRow4, dRow5
+ VST1 dRow3, [pDst@64], dstStep
+ $M_VHADDR dRow5, dRow5, dRow6
+ VST1 dRow4, [pDst@64], dstStep
+ $M_VHADDR dRow6, dRow6, dRow7
+ VST1 dRow5, [pDst@64], dstStep
+ $M_VHADDR dRow7, dRow7, dRow8
+ VST1 dRow6, [pDst@64], dstStep
+ VST1 dRow7, [pDst@64], dstStep
+
+ B SwitchPredictTypeEnd
+ MEND
+;// ***************************************************************************
+ ;// Description:
+ ;// Does interpolation for the case of "IntegerPixel" predictType. Both
+ ;// rounding cases are handled.
+ ;// Typical computation for a row goes like this
+ ;// 1. VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
+ ;// 2. VEXT dRow0Shft, dRow0, dRow0Shft, #1 ;// Generate the shifted row
+ ;// 3. VADDL qSum0, dRow0, dRow0Shft ;// Generate the sum of row and shifted row
+ ;// 5. VADD qSum0, qSum0, qSum1 ;// Add to the sum of next row (odd row sum has rounding value added to it)
+ ;// 6. VSHRN dRow0, qSum0, #2 ;// Divide by 4
+ ;// 7. VST1 dRow0, [pDst@64], dstStep ;// Store
+ ;// Odd rows undergo following computation after step 3
+ ;// 4. VADD qSum1, qSum1, qRound
+ ;// This saves for adding rounding value to each final sum (overall saves 4
+ ;// instructions).
+ ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
+ ;// care of this and also minimizes stalls. Rounding value was modified in
+ ;// ARM register rndVal (originally used for rounding flag) before the switch.
+ ;// It is then populated into all lanes in this macro. No branching out to
+ ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
+ ;// are the last of switch cases.
+ ;//
+ ;// Syntax:
+ ;// M_MCRECONBLOCK_HalfPixelXY
+ ;//
+ ;// Inputs: None
+ ;// Outputs: None
+
+ MACRO
+ M_MCRECONBLOCK_HalfPixelXY
+
+CaseHalfPixelXY_Rnd0
+CaseHalfPixelXY_Rnd1
+ VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep
+ VDUP qRound, rndVal
+ VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep
+ VEXT dRow0Shft, dRow0, dRow0Shft, #1
+ VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep
+ VEXT dRow1Shft, dRow1, dRow1Shft, #1
+ VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep
+ VEXT dRow2Shft, dRow2, dRow2Shft, #1
+ VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep
+ VADDL qSum0, dRow0, dRow0Shft
+ VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep
+ VADDL qSum1, dRow1, dRow1Shft
+ VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep
+ VEXT dRow3Shft, dRow3, dRow3Shft, #1
+ VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep
+ VEXT dRow4Shft, dRow4, dRow4Shft, #1
+ VLD1 {dRow8, dRow8Shft}, [pSrc], srcStep
+ VADD qSum1, qSum1, qRound
+ VADDL qSum2, dRow2, dRow2Shft
+ VEXT dRow5Shft, dRow5, dRow5Shft, #1
+ VADD qSum0, qSum0, qSum1
+ VADDL qSum3, dRow3, dRow3Shft
+ VEXT dRow6Shft, dRow6, dRow6Shft, #1
+ VADD qSum1, qSum1, qSum2
+ VSHRN dRow0, qSum0, #2
+ VADDL qSum4, dRow4, dRow4Shft
+ VSHRN dRow1, qSum1, #2
+ VADD qSum3, qSum3, qRound
+ VADDL qSum5, dRow5, dRow5Shft
+ VST1 dRow0, [pDst@64], dstStep
+ VEXT dRow7Shft, dRow7, dRow7Shft, #1
+ VST1 dRow1, [pDst@64], dstStep
+ VEXT dRow8Shft, dRow8, dRow8Shft, #1
+ VADD qSum5, qSum5, qRound
+ VADD qSum2, qSum2, qSum3
+ VADD qSum3, qSum3, qSum4
+ VADD qSum4, qSum4, qSum5
+ VSHRN dRow2, qSum2, #2
+ VSHRN dRow3, qSum3, #2
+ VSHRN dRow4, qSum4, #2
+ VADDL qSum6, dRow6, dRow6Shft
+ VADDL qSum7, dRow7, dRow7Shft
+ VST1 dRow2, [pDst@64], dstStep
+ VADDL qSum8, dRow8, dRow8Shft
+ VADD qSum7, qSum7, qRound
+ VST1 dRow3, [pDst@64], dstStep
+ VST1 dRow4, [pDst@64], dstStep
+ VADD qSum5, qSum5, qSum6
+ VADD qSum6, qSum6, qSum7
+ VADD qSum7, qSum7, qSum8
+ VSHRN dRow5, qSum5, #2
+ VSHRN dRow6, qSum6, #2
+ VSHRN dRow7, qSum7, #2
+ VST1 dRow5, [pDst@64], dstStep
+ VST1 dRow6, [pDst@64], dstStep
+ VST1 dRow7, [pDst@64], dstStep
+
+ MEND
+;// ***************************************************************************
+
+;// Input/Output Registers
+pSrc RN 0
+srcStep RN 1
+pSrcResidue RN 2
+pDst RN 3
+dstStep RN 4
+predictType RN 5
+rndVal RN 6
+
+;// Local Scratch Registers
+pDstCopy RN 0
+return RN 0
+
+;// Neon Registers
+dRow0 DN D0.U8
+dRow0Shft DN D1.U8
+dRow1 DN D2.U8
+dRow1Shft DN D3.U8
+dRow2 DN D4.U8
+dRow2Shft DN D5.U8
+dRow3 DN D6.U8
+dRow3Shft DN D7.U8
+dRow4 DN D8.U8
+dRow4Shft DN D9.U8
+dRow5 DN D10.U8
+dRow5Shft DN D11.U8
+dRow6 DN D12.U8
+dRow6Shft DN D13.U8
+dRow7 DN D14.U8
+dRow7Shft DN D15.U8
+dRow8 DN D16.U8
+dRow8Shft DN D17.U8
+
+
+qSum0 QN Q9.U16
+qSum1 QN Q10.U16
+qSum2 QN Q11.U16
+qSum3 QN Q12.U16
+qSum4 QN Q13.U16
+qSum5 QN Q14.U16
+qSum6 QN Q0.U16
+qSum7 QN Q1.U16
+qSum8 QN Q2.U16
+
+qRound QN Q15.U16
+
+dDst0 DN D0.U8
+dDst1 DN D1.U8
+dDst2 DN D2.U8
+dDst3 DN D3.U8
+dDst4 DN D4.U8
+dDst5 DN D5.U8
+dDst6 DN D6.U8
+dDst7 DN D7.U8
+
+qRes0 QN Q4.S16
+qRes1 QN Q5.S16
+qRes2 QN Q6.S16
+qRes3 QN Q7.S16
+qRes4 QN Q8.S16
+qRes5 QN Q9.S16
+qRes6 QN Q10.S16
+qRes7 QN Q11.S16
+
+ ;// Function header
+ M_START omxVCM4P2_MCReconBlock, r6, d15
+ ;// Define stack arguments
+ M_ARG Arg_dstStep, 4
+ M_ARG Arg_predictType, 4
+ M_ARG Arg_rndVal, 4
+ ;// Load argument from the stack
+ M_LDR dstStep, Arg_dstStep
+ M_LDR predictType, Arg_predictType
+ M_LDR rndVal, Arg_rndVal
+ ADD predictType, rndVal, predictType, LSL #1
+ RSB rndVal, rndVal, #2 ;// preparing rndVal for HalfPixelXY
+
+ ;// The following is implementation of switching to different code segments
+ ;// based on different predictType and rndVal flags. The corresponding
+ ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
+ ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
+ ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
+ ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
+ ;// the two rounding cases in separate code bases.
+ ;// All these together implement the interpolation functionality
+
+ M_SWITCH predictType
+ M_CASE CaseIntegerPixel_Rnd0
+ M_CASE CaseIntegerPixel_Rnd1
+ M_CASE CaseHalfPixelX_Rnd0
+ M_CASE CaseHalfPixelX_Rnd1
+ M_CASE CaseHalfPixelY_Rnd0
+ M_CASE CaseHalfPixelY_Rnd1
+ M_CASE CaseHalfPixelXY_Rnd0
+ M_CASE CaseHalfPixelXY_Rnd1
+ M_ENDSWITCH
+
+ M_MCRECONBLOCK_IntegerPixel
+ M_MCRECONBLOCK_HalfPixelX 0
+ M_MCRECONBLOCK_HalfPixelX 1
+ M_MCRECONBLOCK_HalfPixelY 0
+ M_MCRECONBLOCK_HalfPixelY 1
+ M_MCRECONBLOCK_HalfPixelXY
+SwitchPredictTypeEnd
+
+ ;// After interpolation is done, residue needs to be added. This is done
+ ;// only in case "pSrcResidue" parameter to the function is not NULL.
+ ;// Following is a completely unrolled code to do so. Each row and
+ ;// corresponding residue is loaded and residue is added and value
+ ;// stored
+
+ CMP pSrcResidue, #0
+ SUBNE pDst, pDst, dstStep, LSL #3 ;// Restoring pDst
+ MOVNE pDstCopy, pDst
+ BEQ pSrcResidueConditionEnd
+pSrcResidueNotNull
+ VLD1 dDst0, [pDst@64], dstStep
+ VLD1 qRes0, [pSrcResidue@128]!
+ VLD1 dDst1, [pDst@64], dstStep
+ VLD1 qRes1, [pSrcResidue@128]!
+ VLD1 dDst2, [pDst@64], dstStep
+ VLD1 qRes2, [pSrcResidue@128]!
+ VADDW qRes0, qRes0, dDst0
+ VLD1 dDst3, [pDst@64], dstStep
+ VADDW qRes1, qRes1, dDst1
+ VLD1 qRes3, [pSrcResidue@128]!
+ VADDW qRes2, qRes2, dDst2
+ VLD1 dDst4, [pDst@64], dstStep
+ VQMOVUN dDst0, qRes0
+ VLD1 qRes4, [pSrcResidue@128]!
+ VADDW qRes3, qRes3, dDst3
+ VLD1 dDst5, [pDst@64], dstStep
+ VQMOVUN dDst1, qRes1
+ VLD1 qRes5, [pSrcResidue@128]!
+ VADDW qRes4, qRes4, dDst4
+ VLD1 dDst6, [pDst@64], dstStep
+ VQMOVUN dDst2, qRes2
+ VLD1 qRes6, [pSrcResidue@128]!
+ VADDW qRes5, qRes5, dDst5
+ VLD1 dDst7, [pDst@64], dstStep
+ VQMOVUN dDst3, qRes3
+ VLD1 qRes7, [pSrcResidue@128]!
+ VADDW qRes6, qRes6, dDst6
+ VST1 dDst0, [pDstCopy@64], dstStep
+ VQMOVUN dDst4, qRes4
+ VST1 dDst1, [pDstCopy@64], dstStep
+ VADDW qRes7, qRes7, dDst7
+ VST1 dDst2, [pDstCopy@64], dstStep
+ VQMOVUN dDst5, qRes5
+ VST1 dDst3, [pDstCopy@64], dstStep
+ VQMOVUN dDst6, qRes6
+ VST1 dDst4, [pDstCopy@64], dstStep
+ VQMOVUN dDst7, qRes7
+ VST1 dDst5, [pDstCopy@64], dstStep
+ VST1 dDst6, [pDstCopy@64], dstStep
+ VST1 dDst7, [pDstCopy@64], dstStep
+
+pSrcResidueConditionEnd
+ MOV return, #OMX_Sts_NoErr
+
+ M_END
+ ENDIF ;// CortexA8
+ END
+;// ***************************************************************************
+;// omxVCM4P2_MCReconBlock ends
+;// ***************************************************************************
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
new file mode 100755
index 0000000..a73f64a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
@@ -0,0 +1,320 @@
+; **********
+; *
+; * File Name: omxVCM4P2_PredictReconCoefIntra_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains module for DC/AC coefficient prediction
+; *
+; *
+; * Function: omxVCM4P2_PredictReconCoefIntra
+; *
+; * Description:
+; * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+; * to the function call, prediction direction (predDir) should be selected
+; * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] pSrcDst pointer to the coefficient buffer which contains the
+; * quantized coefficient residuals (PQF) of the current
+; * block; must be aligned on a 4-byte boundary. The
+; * output coefficients are saturated to the range
+; * [-2048, 2047].
+; * [in] pPredBufRow pointer to the coefficient row buffer; must be aligned
+; * on a 4-byte boundary.
+; * [in] pPredBufCol pointer to the coefficient column buffer; must be
+; * aligned on a 4-byte boundary.
+; * [in] curQP quantization parameter of the current block. curQP may
+; * equal to predQP especially when the current block and
+; * the predictor block are in the same macroblock.
+; * [in] predQP quantization parameter of the predictor block
+; * [in] predDir indicates the prediction direction which takes one
+; * of the following values:
+; * OMX_VIDEO_HORIZONTAL predict horizontally
+; * OMX_VIDEO_VERTICAL predict vertically
+; * [in] ACPredFlag a flag indicating if AC prediction should be
+; * performed. It is equal to ac_pred_flag in the bit
+; * stream syntax of MPEG-4
+; * [in] videoComp video component type (luminance, chrominance or
+; * alpha) of the current block
+; * [out] pSrcDst pointer to the coefficient buffer which contains
+; * the quantized coefficients (QF) of the current
+; * block
+; * [out] pPredBufRow pointer to the updated coefficient row buffer
+; * [out] pPredBufCol pointer to the updated coefficient column buffer
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - Bad arguments
+; * - At least one of the pointers is NULL: pSrcDst, pPredBufRow, or pPredBufCol.
+; * - At least one the following cases: curQP <= 0, predQP <= 0, curQP >31,
+; * predQP > 31, preDir exceeds [1,2].
+; * - At least one of the pointers pSrcDst, pPredBufRow, or pPredBufCol is not
+; * 4-byte aligned.
+; *
+; *********
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+
+ IMPORT armVCM4P2_Reciprocal_QP_S32
+ IMPORT armVCM4P2_Reciprocal_QP_S16
+ IMPORT armVCM4P2_DCScaler
+
+ IF CortexA8
+;// Input Arguments
+
+pSrcDst RN 0
+pPredBufRow RN 1
+pPredBufCol RN 2
+curQP RN 3
+QP RN 3
+predQP RN 4
+predDir RN 5
+ACPredFlag RN 6
+videoComp RN 7
+
+;// Local Variables
+
+shortVideoHeader RN 4
+dcScaler RN 4
+index RN 6
+predCoeffTable RN 7
+temp1 RN 6
+temp2 RN 9
+temp RN 14
+Const RN 8
+temppPredColBuf RN 8
+tempPred RN 9
+
+absCoeffDC RN 8
+negdcScaler RN 10
+Rem RN 11
+temp3 RN 12
+
+dcRowbufCoeff RN 10
+dcColBuffCoeff RN 11
+Return RN 0
+
+;//NEON Registers
+
+qPredRowBuf QN Q0.S16
+dPredRowBuf0 DN D0.S16
+dPredRowBuf1 DN D1.S16
+
+
+
+
+qCoeffTab QN Q1.S32
+
+qPredQP QN Q2.S16
+dPredQP0 DN D4.S16
+dPredQP1 DN D5.S16
+
+
+qtemp1 QN Q3.S32
+qtemp QN Q3.S16
+
+dtemp0 DN D6.S16
+dtemp1 DN D7.S16
+
+dtemp2 DN D8.S16
+dtemp3 DN D9.S16
+
+dtemp4 DN D2.S16
+dtemp5 DN D3.S16
+dtemp6 DN D4.S16
+dtemp7 DN D5.S16
+
+qtempPred1 QN Q5.S32
+qtempPred QN Q5.S16
+
+dtempPred0 DN D10.S16
+dtempPred1 DN D11.S16
+
+
+
+ M_START omxVCM4P2_PredictReconCoefIntra,r11,d11
+
+ ;// Assigning pointers to Input arguments on Stack
+
+ M_ARG predQPonStack,4
+ M_ARG predDironStack,4
+ M_ARG ACPredFlagonStack,4
+ M_ARG videoComponStack,4
+
+ ;// DC Prediction
+
+ M_LDR videoComp,videoComponStack ;// Load videoComp From Stack
+
+ M_LDR predDir,predDironStack ;// Load Prediction direction
+ ;// DC Scaler calculation
+ LDR index, =armVCM4P2_DCScaler
+ ADD index,index,videoComp,LSL #5
+ LDRB dcScaler,[index,QP]
+
+
+ LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S16 ;// Loading the table with entries 32767/(1 to 63)
+ CMP predDir,#2 ;// Check if the Prediction direction is vertical
+
+ ;// Caulucate tempPred
+
+ LDREQSH absCoeffDC,[pPredBufRow] ;// If vetical load the coeff from Row Prediction Buffer
+ LDRNESH absCoeffDC,[pPredBufCol] ;// If horizontal load the coeff from column Prediction Buffer
+
+ RSB negdcScaler,dcScaler,#0 ;// negdcScaler=-dcScaler
+ MOV temp1,absCoeffDC ;// Load the Prediction coeff to temp for comparision
+ CMP temp1,#0
+ RSBLT absCoeffDC,temp1,#0 ;// calculate absolute val of prediction coeff
+
+ ADD temp,dcScaler,dcScaler
+ LDRH temp,[predCoeffTable,temp] ;// Load value from coeff table for performing division using multiplication
+ SMULBB tempPred,temp,absCoeffDC ;// tempped=pPredBufRow(Col)[0]*32767/dcScaler
+ ADD temp3,dcScaler,#1
+ LSR tempPred,tempPred,#15 ;// tempped=pPredBufRow(Col)[0]/dcScaler
+ LSR temp3,temp3,#1 ;// temp3=round(dcScaler/2)
+ MLA Rem,negdcScaler,tempPred,absCoeffDC ;// Remainder Rem=abs(pPredBufRow(Col)[0])-tempPred*dcScaler
+
+ LDRH dcRowbufCoeff,[pPredBufCol]
+
+ CMP Rem,temp3 ;// compare Rem with (dcScaler/2)
+ ADDGE tempPred,#1 ;// tempPred=tempPred+1 if Rem>=(dcScaler/2)
+ CMP temp1,#0
+ RSBLT tempPred,tempPred,#0 ;// tempPred=-tempPred if
+
+ STRH dcRowbufCoeff,[pPredBufRow,#-16]
+
+
+ LDRH temp,[pSrcDst] ;// temp=pSrcDst[0]
+ ADD temp,temp,tempPred ;// temp=pSrcDst[0]+tempPred
+ SSAT16 temp,#12,temp ;// clip temp to [-2048,2047]
+ SMULBB dcColBuffCoeff,temp,dcScaler ;// temp1=clipped(pSrcDst[0])*dcScaler
+ M_LDR ACPredFlag,ACPredFlagonStack
+ STRH dcColBuffCoeff,[pPredBufCol]
+
+
+ ;// AC Prediction
+
+ M_LDR predQP,predQPonStack
+
+ CMP ACPredFlag,#1 ;// Check if the AC prediction flag is set or not
+ BNE Exit ;// If not set Exit
+ CMP predDir,#2 ;// Check the Prediction direction
+ LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S32 ;// Loading the table with entries 0x1ffff/(1 to 63)
+ MOV Const,#4
+ MUL curQP,curQP,Const ;// curQP=4*curQP
+ VDUP dPredQP0,predQP
+ LDR temp2,[predCoeffTable,curQP] ;// temp=0x1ffff/curQP
+ VDUP qCoeffTab,temp2
+ BNE Horizontal ;// If the Prediction direction is horizontal branch to Horizontal
+
+
+
+ ;// Vertical
+ ;//Calculating tempPred
+
+ VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufRow] ;// Loading pPredBufRow[i]:i=0 t0 7
+
+ VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i]: i=0 t0 3
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+
+ VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i] : i=4 t0 7
+
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=0 t0 3
+ VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+
+
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=4 t0 7
+ VLD1 {dtemp0,dtemp1},[pSrcDst] ;//Loading pSrcDst[i] : i=0 to 7
+ VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+ VMOV dtempPred0,dPredQP1
+
+ ;//updating source and row prediction buffer contents
+ VADD qtemp,qtemp,qtempPred ;//pSrcDst[i]=pSrcDst[i]+qtempPred[i]: i=0 to 7
+ VQSHL qtemp,qtemp,#4 ;//Clip to [-2048,2047]
+ LDRH dcRowbufCoeff,[pPredBufRow] ;//Loading Dc Value of Row Prediction buffer
+ VSHR qtemp,qtemp,#4
+
+ VST1 {dtemp0,dtemp1},[pSrcDst] ;//storing back the updated values
+ VST1 {dtemp0,dtemp1},[pPredBufRow] ;//storing back the updated row prediction values
+ STRH dcRowbufCoeff,[pPredBufRow] ;// storing the updated DC Row Prediction coeff
+
+ B Exit
+
+Horizontal
+
+ ;// Calculating Temppred
+
+
+
+ VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufCol] ;// Loading pPredBufCol[i]:i=0 t0 7
+ VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i]: i=0 t0 3
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+
+ VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i] : i=4 t0 7
+
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=0 t0 3
+ VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+
+
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+
+ MOV temppPredColBuf,pPredBufCol
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=4 t0 7
+ VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Loading coefficients Interleaving by 4
+ VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+ VMOV dtempPred0,dPredQP1
+
+ ;// Updating source and column prediction buffer contents
+ ADD temp2,pSrcDst,#32
+ VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2] ;// Loading next 16 coefficients Interleaving by 4
+ VUZP dtemp0,dtemp4 ;// Interleaving by 8
+ VADD dtemp0,dtemp0,dtempPred0 ;// Adding tempPred to coeffs
+ VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047]
+ VSHR dtemp0,dtemp0,#4
+ VST1 {dtemp0},[pPredBufCol]! ;// Updating Pridiction column buffer
+ VZIP dtemp0,dtemp4 ;// deinterleaving
+ VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Updating source coeffs
+ VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]!
+
+ MOV temp1,temp2
+ VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp2]! ;// Loading coefficients Interleaving by 4
+
+ VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+ VUZP dtemp0,dtemp4 ;// Interleaving by 8
+ VADD dtemp0,dtemp0,dtempPred1
+ VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047]
+ VSHR dtemp0,dtemp0,#4
+ VST1 {dtemp0},[pPredBufCol]!
+ VZIP dtemp0,dtemp4
+ VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp1]
+ STRH dcColBuffCoeff,[temppPredColBuf]
+ VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+
+Exit
+
+ STRH temp,[pSrcDst]
+
+
+ MOV Return,#OMX_Sts_NoErr
+
+ M_END
+ ENDIF
+
+
+ END
+
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
new file mode 100755
index 0000000..bd0ad1f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvInter_I_s.s
@@ -0,0 +1,162 @@
+;/**
+; *
+; * File Name: omxVCM4P2_QuantInvInter_I_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for inter reconstruction
+; *
+; *
+; *
+; *
+; *
+; * Function: omxVCM4P2_QuantInvInter_I
+; *
+; * Description:
+; * Performs inverse quantization on intra/inter coded block.
+; * This function supports bits_per_pixel = 8. Mismatch control
+; * is performed for the first MPEG-4 mode inverse quantization method.
+; * The output coefficients are clipped to the range: [-2048, 2047].
+; * Mismatch control is performed for the first inverse quantization method.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] pSrcDst pointer to the input (quantized) intra/inter block. Must be 16-byte aligned.
+; * [in] QP quantization parameter (quantiser_scale)
+; * [in] videoComp (Intra version only.) Video component type of the
+; * current block. Takes one of the following flags:
+; * OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE,
+; * OMX_VC_ALPHA.
+; * [in] shortVideoHeader a flag indicating presence of short_video_header;
+; * shortVideoHeader==1 selects linear intra DC mode,
+; * and shortVideoHeader==0 selects nonlinear intra DC mode.
+; * [out] pSrcDst pointer to the output (dequantized) intra/inter block. Must be 16-byte aligned.
+; *
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - bad arguments
+; * - If pSrcDst is NULL or is not 16-byte aligned.
+; * or
+; * - If QP <= 0.
+; * or
+; * - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA.
+; *
+; */
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+ IF CortexA8
+
+
+;//Input Arguments
+pSrcDst RN 0
+QP RN 1
+
+
+;//Local Variables
+Count RN 3
+doubleQP RN 4
+Return RN 0
+;// Neon registers
+
+
+dQP10 DN D0.S32[0]
+qQP1 QN Q0.S32
+
+dQP1 DN D0.S16
+dMinusQP1 DN D1.S16
+
+dCoeff0 DN D2.S16
+dCoeff1 DN D3.S16
+
+qResult0 QN Q3.S32
+dResult0 DN D7.S16
+qSign0 QN Q3.S32
+dSign0 DN D6.S16
+
+qResult1 QN Q4.S32
+dResult1 DN D8.S16
+qSign1 QN Q4.S32
+dSign1 DN D8.S16
+
+d2QP0 DN D10.S32[0]
+q2QP0 QN Q5.S32
+d2QP DN D10.S16
+
+dZero0 DN D11.S16
+dZero1 DN D12.S16
+dConst0 DN D13.S16
+
+
+ M_START omxVCM4P2_QuantInvInter_I,r4,d13
+
+
+
+ ADD doubleQP,QP,QP ;// doubleQP= 2*QP
+ VMOV d2QP0,doubleQP
+ VDUP q2QP0,d2QP0 ;// Move doubleQP in to a scalar
+ TST QP,#1
+ VLD1 {dCoeff0,dCoeff1},[pSrcDst] ;// Load first 8 values to Coeff0,Coeff1
+ SUBEQ QP,QP,#1
+ VMOV dQP10,QP ;// If QP is even then QP1=QP-1 else QP1=QP
+ MOV Count,#64
+ VDUP qQP1,dQP10 ;// Duplicate tempResult with QP1
+ VSHRN d2QP,q2QP0,#0
+ VEOR dConst0,dConst0,dConst0
+ VSHRN dQP1,qQP1,#0 ;// QP1 truncated to 16 bits
+ VSUB dMinusQP1,dConst0,dQP1 ;// dMinusQP1=-QP1
+
+Loop
+
+ ;//Performing Inverse Quantization
+
+ VCLT dSign0,dCoeff0, #0 ;// Compare Coefficient 0 against 0
+ VCLT dSign1,dCoeff1, #0 ;// Compare Coefficient 1 against 0
+ VCEQ dZero0,dCoeff0,#0 ;// Compare Coefficient 0 against zero
+ VBSL dSign0,dMinusQP1,dQP1 ;// dSign0 = -QP1 if Coeff0< 0 else QP1
+ VCEQ dZero1,dCoeff1,#0 ;// Compare Coefficient 1 against zero
+ VBSL dSign1,dMinusQP1,dQP1 ;// dSign1 = -QP1 if Coeff1< 0 else QP1
+ VMOVL qSign0,dSign0 ;// Sign extend qSign0 to 32 bits
+ VMOVL qSign1,dSign1
+ VMLAL qResult0,dCoeff0,d2QP ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0
+ ;// qResult0[i]= qCoeff0[i] if Coeff >=0
+ VMLAL qResult1,dCoeff1,d2QP ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0
+ ;// qResult1[i]= qCoeff1[i] if Coeff >=0
+ ;// Clip Result to [-2048,2047]
+
+ VQSHL qResult0,qResult0,#20 ;// clip to [-2048,2047]
+ VQSHL qResult1,qResult1,#20
+
+ VSHR qResult0,qResult0,#4
+ VSHR qResult1,qResult1,#4
+ VSHRN dResult0,qResult0,#16 ;// Narrow the clipped Value to Halfword
+ VSHRN dResult1,qResult1,#16
+ VBIT dResult0,dConst0,dZero0
+ VBIT dResult1,dConst0,dZero1
+
+ VST1 {dResult0,dResult1},[pSrcDst]! ;// Store the result
+ SUBS Count,Count,#8
+ VLD1 {dCoeff0,dCoeff1},[pSrcDst]
+
+
+ BGT Loop
+
+ MOV Return,#OMX_Sts_NoErr
+
+
+ M_END
+ ENDIF
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
new file mode 100755
index 0000000..e00591f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_QuantInvIntra_I_s.s
@@ -0,0 +1,210 @@
+;/**
+; *
+; * File Name: omxVCM4P2_QuantInvIntra_I_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains modules for inter reconstruction
+; *
+; *
+; *
+; *
+; *
+; *
+; * Function: omxVCM4P2_QuantInvIntra_I
+; *
+; * Description:
+; * Performs inverse quantization on intra/inter coded block.
+; * This function supports bits_per_pixel = 8. Mismatch control
+; * is performed for the first MPEG-4 mode inverse quantization method.
+; * The output coefficients are clipped to the range: [-2048, 2047].
+; * Mismatch control is performed for the first inverse quantization method.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] pSrcDst pointer to the input (quantized) intra/inter block. Must be 16-byte aligned.
+; * [in] QP quantization parameter (quantiser_scale)
+; * [in] videoComp (Intra version only.) Video component type of the
+; * current block. Takes one of the following flags:
+; * OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE,
+; * OMX_VC_ALPHA.
+; * [in] shortVideoHeader a flag indicating presence of short_video_header;
+; * shortVideoHeader==1 selects linear intra DC mode,
+; * and shortVideoHeader==0 selects nonlinear intra DC mode.
+; * [out] pSrcDst pointer to the output (dequantized) intra/inter block. Must be 16-byte aligned.
+; *
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - bad arguments
+; * - If pSrcDst is NULL or is not 16-byte aligned.
+; * or
+; * - If QP <= 0.
+; * or
+; * - videoComp is none of OMX_VC_LUMINANCE, OMX_VC_CHROMINANCE and OMX_VC_ALPHA.
+; *
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+ IMPORT armVCM4P2_DCScaler
+
+ IF CortexA8
+
+
+;//Input Arguments
+pSrcDst RN 0
+QP RN 1
+videoComp RN 2
+shortVideoHeader RN 3
+
+
+;//Local Variables
+
+dcScaler RN 4
+temp RN 14
+index RN 5
+
+
+Count RN 5
+doubleQP RN 4
+Return RN 0
+
+
+;// Neon registers
+
+
+dQP10 DN D0.S32[0]
+qQP1 QN Q0.S32
+
+dQP1 DN D0.S16
+dMinusQP1 DN D1.S16
+
+dCoeff0 DN D2.S16
+dCoeff1 DN D3.S16
+
+qResult0 QN Q3.S32
+dResult0 DN D7.S16
+qSign0 QN Q3.S32
+dSign0 DN D6.S16
+
+qResult1 QN Q4.S32
+dResult1 DN D8.S16
+qSign1 QN Q4.S32
+dSign1 DN D8.S16
+
+d2QP0 DN D10.S32[0]
+q2QP0 QN Q5.S32
+d2QP DN D10.S16
+
+dZero0 DN D11.S16
+dZero1 DN D4.S16
+dConst0 DN D5.S16
+
+
+
+
+
+
+ M_START omxVCM4P2_QuantInvIntra_I,r5,d11
+
+
+ ;// Perform Inverse Quantization for DC coefficient
+
+ TEQ shortVideoHeader,#0 ;// Test if short Video Header flag =0
+ MOVNE dcScaler,#8 ;// if shortVideoHeader is non zero dcScaler=8
+ BNE calDCVal
+
+ LDR index, =armVCM4P2_DCScaler
+ ADD index,index,videoComp,LSL #5
+ LDRB dcScaler,[index,QP]
+
+ ;//M_CalDCScalar shortVideoHeader,videoComp, QP
+
+calDCVal
+
+ LDRH temp,[pSrcDst]
+ SMULBB temp,temp,dcScaler ;// dcCoeff = dcScaler * Quantized DC coefficient(from memory)
+ SSAT temp,#12,temp ;// Saturating to 12 bits
+
+
+
+ ;// Perform Inverse Quantization for Ac Coefficients
+
+
+
+ ADD doubleQP,QP,QP ;// doubleQP= 2*QP
+ VMOV d2QP0,doubleQP
+ VDUP q2QP0,d2QP0 ;// Move doubleQP in to a scalar
+ TST QP,#1
+ VLD1 {dCoeff0,dCoeff1},[pSrcDst] ;// Load first 8 values to Coeff0,Coeff1
+ SUBEQ QP,QP,#1
+ VMOV dQP10,QP ;// If QP is even then QP1=QP-1 else QP1=QP
+ MOV Count,#64
+ VDUP qQP1,dQP10 ;// Duplicate tempResult with QP1
+ VSHRN d2QP,q2QP0,#0
+ VEOR dConst0,dConst0,dConst0
+ VSHRN dQP1,qQP1,#0 ;// QP1 truncated to 16 bits
+ VSUB dMinusQP1,dConst0,dQP1 ;// dMinusQP1=-QP1
+
+Loop
+
+ ;//Performing Inverse Quantization
+
+ VCLT dSign0,dCoeff0, #0 ;// Compare Coefficient 0 against 0
+ VCLT dSign1,dCoeff1, #0 ;// Compare Coefficient 1 against 0
+ VCEQ dZero0,dCoeff0,#0 ;// Compare Coefficient 0 against zero
+ VBSL dSign0,dMinusQP1,dQP1 ;// dSign0 = -QP1 if Coeff0< 0 else QP1
+ VCEQ dZero1,dCoeff1,#0 ;// Compare Coefficient 1 against zero
+ VBSL dSign1,dMinusQP1,dQP1 ;// dSign1 = -QP1 if Coeff1< 0 else QP1
+ VMOVL qSign0,dSign0 ;// Sign extend qSign0 to 32 bits
+ VMOVL qSign1,dSign1
+ VMLAL qResult0,dCoeff0,d2QP ;// qResult0[i]= qCoeff0[i]+qCoeff0[i]*(-2) if Coeff <0
+ ;// qResult0[i]= qCoeff0[i] if Coeff >=0
+ VMLAL qResult1,dCoeff1,d2QP ;// qResult1[i]= qCoeff1[i]+qCoeff1[i]*(-2) if Coeff <0
+ ;// qResult1[i]= qCoeff1[i] if Coeff >=0
+ ;// Clip Result to [-2048,2047]
+
+ VQSHL qResult0,qResult0,#20 ;// clip to [-2048,2047]
+ VQSHL qResult1,qResult1,#20
+
+ VSHR qResult0,qResult0,#4
+ VSHR qResult1,qResult1,#4
+ VSHRN dResult0,qResult0,#16 ;// Narrow the clipped Value to Halfword
+ VSHRN dResult1,qResult1,#16
+ VBIT dResult0,dConst0,dZero0
+ VBIT dResult1,dConst0,dZero1
+
+ VST1 {dResult0,dResult1},[pSrcDst]! ;// Store the result
+ SUBS Count,Count,#8
+ VLD1 {dCoeff0,dCoeff1},[pSrcDst]
+
+
+ BGT Loop
+
+ SUB pSrcDst,pSrcDst,#128
+
+ ;// Store the Inverse quantized Dc coefficient
+
+ STRH temp,[pSrcDst],#2
+
+ MOV Return,#OMX_Sts_NoErr
+
+
+
+ M_END
+ ENDIF
+
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c
new file mode 100755
index 0000000..5d93681
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/src/armVC_Version.c
@@ -0,0 +1,6 @@
+#include "omxtypes.h"
+#include "armCOMM_Version.h"
+
+#ifdef ARM_INCLUDE_VERSION_DESCRIPTIONS
+const char * const omxVC_VersionDescription = "ARM OpenMAX DL v" ARM_VERSION_STRING " Rel=" OMX_ARM_RELEASE_TAG " Arch=" OMX_ARM_BUILD_ARCHITECTURE " Tools=" OMX_ARM_BUILD_TOOLCHAIN ;
+#endif /* ARM_INCLUDE_VERSION_DESCRIPTIONS */