summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s531
1 files changed, 531 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
new file mode 100755
index 0000000..39eb8a4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
@@ -0,0 +1,531 @@
+;//
+;//
+;// File Name: omxVCM4P10_PredictIntra_4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Define the processor variants supported by this file
+
+ M_VARIANTS CortexA8
+
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+ M_TABLE armVCM4P10_pSwitchTable4x4
+ DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR
+ DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL
+ DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR
+ DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL
+ DCD OMX_VC_4x4_HU
+
+
+ IF CortexA8
+
+;//--------------------------------------------
+;// Scratch variable
+;//--------------------------------------------
+return RN 0
+pTable RN 8
+pc RN 15
+
+;//--------------------------------------------
+;// Declare input registers
+;//--------------------------------------------
+pSrcLeft RN 0 ;// input pointer
+pSrcAbove RN 1 ;// input pointer
+pSrcAboveLeft RN 2 ;// input pointer
+pDst RN 3 ;// output pointer
+leftStep RN 4 ;// input variable
+dstStep RN 5 ;// input variable
+predMode RN 6 ;// input variable
+availability RN 7 ;// input variable
+pDst1 RN 1
+pDst2 RN 4
+pDst3 RN 6
+
+pSrcTmp RN 9
+srcStep RN 10
+pDstTmp RN 11
+dstep RN 12
+
+;//-------------------
+;// Neon registers
+;//-------------------
+
+;// OMX_VC_CHROMA_VERT
+dAboveU32 DN D0.U32
+
+;// OMX_VC_CHROMA_HOR
+dLeftVal0 DN D0.8
+dLeftVal1 DN D1.8
+dLeftVal2 DN D2.8
+dLeftVal3 DN D3.8
+dLeftVal0U32 DN D0.U32
+dLeftVal1U32 DN D1.U32
+dLeftVal2U32 DN D2.U32
+dLeftVal3U32 DN D3.U32
+
+;// OMX_VC_4x4_DC
+dLeftVal DN D0.U8
+dLeftValU32 DN D0.U32
+dSumAboveLeftU16 DN D1.U16
+dSumAboveLeftU32 DN D1.U32
+dSumAboveLeftU64 DN D1.U64
+dSumAboveLeftU8 DN D1.U8
+dSum DN D0.U8
+
+dSumLeftValU16 DN D1.U16
+dSumLeftValU32 DN D1.U32
+dSumLeftValU64 DN D1.U64
+dSumLeftValU8 DN D1.U8
+
+dAboveVal DN D0.U8
+dSumAboveValU16 DN D1.U16
+dSumAboveValU32 DN D1.U32
+dSumAboveValU64 DN D1.U64
+dSumAboveValU8 DN D1.U8
+dConst128U8 DN D0.U8
+
+
+;//OMX_VC_4x4_DIAG_DL
+
+dAbove DN D0.U8
+dU7 DN D2.U8
+dU3 DN D2.U8
+dAbove0 DN D3.U8
+dAbove1 DN D4.U8
+dAbove2 DN D5.U8
+dTmp DN D6.U8
+dTmp0 DN D7.U8
+dTmp1 DN D8.U8
+dTmp2 DN D9.U8
+dTmp3 DN D10.U8
+dTmpU32 DN D6.U32
+
+
+;//OMX_VC_4x4_DIAG_DR
+dLeft DN D1.U8
+dUL DN D2.U8
+
+;//OMX_VC_4x4_VR
+dLeft0 DN D1.U8
+dLeft1 DN D2.U8
+dEven0 DN D3.U8
+dEven1 DN D4.U8
+dEven2 DN D5.U8
+dOdd0 DN D6.U8
+dOdd1 DN D11.U8
+dOdd2 DN D12.U8
+dTmp3U32 DN D10.U32
+dTmp2U32 DN D9.U32
+
+
+;//OMX_VC_4x4_HD
+dTmp1U64 DN D8.U64
+dTmp0U64 DN D7.U64
+dTmpU64 DN D6.U64
+dTmpU32 DN D6.U32
+dTmp1U32 DN D8.U32
+
+;//OMX_VC_4x4_HU
+dL3 DN D2.U8
+dLeftHU0 DN D3.U8
+dLeftHU1 DN D4.U8
+dLeftHU2 DN D5.U8
+dTmp0U32 DN D7.U32
+
+
+
+
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 starts
+;//-----------------------------------------------------------------------------------------------
+
+ ;// Write function header
+ M_START omxVCM4P10_PredictIntra_4x4, r12,d12
+
+ ;// Define stack arguments
+ M_ARG LeftStep, 4
+ M_ARG DstStep, 4
+ M_ARG PredMode, 4
+ M_ARG Availability, 4
+
+
+ LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case
+
+ ;// Load argument from the stack
+ M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg
+ M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg
+
+
+ LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
+
+
+OMX_VC_4x4_HOR
+
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ;// Load Left Edge
+ VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+ VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7
+ VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VERT
+
+ ;// Load Upper Edge
+ VLD1 dAboveU32[0],[pSrcAbove]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+DCPredict4x4VertStore
+
+ VST1 dAboveU32[0],[pDst],dstep
+ VST1 dAboveU32[0],[pDstTmp],dstep
+ VST1 dAboveU32[0],[pDst]
+ VST1 dAboveU32[0],[pDstTmp]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_DC
+
+
+ TST availability, #OMX_VC_LEFT
+ BEQ DCPredict4x4LeftNotAvailable
+
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ;// Load Left Edge
+ VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCPredict4x4LeftOnlyAvailable
+
+ ;// Load Upper Edge also
+ VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3]
+ MOV return, #OMX_Sts_NoErr
+
+ VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
+ VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
+ VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
+ VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumAboveLeftU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4LeftOnlyAvailable
+
+ MOV return, #OMX_Sts_NoErr ;// returnNoError
+
+ VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]]
+ VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]]
+
+ VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumLeftValU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4LeftNotAvailable
+
+ TST availability, #OMX_VC_UPPER
+ BEQ DCPredict4x4NoneAvailable
+
+ ;// Load Upper Edge
+ VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3]
+ MOV return, #OMX_Sts_NoErr
+
+ VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]]
+ VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]]
+
+ VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VDUP dSum,dSumAboveValU8[0]
+
+ B DCPredict4x4VertStore
+
+DCPredict4x4NoneAvailable
+
+ VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
+ MOV return, #OMX_Sts_NoErr
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ B DCPredict4x4VertStore
+
+
+
+OMX_VC_4x4_DIAG_DL
+
+ TST availability, #OMX_VC_UPPER_RIGHT
+ BEQ DiagDLUpperRightNotAvailable
+
+ VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
+ VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7]
+ VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1]
+ VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2]
+ B DiagDLPredict4x4Store
+
+DiagDLUpperRightNotAvailable
+ VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
+ VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+ VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+ VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+ VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+
+DiagDLPredict4x4Store
+
+ VHADD dTmp, dAbove0, dAbove2
+ VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
+
+
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst],dstStep
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+
+OMX_VC_4x4_DIAG_DR
+
+
+ ;// Load U0,U1,U2,U3
+
+ VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0]
+
+ ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
+ VLD1 {dLeft[7]},[pSrcAboveLeft]
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+ ADD pDst1,pDst,dstStep
+
+ VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+
+ VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
+ ADD pDst2,pDst1,dstStep
+ VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
+ ADD pDst3,pDst2,dstStep
+ VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
+
+ VHADD dTmp, dAbove0, dAbove2
+ VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
+
+
+ VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1
+ VEXT dTmp,dTmp,dTmp,#1
+ VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VR
+
+
+ ;// Load UL,U0,U1,U2,U3
+ VLD1 dAboveU32[0],[pSrcAbove]
+ VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0]
+
+ ;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
+ ;// dLeft1 = [L1| X|X|X|X|X|X|X]
+ VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+
+
+ VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ]
+ VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ]
+ VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ]
+ VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ]
+ VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ]
+ VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ]
+
+ VHADD dTmp1, dOdd0, dOdd2
+ VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ]
+
+ VHADD dTmp0, dEven0, dEven2
+ VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ]
+
+
+ VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ]
+
+
+ VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3]
+ VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2]
+ VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1]
+ VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_HD
+
+
+ ;// Load U0,U1,U2,U3
+ VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
+
+ ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
+ VLD1 {dLeft[7]},[pSrcAboveLeft]
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+
+ VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
+ VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
+ VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
+
+ VHADD dTmp0, dAbove0, dAbove2
+ VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
+
+
+ VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1
+ VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
+
+
+ VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ]
+ VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
+ VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ]
+ VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
+
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+
+ VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3]
+ VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5]
+ VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7]
+ VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_VL
+
+
+ TST availability, #OMX_VC_UPPER_RIGHT
+ BEQ DiagVLUpperRightNotAvailable
+
+ VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
+ VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1]
+ VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2]
+
+ B DiagVLPredict4x4Store
+
+DiagVLUpperRightNotAvailable
+ VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
+ VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
+
+ VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
+ VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
+ VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
+
+DiagVLPredict4x4Store
+
+ VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1
+ ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
+
+ VHADD dTmp3, dAbove0, dAbove2
+ VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2
+ ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
+
+ VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
+ ADD pDstTmp, pDst, dstStep
+ ADD dstep, dstStep, dstStep
+ VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
+
+ VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0]
+ VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1]
+ VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2]
+ VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3]
+
+ B ExitPredict4x4 ;// Branch to exit code
+
+OMX_VC_4x4_HU
+ ADD pSrcTmp, pSrcLeft, leftStep
+ ADD srcStep, leftStep, leftStep
+
+ ;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X]
+ VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
+ VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
+ VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
+ VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
+
+ VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3]
+
+ VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0]
+ VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1]
+ VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2]
+
+ VHADD dTmp0, dLeftHU0, dLeftHU2
+ VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
+
+ VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1
+ ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
+
+ VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
+ ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
+
+
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0]
+ VEXT dTmp1,dTmp1,dTmp1,#2
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2]
+ VEXT dTmp1,dTmp1,dTmp1,#2
+ VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4]
+ VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6]
+
+
+ExitPredict4x4
+
+ MOV return, #OMX_Sts_NoErr
+ M_END
+
+ ENDIF ;// CortexA8
+
+ END
+;//-----------------------------------------------------------------------------------------------
+;// omxVCM4P10_PredictIntra_4x4 ends
+;//-----------------------------------------------------------------------------------------------