diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s | 531 |
1 files changed, 531 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s new file mode 100755 index 0000000..39eb8a4 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s @@ -0,0 +1,531 @@ +;// +;// +;// File Name: omxVCM4P10_PredictIntra_4x4_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Define the processor variants supported by this file + + M_VARIANTS CortexA8 + +;//------------------------------------------------------- +;// This table for implementing switch case of C in asm by +;// the mehtod of two levels of indexing. +;//------------------------------------------------------- + + M_TABLE armVCM4P10_pSwitchTable4x4 + DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR + DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL + DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR + DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL + DCD OMX_VC_4x4_HU + + + IF CortexA8 + +;//-------------------------------------------- +;// Scratch variable +;//-------------------------------------------- +return RN 0 +pTable RN 8 +pc RN 15 + +;//-------------------------------------------- +;// Declare input registers +;//-------------------------------------------- +pSrcLeft RN 0 ;// input pointer +pSrcAbove RN 1 ;// input pointer +pSrcAboveLeft RN 2 ;// input pointer +pDst RN 3 ;// output pointer +leftStep RN 4 ;// input variable +dstStep RN 5 ;// input variable +predMode RN 6 ;// input variable +availability RN 7 ;// input variable +pDst1 RN 1 +pDst2 RN 4 +pDst3 RN 6 + +pSrcTmp RN 9 +srcStep RN 10 +pDstTmp RN 11 +dstep RN 12 + +;//------------------- +;// Neon registers +;//------------------- + +;// OMX_VC_CHROMA_VERT +dAboveU32 DN D0.U32 + +;// OMX_VC_CHROMA_HOR +dLeftVal0 DN D0.8 +dLeftVal1 DN D1.8 +dLeftVal2 DN D2.8 +dLeftVal3 DN D3.8 +dLeftVal0U32 DN D0.U32 +dLeftVal1U32 DN D1.U32 +dLeftVal2U32 DN D2.U32 +dLeftVal3U32 DN D3.U32 + +;// OMX_VC_4x4_DC +dLeftVal DN D0.U8 +dLeftValU32 DN D0.U32 +dSumAboveLeftU16 DN D1.U16 +dSumAboveLeftU32 DN D1.U32 +dSumAboveLeftU64 DN D1.U64 +dSumAboveLeftU8 DN D1.U8 +dSum DN D0.U8 + +dSumLeftValU16 DN D1.U16 +dSumLeftValU32 DN D1.U32 +dSumLeftValU64 DN D1.U64 +dSumLeftValU8 DN D1.U8 + +dAboveVal DN D0.U8 +dSumAboveValU16 DN D1.U16 +dSumAboveValU32 DN D1.U32 +dSumAboveValU64 DN D1.U64 +dSumAboveValU8 DN D1.U8 +dConst128U8 DN D0.U8 + + +;//OMX_VC_4x4_DIAG_DL + +dAbove DN D0.U8 +dU7 DN D2.U8 +dU3 DN D2.U8 +dAbove0 DN D3.U8 +dAbove1 DN D4.U8 +dAbove2 DN D5.U8 +dTmp DN D6.U8 +dTmp0 DN D7.U8 +dTmp1 DN D8.U8 +dTmp2 DN D9.U8 +dTmp3 DN D10.U8 +dTmpU32 DN D6.U32 + + +;//OMX_VC_4x4_DIAG_DR +dLeft DN D1.U8 +dUL DN D2.U8 + +;//OMX_VC_4x4_VR +dLeft0 DN D1.U8 +dLeft1 DN D2.U8 +dEven0 DN D3.U8 +dEven1 DN D4.U8 +dEven2 DN D5.U8 +dOdd0 DN D6.U8 +dOdd1 DN D11.U8 +dOdd2 DN D12.U8 +dTmp3U32 DN D10.U32 +dTmp2U32 DN D9.U32 + + +;//OMX_VC_4x4_HD +dTmp1U64 DN D8.U64 +dTmp0U64 DN D7.U64 +dTmpU64 DN D6.U64 +dTmpU32 DN D6.U32 +dTmp1U32 DN D8.U32 + +;//OMX_VC_4x4_HU +dL3 DN D2.U8 +dLeftHU0 DN D3.U8 +dLeftHU1 DN D4.U8 +dLeftHU2 DN D5.U8 +dTmp0U32 DN D7.U32 + + + + +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntra_4x4 starts +;//----------------------------------------------------------------------------------------------- + + ;// Write function header + M_START omxVCM4P10_PredictIntra_4x4, r12,d12 + + ;// Define stack arguments + M_ARG LeftStep, 4 + M_ARG DstStep, 4 + M_ARG PredMode, 4 + M_ARG Availability, 4 + + + LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case + + ;// Load argument from the stack + M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg + M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg + + + LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode + + +OMX_VC_4x4_HOR + + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ;// Load Left Edge + VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + + VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7 + VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7 + VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7 + VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7 + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VERT + + ;// Load Upper Edge + VLD1 dAboveU32[0],[pSrcAbove] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + +DCPredict4x4VertStore + + VST1 dAboveU32[0],[pDst],dstep + VST1 dAboveU32[0],[pDstTmp],dstep + VST1 dAboveU32[0],[pDst] + VST1 dAboveU32[0],[pDstTmp] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_DC + + + TST availability, #OMX_VC_LEFT + BEQ DCPredict4x4LeftNotAvailable + + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ;// Load Left Edge + VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + TST availability, #OMX_VC_UPPER + BEQ DCPredict4x4LeftOnlyAvailable + + ;// Load Upper Edge also + VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3] + MOV return, #OMX_Sts_NoErr + + VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]] + VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]] + VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]] + VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumAboveLeftU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4LeftOnlyAvailable + + MOV return, #OMX_Sts_NoErr ;// returnNoError + + VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]] + VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]] + + VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumLeftValU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4LeftNotAvailable + + TST availability, #OMX_VC_UPPER + BEQ DCPredict4x4NoneAvailable + + ;// Load Upper Edge + VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3] + MOV return, #OMX_Sts_NoErr + + VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]] + VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]] + + VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VDUP dSum,dSumAboveValU8[0] + + B DCPredict4x4VertStore + +DCPredict4x4NoneAvailable + + VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) + MOV return, #OMX_Sts_NoErr + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + B DCPredict4x4VertStore + + + +OMX_VC_4x4_DIAG_DL + + TST availability, #OMX_VC_UPPER_RIGHT + BEQ DiagDLUpperRightNotAvailable + + VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] + VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7] + VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1] + VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2] + B DiagDLPredict4x4Store + +DiagDLUpperRightNotAvailable + VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] + VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] + + VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] + VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] + VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] + +DiagDLPredict4x4Store + + VHADD dTmp, dAbove0, dAbove2 + VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 + + + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst],dstStep + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst] + + B ExitPredict4x4 ;// Branch to exit code + + +OMX_VC_4x4_DIAG_DR + + + ;// Load U0,U1,U2,U3 + + VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0] + + ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] + VLD1 {dLeft[7]},[pSrcAboveLeft] + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + ADD pDst1,pDst,dstStep + + VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + + VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3] + ADD pDst2,pDst1,dstStep + VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2] + ADD pDst3,pDst2,dstStep + VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1] + + VHADD dTmp, dAbove0, dAbove2 + VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 + + + VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1 + VEXT dTmp,dTmp,dTmp,#1 + VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VR + + + ;// Load UL,U0,U1,U2,U3 + VLD1 dAboveU32[0],[pSrcAbove] + VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0] + + ;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X] + ;// dLeft1 = [L1| X|X|X|X|X|X|X] + VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + + + VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ] + VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ] + VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ] + VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ] + VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ] + VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ] + + VHADD dTmp1, dOdd0, dOdd2 + VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ] + + VHADD dTmp0, dEven0, dEven2 + VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ] + + + VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ] + + + VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3] + VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2] + VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1] + VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_HD + + + ;// Load U0,U1,U2,U3 + VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0] + + ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] + VLD1 {dLeft[7]},[pSrcAboveLeft] + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + + VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ] + VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ] + VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ] + + VHADD dTmp0, dAbove0, dAbove2 + VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ] + + + VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1 + VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ] + + + VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ] + VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ] + VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ] + VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ] + + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + + VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3] + VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5] + VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7] + VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_VL + + + TST availability, #OMX_VC_UPPER_RIGHT + BEQ DiagVLUpperRightNotAvailable + + VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] + VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1] + VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2] + + B DiagVLPredict4x4Store + +DiagVLUpperRightNotAvailable + VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] + VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] + + VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] + VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] + VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] + +DiagVLPredict4x4Store + + VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1 + ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ] + + VHADD dTmp3, dAbove0, dAbove2 + VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2 + ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ] + + VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ] + ADD pDstTmp, pDst, dstStep + ADD dstep, dstStep, dstStep + VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ] + + VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0] + VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1] + VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2] + VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3] + + B ExitPredict4x4 ;// Branch to exit code + +OMX_VC_4x4_HU + ADD pSrcTmp, pSrcLeft, leftStep + ADD srcStep, leftStep, leftStep + + ;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X] + VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] + VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] + VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] + VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep] + + VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3] + + VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0] + VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1] + VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2] + + VHADD dTmp0, dLeftHU0, dLeftHU2 + VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ] + + VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1 + ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ] + + VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0] + ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3] + + + VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0] + VEXT dTmp1,dTmp1,dTmp1,#2 + VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2] + VEXT dTmp1,dTmp1,dTmp1,#2 + VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4] + VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6] + + +ExitPredict4x4 + + MOV return, #OMX_Sts_NoErr + M_END + + ENDIF ;// CortexA8 + + END +;//----------------------------------------------------------------------------------------------- +;// omxVCM4P10_PredictIntra_4x4 ends +;//----------------------------------------------------------------------------------------------- |