diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s new file mode 100755 index 0000000..0c3f4f2 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s @@ -0,0 +1,288 @@ +;// +;// +;// File Name: omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS CortexA8 + + IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe + IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe + + IF CortexA8 + +LOOP_COUNT EQU 0x55000000 + + +;// Function arguments + +pSrcDst RN 0 +srcdstStep RN 1 +pAlpha RN 2 +pBeta RN 3 + +pThresholds RN 5 +pBS RN 4 +bS10 RN 12 + +pAlpha_0 RN 2 +pBeta_0 RN 3 + +pAlpha_1 RN 7 +pBeta_1 RN 8 + + + +;// Loop + +XY RN 9 + +pTmp RN 6 +step RN 10 + +;// Pixels +dP_0 DN D4.U8 +dP_1 DN D5.U8 +dP_2 DN D6.U8 +dP_3 DN D7.U8 +dQ_0 DN D8.U8 +dQ_1 DN D9.U8 +dQ_2 DN D10.U8 +dQ_3 DN D11.U8 + + +;// Filtering Decision +dAlpha DN D0.U8 +dBeta DN D2.U8 + +dFilt DN D16.U8 +dAqflg DN D12.U8 +dApflg DN D17.U8 + +dAp0q0 DN D13.U8 +dAp1p0 DN D12.U8 +dAq1q0 DN D18.U8 +dAp2p0 DN D19.U8 +dAq2q0 DN D17.U8 + +;// bSLT4 +dTC0 DN D18.U8 +dTC1 DN D19.U8 +dTC01 DN D18.U8 + +dTCs DN D31.S8 +dTC DN D31.U8 + +dMask_0 DN D14.U8 +dMask_1 DN D15.U8 + +Mask_0 RN 11 + +dTemp DN D19.U8 + +;// Computing P0,Q0 +qDq0p0 QN Q10.S16 +qDp1q1 QN Q11.S16 +qDelta QN Q10.S16 ; reuse qDq0p0 +dDelta DN D20.S8 + + +;// Computing P1,Q1 +dRp0q0 DN D24.U8 + +dMaxP DN D23.U8 +dMinP DN D22.U8 + +dMaxQ DN D19.U8 +dMinQ DN D21.U8 + +dDeltaP DN D26.U8 +dDeltaQ DN D27.U8 + +qP_0n QN Q14.S16 +qQ_0n QN Q12.S16 + +dQ_0n DN D24.U8 +dQ_1n DN D25.U8 +dP_0n DN D29.U8 +dP_1n DN D30.U8 + +;// bSGE4 + +qSp0q0 QN Q10.U16 + +qSp2q1 QN Q11.U16 +qSp0q0p1 QN Q12.U16 +qSp3p2 QN Q13.U16 +dHSp0q1 DN D28.U8 + +qSq2p1 QN Q11.U16 +qSp0q0q1 QN Q12.U16 +qSq3q2 QN Q13.U16 ;!! +dHSq0p1 DN D28.U8 ;!! + +qTemp1 QN Q11.U16 ;!!;qSp2q1 +qTemp2 QN Q12.U16 ;!!;qSp0q0p1 + +dP_0t DN D28.U8 ;!!;dHSp0q1 +dQ_0t DN D22.U8 ;!!;Temp1 + +dP_0n DN D29.U8 +dP_1n DN D30.U8 +dP_2n DN D31.U8 + +dQ_0n DN D24.U8 ;!!;Temp2 +dQ_1n DN D25.U8 ;!!;Temp2 +dQ_2n DN D28.U8 ;!!;dQ_0t + + + ;// Function header + M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15 + + ;//Arguments on the stack + M_ARG ppThresholds, 4 + M_ARG ppBS, 4 + + ;// d0-dAlpha_0 + ;// d2-dBeta_0 + + ADD pAlpha_1, pAlpha_0, #1 + ADD pBeta_1, pBeta_0, #1 + + VLD1 {dAlpha[]}, [pAlpha_0] + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + VLD1 {dBeta[]}, [pBeta_0] + + M_LDR pBS, ppBS + M_LDR pThresholds, ppThresholds + + MOV Mask_0,#0 + + ;dMask_0-14 + ;dMask_1-15 + + VMOV dMask_0, #0 + VMOV dMask_1, #1 + + ADD step, srcdstStep, srcdstStep + + LDR XY,=LOOP_COUNT + + ;// p0-p3 - d4-d7 + ;// q0-q3 - d8-d11 +LoopY +LoopX + LDRH bS10, [pBS], #2 + ADD pTmp, pSrcDst, srcdstStep + CMP bS10, #0 + BEQ NoFilterBS0 + + VLD1 dP_3, [pSrcDst], step + VLD1 dP_2, [pTmp], step + VLD1 dP_1, [pSrcDst], step + VLD1 dP_0, [pTmp], step + VLD1 dQ_0, [pSrcDst], step + VABD dAp1p0, dP_0, dP_1 + VLD1 dQ_1, [pTmp] + VABD dAp0q0, dQ_0, dP_0 + VLD1 dQ_2, [pSrcDst], srcdstStep + + VABD dAq1q0, dQ_1, dQ_0 + VABD dAp2p0, dP_2, dP_0 + VCGT dFilt, dAlpha, dAp0q0 + + TST bS10, #0xff + VMAX dAp1p0, dAq1q0, dAp1p0 + VABD dAq2q0, dQ_2, dQ_0 + + VMOVEQ.U32 dFilt[0], Mask_0 + TST bS10, #0xff00 + + VCGT dAp2p0, dBeta, dAp2p0 + VCGT dAp1p0, dBeta, dAp1p0 + + VMOVEQ.U32 dFilt[1], Mask_0 + + VCGT dAq2q0, dBeta, dAq2q0 + VLD1 dQ_3, [pSrcDst] + VAND dFilt, dFilt, dAp1p0 + TST bS10, #4 + + VAND dAqflg, dFilt, dAq2q0 + VAND dApflg, dFilt, dAp2p0 + + BNE bSGE4 +bSLT4 + ;// bS < 4 Filtering + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + SUB pSrcDst, pSrcDst, srcdstStep + + BL armVCM4P10_DeblockingLumabSLT4_unsafe + + ;// Result Storage + VST1 dP_1n, [pSrcDst], srcdstStep + VST1 dP_0n, [pSrcDst], srcdstStep + SUB pTmp, pSrcDst, srcdstStep, LSL #2 + VST1 dQ_0n, [pSrcDst], srcdstStep + ADDS XY, XY, XY + VST1 dQ_1n, [pSrcDst] + ADD pSrcDst, pTmp, #8 + + BCC LoopX + B ExitLoopY + +NoFilterBS0 + ADD pSrcDst, pSrcDst, #8 + ADDS XY, XY, XY + ADD pThresholds, pThresholds, #2 + BCC LoopX + B ExitLoopY +bSGE4 + ;// bS >= 4 Filtering + SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 + SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 + BL armVCM4P10_DeblockingLumabSGE4_unsafe + + ;// Result Storage + VST1 dP_2n, [pSrcDst], srcdstStep + VST1 dP_1n, [pSrcDst], srcdstStep + VST1 dP_0n, [pSrcDst], srcdstStep + SUB pTmp, pSrcDst, srcdstStep, LSL #2 + VST1 dQ_0n, [pSrcDst], srcdstStep + ADDS XY,XY,XY + VST1 dQ_1n, [pSrcDst], srcdstStep + ADD pThresholds, pThresholds, #2 + VST1 dQ_2n, [pSrcDst] + + ADD pSrcDst, pTmp, #8 + BCC LoopX + +ExitLoopY + + SUB pSrcDst, pSrcDst, #16 + VLD1 {dAlpha[]}, [pAlpha_1] + ADD pSrcDst, pSrcDst, srcdstStep, LSL #2 + VLD1 {dBeta[]}, [pBeta_1] + BNE LoopY + + MOV r0, #OMX_Sts_NoErr + + M_END + + ENDIF + + + + + END + + |