;// ;// Copyright (C) 2007-2008 ARM Limited ;// ;// Licensed under the Apache License, Version 2.0 (the "License"); ;// you may not use this file except in compliance with the License. ;// You may obtain a copy of the License at ;// ;// http://www.apache.org/licenses/LICENSE-2.0 ;// ;// Unless required by applicable law or agreed to in writing, software ;// distributed under the License is distributed on an "AS IS" BASIS, ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ;// See the License for the specific language governing permissions and ;// limitations under the License. ;// ;// ;// ;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s ;// OpenMAX DL: v1.0.2 ;// Revision: 12290 ;// Date: Wednesday, April 9, 2008 ;// ;// ;// ;// INCLUDE omxtypes_s.h INCLUDE armCOMM_s.h M_VARIANTS CortexA8 IF CortexA8 IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe LOOP_COUNT EQU 0x40000000 MASK_3 EQU 0x03030303 MASK_4 EQU 0x04040404 ;// Function arguments pSrcDst RN 0 srcdstStep RN 1 pAlpha RN 2 pBeta RN 3 pThresholds RN 5 pBS RN 4 bS3210 RN 6 pSrcDst_P RN 10 pSrcDst_Q RN 12 pTmp RN 10 pTmp2 RN 12 step RN 14 ;// Loop XY RN 7 ;// Rows input dRow0 DN D7.U8 dRow1 DN D8.U8 dRow2 DN D5.U8 dRow3 DN D10.U8 dRow4 DN D6.U8 dRow5 DN D9.U8 dRow6 DN D4.U8 dRow7 DN D11.U8 ;// Pixels dP_0 DN D4.U8 dP_1 DN D5.U8 dP_2 DN D6.U8 dQ_0 DN D8.U8 dQ_1 DN D9.U8 dQ_2 DN D10.U8 ;// Filtering Decision dAlpha DN D0.U8 dBeta DN D2.U8 dFilt DN D16.U8 dAqflg DN D12.U8 dApflg DN D17.U8 dAp0q0 DN D13.U8 dAp1p0 DN D12.U8 dAq1q0 DN D18.U8 dAp2p0 DN D19.U8 dAq2q0 DN D17.U8 qBS3210 QN Q13.U16 dBS3210 DN D26 dMask_bs DN D27 dFilt_bs DN D26.U16 ;// bSLT4 dMask_0 DN D14.U8 dMask_1 DN D15.U8 dMask_4 DN D1.U16 Mask_4 RN 8 Mask_3 RN 9 dTemp DN D19.U8 ;// Result dP_0t DN D13.U8 dQ_0t DN D31.U8 dP_0n DN D29.U8 dQ_0n DN D24.U8 ;// Function header M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15 ;//Arguments on the stack M_ARG ppThresholds, 4 M_ARG ppBS, 4 ;// d0-dAlpha_0 ;// d2-dBeta_0 ;load alpha1,beta1 somewhere to avoid more loads VLD1 {dAlpha[]}, [pAlpha]! SUB pSrcDst, pSrcDst, #4 VLD1 {dBeta[]}, [pBeta]! M_LDR pBS, ppBS M_LDR pThresholds, ppThresholds LDR Mask_4, =MASK_4 LDR Mask_3, =MASK_3 ;dMask_0-14 ;dMask_1-15 ;dMask_4-19 VMOV dMask_0, #0 VMOV dMask_1, #1 VMOV dMask_4, #4 LDR XY, =LOOP_COUNT ;// p0-p3 - d4-d7 ;// q0-q3 - d8-d11 LoopY LDR bS3210, [pBS], #8 ADD pTmp, pSrcDst, srcdstStep ADD step, srcdstStep, srcdstStep ;1 VLD1 dRow0, [pSrcDst], step ;1 VLD1 dRow1, [pTmp], step VLD1 dRow2, [pSrcDst], step VLD1 dRow3, [pTmp], step VLD1 dRow4, [pSrcDst], step VLD1 dRow5, [pTmp], step VLD1 dRow6, [pSrcDst], step VLD1 dRow7, [pTmp], step ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] ;// 8x8 Transpose VZIP.8 dRow0, dRow1 VZIP.8 dRow2, dRow3 VZIP.8 dRow4, dRow5 VZIP.8 dRow6, dRow7 VZIP.16 dRow0, dRow2 VZIP.16 dRow1, dRow3 VZIP.16 dRow4, dRow6 VZIP.16 dRow5, dRow7 VZIP.32 dRow0, dRow4 VZIP.32 dRow2, dRow6 VZIP.32 dRow3, dRow7 VZIP.32 dRow1, dRow5 ;Realign the pointers CMP bS3210, #0 VABD dAp2p0, dP_2, dP_0 VABD dAp0q0, dP_0, dQ_0 BEQ NoFilterBS0 VABD dAp1p0, dP_1, dP_0 VABD dAq1q0, dQ_1, dQ_0 VMOV.U32 dBS3210[0], bS3210 VCGT dFilt, dAlpha, dAp0q0 VMAX dAp1p0, dAq1q0, dAp1p0 VMOVL qBS3210, dBS3210.U8 VABD dAq2q0, dQ_2, dQ_0 VCGT dMask_bs.S16, dBS3210.S16, #0 VCGT dAp1p0, dBeta, dAp1p0 VCGT dAp2p0, dBeta, dAp2p0 VAND dFilt, dMask_bs.U8 TST bS3210, Mask_3 VCGT dAq2q0, dBeta, dAq2q0 VAND dFilt, dFilt, dAp1p0 VAND dAqflg, dFilt, dAq2q0 VAND dApflg, dFilt, dAp2p0 ;// bS < 4 Filtering BLNE armVCM4P10_DeblockingChromabSLT4_unsafe TST bS3210, Mask_4 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 VTST dFilt_bs, dFilt_bs, dMask_4 ;// bS == 4 Filtering BLNE armVCM4P10_DeblockingChromabSGE4_unsafe VBIT dP_0n, dP_0t, dFilt_bs VBIT dQ_0n, dQ_0t, dFilt_bs ;// Result Storage ADD pSrcDst_P, pSrcDst, #3 VBIF dP_0n, dP_0, dFilt ADD pTmp2, pSrcDst_P, srcdstStep ADD step, srcdstStep, srcdstStep VBIF dQ_0n, dQ_0, dFilt ADDS XY, XY, XY VST1 {dP_0n[0]}, [pSrcDst_P], step VST1 {dP_0n[1]}, [pTmp2], step VST1 {dP_0n[2]}, [pSrcDst_P], step VST1 {dP_0n[3]}, [pTmp2], step VST1 {dP_0n[4]}, [pSrcDst_P], step VST1 {dP_0n[5]}, [pTmp2], step VST1 {dP_0n[6]}, [pSrcDst_P], step VST1 {dP_0n[7]}, [pTmp2], step ADD pSrcDst_Q, pSrcDst, #4 ADD pTmp, pSrcDst_Q, srcdstStep VST1 {dQ_0n[0]}, [pSrcDst_Q], step VST1 {dQ_0n[1]}, [pTmp], step VST1 {dQ_0n[2]}, [pSrcDst_Q], step VST1 {dQ_0n[3]}, [pTmp], step VST1 {dQ_0n[4]}, [pSrcDst_Q], step VST1 {dQ_0n[5]}, [pTmp], step VST1 {dQ_0n[6]}, [pSrcDst_Q], step VST1 {dQ_0n[7]}, [pTmp], step ADD pSrcDst, pSrcDst, #4 BNE LoopY MOV r0, #OMX_Sts_NoErr M_EXIT NoFilterBS0 VLD1 {dAlpha[]}, [pAlpha] ADD pSrcDst, pSrcDst, #4 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 ADDS XY, XY, XY VLD1 {dBeta[]}, [pBeta] ADD pThresholds, pThresholds, #4 BNE LoopY MOV r0, #OMX_Sts_NoErr M_END ENDIF END