diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s | 273 |
1 files changed, 273 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s new file mode 100755 index 0000000..de243d4 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s @@ -0,0 +1,273 @@ +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. + +;------------------------------------------------------------------------------- +;-- +;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function +;-- +;------------------------------------------------------------------------------- + + + IF :DEF: H264DEC_WINASM + ;// We dont use REQUIRE8 and PRESERVE8 for winasm + ELSE + REQUIRE8 + PRESERVE8 + ENDIF + + AREA |.text|, CODE + +;// h264bsdInterpolateHorQuarter register allocation + +ref RN 0 + +mb RN 1 +buff RN 1 + +count RN 2 +x0 RN 2 + +y0 RN 3 +x_2_0 RN 3 + +width RN 4 +x_3_1 RN 4 + +height RN 5 +x_6_4 RN 5 + +partW RN 6 +x_7_5 RN 6 + +partH RN 7 +tmp1 RN 7 + +tmp2 RN 8 + +tmp3 RN 9 + +tmp4 RN 10 + +mult_20_01 RN 11 + +mult_20_m5 RN 12 + +plus16 RN 14 + + +;// function exports and imports + + IMPORT h264bsdFillBlock + + EXPORT h264bsdInterpolateHorQuarter + + +;// Horizontal filter approach +;// +;// Basic idea in horizontal filtering is to adjust coefficients +;// like below. Calculation is done with 16-bit maths. +;// +;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 +;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... +;// y_0 = 20 1 20 -5 -5 1 +;// y_1 = -5 20 1 1 20 -5 +;// y_2 = 1 -5 -5 20 1 20 +;// y_3 = 1 20 -5 -5 20 1 + + +h264bsdInterpolateHorQuarter + STMFD sp!, {r0-r11, lr} + SUB sp, sp, #0x1e4 + + CMP x0, #0 + BLT do_fill ;// (x0 < 0) + LDR partW, [sp,#0x220] ;// partWidth + ADD tmp4, x0, partW ;// (x0+partWidth) + ADD tmp4, tmp4, #5 ;// (y0+partW+5) + LDR width, [sp,#0x218] ;// width + CMP tmp4, width + BHI do_fill ;// (x0+partW)>width + + CMP y0, #0 + BLT do_fill ;// (y0 < 0) + LDR partH, [sp,#0x224] ;// partHeight + ADD tmp2, y0, partH ;// (y0+partHeight) + LDR height, [sp,#0x21c] ;// height + CMP tmp2, height + BLS skip_fill ;// no overfill needed + + +do_fill + LDR partH, [sp,#0x224] ;// partHeight + LDR height, [sp,#0x21c] ;// height + LDR partW, [sp,#0x220] ;// partWidth + ADD tmp4, partW, #5 ;// tmp4 = partW + 5; + STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5 + STR partH, [sp,#0xc] ;// sp+c = partHeight + STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5 + LDR width, [sp,#0x218] ;// width + STR width, [sp,#0] ;// sp+0 = width + ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] + BL h264bsdFillBlock + + MOV x0, #0 + STR x0,[sp,#0x1ec] ;// x0 = 0 + STR x0,[sp,#0x1f0] ;// y0 = 0 + ADD ref,sp,#0x28 ;// ref = p1 + STR tmp4, [sp,#0x218] ;// width = partWidth+5 + + +skip_fill + LDR x0 ,[sp,#0x1ec] ;// x0 + LDR y0 ,[sp,#0x1f0] ;// y0 + LDR width, [sp,#0x218] ;// width + MLA tmp2, width, y0, x0 ;// y0*width+x0 + ADD ref, ref, tmp2 ;// ref += y0*width+x0 + ADD ref, ref, #8 ;// ref = ref+8 + LDR mb, [sp, #0x1e8] ;// mb + + ;// pack values to count register + ;// [31:28] loop_x (partWidth-1) + ;// [27:24] loop_y (partHeight-1) + ;// [23:20] partWidth-1 + ;// [19:16] partHeight-1 + ;// [15:00] width + MOV count, width + SUB partW, partW, #1; + SUB partH, partH, #1; + ADD tmp2, partH, partW, LSL #4 + ADD count, count, tmp2, LSL #16 + + + LDR mult_20_01, = 0x00140001 + LDR mult_20_m5, = 0x0014FFFB + MOV plus16, #16 + AND tmp1, count, #0x000F0000 ;// partHeight-1 + AND tmp3, count, #0x00F00000 ;// partWidth-1 + ADD count, count, tmp1, LSL #8 +loop_y + LDR x_3_1, [ref, #-8] + ADD count, count, tmp3, LSL #8 + LDR x_7_5, [ref, #-4] + UXTB16 x_2_0, x_3_1 + UXTB16 x_3_1, x_3_1, ROR #8 + UXTB16 x_6_4, x_7_5 + +loop_x + UXTB16 x_7_5, x_7_5, ROR #8 + + SMLAD tmp1, x_2_0, mult_20_01, plus16 + SMLATB tmp3, x_2_0, mult_20_01, plus16 + SMLATB tmp2, x_2_0, mult_20_m5, plus16 + SMLATB tmp4, x_3_1, mult_20_01, plus16 + + SMLAD tmp1, x_3_1, mult_20_m5, tmp1 + SMLATB tmp3, x_3_1, mult_20_m5, tmp3 + SMLAD tmp2, x_3_1, mult_20_01, tmp2 + LDR x_3_1, [ref], #4 + SMLAD tmp4, x_6_4, mult_20_m5, tmp4 + + SMLABB tmp1, x_6_4, mult_20_m5, tmp1 + SMLADX tmp3, x_6_4, mult_20_m5, tmp3 + SMLADX tmp2, x_6_4, mult_20_01, tmp2 + SMLADX tmp4, x_7_5, mult_20_m5, tmp4 + + SMLABB tmp1, x_7_5, mult_20_01, tmp1 + UXTB16 x_2_0, x_3_1 + SMLABB tmp2, x_7_5, mult_20_m5, tmp2 + SMLADX tmp3, x_7_5, mult_20_01, tmp3 + SMLABB tmp4, x_2_0, mult_20_01, tmp4 + + MOV tmp2, tmp2, ASR #5 + MOV tmp1, tmp1, ASR #5 + PKHBT tmp2, tmp2, tmp4, LSL #(16-5) + PKHBT tmp1, tmp1, tmp3, LSL #(16-5) + LDR tmp4, [sp, #0x228] + USAT16 tmp2, #8, tmp2 + USAT16 tmp1, #8, tmp1 + SUB tmp4, tmp4, #10 + + SUBS count, count, #4<<28 + LDR tmp3, [ref, tmp4] + ORR tmp1, tmp1, tmp2, LSL #8 + +;// quarter pel position + LDR tmp2, = 0x80808080 + MVN tmp3, tmp3 + UHSUB8 tmp1, tmp1, tmp3 + EOR tmp1, tmp1, tmp2 + STR tmp1, [mb], #4 + + BCC next_y + + UXTB16 x_3_1, x_3_1, ROR #8 + + SMLAD tmp1, x_6_4, mult_20_01, plus16 + SMLATB tmp3, x_6_4, mult_20_01, plus16 + SMLATB tmp2, x_6_4, mult_20_m5, plus16 + SMLATB tmp4, x_7_5, mult_20_01, plus16 + + SMLAD tmp1, x_7_5, mult_20_m5, tmp1 + SMLATB tmp3, x_7_5, mult_20_m5, tmp3 + SMLAD tmp2, x_7_5, mult_20_01, tmp2 + LDR x_7_5, [ref], #4 + SMLAD tmp4, x_2_0, mult_20_m5, tmp4 + + SMLABB tmp1, x_2_0, mult_20_m5, tmp1 + SMLADX tmp3, x_2_0, mult_20_m5, tmp3 + SMLADX tmp2, x_2_0, mult_20_01, tmp2 + SMLADX tmp4, x_3_1, mult_20_m5, tmp4 + + SMLABB tmp1, x_3_1, mult_20_01, tmp1 + UXTB16 x_6_4, x_7_5 + SMLABB tmp2, x_3_1, mult_20_m5, tmp2 + SMLADX tmp3, x_3_1, mult_20_01, tmp3 + SMLABB tmp4, x_6_4, mult_20_01, tmp4 + + MOV tmp2, tmp2, ASR #5 + MOV tmp1, tmp1, ASR #5 + PKHBT tmp2, tmp2, tmp4, LSL #(16-5) + PKHBT tmp1, tmp1, tmp3, LSL #(16-5) + LDR tmp4, [sp, #0x228] + USAT16 tmp2, #8, tmp2 + USAT16 tmp1, #8, tmp1 + SUB tmp4, tmp4, #10 + + SUBS count, count, #4<<28 + LDR tmp3, [ref, tmp4] + ORR tmp1, tmp1, tmp2, LSL #8 + +;// quarter pel + LDR tmp2, = 0x80808080 + MVN tmp3, tmp3 + UHSUB8 tmp1, tmp1, tmp3 + EOR tmp1, tmp1, tmp2 + + STR tmp1, [mb], #4 + BCS loop_x + +next_y + AND tmp3, count, #0x00F00000 ;// partWidth-1 + SMLABB ref, count, mult_20_01, ref ;// +width + ADDS mb, mb, #16 ;// +16, Carry=0 + SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1 + SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1 + ADDS count, count, #(1<<28)-(1<<24) + BGE loop_y + + ADD sp,sp,#0x1f4 + LDMFD sp!, {r4-r11, pc} + + END + |