summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s273
1 files changed, 273 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
new file mode 100755
index 0000000..de243d4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
@@ -0,0 +1,273 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateHorQuarter register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+y0 RN 3
+x_2_0 RN 3
+
+width RN 4
+x_3_1 RN 4
+
+height RN 5
+x_6_4 RN 5
+
+partW RN 6
+x_7_5 RN 6
+
+partH RN 7
+tmp1 RN 7
+
+tmp2 RN 8
+
+tmp3 RN 9
+
+tmp4 RN 10
+
+mult_20_01 RN 11
+
+mult_20_m5 RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateHorQuarter
+
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
+;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
+;// y_0 = 20 1 20 -5 -5 1
+;// y_1 = -5 20 1 1 20 -5
+;// y_2 = 1 -5 -5 20 1 20
+;// y_3 = 1 20 -5 -5 20 1
+
+
+h264bsdInterpolateHorQuarter
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, x0, partW ;// (x0+partWidth)
+ ADD tmp4, tmp4, #5 ;// (y0+partW+5)
+ LDR width, [sp,#0x218] ;// width
+ CMP tmp4, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp2, y0, partH ;// (y0+partHeight)
+ LDR height, [sp,#0x21c] ;// height
+ CMP tmp2, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ LDR height, [sp,#0x21c] ;// height
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, partW, #5 ;// tmp4 = partW + 5;
+ STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5
+ STR partH, [sp,#0xc] ;// sp+c = partHeight
+ STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5
+ LDR width, [sp,#0x218] ;// width
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR tmp4, [sp,#0x218] ;// width = partWidth+5
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ MLA tmp2, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp2 ;// ref += y0*width+x0
+ ADD ref, ref, #8 ;// ref = ref+8
+ LDR mb, [sp, #0x1e8] ;// mb
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (partWidth-1)
+ ;// [27:24] loop_y (partHeight-1)
+ ;// [23:20] partWidth-1
+ ;// [19:16] partHeight-1
+ ;// [15:00] width
+ MOV count, width
+ SUB partW, partW, #1;
+ SUB partH, partH, #1;
+ ADD tmp2, partH, partW, LSL #4
+ ADD count, count, tmp2, LSL #16
+
+
+ LDR mult_20_01, = 0x00140001
+ LDR mult_20_m5, = 0x0014FFFB
+ MOV plus16, #16
+ AND tmp1, count, #0x000F0000 ;// partHeight-1
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ ADD count, count, tmp1, LSL #8
+loop_y
+ LDR x_3_1, [ref, #-8]
+ ADD count, count, tmp3, LSL #8
+ LDR x_7_5, [ref, #-4]
+ UXTB16 x_2_0, x_3_1
+ UXTB16 x_3_1, x_3_1, ROR #8
+ UXTB16 x_6_4, x_7_5
+
+loop_x
+ UXTB16 x_7_5, x_7_5, ROR #8
+
+ SMLAD tmp1, x_2_0, mult_20_01, plus16
+ SMLATB tmp3, x_2_0, mult_20_01, plus16
+ SMLATB tmp2, x_2_0, mult_20_m5, plus16
+ SMLATB tmp4, x_3_1, mult_20_01, plus16
+
+ SMLAD tmp1, x_3_1, mult_20_m5, tmp1
+ SMLATB tmp3, x_3_1, mult_20_m5, tmp3
+ SMLAD tmp2, x_3_1, mult_20_01, tmp2
+ LDR x_3_1, [ref], #4
+ SMLAD tmp4, x_6_4, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_6_4, mult_20_m5, tmp1
+ SMLADX tmp3, x_6_4, mult_20_m5, tmp3
+ SMLADX tmp2, x_6_4, mult_20_01, tmp2
+ SMLADX tmp4, x_7_5, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_7_5, mult_20_01, tmp1
+ UXTB16 x_2_0, x_3_1
+ SMLABB tmp2, x_7_5, mult_20_m5, tmp2
+ SMLADX tmp3, x_7_5, mult_20_01, tmp3
+ SMLABB tmp4, x_2_0, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ LDR tmp4, [sp, #0x228]
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+ SUB tmp4, tmp4, #10
+
+ SUBS count, count, #4<<28
+ LDR tmp3, [ref, tmp4]
+ ORR tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel position
+ LDR tmp2, = 0x80808080
+ MVN tmp3, tmp3
+ UHSUB8 tmp1, tmp1, tmp3
+ EOR tmp1, tmp1, tmp2
+ STR tmp1, [mb], #4
+
+ BCC next_y
+
+ UXTB16 x_3_1, x_3_1, ROR #8
+
+ SMLAD tmp1, x_6_4, mult_20_01, plus16
+ SMLATB tmp3, x_6_4, mult_20_01, plus16
+ SMLATB tmp2, x_6_4, mult_20_m5, plus16
+ SMLATB tmp4, x_7_5, mult_20_01, plus16
+
+ SMLAD tmp1, x_7_5, mult_20_m5, tmp1
+ SMLATB tmp3, x_7_5, mult_20_m5, tmp3
+ SMLAD tmp2, x_7_5, mult_20_01, tmp2
+ LDR x_7_5, [ref], #4
+ SMLAD tmp4, x_2_0, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_2_0, mult_20_m5, tmp1
+ SMLADX tmp3, x_2_0, mult_20_m5, tmp3
+ SMLADX tmp2, x_2_0, mult_20_01, tmp2
+ SMLADX tmp4, x_3_1, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_3_1, mult_20_01, tmp1
+ UXTB16 x_6_4, x_7_5
+ SMLABB tmp2, x_3_1, mult_20_m5, tmp2
+ SMLADX tmp3, x_3_1, mult_20_01, tmp3
+ SMLABB tmp4, x_6_4, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ LDR tmp4, [sp, #0x228]
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+ SUB tmp4, tmp4, #10
+
+ SUBS count, count, #4<<28
+ LDR tmp3, [ref, tmp4]
+ ORR tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel
+ LDR tmp2, = 0x80808080
+ MVN tmp3, tmp3
+ UHSUB8 tmp1, tmp1, tmp3
+ EOR tmp1, tmp1, tmp2
+
+ STR tmp1, [mb], #4
+ BCS loop_x
+
+next_y
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ SMLABB ref, count, mult_20_01, ref ;// +width
+ ADDS mb, mb, #16 ;// +16, Carry=0
+ SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1
+ SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1
+ ADDS count, count, #(1<<28)-(1<<24)
+ BGE loop_y
+
+ ADD sp,sp,#0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
+