1 files changed, 273 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
new file mode 100755
index 0000000..de243d4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
@@ -0,0 +1,273 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateHorQuarter register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+y0      RN 3
+x_2_0   RN 3
+
+width   RN 4
+x_3_1   RN 4
+
+height  RN 5
+x_6_4   RN 5
+
+partW   RN 6
+x_7_5   RN 6
+
+partH   RN 7
+tmp1    RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+mult_20_01  RN 11
+
+mult_20_m5  RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateHorQuarter
+
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
+;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
+;// y_0 =   20  1     20 -5        -5         1
+;// y_1 =   -5        20  1      1 20        -5
+;// y_2 =    1        -5        -5 20      1 20
+;// y_3 =              1        20 -5     -5 20         1
+
+
+h264bsdInterpolateHorQuarter
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, x0, partW         ;// (x0+partWidth)
+    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
+    LDR     width, [sp,#0x218]      ;// width
+    CMP     tmp4, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp2, y0, partH         ;// (y0+partHeight)
+    LDR     height, [sp,#0x21c]     ;// height
+    CMP     tmp2, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    LDR     height, [sp,#0x21c]     ;// height
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
+    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
+    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
+    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
+    LDR     width, [sp,#0x218]      ;// width
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    MLA     tmp2, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
+    ADD     ref, ref, #8            ;// ref = ref+8
+    LDR     mb, [sp, #0x1e8]        ;// mb
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (partWidth-1)
+    ;// [27:24] loop_y (partHeight-1)
+    ;// [23:20] partWidth-1
+    ;// [19:16] partHeight-1
+    ;// [15:00] width
+    MOV     count, width
+    SUB     partW, partW, #1;
+    SUB     partH, partH, #1;
+    ADD     tmp2, partH, partW, LSL #4
+    ADD     count, count, tmp2, LSL #16
+
+
+    LDR     mult_20_01, = 0x00140001
+    LDR     mult_20_m5, = 0x0014FFFB
+    MOV     plus16, #16
+    AND     tmp1, count, #0x000F0000    ;// partHeight-1
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    ADD     count, count, tmp1, LSL #8
+loop_y
+    LDR     x_3_1, [ref, #-8]
+    ADD     count, count, tmp3, LSL #8
+    LDR     x_7_5, [ref, #-4]
+    UXTB16  x_2_0, x_3_1
+    UXTB16  x_3_1, x_3_1, ROR #8
+    UXTB16  x_6_4, x_7_5
+
+loop_x
+    UXTB16  x_7_5, x_7_5, ROR #8
+
+    SMLAD   tmp1, x_2_0, mult_20_01, plus16
+    SMLATB  tmp3, x_2_0, mult_20_01, plus16
+    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
+    SMLATB  tmp4, x_3_1, mult_20_01, plus16
+
+    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
+    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
+    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
+    LDR     x_3_1, [ref], #4
+    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
+    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
+    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
+    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
+    UXTB16  x_2_0, x_3_1
+    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
+    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
+    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    LDR     tmp4, [sp, #0x228]
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+    SUB     tmp4, tmp4, #10
+
+    SUBS    count, count, #4<<28
+    LDR     tmp3, [ref, tmp4]
+    ORR     tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel position
+    LDR     tmp2, = 0x80808080
+    MVN     tmp3, tmp3
+    UHSUB8  tmp1, tmp1, tmp3
+    EOR     tmp1, tmp1, tmp2
+    STR     tmp1, [mb], #4
+
+    BCC     next_y
+
+    UXTB16  x_3_1, x_3_1, ROR #8
+
+    SMLAD   tmp1, x_6_4, mult_20_01, plus16
+    SMLATB  tmp3, x_6_4, mult_20_01, plus16
+    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
+    SMLATB  tmp4, x_7_5, mult_20_01, plus16
+
+    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
+    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
+    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
+    LDR     x_7_5, [ref], #4
+    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
+    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
+    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
+    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
+    UXTB16  x_6_4, x_7_5
+    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
+    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
+    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    LDR     tmp4, [sp, #0x228]
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+    SUB     tmp4, tmp4, #10
+
+    SUBS    count, count, #4<<28
+    LDR     tmp3, [ref, tmp4]
+    ORR     tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel
+    LDR     tmp2, = 0x80808080
+    MVN     tmp3, tmp3
+    UHSUB8  tmp1, tmp1, tmp3
+    EOR     tmp1, tmp1, tmp2
+
+    STR     tmp1, [mb], #4
+    BCS     loop_x
+
+next_y
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    SMLABB  ref, count, mult_20_01, ref ;// +width
+    ADDS    mb, mb, #16                 ;// +16, Carry=0
+    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
+    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
+    ADDS    count, count, #(1<<28)-(1<<24)
+    BGE     loop_y
+
+    ADD     sp,sp,#0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
+