1 files changed, 298 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
new file mode 100755
index 0000000..634a484
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
@@ -0,0 +1,298 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF  :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+
+;// h264bsdInterpolateChromaHor register allocation
+
+ref     RN 0
+ptrA    RN 0
+
+mb      RN 1
+block   RN 1
+
+x0      RN 2
+count   RN 2
+
+y0      RN 3
+valX    RN 3
+
+width   RN 4
+
+height  RN 5
+tmp7    RN 5
+
+chrPW   RN 6
+tmp8    RN 6
+
+tmp1    RN 7
+chrPH   RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+tmp5    RN 11
+
+tmp6    RN 12
+
+c32     RN 14
+xFrac   RN 14
+
+;// Function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateChromaHor
+
+;//  Function arguments
+;//
+;//  u8 *ref,                   : 0xc4
+;//  u8 *predPartChroma,        : 0xc8
+;//  i32 x0,                    : 0xcc
+;//  i32 y0,                    : 0xd0
+;//  u32 width,                 : 0xf8
+;//  u32 height,                : 0xfc
+;//  u32 xFrac,                 : 0x100
+;//  u32 chromaPartWidth,       : 0x104
+;//  u32 chromaPartHeight       : 0x108
+
+h264bsdInterpolateChromaHor
+    STMFD   sp!, {r0-r11,lr}
+    SUB     sp, sp, #0xc4
+
+    LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
+    LDR     width, [sp, #0xf8]      ;// width
+    CMP     x0, #0
+    BLT     do_fill
+
+    ADD     tmp6, x0, chrPW         ;// tmp6 = x0+ chromaPartWidth
+    ADD     tmp6, tmp6, #1          ;// tmp6 = x0 + chromaPartWidth + 1
+    CMP     tmp6, width             ;// x0+chromaPartWidth+1 > width
+    BHI     do_fill
+
+    CMP     y0, #0
+    BLT     do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp6, y0, chrPH         ;// tmp6 = y0 + chromaPartHeight
+    CMP     tmp6, height
+    BLS     skip_fill
+
+do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp8, chrPW, #1         ;// tmp8 = chromaPartWidth+1
+    MOV     tmp2, tmp8              ;// tmp2 = chromaPartWidth+1
+    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    BL      h264bsdFillBlock
+
+    LDR     x0, [sp, #0xcc]
+    LDR     y0, [sp, #0xd0]
+    LDR     ref, [sp, #0xc4]        ;// ref
+    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    MLA     ref, height, width, ref ;// ref += width * height; 
+    MLA     block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0                  ;// x0 = 0
+    MOV     y0, #0                  ;// y0 = 0
+    STR     x0, [sp, #0xcc]
+    STR     y0, [sp, #0xd0]
+    ADD     ref, sp, #0x1c          ;// ref = block
+    STR     ref, [sp, #0xc4]        ;// ref
+
+    STR     chrPH, [sp, #0xfc]      ;// height
+    STR     tmp8, [sp, #0xf8]       ;// width
+    MOV     width, tmp8
+    SUB     chrPW, chrPW, #1
+
+skip_fill
+    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
+    LDR     xFrac, [sp, #0x100]     ;// xFrac
+    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
+    RSB     valX, xFrac, #8         ;// valX = 8-xFrac
+
+    LDR     mb, [sp, #0xc8]         ;// predPartChroma
+
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (chromaPartWidth-1)
+    ;// [27:24] loop_y (chromaPartHeight-1)
+    ;// [23:20] chromaPartWidth-1
+    ;// [19:16] chromaPartHeight-1
+    ;// [15:00] nothing
+
+    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
+    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
+    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+    ADD     count, count, tmp2, LSL #24 ;// loop_y
+    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+    AND     tmp2, count, #0x00F00000    ;// loop_x
+    PKHBT   valX, valX, xFrac, LSL #16  ;// |xFrac|valX |
+    MOV     valX, valX, LSL #3          ;// multiply by 8 in advance
+    MOV     c32, #32
+
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cb
+    ;///////////////////////////////////////////////////////////////////////////
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+
+loop1_y
+    ADD     count, count, tmp2, LSL #8
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+loop1_x
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp4, [ptrA], #1
+
+    PKHBT   tmp5, tmp1, tmp3, LSL #16
+    PKHBT   tmp6, tmp2, tmp4, LSL #16
+
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
+    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
+
+    PKHBT   tmp7, tmp3, tmp1, LSL #16
+    PKHBT   tmp8, tmp4, tmp2, LSL #16
+
+    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
+    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
+
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb],#1               ;// store row 1 col 2
+
+    SUBS    count, count, #2<<28
+    BCS     loop1_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+    SUB     ptrA, ptrA, #1
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop1_y
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cr
+    ;///////////////////////////////////////////////////////////////////////////
+    LDR     height, [sp,#0xfc]          ;// height
+    LDR     ref, [sp, #0xc4]            ;// ref
+    LDR     tmp1, [sp, #0xd0]           ;// y0
+    LDR     tmp2, [sp, #0xcc]           ;// x0
+    LDR     mb, [sp, #0xc8]             ;// predPartChroma
+
+    ADD     tmp1, height, tmp1
+    MLA     tmp3, tmp1, width, tmp2
+    ADD     ptrA, ref, tmp3
+    ADD     mb, mb, #64
+
+    AND     count, count, #0x00FFFFFF
+    AND     tmp1, count, #0x000F0000
+    ADD     count, count, tmp1, LSL #8
+    AND     tmp2, count, #0x00F00000
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+loop2_y
+    ADD     count, count, tmp2, LSL #8
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+loop2_x
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp4, [ptrA], #1
+
+    PKHBT   tmp5, tmp1, tmp3, LSL #16
+    PKHBT   tmp6, tmp2, tmp4, LSL #16
+
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
+    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
+
+    PKHBT   tmp7, tmp3, tmp1, LSL #16
+    PKHBT   tmp8, tmp4, tmp2, LSL #16
+
+    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
+    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
+
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb],#1               ;// store row 1 col 2
+
+    SUBS    count, count, #2<<28
+    BCS     loop2_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+    SUB     ptrA, ptrA, #1
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop2_y
+
+    ADD     sp,sp,#0xd4
+    LDMFD   sp!, {r4-r11,pc}
+
+    END