10 files changed, 2884 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
new file mode 100755
index 0000000..634a484
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
@@ -0,0 +1,298 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF  :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+
+;// h264bsdInterpolateChromaHor register allocation
+
+ref     RN 0
+ptrA    RN 0
+
+mb      RN 1
+block   RN 1
+
+x0      RN 2
+count   RN 2
+
+y0      RN 3
+valX    RN 3
+
+width   RN 4
+
+height  RN 5
+tmp7    RN 5
+
+chrPW   RN 6
+tmp8    RN 6
+
+tmp1    RN 7
+chrPH   RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+tmp5    RN 11
+
+tmp6    RN 12
+
+c32     RN 14
+xFrac   RN 14
+
+;// Function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateChromaHor
+
+;//  Function arguments
+;//
+;//  u8 *ref,                   : 0xc4
+;//  u8 *predPartChroma,        : 0xc8
+;//  i32 x0,                    : 0xcc
+;//  i32 y0,                    : 0xd0
+;//  u32 width,                 : 0xf8
+;//  u32 height,                : 0xfc
+;//  u32 xFrac,                 : 0x100
+;//  u32 chromaPartWidth,       : 0x104
+;//  u32 chromaPartHeight       : 0x108
+
+h264bsdInterpolateChromaHor
+    STMFD   sp!, {r0-r11,lr}
+    SUB     sp, sp, #0xc4
+
+    LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
+    LDR     width, [sp, #0xf8]      ;// width
+    CMP     x0, #0
+    BLT     do_fill
+
+    ADD     tmp6, x0, chrPW         ;// tmp6 = x0+ chromaPartWidth
+    ADD     tmp6, tmp6, #1          ;// tmp6 = x0 + chromaPartWidth + 1
+    CMP     tmp6, width             ;// x0+chromaPartWidth+1 > width
+    BHI     do_fill
+
+    CMP     y0, #0
+    BLT     do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp6, y0, chrPH         ;// tmp6 = y0 + chromaPartHeight
+    CMP     tmp6, height
+    BLS     skip_fill
+
+do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp8, chrPW, #1         ;// tmp8 = chromaPartWidth+1
+    MOV     tmp2, tmp8              ;// tmp2 = chromaPartWidth+1
+    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    BL      h264bsdFillBlock
+
+    LDR     x0, [sp, #0xcc]
+    LDR     y0, [sp, #0xd0]
+    LDR     ref, [sp, #0xc4]        ;// ref
+    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    MLA     ref, height, width, ref ;// ref += width * height; 
+    MLA     block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0                  ;// x0 = 0
+    MOV     y0, #0                  ;// y0 = 0
+    STR     x0, [sp, #0xcc]
+    STR     y0, [sp, #0xd0]
+    ADD     ref, sp, #0x1c          ;// ref = block
+    STR     ref, [sp, #0xc4]        ;// ref
+
+    STR     chrPH, [sp, #0xfc]      ;// height
+    STR     tmp8, [sp, #0xf8]       ;// width
+    MOV     width, tmp8
+    SUB     chrPW, chrPW, #1
+
+skip_fill
+    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
+    LDR     xFrac, [sp, #0x100]     ;// xFrac
+    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
+    RSB     valX, xFrac, #8         ;// valX = 8-xFrac
+
+    LDR     mb, [sp, #0xc8]         ;// predPartChroma
+
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (chromaPartWidth-1)
+    ;// [27:24] loop_y (chromaPartHeight-1)
+    ;// [23:20] chromaPartWidth-1
+    ;// [19:16] chromaPartHeight-1
+    ;// [15:00] nothing
+
+    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
+    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
+    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+    ADD     count, count, tmp2, LSL #24 ;// loop_y
+    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+    AND     tmp2, count, #0x00F00000    ;// loop_x
+    PKHBT   valX, valX, xFrac, LSL #16  ;// |xFrac|valX |
+    MOV     valX, valX, LSL #3          ;// multiply by 8 in advance
+    MOV     c32, #32
+
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cb
+    ;///////////////////////////////////////////////////////////////////////////
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+
+loop1_y
+    ADD     count, count, tmp2, LSL #8
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+loop1_x
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp4, [ptrA], #1
+
+    PKHBT   tmp5, tmp1, tmp3, LSL #16
+    PKHBT   tmp6, tmp2, tmp4, LSL #16
+
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
+    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
+
+    PKHBT   tmp7, tmp3, tmp1, LSL #16
+    PKHBT   tmp8, tmp4, tmp2, LSL #16
+
+    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
+    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
+
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb],#1               ;// store row 1 col 2
+
+    SUBS    count, count, #2<<28
+    BCS     loop1_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+    SUB     ptrA, ptrA, #1
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop1_y
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cr
+    ;///////////////////////////////////////////////////////////////////////////
+    LDR     height, [sp,#0xfc]          ;// height
+    LDR     ref, [sp, #0xc4]            ;// ref
+    LDR     tmp1, [sp, #0xd0]           ;// y0
+    LDR     tmp2, [sp, #0xcc]           ;// x0
+    LDR     mb, [sp, #0xc8]             ;// predPartChroma
+
+    ADD     tmp1, height, tmp1
+    MLA     tmp3, tmp1, width, tmp2
+    ADD     ptrA, ref, tmp3
+    ADD     mb, mb, #64
+
+    AND     count, count, #0x00FFFFFF
+    AND     tmp1, count, #0x000F0000
+    ADD     count, count, tmp1, LSL #8
+    AND     tmp2, count, #0x00F00000
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+loop2_y
+    ADD     count, count, tmp2, LSL #8
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+loop2_x
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp4, [ptrA], #1
+
+    PKHBT   tmp5, tmp1, tmp3, LSL #16
+    PKHBT   tmp6, tmp2, tmp4, LSL #16
+
+    LDRB    tmp1, [ptrA, width]
+    LDRB    tmp2, [ptrA], #1
+
+    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
+    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
+
+    PKHBT   tmp7, tmp3, tmp1, LSL #16
+    PKHBT   tmp8, tmp4, tmp2, LSL #16
+
+    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
+    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
+
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb],#1               ;// store row 1 col 2
+
+    SUBS    count, count, #2<<28
+    BCS     loop2_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+    SUB     ptrA, ptrA, #1
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop2_y
+
+    ADD     sp,sp,#0xd4
+    LDMFD   sp!, {r4-r11,pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s
new file mode 100755
index 0000000..7420ad3
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s
@@ -0,0 +1,339 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer 
+;--            function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF  :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+
+;// h264bsdInterpolateChromaHorVer register allocation
+
+ref     RN 0
+ptrA    RN 0
+
+mb      RN 1
+block   RN 1
+
+x0      RN 2
+count   RN 2
+
+y0      RN 3
+valY    RN 3
+
+width   RN 4
+
+tmp4    RN 5
+height  RN 5
+
+tmp1    RN 6
+
+tmp2    RN 7
+
+tmp3    RN 8
+
+valX    RN 9
+
+tmp5    RN 10
+chrPW   RN 10
+
+tmp6    RN 11
+chrPH   RN 11
+
+xFrac   RN 12
+
+c32     RN 14
+yFrac   RN 14
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateChromaHorVer
+
+;//  Function arguments
+;//
+;//  u8 *ref,                   : 0xc4
+;//  u8 *predPartChroma,        : 0xc8
+;//  i32 x0,                    : 0xcc
+;//  i32 y0,                    : 0xd0
+;//  u32 width,                 : 0xf8
+;//  u32 height,                : 0xfc
+;//  u32 xFrac,                 : 0x100
+;//  u32 yFrac,                 : 0x104
+;//  u32 chromaPartWidth,       : 0x108
+;//  u32 chromaPartHeight       : 0x10c
+
+h264bsdInterpolateChromaHorVer
+    STMFD   sp!, {r0-r11,lr}
+    SUB     sp, sp, #0xc4
+
+    LDR     chrPW, [sp, #0x108]     ;// chromaPartWidth
+    LDR     xFrac, [sp, #0x100]     ;// xFrac
+    LDR     width, [sp, #0xf8]      ;// width
+    CMP     x0, #0
+    BLT     do_fill
+
+    ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
+    ADD     tmp1, tmp1, #1          ;// tmp1 = x0+ chromaPartWidth+1
+    CMP     tmp1, width             ;// x0+chromaPartWidth+1 > width
+    BHI     do_fill
+
+    CMP     y0, #0
+    BLT     do_fill
+    LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
+    ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
+    CMP     tmp1, height
+    BLS     skip_fill
+
+do_fill
+    LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp3, chrPW, #1         ;// tmp3 = chromaPartWidth+1
+    ADD     tmp1, chrPW, #1         ;// tmp1 = chromaPartWidth+1
+    ADD     tmp2, chrPH, #1         ;// tmp2 = chromaPartHeight+1
+    STMIA   sp,{width,height,tmp1,tmp2,tmp3}
+    ADD     block, sp, #0x1c        ;// block
+    BL      h264bsdFillBlock
+
+    LDR     x0, [sp, #0xcc]
+    LDR     y0, [sp, #0xd0]
+    LDR     ref, [sp, #0xc4]        ;// ref
+    STMIA   sp,{width,height,tmp1,tmp2,tmp3}
+    ADD     block, sp, #0x1c        ;// block
+    MLA     ref, height, width, ref ;// ref += width * height; 
+    MLA     block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1)
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0                  ;// x0 = 0
+    MOV     y0, #0                  ;// y0 = 0
+    STR     x0, [sp, #0xcc]
+    STR     y0, [sp, #0xd0]
+    ADD     ref, sp, #0x1c          ;// ref = block
+    STR     ref, [sp, #0xc4]        ;// ref
+
+    STR     tmp2, [sp, #0xfc]       ;// height
+    STR     tmp1, [sp, #0xf8]       ;// width
+    MOV     width, tmp1
+
+skip_fill
+    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
+    LDR     yFrac, [sp, #0x104]     ;// yFrac
+    LDR     xFrac, [sp, #0x100]
+    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
+    RSB     valX, xFrac, #8         ;// valX = 8-xFrac
+    RSB     valY, yFrac, #8         ;// valY = 8-yFrac
+
+    LDR     mb, [sp, #0xc8]         ;// predPartChroma
+
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (chromaPartWidth-1)
+    ;// [27:24] loop_y (chromaPartHeight-1)
+    ;// [23:20] chromaPartWidth-1
+    ;// [19:16] chromaPartHeight-1
+    ;// [15:00] nothing
+
+    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
+    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
+    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+    ADD     count, count, tmp2, LSL #24 ;// loop_y
+    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+    AND     tmp2, count, #0x00F00000    ;// loop_x
+    PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
+    MOV     c32, #32
+
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cb
+    ;///////////////////////////////////////////////////////////////////////////
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical and horizontal interpolation
+
+loop1_y
+    LDRB    tmp1, [ptrA]
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp5, [ptrA, width, LSL #1]
+
+    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
+    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
+
+    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
+    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
+
+    ADD     count, count, tmp2, LSL #8
+loop1_x
+    ;// first
+    LDRB    tmp2, [ptrA, #1]!
+    LDRB    tmp4, [ptrA, width]
+    LDRB    tmp6, [ptrA, width, LSL #1]
+
+    PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
+    PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
+
+    SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
+    MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
+    MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
+
+    SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
+    MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
+    MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb, #8]              ;// store pixel
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb], #1              ;// store pixel
+
+    ;// second
+    LDRB    tmp1, [ptrA, #1]!
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp5, [ptrA, width, LSL #1]
+
+    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
+    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
+
+    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
+    MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
+    MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
+
+    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
+    MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
+    MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb, #8]              ;// store pixel
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb], #1              ;// store pixel
+
+    SUBS    count, count, #2<<28
+    BCS     loop1_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop1_y
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cr
+    ;///////////////////////////////////////////////////////////////////////////
+    LDR     height, [sp,#0xfc]          ;// height
+    LDR     ref, [sp, #0xc4]            ;// ref
+    LDR     tmp1, [sp, #0xd0]           ;// y0
+    LDR     tmp2, [sp, #0xcc]           ;// x0
+    LDR     mb, [sp, #0xc8]             ;// predPartChroma
+
+    ADD     tmp1, height, tmp1
+    MLA     tmp3, tmp1, width, tmp2
+    ADD     ptrA, ref, tmp3
+    ADD     mb, mb, #64
+
+    AND     count, count, #0x00FFFFFF
+    AND     tmp1, count, #0x000F0000
+    ADD     count, count, tmp1, LSL #8
+    AND     tmp2, count, #0x00F00000
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical and horizontal interpolation
+loop2_y
+    LDRB    tmp1, [ptrA]
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp5, [ptrA, width, LSL #1]
+
+    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
+    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
+
+    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
+    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
+
+    ADD     count, count, tmp2, LSL #8
+loop2_x
+    ;// first
+    LDRB    tmp2, [ptrA, #1]!
+    LDRB    tmp4, [ptrA, width]
+    LDRB    tmp6, [ptrA, width, LSL #1]
+
+    PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
+    PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
+
+    SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
+    MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
+    MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
+
+    SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
+    MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
+    MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb, #8]              ;// store pixel
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb], #1              ;// store pixel
+
+    ;// second 
+    LDRB    tmp1, [ptrA, #1]!
+    LDRB    tmp3, [ptrA, width]
+    LDRB    tmp5, [ptrA, width, LSL #1]
+
+    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
+    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
+
+    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
+    MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
+    MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
+
+    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
+    MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
+    MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
+
+    MOV     tmp6, tmp6, LSR #6          ;// scale down
+    STRB    tmp6, [mb, #8]              ;// store pixel
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb], #1              ;// store pixel
+
+    SUBS    count, count, #2<<28
+    BCS     loop2_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop2_y
+
+    ADD     sp,sp,#0xd4
+    LDMFD   sp!,{r4-r11,pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s
new file mode 100755
index 0000000..af9df1b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s
@@ -0,0 +1,288 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateChromaVer register allocation
+
+ref     RN 0
+ptrA    RN 0
+
+mb      RN 1
+block   RN 1
+
+x0      RN 2
+count   RN 2
+
+y0      RN 3
+valY    RN 3
+
+width   RN 4
+
+height  RN 5
+tmp7    RN 5
+
+chrPW   RN 6
+tmp8    RN 6
+
+tmp1    RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+tmp5    RN 11
+chrPH   RN 11
+
+tmp6    RN 12
+
+c32     RN 14
+yFrac   RN 14
+
+;// Function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateChromaVer
+
+;//  Function arguments
+;//
+;//  u8 *ref,                   : 0xc4
+;//  u8 *predPartChroma,        : 0xc8
+;//  i32 x0,                    : 0xcc
+;//  i32 y0,                    : 0xd0
+;//  u32 width,                 : 0xf8
+;//  u32 height,                : 0xfc
+;//  u32 yFrac,                 : 0x100
+;//  u32 chromaPartWidth,       : 0x104
+;//  u32 chromaPartHeight       : 0x108
+
+h264bsdInterpolateChromaVer
+    STMFD   sp!, {r0-r11,lr}
+    SUB     sp, sp, #0xc4
+
+    LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
+    LDR     width, [sp, #0xf8]      ;// width
+    CMP     x0, #0
+    BLT     do_fill
+
+    ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
+    CMP     tmp1, width             ;// x0+chromaPartWidth > width
+    BHI     do_fill
+
+    CMP     y0, #0
+    BLT     do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
+    ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
+    CMP     tmp1, height
+    BLS     skip_fill
+
+do_fill
+    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
+    LDR     height, [sp, #0xfc]     ;// height
+    ADD     tmp1, chrPH, #1         ;// tmp1 = chromaPartHeight+1
+    MOV     tmp2, chrPW             ;// tmp2 = chromaPartWidth
+    STMIA   sp,{width,height,chrPW,tmp1,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    BL      h264bsdFillBlock
+
+    LDR     x0, [sp, #0xcc]
+    LDR     y0, [sp, #0xd0]
+    LDR     ref, [sp, #0xc4]        ;// ref
+    STMIA   sp,{width,height,chrPW,tmp1,tmp2}
+    ADD     block, sp, #0x1c        ;// block
+    MLA     ref, height, width, ref ;// ref += width * height; 
+    MLA     block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0                  ;// x0 = 0
+    MOV     y0, #0                  ;// y0 = 0
+    STR     x0, [sp, #0xcc]
+    STR     y0, [sp, #0xd0]
+    ADD     ref, sp, #0x1c          ;// ref = block
+    STR     ref, [sp, #0xc4]        ;// ref
+
+    STR     tmp1, [sp, #0xfc]       ;// height
+    STR     chrPW, [sp, #0xf8]      ;// width
+    MOV     width, chrPW
+
+skip_fill
+    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
+    LDR     yFrac, [sp, #0x100]     ;// yFrac
+    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
+    RSB     valY, yFrac, #8         ;// valY = 8-yFrac
+
+    LDR     mb, [sp, #0xc8]         ;// predPartChroma
+
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (chromaPartWidth-1)
+    ;// [27:24] loop_y (chromaPartHeight-1)
+    ;// [23:20] chromaPartWidth-1
+    ;// [19:16] chromaPartHeight-1
+    ;// [15:00] nothing
+
+    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
+    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
+    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+    ADD     count, count, tmp2, LSL #24 ;// loop_y
+    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+    AND     tmp2, count, #0x00F00000    ;// loop_x
+    PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
+    MOV     valY, valY, LSL #3          ;// multiply by 8 in advance
+    MOV     c32, #32
+
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cb
+    ;///////////////////////////////////////////////////////////////////////////
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+
+loop1_y
+    ADD     count, count, tmp2, LSL #8
+loop1_x
+    ;// Process 2x2 block
+    LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
+    LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
+    LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
+
+    LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
+    LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
+    LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
+
+    PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
+    PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
+    PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
+
+    SMLAD   tmp7, tmp2, valY, c32       ;// multiply
+    PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
+    SMLAD   tmp2, tmp1, valY, c32       ;// multiply
+    SMLAD   tmp8, tmp5, valY, c32       ;// multiply
+    SMLAD   tmp5, tmp4, valY, c32       ;// multiply
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 1
+    MOV     tmp2, tmp2, LSR #6          ;// scale down
+    STRB    tmp2, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb,#8]               ;// store row 2 col 2
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb],#1               ;// store row 1 col 2
+
+
+    SUBS    count, count, #2<<28
+    BCS     loop1_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop1_y 
+
+    ;///////////////////////////////////////////////////////////////////////////
+    ;// Cr
+    ;///////////////////////////////////////////////////////////////////////////
+    LDR     height, [sp,#0xfc]          ;// height
+    LDR     ref, [sp, #0xc4]            ;// ref
+    LDR     tmp1, [sp, #0xd0]           ;// y0
+    LDR     tmp2, [sp, #0xcc]           ;// x0
+    LDR     mb, [sp, #0xc8]             ;// predPartChroma
+
+    ADD     tmp1, height, tmp1
+    MLA     tmp3, tmp1, width, tmp2
+    ADD     ptrA, ref, tmp3
+    ADD     mb, mb, #64
+
+    AND     count, count, #0x00FFFFFF
+    AND     tmp1, count, #0x000F0000
+    ADD     count, count, tmp1, LSL #8
+    AND     tmp2, count, #0x00F00000
+
+    ;// 2x2 pels per iteration
+    ;// bilinear vertical interpolation
+loop2_y
+    ADD     count, count, tmp2, LSL #8
+loop2_x
+    ;// Process 2x2 block
+    LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
+    LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
+    LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
+
+    LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
+    LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
+    LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
+
+    PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
+    PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
+    PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
+
+    SMLAD   tmp7, tmp2, valY, c32       ;// multiply
+    PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
+    SMLAD   tmp2, tmp1, valY, c32       ;// multiply
+    SMLAD   tmp8, tmp5, valY, c32       ;// multiply
+    SMLAD   tmp5, tmp4, valY, c32       ;// multiply
+
+    MOV     tmp7, tmp7, LSR #6          ;// scale down
+    STRB    tmp7, [mb,#8]               ;// store row 2 col 1
+    MOV     tmp2, tmp2, LSR #6          ;// scale down
+    STRB    tmp2, [mb],#1               ;// store row 1 col 1
+
+    MOV     tmp8, tmp8, LSR #6          ;// scale down
+    STRB    tmp8, [mb,#8]               ;// store row 2 col 2
+    MOV     tmp5, tmp5, LSR #6          ;// scale down
+    STRB    tmp5, [mb],#1               ;// store row 1 col 2
+
+
+    SUBS    count, count, #2<<28
+    BCS     loop2_x
+
+    AND     tmp2, count, #0x00F00000
+
+    ADDS    mb, mb, #16
+    SBC     mb, mb, tmp2, LSR #20
+    ADD     ptrA, ptrA, width, LSL #1
+    SBC     ptrA, ptrA, tmp2, LSR #20
+
+    ADDS    count, count, #0xE << 24
+    BGE     loop2_y
+
+    ADD     sp,sp,#0xd4
+    LDMFD   sp!, {r4-r11,pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s
new file mode 100755
index 0000000..93968b6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s
@@ -0,0 +1,251 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateHorHalf register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+y0      RN 3
+x_2_0   RN 3
+
+width   RN 4
+x_3_1   RN 4
+
+height  RN 5
+x_6_4   RN 5
+
+partW   RN 6
+x_7_5   RN 6
+
+partH   RN 7
+tmp1    RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+mult_20_01  RN 11
+mult_20_m5  RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateHorHalf
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
+;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
+;// y_0 =   20  1     20 -5        -5         1
+;// y_1 =   -5        20  1      1 20        -5
+;// y_2 =    1        -5        -5 20      1 20
+;// y_3 =              1        20 -5     -5 20         1
+
+
+h264bsdInterpolateHorHalf
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, x0, partW         ;// (x0+partWidth)
+    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
+    LDR     width, [sp,#0x218]      ;// width
+    CMP     tmp4, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp2, y0, partH         ;// (y0+partHeight)
+    LDR     height, [sp,#0x21c]     ;// height
+    CMP     tmp2, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    LDR     height, [sp,#0x21c]     ;// height
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
+    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
+    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
+    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
+    LDR     width, [sp,#0x218]      ;// width
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    MLA     tmp2, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
+    ADD     ref, ref, #8            ;// ref = ref+8
+    LDR     mb, [sp, #0x1e8]        ;// mb
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (partWidth-1)
+    ;// [27:24] loop_y (partHeight-1)
+    ;// [23:20] partWidth-1
+    ;// [19:16] partHeight-1
+    ;// [15:00] width
+    MOV     count, width
+    SUB     partW, partW, #1;
+    SUB     partH, partH, #1;
+    ADD     tmp2, partH, partW, LSL #4
+    ADD     count, count, tmp2, LSL #16
+
+
+    LDR     mult_20_01, = 0x00140001
+    LDR     mult_20_m5, = 0x0014FFFB
+    MOV     plus16, #16
+    AND     tmp1, count, #0x000F0000    ;// partHeight-1
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    ADD     count, count, tmp1, LSL #8
+loop_y
+    LDR     x_3_1, [ref, #-8]
+    ADD     count, count, tmp3, LSL #8
+    LDR     x_7_5, [ref, #-4]
+    UXTB16  x_2_0, x_3_1
+    UXTB16  x_3_1, x_3_1, ROR #8
+    UXTB16  x_6_4, x_7_5
+
+loop_x
+    UXTB16  x_7_5, x_7_5, ROR #8
+
+    SMLAD   tmp1, x_2_0, mult_20_01, plus16
+    SMLATB  tmp3, x_2_0, mult_20_01, plus16
+    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
+    SMLATB  tmp4, x_3_1, mult_20_01, plus16
+
+    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
+    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
+    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
+    LDR     x_3_1, [ref], #4
+    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
+    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
+    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
+    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
+    UXTB16  x_2_0, x_3_1
+    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
+    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
+    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+
+    SUBS    count, count, #4<<28
+    ORR     tmp1, tmp1, tmp2, LSL #8
+    STR     tmp1, [mb], #4
+    BCC     next_y
+
+    UXTB16  x_3_1, x_3_1, ROR #8
+
+    SMLAD   tmp1, x_6_4, mult_20_01, plus16
+    SMLATB  tmp3, x_6_4, mult_20_01, plus16
+    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
+    SMLATB  tmp4, x_7_5, mult_20_01, plus16
+
+    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
+    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
+    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
+    LDR     x_7_5, [ref], #4
+    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
+    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
+    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
+    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
+    UXTB16  x_6_4, x_7_5
+    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
+    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
+    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+
+    SUBS    count, count, #4<<28
+    ORR     tmp1, tmp1, tmp2, LSL #8
+    STR     tmp1, [mb], #4
+    BCS     loop_x
+
+next_y
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    SMLABB  ref, count, mult_20_01, ref ;// +width
+    ADDS    mb, mb, #16                 ;// +16, Carry=0
+    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
+    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
+    ADDS    count, count, #(1<<28)-(1<<24)
+    BGE     loop_y
+
+    ADD     sp,sp,#0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
new file mode 100755
index 0000000..de243d4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
@@ -0,0 +1,273 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateHorQuarter register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+y0      RN 3
+x_2_0   RN 3
+
+width   RN 4
+x_3_1   RN 4
+
+height  RN 5
+x_6_4   RN 5
+
+partW   RN 6
+x_7_5   RN 6
+
+partH   RN 7
+tmp1    RN 7
+
+tmp2    RN 8
+
+tmp3    RN 9
+
+tmp4    RN 10
+
+mult_20_01  RN 11
+
+mult_20_m5  RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateHorQuarter
+
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
+;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
+;// y_0 =   20  1     20 -5        -5         1
+;// y_1 =   -5        20  1      1 20        -5
+;// y_2 =    1        -5        -5 20      1 20
+;// y_3 =              1        20 -5     -5 20         1
+
+
+h264bsdInterpolateHorQuarter
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, x0, partW         ;// (x0+partWidth)
+    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
+    LDR     width, [sp,#0x218]      ;// width
+    CMP     tmp4, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp2, y0, partH         ;// (y0+partHeight)
+    LDR     height, [sp,#0x21c]     ;// height
+    CMP     tmp2, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    LDR     height, [sp,#0x21c]     ;// height
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
+    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
+    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
+    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
+    LDR     width, [sp,#0x218]      ;// width
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    MLA     tmp2, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
+    ADD     ref, ref, #8            ;// ref = ref+8
+    LDR     mb, [sp, #0x1e8]        ;// mb
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (partWidth-1)
+    ;// [27:24] loop_y (partHeight-1)
+    ;// [23:20] partWidth-1
+    ;// [19:16] partHeight-1
+    ;// [15:00] width
+    MOV     count, width
+    SUB     partW, partW, #1;
+    SUB     partH, partH, #1;
+    ADD     tmp2, partH, partW, LSL #4
+    ADD     count, count, tmp2, LSL #16
+
+
+    LDR     mult_20_01, = 0x00140001
+    LDR     mult_20_m5, = 0x0014FFFB
+    MOV     plus16, #16
+    AND     tmp1, count, #0x000F0000    ;// partHeight-1
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    ADD     count, count, tmp1, LSL #8
+loop_y
+    LDR     x_3_1, [ref, #-8]
+    ADD     count, count, tmp3, LSL #8
+    LDR     x_7_5, [ref, #-4]
+    UXTB16  x_2_0, x_3_1
+    UXTB16  x_3_1, x_3_1, ROR #8
+    UXTB16  x_6_4, x_7_5
+
+loop_x
+    UXTB16  x_7_5, x_7_5, ROR #8
+
+    SMLAD   tmp1, x_2_0, mult_20_01, plus16
+    SMLATB  tmp3, x_2_0, mult_20_01, plus16
+    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
+    SMLATB  tmp4, x_3_1, mult_20_01, plus16
+
+    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
+    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
+    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
+    LDR     x_3_1, [ref], #4
+    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
+    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
+    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
+    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
+    UXTB16  x_2_0, x_3_1
+    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
+    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
+    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    LDR     tmp4, [sp, #0x228]
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+    SUB     tmp4, tmp4, #10
+
+    SUBS    count, count, #4<<28
+    LDR     tmp3, [ref, tmp4]
+    ORR     tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel position
+    LDR     tmp2, = 0x80808080
+    MVN     tmp3, tmp3
+    UHSUB8  tmp1, tmp1, tmp3
+    EOR     tmp1, tmp1, tmp2
+    STR     tmp1, [mb], #4
+
+    BCC     next_y
+
+    UXTB16  x_3_1, x_3_1, ROR #8
+
+    SMLAD   tmp1, x_6_4, mult_20_01, plus16
+    SMLATB  tmp3, x_6_4, mult_20_01, plus16
+    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
+    SMLATB  tmp4, x_7_5, mult_20_01, plus16
+
+    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
+    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
+    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
+    LDR     x_7_5, [ref], #4
+    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
+    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
+    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
+    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
+    UXTB16  x_6_4, x_7_5
+    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
+    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
+    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
+
+    MOV     tmp2, tmp2, ASR #5
+    MOV     tmp1, tmp1, ASR #5
+    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
+    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
+    LDR     tmp4, [sp, #0x228]
+    USAT16  tmp2, #8, tmp2
+    USAT16  tmp1, #8, tmp1
+    SUB     tmp4, tmp4, #10
+
+    SUBS    count, count, #4<<28
+    LDR     tmp3, [ref, tmp4]
+    ORR     tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel
+    LDR     tmp2, = 0x80808080
+    MVN     tmp3, tmp3
+    UHSUB8  tmp1, tmp1, tmp3
+    EOR     tmp1, tmp1, tmp2
+
+    STR     tmp1, [mb], #4
+    BCS     loop_x
+
+next_y
+    AND     tmp3, count, #0x00F00000    ;// partWidth-1
+    SMLABB  ref, count, mult_20_01, ref ;// +width
+    ADDS    mb, mb, #16                 ;// +16, Carry=0
+    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
+    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
+    ADDS    count, count, #(1<<28)-(1<<24)
+    BGE     loop_y
+
+    ADD     sp,sp,#0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s
new file mode 100755
index 0000000..1c79b39
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s
@@ -0,0 +1,536 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter 
+;--            function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateHorVerQuarter register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+y0      RN 3
+x_2_0   RN 3
+res     RN 3
+
+x_3_1   RN 4
+tmp1    RN 4
+
+height  RN 5
+x_6_4   RN 5
+tmp2    RN 5
+
+partW   RN 6
+x_7_5   RN 6
+tmp3    RN 6
+
+partH   RN 7
+tmp4    RN 7
+
+tmp5    RN 8
+
+tmp6    RN 9
+
+tmpa    RN 10
+
+mult_20_01  RN 11
+tmpb        RN 11
+
+mult_20_m5  RN 12
+width       RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateHorVerQuarter
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
+;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
+;// y_0 =   20  1     20 -5        -5         1
+;// y_1 =   -5        20  1      1 20        -5
+;// y_2 =    1        -5        -5 20      1 20
+;// y_3 =              1        20 -5     -5 20         1
+
+
+h264bsdInterpolateHorVerQuarter
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    LDR     width, [sp,#0x218]      ;// width
+    ADD     tmpa, x0, partW         ;// (x0+partWidth)
+    ADD     tmpa, tmpa, #5          ;// (x0+partW+5)
+    CMP     tmpa, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    LDR     height, [sp,#0x21c]     ;// height
+    ADD     tmp5, y0, partH         ;// (y0+partHeight)
+    ADD     tmp5, tmp5, #5          ;// (y0+partH+5)
+    CMP     tmp5, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    LDR     height, [sp,#0x21c]     ;// height
+    ADD     tmp5, partH, #5         ;// tmp5 = partH + 5
+    ADD     tmpa, partW, #5         ;// tmpa = partW + 5
+    STMIB   sp, {height, tmpa}      ;// sp+4 = height, sp+8 = partWidth+5
+    LDR     width, [sp,#0x218]      ;// width
+    STR     tmp5, [sp,#0xc]         ;// sp+c = partHeight+5
+    STR     tmpa, [sp,#0x10]        ;// sp+10 = partWidth+5
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     tmpa, [sp,#0x218]       ;// width = partWidth+5
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    LDR     tmp6, [sp,#0x228]       ;// horVerOffset
+    LDR     mb, [sp, #0x1e8]        ;// mb
+    MLA     tmp5, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp5          ;// ref += y0*width+x0
+    STR     ref, [sp, #0x1e4]       ;// store "ref" for vertical filtering
+    AND     tmp6, tmp6, #2          ;// calculate ref for horizontal filter
+    MOV     tmpa, #2
+    ADD     tmp6, tmpa, tmp6, LSR #1
+    MLA     ref, tmp6, width, ref
+    ADD     ref, ref, #8            ;// ref = ref+8
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (partWidth-1)
+    ;// [27:24] loop_y (partHeight-1)
+    ;// [23:20] partWidth-1
+    ;// [19:16] partHeight-1
+    ;// [15:00] width
+    MOV     count, width
+    SUB     partW, partW, #1;
+    SUB     partH, partH, #1;
+    ADD     tmp5, partH, partW, LSL #4
+    ADD     count, count, tmp5, LSL #16
+
+
+    LDR     mult_20_01, = 0x00140001    ;// constant multipliers
+    LDR     mult_20_m5, = 0x0014FFFB    ;// constant multipliers
+    MOV     plus16, #16                 ;// constant for add
+    AND     tmp4, count, #0x000F0000    ;// partHeight-1
+    AND     tmp6, count, #0x00F00000    ;// partWidth-1
+    ADD     count, count, tmp4, LSL #8  ;// partH-1 to lower part of top byte
+
+;// HORIZONTAL PART
+
+loop_y_hor
+    LDR     x_3_1, [ref, #-8]
+    ADD     count, count, tmp6, LSL #8   ;// partW-1 to upper part of top byte
+    LDR     x_7_5, [ref, #-4]
+    UXTB16  x_2_0, x_3_1
+    UXTB16  x_3_1, x_3_1, ROR #8
+    UXTB16  x_6_4, x_7_5
+
+loop_x_hor
+    UXTB16  x_7_5, x_7_5, ROR #8
+
+    SMLAD   tmp4, x_2_0, mult_20_01, plus16
+    SMLATB  tmp6, x_2_0, mult_20_01, plus16
+    SMLATB  tmp5, x_2_0, mult_20_m5, plus16
+    SMLATB  tmpa, x_3_1, mult_20_01, plus16
+
+    SMLAD   tmp4, x_3_1, mult_20_m5, tmp4
+    SMLATB  tmp6, x_3_1, mult_20_m5, tmp6
+    SMLAD   tmp5, x_3_1, mult_20_01, tmp5
+    LDR     x_3_1, [ref], #4
+    SMLAD   tmpa, x_6_4, mult_20_m5, tmpa
+
+    SMLABB  tmp4, x_6_4, mult_20_m5, tmp4
+    SMLADX  tmp6, x_6_4, mult_20_m5, tmp6
+    SMLADX  tmp5, x_6_4, mult_20_01, tmp5
+    SMLADX  tmpa, x_7_5, mult_20_m5, tmpa
+
+    SMLABB  tmp4, x_7_5, mult_20_01, tmp4
+    UXTB16  x_2_0, x_3_1
+    SMLABB  tmp5, x_7_5, mult_20_m5, tmp5
+    SMLADX  tmp6, x_7_5, mult_20_01, tmp6
+    SMLABB  tmpa, x_2_0, mult_20_01, tmpa
+
+    MOV     tmp5, tmp5, ASR #5
+    MOV     tmp4, tmp4, ASR #5
+    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
+    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
+    USAT16  tmp5, #8, tmp5
+    USAT16  tmp4, #8, tmp4
+
+    SUBS    count, count, #4<<28
+    ORR     tmp4, tmp4, tmp5, LSL #8
+    STR     tmp4, [mb], #4
+    BCC     next_y_hor
+
+    UXTB16  x_3_1, x_3_1, ROR #8
+
+    SMLAD   tmp4, x_6_4, mult_20_01, plus16
+    SMLATB  tmp6, x_6_4, mult_20_01, plus16
+    SMLATB  tmp5, x_6_4, mult_20_m5, plus16
+    SMLATB  tmpa, x_7_5, mult_20_01, plus16
+
+    SMLAD   tmp4, x_7_5, mult_20_m5, tmp4
+    SMLATB  tmp6, x_7_5, mult_20_m5, tmp6
+    SMLAD   tmp5, x_7_5, mult_20_01, tmp5
+    LDR     x_7_5, [ref], #4
+    SMLAD   tmpa, x_2_0, mult_20_m5, tmpa
+
+    SMLABB  tmp4, x_2_0, mult_20_m5, tmp4
+    SMLADX  tmp6, x_2_0, mult_20_m5, tmp6
+    SMLADX  tmp5, x_2_0, mult_20_01, tmp5
+    SMLADX  tmpa, x_3_1, mult_20_m5, tmpa
+
+    SMLABB  tmp4, x_3_1, mult_20_01, tmp4
+    UXTB16  x_6_4, x_7_5
+    SMLABB  tmp5, x_3_1, mult_20_m5, tmp5
+    SMLADX  tmp6, x_3_1, mult_20_01, tmp6
+    SMLABB  tmpa, x_6_4, mult_20_01, tmpa
+
+    MOV     tmp5, tmp5, ASR #5
+    MOV     tmp4, tmp4, ASR #5
+    PKHBT   tmp5, tmp5, tmpa, LSL #(16-5)
+    PKHBT   tmp4, tmp4, tmp6, LSL #(16-5)
+    USAT16  tmp5, #8, tmp5
+    USAT16  tmp4, #8, tmp4
+
+    SUBS    count, count, #4<<28
+    ORR     tmp4, tmp4, tmp5, LSL #8
+    STR     tmp4, [mb], #4
+    BCS     loop_x_hor
+
+next_y_hor
+    AND     tmp6, count, #0x00F00000        ;// partWidth-1
+    SMLABB  ref, count, mult_20_01, ref     ;// +width
+    ADDS    mb, mb, #16                     ;// +16, Carry=0
+    SBC     mb, mb, tmp6, LSR #20           ;// -(partWidth-1)-1
+    SBC     ref, ref, tmp6, LSR #20         ;// -(partWidth-1)-1
+    ADDS    count, count, #(1<<28)-(1<<24)  ;// decrement counter (partW)
+    BGE     loop_y_hor
+
+
+
+;// VERTICAL PART
+;//
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;//           ..
+;//           ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+;// Approach to bilinear interpolation to quarter pel position.
+;// 4 bytes are processed parallel
+;//
+;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by 
+;// negating second operand to get one's complement (instead of 2's)
+;// and using subtraction, EOR is used to correct sign.
+;//
+;// MVN     b, b
+;// UHSUB8  a, a, b
+;// EOR     a, a, 0x80808080
+
+
+    LDR     ref, [sp, #0x1e4]           ;// ref
+    LDR     tmpa, [sp, #0x228]          ;// horVerOffset
+    LDR     mb, [sp, #0x1e8]            ;// mb
+    LDR     width, [sp, #0x218]         ;// width
+    ADD     ref, ref, #2                ;// calculate correct position
+    AND     tmpa, tmpa, #1
+    ADD     ref, ref, tmpa
+    LDR     plus16, = 0x00100010        ;// +16 to lower and upperf halfwords
+    AND     count, count, #0x00FFFFFF   ;// partWidth-1
+
+    AND     tmpa, count, #0x000F0000    ;// partHeight-1
+    ADD     count, count, tmpa, LSL #8
+
+loop_y
+    ADD     count, count, tmp6, LSL #8  ;// partWidth-1
+
+loop_x
+    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
+    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
+    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
+    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
+    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
+    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
+
+    ;// first four pixels 
+    UXTB16  tmpa, tmp3                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp2                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+
+    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp1, [mb]
+    LDR     tmpa, = 0xFF00FF00
+    MVN     tmp1, tmp1
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
+    ORR     res, res, tmpa
+
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp1              ;// bilinear interpolation
+    LDR     tmp1, [ref], width          ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+
+    STR     res, [mb], #16              ;// next row (mb)
+
+
+    ;// tmp2 = |a4|a3|a2|a1|
+    ;// tmp3 = |c4|c3|c2|c1|
+    ;// tmp4 = |g4|g3|g2|g1|
+    ;// tmp5 = |m4|m3|m2|m1|
+    ;// tmp6 = |r4|r3|r2|r1|
+    ;// tmp1 = |t4|t3|t2|t1|
+
+    ;// second four pixels
+    UXTB16  tmpa, tmp4                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp3                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp2, [mb]
+    LDR     tmpa, = 0xFF00FF00
+    MVN     tmp2, tmp2
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp2              ;// bilinear interpolation
+    LDR     tmp2, [ref], width          ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp3 = |a4|a3|a2|a1|
+    ;// tmp4 = |c4|c3|c2|c1|
+    ;// tmp5 = |g4|g3|g2|g1|
+    ;// tmp6 = |m4|m3|m2|m1|
+    ;// tmp1 = |r4|r3|r2|r1|
+    ;// tmp2 = |t4|t3|t2|t1|
+
+    ;// third four pixels
+    UXTB16  tmpa, tmp5                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp4                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
+
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp3, [mb]
+    LDR     tmpa, = 0xFF00FF00
+    MVN     tmp3, tmp3
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp3              ;// bilinear interpolation
+    LDR     tmp3, [ref]                 ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp4 = |a4|a3|a2|a1|
+    ;// tmp5 = |c4|c3|c2|c1|
+    ;// tmp6 = |g4|g3|g2|g1|
+    ;// tmp1 = |m4|m3|m2|m1|
+    ;// tmp2 = |r4|r3|r2|r1|
+    ;// tmp3 = |t4|t3|t2|t1|
+
+    ;// fourth four pixels
+    UXTB16  tmpa, tmp6                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp5                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp5, [mb]
+    LDR     tmp4, = 0xFF00FF00
+    MVN     tmp5, tmp5
+
+    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp5              ;// bilinear interpolation
+
+    ;// decrement loop_x counter
+    SUBS    count, count, #4<<28        ;// decrement x loop counter
+
+    ;// calculate "ref" address for next round
+    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
+    ADD     ref, ref, #4                ;// next column (4 pixels)
+
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #-44
+
+    BCS     loop_x
+
+    ADDS    mb, mb, #64                 ;// set Carry=0
+    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
+    AND     tmp6, count, #0x00F00000    ;// partWidth-1
+    SBC     ref, ref, tmp6, LSR #20     ;// -(partWidth-1)-1
+    SBC     mb, mb, tmp6, LSR #20       ;// -(partWidth-1)-1
+
+    ADDS    count, count, #0xC << 24    ;// decrement y loop counter
+    BGE     loop_y
+
+    ADD     sp, sp, #0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s
new file mode 100755
index 0000000..a81aed7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s
@@ -0,0 +1,163 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version horizontal part of 
+;--            h264bsdInterpolateMid functions
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+
+;// Register allocation
+
+ref     RN 0    ;// pointer to current position in reference image
+mb      RN 1    ;// pointer to current position in interpolated mb
+count   RN 2    ;// bit-packed width and count values
+
+x_2_0   RN 4
+x_3_1   RN 5
+x_6_4   RN 6
+x_7_5   RN 7
+
+tmp1    RN 8
+tmp2    RN 9
+tmp3    RN 10
+tmp4    RN 11
+
+mult_20_01  RN 12   ;// [20,  1]
+mult_20_m5  RN 14   ;// [20, -5]
+
+
+        EXPORT  h264bsdInterpolateMidHorPart
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
+;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
+;// y_0 =   20  1     20 -5        -5         1
+;// y_1 =   -5        20  1      1 20        -5
+;// y_2 =    1        -5        -5 20      1 20
+;// y_3 =              1        20 -5     -5 20         1
+
+
+h264bsdInterpolateMidHorPart
+    STMFD   sp!, {r4-r11, lr}
+
+    ;// pack values to count register
+    ;// [31:28] loop_x (partWidth-1)
+    ;// [27:24] loop_y (partHeight-1)
+    ;// [23:20] partWidth-1
+    ;// [19:16] partHeight-1
+    ;// [15:00] width
+
+
+    LDR     mult_20_01, = 0x00140001
+    LDR     mult_20_m5, = 0x0014FFFB
+    AND     tmp3, count, #0x000F0000    ;// partWidth-1
+loop_y
+    LDR     x_3_1, [ref, #-8]
+    ADD     count, count, tmp3, LSL #12
+    LDR     x_7_5, [ref, #-4]
+    UXTB16  x_2_0, x_3_1
+    UXTB16  x_3_1, x_3_1, ROR #8
+    UXTB16  x_6_4, x_7_5
+
+loop_x
+    UXTB16  x_7_5, x_7_5, ROR #8
+
+    SMUAD   tmp1, x_2_0, mult_20_01
+    SMULTB  tmp2, x_2_0, mult_20_m5
+    SMULTB  tmp3, x_2_0, mult_20_01
+    SMULTB  tmp4, x_3_1, mult_20_01
+
+    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
+    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
+    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
+    LDR     x_3_1, [ref], #4
+    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
+    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
+    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
+    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
+    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
+    UXTB16  x_2_0, x_3_1
+    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
+    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
+
+    SUBS    count, count, #4<<28
+    STR     tmp1, [mb], #4
+    STR     tmp2, [mb], #4
+    STR     tmp3, [mb], #4
+    STR     tmp4, [mb], #4
+    BCC     next_y
+
+    UXTB16  x_3_1, x_3_1, ROR #8
+
+    SMUAD   tmp1, x_6_4, mult_20_01
+    SMULTB  tmp2, x_6_4, mult_20_m5
+    SMULTB  tmp3, x_6_4, mult_20_01
+    SMULTB  tmp4, x_7_5, mult_20_01
+
+    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
+    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
+    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
+    LDR     x_7_5, [ref], #4
+    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
+    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
+    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
+    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
+
+    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
+    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
+    UXTB16  x_6_4, x_7_5
+    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
+    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
+
+    SUBS    count, count, #4<<28
+    STR     tmp1, [mb], #4
+    STR     tmp2, [mb], #4
+    STR     tmp3, [mb], #4
+    STR     tmp4, [mb], #4
+    BCS     loop_x
+
+next_y
+    AND     tmp3, count, #0x000F0000    ;// partWidth-1
+    SMLABB  ref, count, mult_20_01, ref   ;// +width
+    SBC     ref, ref, tmp3, LSR #16   ;// -(partWidth-1)-1
+    ADDS    count, count, #(1<<28)-(1<<20)
+    BGE     loop_y
+
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s
new file mode 100755
index 0000000..244fc6f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s
@@ -0,0 +1,347 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerHalf function
+;--
+;-------------------------------------------------------------------------------
+
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateVerHalf register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+res     RN 3
+y0      RN 3
+
+tmp1    RN 4
+
+tmp2    RN 5
+height  RN 5
+
+tmp3    RN 6
+partW   RN 6
+
+tmp4    RN 7
+partH   RN 7
+
+tmp5    RN 8
+tmp6    RN 9
+
+tmpa    RN 10
+tmpb    RN 11
+width   RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateVerHalf
+
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;//           ..
+;//           ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+h264bsdInterpolateVerHalf
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp5, x0, partW         ;// (x0+partWidth)
+    LDR     width, [sp,#0x218]      ;// width
+    CMP     tmp5, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp6, y0, partH         ;// (y0+partHeight)
+    ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
+    LDR     height, [sp,#0x21c]     ;// height
+    CMP     tmp6, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp5, partH, #5         ;// r2 = partH + 5;
+    LDR     height, [sp,#0x21c]     ;// height
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
+    STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
+    STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
+    LDR     width, [sp,#0x218]      ;// width
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     partW, [sp,#0x218]
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    MLA     tmp6, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp6          ;// ref += y0*width+x0
+    LDR     mb, [sp, #0x1e8]        ;// mb
+
+    ADD     count, partW, partH, LSL #16    ;// |partH|partW|
+    LDR     tmp5, = 0x00010001
+    SSUB16  count, count, tmp5;     ;// |partH-1|partW-1|
+    LDR     plus16, = 0x00100010
+
+    AND     tmp1, count, #0x000000FF ;// partWidth
+
+
+loop_y
+    ADD     count, count, tmp1, LSL #24  ;// partWidth-1 to top byte
+
+loop_x
+    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
+    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
+    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
+    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
+    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
+    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
+
+    ;// first four pixels
+    UXTB16  tmpa, tmp3                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp2                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+
+    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp1, [ref], width
+    LDR     tmpa, = 0xFF00FF00
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
+    ORR     res, res, tmpa
+    STR     res, [mb], #16              ;// next row (mb)
+
+    ;// tmp2 = |a4|a3|a2|a1|
+    ;// tmp3 = |c4|c3|c2|c1|
+    ;// tmp4 = |g4|g3|g2|g1|
+    ;// tmp5 = |m4|m3|m2|m1|
+    ;// tmp6 = |r4|r3|r2|r1|
+    ;// tmp1 = |t4|t3|t2|t1|
+
+    ;// second four pixels
+    UXTB16  tmpa, tmp4                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp3                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp2, [ref], width
+    LDR     tmpa, = 0xFF00FF00
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp3 = |a4|a3|a2|a1|
+    ;// tmp4 = |c4|c3|c2|c1|
+    ;// tmp5 = |g4|g3|g2|g1|
+    ;// tmp6 = |m4|m3|m2|m1|
+    ;// tmp1 = |r4|r3|r2|r1|
+    ;// tmp2 = |t4|t3|t2|t1|
+
+    ;// third four pixels
+    UXTB16  tmpa, tmp5                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp4                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
+
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp3, [ref]
+    LDR     tmpa, = 0xFF00FF00
+
+    ;// decrement loop_x counter
+    SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp4 = |a4|a3|a2|a1|
+    ;// tmp5 = |c4|c3|c2|c1|
+    ;// tmp6 = |g4|g3|g2|g1|
+    ;// tmp1 = |m4|m3|m2|m1|
+    ;// tmp2 = |r4|r3|r2|r1|
+    ;// tmp3 = |t4|t3|t2|t1|
+
+    ;// fourth four pixels
+    UXTB16  tmpa, tmp6                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp5                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp4, = 0xFF00FF00
+
+    ;// calculate "ref" address for next round
+    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
+    ADD     ref, ref, #4;               ;// next column (4 pixels)
+    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    STR     res, [mb], #-44
+
+    BCS     loop_x
+
+    ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
+    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
+    AND     tmp1, count, #0x000000FF    ;// partWidth-1
+    ADD     tmp2, tmp1, #1              ;// partWidth
+    SUB     ref, ref, tmp2              ;// ref -= partWidth
+    ADD     mb, mb, #64;
+    SUB     mb, mb, tmp2;               ;// mb -= partWidth
+    BGE     loop_y
+
+    ADD     sp,sp,#0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s
new file mode 100755
index 0000000..5266c85
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s
@@ -0,0 +1,374 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;      http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+    IF :DEF: H264DEC_WINASM
+        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+    ELSE
+        REQUIRE8
+        PRESERVE8
+    ENDIF
+
+    AREA    |.text|, CODE
+
+;// h264bsdInterpolateVerQuarter register allocation
+
+ref     RN 0
+
+mb      RN 1
+buff    RN 1
+
+count   RN 2
+x0      RN 2
+
+res     RN 3
+y0      RN 3
+
+tmp1    RN 4
+
+tmp2    RN 5
+height  RN 5
+
+tmp3    RN 6
+partW   RN 6
+
+tmp4    RN 7
+partH   RN 7
+
+tmp5    RN 8
+tmp6    RN 9
+
+tmpa    RN 10
+tmpb    RN 11
+width   RN 12
+
+plus16  RN 14
+
+
+;// function exports and imports
+
+    IMPORT  h264bsdFillBlock
+
+    EXPORT  h264bsdInterpolateVerQuarter
+
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;//           ..
+;//           ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+h264bsdInterpolateVerQuarter
+    STMFD   sp!, {r0-r11, lr}
+    SUB     sp, sp, #0x1e4
+
+    CMP     x0, #0
+    BLT     do_fill                 ;// (x0 < 0)
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    ADD     tmp5, x0, partW         ;// (x0+partWidth)
+    LDR     width, [sp,#0x218]      ;// width
+    CMP     tmp5, width
+    BHI     do_fill                 ;// (x0+partW)>width
+
+    CMP     y0, #0
+    BLT     do_fill                 ;// (y0 < 0)
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp6, y0, partH         ;// (y0+partHeight)
+    ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
+    LDR     height, [sp,#0x21c]     ;// height
+    CMP     tmp6, height
+    BLS     skip_fill               ;// no overfill needed
+
+
+do_fill
+    LDR     partH, [sp,#0x224]      ;// partHeight
+    ADD     tmp5, partH, #5         ;// r2 = partH + 5;
+    LDR     height, [sp,#0x21c]     ;// height
+    LDR     partW, [sp,#0x220]      ;// partWidth
+    STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
+    STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
+    STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
+    LDR     width, [sp,#0x218]      ;// width
+    STR     width, [sp,#0]          ;// sp+0 = width
+    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
+    BL      h264bsdFillBlock
+
+    MOV     x0, #0
+    STR     x0,[sp,#0x1ec]          ;// x0 = 0
+    STR     x0,[sp,#0x1f0]          ;// y0 = 0
+    ADD     ref,sp,#0x28            ;// ref = p1
+    STR     partW, [sp,#0x218]
+
+
+skip_fill
+    LDR     x0 ,[sp,#0x1ec]         ;// x0
+    LDR     y0 ,[sp,#0x1f0]         ;// y0
+    LDR     width, [sp,#0x218]      ;// width
+    MLA     tmp6, width, y0, x0     ;// y0*width+x0
+    ADD     ref, ref, tmp6          ;// ref += y0*width+x0
+    LDR     mb, [sp, #0x1e8]        ;// mb
+
+    ADD     count, partW, partH, LSL #8    ;// |xx|xx|partH|partW|
+    LDR     tmp5, = 0x00010100
+    RSB     count, tmp5, count, LSL #8      ;// |xx|partH-1|partW-1|xx|
+    LDR     tmp2, [sp, #0x228]      ;// verOffset
+    ADD     count, count, tmp2      ;// |xx|partH-1|partW-1|verOffset|
+    LDR     plus16, = 0x00100010
+
+    AND     tmp1, count, #0x0000FF00 ;// partWidth
+
+
+loop_y
+    ADD     count, count, tmp1, LSL #16  ;// partWidth-1 to top byte
+
+loop_x
+    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
+    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
+    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
+    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
+    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
+    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
+
+    ;// first four pixels 
+    UXTB16  tmpa, tmp3                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp2                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+
+    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    MOVS    tmp1, count, LSL #31        ;// update flags (verOffset)
+    LDR     tmpa, = 0xFF00FF00
+    MVNEQ   tmp1, tmp3                  ;// select verOffset=0
+    MVNNE   tmp1, tmp4                  ;// select verOffset=1
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
+    ORR     res, res, tmpa
+
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp1              ;// bilinear interpolation
+    LDR     tmp1, [ref], width          ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+
+    STR     res, [mb], #16              ;// next row (mb)
+
+
+    ;// tmp2 = |a4|a3|a2|a1|
+    ;// tmp3 = |c4|c3|c2|c1|
+    ;// tmp4 = |g4|g3|g2|g1|
+    ;// tmp5 = |m4|m3|m2|m1|
+    ;// tmp6 = |r4|r3|r2|r1|
+    ;// tmp1 = |t4|t3|t2|t1|
+
+    ;// second four pixels
+    UXTB16  tmpa, tmp4                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp3                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmpa, = 0xFF00FF00
+    MVNEQ   tmp2, tmp4                  ;// select verOffset=0
+    MVNNE   tmp2, tmp5                  ;// select verOffset=1
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp2              ;// bilinear interpolation
+    LDR     tmp2, [ref], width          ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp3 = |a4|a3|a2|a1|
+    ;// tmp4 = |c4|c3|c2|c1|
+    ;// tmp5 = |g4|g3|g2|g1|
+    ;// tmp6 = |m4|m3|m2|m1|
+    ;// tmp1 = |r4|r3|r2|r1|
+    ;// tmp2 = |t4|t3|t2|t1|
+
+    ;// third four pixels
+    UXTB16  tmpa, tmp5                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp4                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
+
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmpa, = 0xFF00FF00
+    MVNEQ   tmp3, tmp5                  ;// select verOffset=0
+    MVNNE   tmp3, tmp6                  ;// select verOffset=1
+
+    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp3              ;// bilinear interpolation
+    LDR     tmp3, [ref]                 ;// load next row
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #16              ;// next row
+
+    ;// tmp4 = |a4|a3|a2|a1|
+    ;// tmp5 = |c4|c3|c2|c1|
+    ;// tmp6 = |g4|g3|g2|g1|
+    ;// tmp1 = |m4|m3|m2|m1|
+    ;// tmp2 = |r4|r3|r2|r1|
+    ;// tmp3 = |t4|t3|t2|t1|
+
+    ;// fourth four pixels
+    UXTB16  tmpa, tmp6                  ;// |g3|g1|
+    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
+    UXTB16  tmpb, tmp5                  ;// |c3|c1|
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     res, = 0x00FF00FF
+    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
+    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
+    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
+
+    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
+    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
+    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
+    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
+    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
+    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
+
+    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
+    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
+
+    USAT16  tmpb, #13, tmpa             ;// saturate
+    LDR     tmp4, = 0xFF00FF00
+    MVNEQ   tmp5, tmp6                  ;// select verOffset=0
+    MVNNE   tmp5, tmp1                  ;// select verOffset=1
+
+    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
+    ORR     res, res, tmpa
+    LDR     tmpa, = 0x80808080
+    UHSUB8  res, res, tmp5              ;// bilinear interpolation
+
+    ;// decrement loop_x counter
+    SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
+
+    ;// calculate "ref" address for next round
+    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
+    ADD     ref, ref, #4;               ;// next column (4 pixels)
+
+    EOR     res, res, tmpa              ;// correct sign
+    STR     res, [mb], #-44
+ 
+    BCS     loop_x
+
+    ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
+    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
+    AND     tmp1, count, #0x0000FF00    ;// partWidth-1
+    MOV     tmp2, #1
+    ADD     tmp2, tmp2, tmp1, LSR #8    ;// partWidth
+    SUB     ref, ref, tmp2              ;// ref -= partWidth
+    ADD     mb, mb, #64;
+    SUB     mb, mb, tmp2;               ;// mb -= partWidth
+    BGE     loop_y
+
+    ADD     sp,sp,#0x1f4
+    LDMFD   sp!, {r4-r11, pc}
+
+    END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat
new file mode 100644
index 0000000..1b8d88c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat
@@ -0,0 +1,15 @@
+echo off
+set ASMFLAGS= -checkreglist -CPU ARM1136 -PreDefine "H264DEC_WINASM SETL {TRUE}"
+set ASM="D:\Program Files\Microsoft Visual Studio 8\VC\ce\bin\x86_arm\armasm"
+echo on
+
+%ASM% %ASMFLAGS% h264bsd_interpolate_chroma_ver.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_chroma_hor.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_half.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_quarter.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_ver_quarter.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_ver_half.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_ver_quarter.s
+
+rem %ASM% %ASMFLAGS% h264bsd_interpolate_chroma_hor_ver.s
+rem %ASM% %ASMFLAGS% h264bsd_interpolate_mid_hor.s