summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/source/arm11_asm
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm11_asm')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s298
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s339
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s288
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s251
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s273
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s536
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s163
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s347
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s374
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat15
10 files changed, 2884 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
new file mode 100755
index 0000000..634a484
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor.s
@@ -0,0 +1,298 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+
+;// h264bsdInterpolateChromaHor register allocation
+
+ref RN 0
+ptrA RN 0
+
+mb RN 1
+block RN 1
+
+x0 RN 2
+count RN 2
+
+y0 RN 3
+valX RN 3
+
+width RN 4
+
+height RN 5
+tmp7 RN 5
+
+chrPW RN 6
+tmp8 RN 6
+
+tmp1 RN 7
+chrPH RN 7
+
+tmp2 RN 8
+
+tmp3 RN 9
+
+tmp4 RN 10
+
+tmp5 RN 11
+
+tmp6 RN 12
+
+c32 RN 14
+xFrac RN 14
+
+;// Function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateChromaHor
+
+;// Function arguments
+;//
+;// u8 *ref, : 0xc4
+;// u8 *predPartChroma, : 0xc8
+;// i32 x0, : 0xcc
+;// i32 y0, : 0xd0
+;// u32 width, : 0xf8
+;// u32 height, : 0xfc
+;// u32 xFrac, : 0x100
+;// u32 chromaPartWidth, : 0x104
+;// u32 chromaPartHeight : 0x108
+
+h264bsdInterpolateChromaHor
+ STMFD sp!, {r0-r11,lr}
+ SUB sp, sp, #0xc4
+
+ LDR chrPW, [sp, #0x104] ;// chromaPartWidth
+ LDR width, [sp, #0xf8] ;// width
+ CMP x0, #0
+ BLT do_fill
+
+ ADD tmp6, x0, chrPW ;// tmp6 = x0+ chromaPartWidth
+ ADD tmp6, tmp6, #1 ;// tmp6 = x0 + chromaPartWidth + 1
+ CMP tmp6, width ;// x0+chromaPartWidth+1 > width
+ BHI do_fill
+
+ CMP y0, #0
+ BLT do_fill
+ LDR chrPH, [sp, #0x108] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp6, y0, chrPH ;// tmp6 = y0 + chromaPartHeight
+ CMP tmp6, height
+ BLS skip_fill
+
+do_fill
+ LDR chrPH, [sp, #0x108] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp8, chrPW, #1 ;// tmp8 = chromaPartWidth+1
+ MOV tmp2, tmp8 ;// tmp2 = chromaPartWidth+1
+ STMIA sp,{width,height,tmp8,chrPH,tmp2}
+ ADD block, sp, #0x1c ;// block
+ BL h264bsdFillBlock
+
+ LDR x0, [sp, #0xcc]
+ LDR y0, [sp, #0xd0]
+ LDR ref, [sp, #0xc4] ;// ref
+ STMIA sp,{width,height,tmp8,chrPH,tmp2}
+ ADD block, sp, #0x1c ;// block
+ MLA ref, height, width, ref ;// ref += width * height;
+ MLA block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
+ BL h264bsdFillBlock
+
+ MOV x0, #0 ;// x0 = 0
+ MOV y0, #0 ;// y0 = 0
+ STR x0, [sp, #0xcc]
+ STR y0, [sp, #0xd0]
+ ADD ref, sp, #0x1c ;// ref = block
+ STR ref, [sp, #0xc4] ;// ref
+
+ STR chrPH, [sp, #0xfc] ;// height
+ STR tmp8, [sp, #0xf8] ;// width
+ MOV width, tmp8
+ SUB chrPW, chrPW, #1
+
+skip_fill
+ MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
+ LDR xFrac, [sp, #0x100] ;// xFrac
+ ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
+ RSB valX, xFrac, #8 ;// valX = 8-xFrac
+
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (chromaPartWidth-1)
+ ;// [27:24] loop_y (chromaPartHeight-1)
+ ;// [23:20] chromaPartWidth-1
+ ;// [19:16] chromaPartHeight-1
+ ;// [15:00] nothing
+
+ SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
+ SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
+ ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+ ADD count, count, tmp2, LSL #24 ;// loop_y
+ ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+ AND tmp2, count, #0x00F00000 ;// loop_x
+ PKHBT valX, valX, xFrac, LSL #16 ;// |xFrac|valX |
+ MOV valX, valX, LSL #3 ;// multiply by 8 in advance
+ MOV c32, #32
+
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cb
+ ;///////////////////////////////////////////////////////////////////////////
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical interpolation
+
+loop1_y
+ ADD count, count, tmp2, LSL #8
+ LDRB tmp1, [ptrA, width]
+ LDRB tmp2, [ptrA], #1
+
+loop1_x
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp4, [ptrA], #1
+
+ PKHBT tmp5, tmp1, tmp3, LSL #16
+ PKHBT tmp6, tmp2, tmp4, LSL #16
+
+ LDRB tmp1, [ptrA, width]
+ LDRB tmp2, [ptrA], #1
+
+ SMLAD tmp5, tmp5, valX, c32 ;// multiply
+ SMLAD tmp6, tmp6, valX, c32 ;// multiply
+
+ PKHBT tmp7, tmp3, tmp1, LSL #16
+ PKHBT tmp8, tmp4, tmp2, LSL #16
+
+ SMLAD tmp7, tmp7, valX, c32 ;// multiply
+ SMLAD tmp8, tmp8, valX, c32 ;// multiply
+
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb,#8] ;// store row 2 col 1
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb],#1 ;// store row 1 col 1
+
+ MOV tmp7, tmp7, LSR #6 ;// scale down
+ STRB tmp7, [mb,#8] ;// store row 2 col 2
+
+ MOV tmp8, tmp8, LSR #6 ;// scale down
+ STRB tmp8, [mb],#1 ;// store row 1 col 2
+
+ SUBS count, count, #2<<28
+ BCS loop1_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+ SUB ptrA, ptrA, #1
+
+ ADDS count, count, #0xE << 24
+ BGE loop1_y
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cr
+ ;///////////////////////////////////////////////////////////////////////////
+ LDR height, [sp,#0xfc] ;// height
+ LDR ref, [sp, #0xc4] ;// ref
+ LDR tmp1, [sp, #0xd0] ;// y0
+ LDR tmp2, [sp, #0xcc] ;// x0
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+ ADD tmp1, height, tmp1
+ MLA tmp3, tmp1, width, tmp2
+ ADD ptrA, ref, tmp3
+ ADD mb, mb, #64
+
+ AND count, count, #0x00FFFFFF
+ AND tmp1, count, #0x000F0000
+ ADD count, count, tmp1, LSL #8
+ AND tmp2, count, #0x00F00000
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical interpolation
+loop2_y
+ ADD count, count, tmp2, LSL #8
+ LDRB tmp1, [ptrA, width]
+ LDRB tmp2, [ptrA], #1
+
+loop2_x
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp4, [ptrA], #1
+
+ PKHBT tmp5, tmp1, tmp3, LSL #16
+ PKHBT tmp6, tmp2, tmp4, LSL #16
+
+ LDRB tmp1, [ptrA, width]
+ LDRB tmp2, [ptrA], #1
+
+ SMLAD tmp5, tmp5, valX, c32 ;// multiply
+ SMLAD tmp6, tmp6, valX, c32 ;// multiply
+
+ PKHBT tmp7, tmp3, tmp1, LSL #16
+ PKHBT tmp8, tmp4, tmp2, LSL #16
+
+ SMLAD tmp7, tmp7, valX, c32 ;// multiply
+ SMLAD tmp8, tmp8, valX, c32 ;// multiply
+
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb,#8] ;// store row 2 col 1
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb],#1 ;// store row 1 col 1
+
+ MOV tmp7, tmp7, LSR #6 ;// scale down
+ STRB tmp7, [mb,#8] ;// store row 2 col 2
+
+ MOV tmp8, tmp8, LSR #6 ;// scale down
+ STRB tmp8, [mb],#1 ;// store row 1 col 2
+
+ SUBS count, count, #2<<28
+ BCS loop2_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+ SUB ptrA, ptrA, #1
+
+ ADDS count, count, #0xE << 24
+ BGE loop2_y
+
+ ADD sp,sp,#0xd4
+ LDMFD sp!, {r4-r11,pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s
new file mode 100755
index 0000000..7420ad3
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_hor_ver.s
@@ -0,0 +1,339 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer
+;-- function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+
+;// h264bsdInterpolateChromaHorVer register allocation
+
+ref RN 0
+ptrA RN 0
+
+mb RN 1
+block RN 1
+
+x0 RN 2
+count RN 2
+
+y0 RN 3
+valY RN 3
+
+width RN 4
+
+tmp4 RN 5
+height RN 5
+
+tmp1 RN 6
+
+tmp2 RN 7
+
+tmp3 RN 8
+
+valX RN 9
+
+tmp5 RN 10
+chrPW RN 10
+
+tmp6 RN 11
+chrPH RN 11
+
+xFrac RN 12
+
+c32 RN 14
+yFrac RN 14
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateChromaHorVer
+
+;// Function arguments
+;//
+;// u8 *ref, : 0xc4
+;// u8 *predPartChroma, : 0xc8
+;// i32 x0, : 0xcc
+;// i32 y0, : 0xd0
+;// u32 width, : 0xf8
+;// u32 height, : 0xfc
+;// u32 xFrac, : 0x100
+;// u32 yFrac, : 0x104
+;// u32 chromaPartWidth, : 0x108
+;// u32 chromaPartHeight : 0x10c
+
+h264bsdInterpolateChromaHorVer
+ STMFD sp!, {r0-r11,lr}
+ SUB sp, sp, #0xc4
+
+ LDR chrPW, [sp, #0x108] ;// chromaPartWidth
+ LDR xFrac, [sp, #0x100] ;// xFrac
+ LDR width, [sp, #0xf8] ;// width
+ CMP x0, #0
+ BLT do_fill
+
+ ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth
+ ADD tmp1, tmp1, #1 ;// tmp1 = x0+ chromaPartWidth+1
+ CMP tmp1, width ;// x0+chromaPartWidth+1 > width
+ BHI do_fill
+
+ CMP y0, #0
+ BLT do_fill
+ LDR chrPH, [sp, #0x10c] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight
+ ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1
+ CMP tmp1, height
+ BLS skip_fill
+
+do_fill
+ LDR chrPH, [sp, #0x10c] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp3, chrPW, #1 ;// tmp3 = chromaPartWidth+1
+ ADD tmp1, chrPW, #1 ;// tmp1 = chromaPartWidth+1
+ ADD tmp2, chrPH, #1 ;// tmp2 = chromaPartHeight+1
+ STMIA sp,{width,height,tmp1,tmp2,tmp3}
+ ADD block, sp, #0x1c ;// block
+ BL h264bsdFillBlock
+
+ LDR x0, [sp, #0xcc]
+ LDR y0, [sp, #0xd0]
+ LDR ref, [sp, #0xc4] ;// ref
+ STMIA sp,{width,height,tmp1,tmp2,tmp3}
+ ADD block, sp, #0x1c ;// block
+ MLA ref, height, width, ref ;// ref += width * height;
+ MLA block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1)
+ BL h264bsdFillBlock
+
+ MOV x0, #0 ;// x0 = 0
+ MOV y0, #0 ;// y0 = 0
+ STR x0, [sp, #0xcc]
+ STR y0, [sp, #0xd0]
+ ADD ref, sp, #0x1c ;// ref = block
+ STR ref, [sp, #0xc4] ;// ref
+
+ STR tmp2, [sp, #0xfc] ;// height
+ STR tmp1, [sp, #0xf8] ;// width
+ MOV width, tmp1
+
+skip_fill
+ MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
+ LDR yFrac, [sp, #0x104] ;// yFrac
+ LDR xFrac, [sp, #0x100]
+ ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
+ RSB valX, xFrac, #8 ;// valX = 8-xFrac
+ RSB valY, yFrac, #8 ;// valY = 8-yFrac
+
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (chromaPartWidth-1)
+ ;// [27:24] loop_y (chromaPartHeight-1)
+ ;// [23:20] chromaPartWidth-1
+ ;// [19:16] chromaPartHeight-1
+ ;// [15:00] nothing
+
+ SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
+ SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
+ ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+ ADD count, count, tmp2, LSL #24 ;// loop_y
+ ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+ AND tmp2, count, #0x00F00000 ;// loop_x
+ PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY |
+ MOV c32, #32
+
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cb
+ ;///////////////////////////////////////////////////////////////////////////
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical and horizontal interpolation
+
+loop1_y
+ LDRB tmp1, [ptrA]
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp5, [ptrA, width, LSL #1]
+
+ PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
+ PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
+
+ SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
+ SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
+
+ ADD count, count, tmp2, LSL #8
+loop1_x
+ ;// first
+ LDRB tmp2, [ptrA, #1]!
+ LDRB tmp4, [ptrA, width]
+ LDRB tmp6, [ptrA, width, LSL #1]
+
+ PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2|
+ PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4|
+
+ SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac)
+ MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32
+ MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5
+
+ SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac)
+ MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32
+ MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb, #8] ;// store pixel
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb], #1 ;// store pixel
+
+ ;// second
+ LDRB tmp1, [ptrA, #1]!
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp5, [ptrA, width, LSL #1]
+
+ PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
+ PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
+
+ SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
+ MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32
+ MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5
+
+ SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
+ MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32
+ MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb, #8] ;// store pixel
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb], #1 ;// store pixel
+
+ SUBS count, count, #2<<28
+ BCS loop1_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+
+ ADDS count, count, #0xE << 24
+ BGE loop1_y
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cr
+ ;///////////////////////////////////////////////////////////////////////////
+ LDR height, [sp,#0xfc] ;// height
+ LDR ref, [sp, #0xc4] ;// ref
+ LDR tmp1, [sp, #0xd0] ;// y0
+ LDR tmp2, [sp, #0xcc] ;// x0
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+ ADD tmp1, height, tmp1
+ MLA tmp3, tmp1, width, tmp2
+ ADD ptrA, ref, tmp3
+ ADD mb, mb, #64
+
+ AND count, count, #0x00FFFFFF
+ AND tmp1, count, #0x000F0000
+ ADD count, count, tmp1, LSL #8
+ AND tmp2, count, #0x00F00000
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical and horizontal interpolation
+loop2_y
+ LDRB tmp1, [ptrA]
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp5, [ptrA, width, LSL #1]
+
+ PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
+ PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
+
+ SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
+ SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
+
+ ADD count, count, tmp2, LSL #8
+loop2_x
+ ;// first
+ LDRB tmp2, [ptrA, #1]!
+ LDRB tmp4, [ptrA, width]
+ LDRB tmp6, [ptrA, width, LSL #1]
+
+ PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2|
+ PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4|
+
+ SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac)
+ MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32
+ MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5
+
+ SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac)
+ MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32
+ MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb, #8] ;// store pixel
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb], #1 ;// store pixel
+
+ ;// second
+ LDRB tmp1, [ptrA, #1]!
+ LDRB tmp3, [ptrA, width]
+ LDRB tmp5, [ptrA, width, LSL #1]
+
+ PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1|
+ PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3|
+
+ SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac)
+ MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32
+ MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5
+
+ SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac)
+ MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32
+ MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6
+
+ MOV tmp6, tmp6, LSR #6 ;// scale down
+ STRB tmp6, [mb, #8] ;// store pixel
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb], #1 ;// store pixel
+
+ SUBS count, count, #2<<28
+ BCS loop2_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+
+ ADDS count, count, #0xE << 24
+ BGE loop2_y
+
+ ADD sp,sp,#0xd4
+ LDMFD sp!,{r4-r11,pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s
new file mode 100755
index 0000000..af9df1b
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_chroma_ver.s
@@ -0,0 +1,288 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateChromaVer register allocation
+
+ref RN 0
+ptrA RN 0
+
+mb RN 1
+block RN 1
+
+x0 RN 2
+count RN 2
+
+y0 RN 3
+valY RN 3
+
+width RN 4
+
+height RN 5
+tmp7 RN 5
+
+chrPW RN 6
+tmp8 RN 6
+
+tmp1 RN 7
+
+tmp2 RN 8
+
+tmp3 RN 9
+
+tmp4 RN 10
+
+tmp5 RN 11
+chrPH RN 11
+
+tmp6 RN 12
+
+c32 RN 14
+yFrac RN 14
+
+;// Function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateChromaVer
+
+;// Function arguments
+;//
+;// u8 *ref, : 0xc4
+;// u8 *predPartChroma, : 0xc8
+;// i32 x0, : 0xcc
+;// i32 y0, : 0xd0
+;// u32 width, : 0xf8
+;// u32 height, : 0xfc
+;// u32 yFrac, : 0x100
+;// u32 chromaPartWidth, : 0x104
+;// u32 chromaPartHeight : 0x108
+
+h264bsdInterpolateChromaVer
+ STMFD sp!, {r0-r11,lr}
+ SUB sp, sp, #0xc4
+
+ LDR chrPW, [sp, #0x104] ;// chromaPartWidth
+ LDR width, [sp, #0xf8] ;// width
+ CMP x0, #0
+ BLT do_fill
+
+ ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth
+ CMP tmp1, width ;// x0+chromaPartWidth > width
+ BHI do_fill
+
+ CMP y0, #0
+ BLT do_fill
+ LDR chrPH, [sp, #0x108] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight
+ ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1
+ CMP tmp1, height
+ BLS skip_fill
+
+do_fill
+ LDR chrPH, [sp, #0x108] ;// chromaPartHeight
+ LDR height, [sp, #0xfc] ;// height
+ ADD tmp1, chrPH, #1 ;// tmp1 = chromaPartHeight+1
+ MOV tmp2, chrPW ;// tmp2 = chromaPartWidth
+ STMIA sp,{width,height,chrPW,tmp1,tmp2}
+ ADD block, sp, #0x1c ;// block
+ BL h264bsdFillBlock
+
+ LDR x0, [sp, #0xcc]
+ LDR y0, [sp, #0xd0]
+ LDR ref, [sp, #0xc4] ;// ref
+ STMIA sp,{width,height,chrPW,tmp1,tmp2}
+ ADD block, sp, #0x1c ;// block
+ MLA ref, height, width, ref ;// ref += width * height;
+ MLA block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
+ BL h264bsdFillBlock
+
+ MOV x0, #0 ;// x0 = 0
+ MOV y0, #0 ;// y0 = 0
+ STR x0, [sp, #0xcc]
+ STR y0, [sp, #0xd0]
+ ADD ref, sp, #0x1c ;// ref = block
+ STR ref, [sp, #0xc4] ;// ref
+
+ STR tmp1, [sp, #0xfc] ;// height
+ STR chrPW, [sp, #0xf8] ;// width
+ MOV width, chrPW
+
+skip_fill
+ MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0
+ LDR yFrac, [sp, #0x100] ;// yFrac
+ ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0
+ RSB valY, yFrac, #8 ;// valY = 8-yFrac
+
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (chromaPartWidth-1)
+ ;// [27:24] loop_y (chromaPartHeight-1)
+ ;// [23:20] chromaPartWidth-1
+ ;// [19:16] chromaPartHeight-1
+ ;// [15:00] nothing
+
+ SUB tmp2, chrPH, #1 ;// chromaPartHeight-1
+ SUB tmp1, chrPW, #1 ;// chromaPartWidth-1
+ ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1
+ ADD count, count, tmp2, LSL #24 ;// loop_y
+ ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1
+ AND tmp2, count, #0x00F00000 ;// loop_x
+ PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY |
+ MOV valY, valY, LSL #3 ;// multiply by 8 in advance
+ MOV c32, #32
+
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cb
+ ;///////////////////////////////////////////////////////////////////////////
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical interpolation
+
+loop1_y
+ ADD count, count, tmp2, LSL #8
+loop1_x
+ ;// Process 2x2 block
+ LDRB tmp2, [ptrA,width] ;// 2 row, 1 col
+ LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col
+ LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col
+
+ LDRB tmp5, [ptrA,width] ;// 2 row, 2 col
+ LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col
+ LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col
+
+ PKHBT tmp1, tmp1, tmp2, LSL #16 ;// |B|A|
+ PKHBT tmp2, tmp2, tmp3, LSL #16 ;// |C|B|
+ PKHBT tmp4, tmp4, tmp5, LSL #16 ;// |B|A|
+
+ SMLAD tmp7, tmp2, valY, c32 ;// multiply
+ PKHBT tmp5, tmp5, tmp6, LSL #16 ;// |C|B|
+ SMLAD tmp2, tmp1, valY, c32 ;// multiply
+ SMLAD tmp8, tmp5, valY, c32 ;// multiply
+ SMLAD tmp5, tmp4, valY, c32 ;// multiply
+
+ MOV tmp7, tmp7, LSR #6 ;// scale down
+ STRB tmp7, [mb,#8] ;// store row 2 col 1
+ MOV tmp2, tmp2, LSR #6 ;// scale down
+ STRB tmp2, [mb],#1 ;// store row 1 col 1
+
+ MOV tmp8, tmp8, LSR #6 ;// scale down
+ STRB tmp8, [mb,#8] ;// store row 2 col 2
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb],#1 ;// store row 1 col 2
+
+
+ SUBS count, count, #2<<28
+ BCS loop1_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+
+ ADDS count, count, #0xE << 24
+ BGE loop1_y
+
+ ;///////////////////////////////////////////////////////////////////////////
+ ;// Cr
+ ;///////////////////////////////////////////////////////////////////////////
+ LDR height, [sp,#0xfc] ;// height
+ LDR ref, [sp, #0xc4] ;// ref
+ LDR tmp1, [sp, #0xd0] ;// y0
+ LDR tmp2, [sp, #0xcc] ;// x0
+ LDR mb, [sp, #0xc8] ;// predPartChroma
+
+ ADD tmp1, height, tmp1
+ MLA tmp3, tmp1, width, tmp2
+ ADD ptrA, ref, tmp3
+ ADD mb, mb, #64
+
+ AND count, count, #0x00FFFFFF
+ AND tmp1, count, #0x000F0000
+ ADD count, count, tmp1, LSL #8
+ AND tmp2, count, #0x00F00000
+
+ ;// 2x2 pels per iteration
+ ;// bilinear vertical interpolation
+loop2_y
+ ADD count, count, tmp2, LSL #8
+loop2_x
+ ;// Process 2x2 block
+ LDRB tmp2, [ptrA,width] ;// 2 row, 1 col
+ LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col
+ LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col
+
+ LDRB tmp5, [ptrA,width] ;// 2 row, 2 col
+ LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col
+ LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col
+
+ PKHBT tmp1, tmp1, tmp2, LSL #16 ;// |B|A|
+ PKHBT tmp2, tmp2, tmp3, LSL #16 ;// |C|B|
+ PKHBT tmp4, tmp4, tmp5, LSL #16 ;// |B|A|
+
+ SMLAD tmp7, tmp2, valY, c32 ;// multiply
+ PKHBT tmp5, tmp5, tmp6, LSL #16 ;// |C|B|
+ SMLAD tmp2, tmp1, valY, c32 ;// multiply
+ SMLAD tmp8, tmp5, valY, c32 ;// multiply
+ SMLAD tmp5, tmp4, valY, c32 ;// multiply
+
+ MOV tmp7, tmp7, LSR #6 ;// scale down
+ STRB tmp7, [mb,#8] ;// store row 2 col 1
+ MOV tmp2, tmp2, LSR #6 ;// scale down
+ STRB tmp2, [mb],#1 ;// store row 1 col 1
+
+ MOV tmp8, tmp8, LSR #6 ;// scale down
+ STRB tmp8, [mb,#8] ;// store row 2 col 2
+ MOV tmp5, tmp5, LSR #6 ;// scale down
+ STRB tmp5, [mb],#1 ;// store row 1 col 2
+
+
+ SUBS count, count, #2<<28
+ BCS loop2_x
+
+ AND tmp2, count, #0x00F00000
+
+ ADDS mb, mb, #16
+ SBC mb, mb, tmp2, LSR #20
+ ADD ptrA, ptrA, width, LSL #1
+ SBC ptrA, ptrA, tmp2, LSR #20
+
+ ADDS count, count, #0xE << 24
+ BGE loop2_y
+
+ ADD sp,sp,#0xd4
+ LDMFD sp!, {r4-r11,pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s
new file mode 100755
index 0000000..93968b6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s
@@ -0,0 +1,251 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateHorHalf register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+y0 RN 3
+x_2_0 RN 3
+
+width RN 4
+x_3_1 RN 4
+
+height RN 5
+x_6_4 RN 5
+
+partW RN 6
+x_7_5 RN 6
+
+partH RN 7
+tmp1 RN 7
+
+tmp2 RN 8
+
+tmp3 RN 9
+
+tmp4 RN 10
+
+mult_20_01 RN 11
+mult_20_m5 RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateHorHalf
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
+;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
+;// y_0 = 20 1 20 -5 -5 1
+;// y_1 = -5 20 1 1 20 -5
+;// y_2 = 1 -5 -5 20 1 20
+;// y_3 = 1 20 -5 -5 20 1
+
+
+h264bsdInterpolateHorHalf
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, x0, partW ;// (x0+partWidth)
+ ADD tmp4, tmp4, #5 ;// (y0+partW+5)
+ LDR width, [sp,#0x218] ;// width
+ CMP tmp4, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp2, y0, partH ;// (y0+partHeight)
+ LDR height, [sp,#0x21c] ;// height
+ CMP tmp2, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ LDR height, [sp,#0x21c] ;// height
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, partW, #5 ;// tmp4 = partW + 5;
+ STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5
+ STR partH, [sp,#0xc] ;// sp+c = partHeight
+ STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5
+ LDR width, [sp,#0x218] ;// width
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR tmp4, [sp,#0x218] ;// width = partWidth+5
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ MLA tmp2, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp2 ;// ref += y0*width+x0
+ ADD ref, ref, #8 ;// ref = ref+8
+ LDR mb, [sp, #0x1e8] ;// mb
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (partWidth-1)
+ ;// [27:24] loop_y (partHeight-1)
+ ;// [23:20] partWidth-1
+ ;// [19:16] partHeight-1
+ ;// [15:00] width
+ MOV count, width
+ SUB partW, partW, #1;
+ SUB partH, partH, #1;
+ ADD tmp2, partH, partW, LSL #4
+ ADD count, count, tmp2, LSL #16
+
+
+ LDR mult_20_01, = 0x00140001
+ LDR mult_20_m5, = 0x0014FFFB
+ MOV plus16, #16
+ AND tmp1, count, #0x000F0000 ;// partHeight-1
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ ADD count, count, tmp1, LSL #8
+loop_y
+ LDR x_3_1, [ref, #-8]
+ ADD count, count, tmp3, LSL #8
+ LDR x_7_5, [ref, #-4]
+ UXTB16 x_2_0, x_3_1
+ UXTB16 x_3_1, x_3_1, ROR #8
+ UXTB16 x_6_4, x_7_5
+
+loop_x
+ UXTB16 x_7_5, x_7_5, ROR #8
+
+ SMLAD tmp1, x_2_0, mult_20_01, plus16
+ SMLATB tmp3, x_2_0, mult_20_01, plus16
+ SMLATB tmp2, x_2_0, mult_20_m5, plus16
+ SMLATB tmp4, x_3_1, mult_20_01, plus16
+
+ SMLAD tmp1, x_3_1, mult_20_m5, tmp1
+ SMLATB tmp3, x_3_1, mult_20_m5, tmp3
+ SMLAD tmp2, x_3_1, mult_20_01, tmp2
+ LDR x_3_1, [ref], #4
+ SMLAD tmp4, x_6_4, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_6_4, mult_20_m5, tmp1
+ SMLADX tmp3, x_6_4, mult_20_m5, tmp3
+ SMLADX tmp2, x_6_4, mult_20_01, tmp2
+ SMLADX tmp4, x_7_5, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_7_5, mult_20_01, tmp1
+ UXTB16 x_2_0, x_3_1
+ SMLABB tmp2, x_7_5, mult_20_m5, tmp2
+ SMLADX tmp3, x_7_5, mult_20_01, tmp3
+ SMLABB tmp4, x_2_0, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+
+ SUBS count, count, #4<<28
+ ORR tmp1, tmp1, tmp2, LSL #8
+ STR tmp1, [mb], #4
+ BCC next_y
+
+ UXTB16 x_3_1, x_3_1, ROR #8
+
+ SMLAD tmp1, x_6_4, mult_20_01, plus16
+ SMLATB tmp3, x_6_4, mult_20_01, plus16
+ SMLATB tmp2, x_6_4, mult_20_m5, plus16
+ SMLATB tmp4, x_7_5, mult_20_01, plus16
+
+ SMLAD tmp1, x_7_5, mult_20_m5, tmp1
+ SMLATB tmp3, x_7_5, mult_20_m5, tmp3
+ SMLAD tmp2, x_7_5, mult_20_01, tmp2
+ LDR x_7_5, [ref], #4
+ SMLAD tmp4, x_2_0, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_2_0, mult_20_m5, tmp1
+ SMLADX tmp3, x_2_0, mult_20_m5, tmp3
+ SMLADX tmp2, x_2_0, mult_20_01, tmp2
+ SMLADX tmp4, x_3_1, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_3_1, mult_20_01, tmp1
+ UXTB16 x_6_4, x_7_5
+ SMLABB tmp2, x_3_1, mult_20_m5, tmp2
+ SMLADX tmp3, x_3_1, mult_20_01, tmp3
+ SMLABB tmp4, x_6_4, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+
+ SUBS count, count, #4<<28
+ ORR tmp1, tmp1, tmp2, LSL #8
+ STR tmp1, [mb], #4
+ BCS loop_x
+
+next_y
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ SMLABB ref, count, mult_20_01, ref ;// +width
+ ADDS mb, mb, #16 ;// +16, Carry=0
+ SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1
+ SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1
+ ADDS count, count, #(1<<28)-(1<<24)
+ BGE loop_y
+
+ ADD sp,sp,#0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
new file mode 100755
index 0000000..de243d4
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_quarter.s
@@ -0,0 +1,273 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateHorQuarter register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+y0 RN 3
+x_2_0 RN 3
+
+width RN 4
+x_3_1 RN 4
+
+height RN 5
+x_6_4 RN 5
+
+partW RN 6
+x_7_5 RN 6
+
+partH RN 7
+tmp1 RN 7
+
+tmp2 RN 8
+
+tmp3 RN 9
+
+tmp4 RN 10
+
+mult_20_01 RN 11
+
+mult_20_m5 RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateHorQuarter
+
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
+;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
+;// y_0 = 20 1 20 -5 -5 1
+;// y_1 = -5 20 1 1 20 -5
+;// y_2 = 1 -5 -5 20 1 20
+;// y_3 = 1 20 -5 -5 20 1
+
+
+h264bsdInterpolateHorQuarter
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, x0, partW ;// (x0+partWidth)
+ ADD tmp4, tmp4, #5 ;// (y0+partW+5)
+ LDR width, [sp,#0x218] ;// width
+ CMP tmp4, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp2, y0, partH ;// (y0+partHeight)
+ LDR height, [sp,#0x21c] ;// height
+ CMP tmp2, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ LDR height, [sp,#0x21c] ;// height
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp4, partW, #5 ;// tmp4 = partW + 5;
+ STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5
+ STR partH, [sp,#0xc] ;// sp+c = partHeight
+ STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5
+ LDR width, [sp,#0x218] ;// width
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR tmp4, [sp,#0x218] ;// width = partWidth+5
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ MLA tmp2, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp2 ;// ref += y0*width+x0
+ ADD ref, ref, #8 ;// ref = ref+8
+ LDR mb, [sp, #0x1e8] ;// mb
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (partWidth-1)
+ ;// [27:24] loop_y (partHeight-1)
+ ;// [23:20] partWidth-1
+ ;// [19:16] partHeight-1
+ ;// [15:00] width
+ MOV count, width
+ SUB partW, partW, #1;
+ SUB partH, partH, #1;
+ ADD tmp2, partH, partW, LSL #4
+ ADD count, count, tmp2, LSL #16
+
+
+ LDR mult_20_01, = 0x00140001
+ LDR mult_20_m5, = 0x0014FFFB
+ MOV plus16, #16
+ AND tmp1, count, #0x000F0000 ;// partHeight-1
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ ADD count, count, tmp1, LSL #8
+loop_y
+ LDR x_3_1, [ref, #-8]
+ ADD count, count, tmp3, LSL #8
+ LDR x_7_5, [ref, #-4]
+ UXTB16 x_2_0, x_3_1
+ UXTB16 x_3_1, x_3_1, ROR #8
+ UXTB16 x_6_4, x_7_5
+
+loop_x
+ UXTB16 x_7_5, x_7_5, ROR #8
+
+ SMLAD tmp1, x_2_0, mult_20_01, plus16
+ SMLATB tmp3, x_2_0, mult_20_01, plus16
+ SMLATB tmp2, x_2_0, mult_20_m5, plus16
+ SMLATB tmp4, x_3_1, mult_20_01, plus16
+
+ SMLAD tmp1, x_3_1, mult_20_m5, tmp1
+ SMLATB tmp3, x_3_1, mult_20_m5, tmp3
+ SMLAD tmp2, x_3_1, mult_20_01, tmp2
+ LDR x_3_1, [ref], #4
+ SMLAD tmp4, x_6_4, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_6_4, mult_20_m5, tmp1
+ SMLADX tmp3, x_6_4, mult_20_m5, tmp3
+ SMLADX tmp2, x_6_4, mult_20_01, tmp2
+ SMLADX tmp4, x_7_5, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_7_5, mult_20_01, tmp1
+ UXTB16 x_2_0, x_3_1
+ SMLABB tmp2, x_7_5, mult_20_m5, tmp2
+ SMLADX tmp3, x_7_5, mult_20_01, tmp3
+ SMLABB tmp4, x_2_0, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ LDR tmp4, [sp, #0x228]
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+ SUB tmp4, tmp4, #10
+
+ SUBS count, count, #4<<28
+ LDR tmp3, [ref, tmp4]
+ ORR tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel position
+ LDR tmp2, = 0x80808080
+ MVN tmp3, tmp3
+ UHSUB8 tmp1, tmp1, tmp3
+ EOR tmp1, tmp1, tmp2
+ STR tmp1, [mb], #4
+
+ BCC next_y
+
+ UXTB16 x_3_1, x_3_1, ROR #8
+
+ SMLAD tmp1, x_6_4, mult_20_01, plus16
+ SMLATB tmp3, x_6_4, mult_20_01, plus16
+ SMLATB tmp2, x_6_4, mult_20_m5, plus16
+ SMLATB tmp4, x_7_5, mult_20_01, plus16
+
+ SMLAD tmp1, x_7_5, mult_20_m5, tmp1
+ SMLATB tmp3, x_7_5, mult_20_m5, tmp3
+ SMLAD tmp2, x_7_5, mult_20_01, tmp2
+ LDR x_7_5, [ref], #4
+ SMLAD tmp4, x_2_0, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_2_0, mult_20_m5, tmp1
+ SMLADX tmp3, x_2_0, mult_20_m5, tmp3
+ SMLADX tmp2, x_2_0, mult_20_01, tmp2
+ SMLADX tmp4, x_3_1, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_3_1, mult_20_01, tmp1
+ UXTB16 x_6_4, x_7_5
+ SMLABB tmp2, x_3_1, mult_20_m5, tmp2
+ SMLADX tmp3, x_3_1, mult_20_01, tmp3
+ SMLABB tmp4, x_6_4, mult_20_01, tmp4
+
+ MOV tmp2, tmp2, ASR #5
+ MOV tmp1, tmp1, ASR #5
+ PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
+ PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
+ LDR tmp4, [sp, #0x228]
+ USAT16 tmp2, #8, tmp2
+ USAT16 tmp1, #8, tmp1
+ SUB tmp4, tmp4, #10
+
+ SUBS count, count, #4<<28
+ LDR tmp3, [ref, tmp4]
+ ORR tmp1, tmp1, tmp2, LSL #8
+
+;// quarter pel
+ LDR tmp2, = 0x80808080
+ MVN tmp3, tmp3
+ UHSUB8 tmp1, tmp1, tmp3
+ EOR tmp1, tmp1, tmp2
+
+ STR tmp1, [mb], #4
+ BCS loop_x
+
+next_y
+ AND tmp3, count, #0x00F00000 ;// partWidth-1
+ SMLABB ref, count, mult_20_01, ref ;// +width
+ ADDS mb, mb, #16 ;// +16, Carry=0
+ SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1
+ SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1
+ ADDS count, count, #(1<<28)-(1<<24)
+ BGE loop_y
+
+ ADD sp,sp,#0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s
new file mode 100755
index 0000000..1c79b39
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_ver_quarter.s
@@ -0,0 +1,536 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorVerQuarter
+;-- function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateHorVerQuarter register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+y0 RN 3
+x_2_0 RN 3
+res RN 3
+
+x_3_1 RN 4
+tmp1 RN 4
+
+height RN 5
+x_6_4 RN 5
+tmp2 RN 5
+
+partW RN 6
+x_7_5 RN 6
+tmp3 RN 6
+
+partH RN 7
+tmp4 RN 7
+
+tmp5 RN 8
+
+tmp6 RN 9
+
+tmpa RN 10
+
+mult_20_01 RN 11
+tmpb RN 11
+
+mult_20_m5 RN 12
+width RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateHorVerQuarter
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
+;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
+;// y_0 = 20 1 20 -5 -5 1
+;// y_1 = -5 20 1 1 20 -5
+;// y_2 = 1 -5 -5 20 1 20
+;// y_3 = 1 20 -5 -5 20 1
+
+
+h264bsdInterpolateHorVerQuarter
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ LDR width, [sp,#0x218] ;// width
+ ADD tmpa, x0, partW ;// (x0+partWidth)
+ ADD tmpa, tmpa, #5 ;// (x0+partW+5)
+ CMP tmpa, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ LDR height, [sp,#0x21c] ;// height
+ ADD tmp5, y0, partH ;// (y0+partHeight)
+ ADD tmp5, tmp5, #5 ;// (y0+partH+5)
+ CMP tmp5, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ LDR partW, [sp,#0x220] ;// partWidth
+ LDR height, [sp,#0x21c] ;// height
+ ADD tmp5, partH, #5 ;// tmp5 = partH + 5
+ ADD tmpa, partW, #5 ;// tmpa = partW + 5
+ STMIB sp, {height, tmpa} ;// sp+4 = height, sp+8 = partWidth+5
+ LDR width, [sp,#0x218] ;// width
+ STR tmp5, [sp,#0xc] ;// sp+c = partHeight+5
+ STR tmpa, [sp,#0x10] ;// sp+10 = partWidth+5
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR tmpa, [sp,#0x218] ;// width = partWidth+5
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ LDR tmp6, [sp,#0x228] ;// horVerOffset
+ LDR mb, [sp, #0x1e8] ;// mb
+ MLA tmp5, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp5 ;// ref += y0*width+x0
+ STR ref, [sp, #0x1e4] ;// store "ref" for vertical filtering
+ AND tmp6, tmp6, #2 ;// calculate ref for horizontal filter
+ MOV tmpa, #2
+ ADD tmp6, tmpa, tmp6, LSR #1
+ MLA ref, tmp6, width, ref
+ ADD ref, ref, #8 ;// ref = ref+8
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (partWidth-1)
+ ;// [27:24] loop_y (partHeight-1)
+ ;// [23:20] partWidth-1
+ ;// [19:16] partHeight-1
+ ;// [15:00] width
+ MOV count, width
+ SUB partW, partW, #1;
+ SUB partH, partH, #1;
+ ADD tmp5, partH, partW, LSL #4
+ ADD count, count, tmp5, LSL #16
+
+
+ LDR mult_20_01, = 0x00140001 ;// constant multipliers
+ LDR mult_20_m5, = 0x0014FFFB ;// constant multipliers
+ MOV plus16, #16 ;// constant for add
+ AND tmp4, count, #0x000F0000 ;// partHeight-1
+ AND tmp6, count, #0x00F00000 ;// partWidth-1
+ ADD count, count, tmp4, LSL #8 ;// partH-1 to lower part of top byte
+
+;// HORIZONTAL PART
+
+loop_y_hor
+ LDR x_3_1, [ref, #-8]
+ ADD count, count, tmp6, LSL #8 ;// partW-1 to upper part of top byte
+ LDR x_7_5, [ref, #-4]
+ UXTB16 x_2_0, x_3_1
+ UXTB16 x_3_1, x_3_1, ROR #8
+ UXTB16 x_6_4, x_7_5
+
+loop_x_hor
+ UXTB16 x_7_5, x_7_5, ROR #8
+
+ SMLAD tmp4, x_2_0, mult_20_01, plus16
+ SMLATB tmp6, x_2_0, mult_20_01, plus16
+ SMLATB tmp5, x_2_0, mult_20_m5, plus16
+ SMLATB tmpa, x_3_1, mult_20_01, plus16
+
+ SMLAD tmp4, x_3_1, mult_20_m5, tmp4
+ SMLATB tmp6, x_3_1, mult_20_m5, tmp6
+ SMLAD tmp5, x_3_1, mult_20_01, tmp5
+ LDR x_3_1, [ref], #4
+ SMLAD tmpa, x_6_4, mult_20_m5, tmpa
+
+ SMLABB tmp4, x_6_4, mult_20_m5, tmp4
+ SMLADX tmp6, x_6_4, mult_20_m5, tmp6
+ SMLADX tmp5, x_6_4, mult_20_01, tmp5
+ SMLADX tmpa, x_7_5, mult_20_m5, tmpa
+
+ SMLABB tmp4, x_7_5, mult_20_01, tmp4
+ UXTB16 x_2_0, x_3_1
+ SMLABB tmp5, x_7_5, mult_20_m5, tmp5
+ SMLADX tmp6, x_7_5, mult_20_01, tmp6
+ SMLABB tmpa, x_2_0, mult_20_01, tmpa
+
+ MOV tmp5, tmp5, ASR #5
+ MOV tmp4, tmp4, ASR #5
+ PKHBT tmp5, tmp5, tmpa, LSL #(16-5)
+ PKHBT tmp4, tmp4, tmp6, LSL #(16-5)
+ USAT16 tmp5, #8, tmp5
+ USAT16 tmp4, #8, tmp4
+
+ SUBS count, count, #4<<28
+ ORR tmp4, tmp4, tmp5, LSL #8
+ STR tmp4, [mb], #4
+ BCC next_y_hor
+
+ UXTB16 x_3_1, x_3_1, ROR #8
+
+ SMLAD tmp4, x_6_4, mult_20_01, plus16
+ SMLATB tmp6, x_6_4, mult_20_01, plus16
+ SMLATB tmp5, x_6_4, mult_20_m5, plus16
+ SMLATB tmpa, x_7_5, mult_20_01, plus16
+
+ SMLAD tmp4, x_7_5, mult_20_m5, tmp4
+ SMLATB tmp6, x_7_5, mult_20_m5, tmp6
+ SMLAD tmp5, x_7_5, mult_20_01, tmp5
+ LDR x_7_5, [ref], #4
+ SMLAD tmpa, x_2_0, mult_20_m5, tmpa
+
+ SMLABB tmp4, x_2_0, mult_20_m5, tmp4
+ SMLADX tmp6, x_2_0, mult_20_m5, tmp6
+ SMLADX tmp5, x_2_0, mult_20_01, tmp5
+ SMLADX tmpa, x_3_1, mult_20_m5, tmpa
+
+ SMLABB tmp4, x_3_1, mult_20_01, tmp4
+ UXTB16 x_6_4, x_7_5
+ SMLABB tmp5, x_3_1, mult_20_m5, tmp5
+ SMLADX tmp6, x_3_1, mult_20_01, tmp6
+ SMLABB tmpa, x_6_4, mult_20_01, tmpa
+
+ MOV tmp5, tmp5, ASR #5
+ MOV tmp4, tmp4, ASR #5
+ PKHBT tmp5, tmp5, tmpa, LSL #(16-5)
+ PKHBT tmp4, tmp4, tmp6, LSL #(16-5)
+ USAT16 tmp5, #8, tmp5
+ USAT16 tmp4, #8, tmp4
+
+ SUBS count, count, #4<<28
+ ORR tmp4, tmp4, tmp5, LSL #8
+ STR tmp4, [mb], #4
+ BCS loop_x_hor
+
+next_y_hor
+ AND tmp6, count, #0x00F00000 ;// partWidth-1
+ SMLABB ref, count, mult_20_01, ref ;// +width
+ ADDS mb, mb, #16 ;// +16, Carry=0
+ SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1
+ SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1
+ ADDS count, count, #(1<<28)-(1<<24) ;// decrement counter (partW)
+ BGE loop_y_hor
+
+
+
+;// VERTICAL PART
+;//
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;// ..
+;// ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+;// Approach to bilinear interpolation to quarter pel position.
+;// 4 bytes are processed parallel
+;//
+;// algorithm (a+b+1)/2. Rouding upwards +1 can be achieved by
+;// negating second operand to get one's complement (instead of 2's)
+;// and using subtraction, EOR is used to correct sign.
+;//
+;// MVN b, b
+;// UHSUB8 a, a, b
+;// EOR a, a, 0x80808080
+
+
+ LDR ref, [sp, #0x1e4] ;// ref
+ LDR tmpa, [sp, #0x228] ;// horVerOffset
+ LDR mb, [sp, #0x1e8] ;// mb
+ LDR width, [sp, #0x218] ;// width
+ ADD ref, ref, #2 ;// calculate correct position
+ AND tmpa, tmpa, #1
+ ADD ref, ref, tmpa
+ LDR plus16, = 0x00100010 ;// +16 to lower and upperf halfwords
+ AND count, count, #0x00FFFFFF ;// partWidth-1
+
+ AND tmpa, count, #0x000F0000 ;// partHeight-1
+ ADD count, count, tmpa, LSL #8
+
+loop_y
+ ADD count, count, tmp6, LSL #8 ;// partWidth-1
+
+loop_x
+ LDR tmp1, [ref], width ;// |a4|a3|a2|a1|
+ LDR tmp2, [ref], width ;// |c4|c3|c2|c1|
+ LDR tmp3, [ref], width ;// |g4|g3|g2|g1|
+ LDR tmp4, [ref], width ;// |m4|m3|m2|m1|
+ LDR tmp5, [ref], width ;// |r4|r3|r2|r1|
+ LDR tmp6, [ref], width ;// |t4|t3|t2|t1|
+
+ ;// first four pixels
+ UXTB16 tmpa, tmp3 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp2 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+
+ UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp1, [mb]
+ LDR tmpa, = 0xFF00FF00
+ MVN tmp1, tmp1
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32
+ ORR res, res, tmpa
+
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp1 ;// bilinear interpolation
+ LDR tmp1, [ref], width ;// load next row
+ EOR res, res, tmpa ;// correct sign
+
+ STR res, [mb], #16 ;// next row (mb)
+
+
+ ;// tmp2 = |a4|a3|a2|a1|
+ ;// tmp3 = |c4|c3|c2|c1|
+ ;// tmp4 = |g4|g3|g2|g1|
+ ;// tmp5 = |m4|m3|m2|m1|
+ ;// tmp6 = |r4|r3|r2|r1|
+ ;// tmp1 = |t4|t3|t2|t1|
+
+ ;// second four pixels
+ UXTB16 tmpa, tmp4 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp3 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp2, [mb]
+ LDR tmpa, = 0xFF00FF00
+ MVN tmp2, tmp2
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp2 ;// bilinear interpolation
+ LDR tmp2, [ref], width ;// load next row
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp3 = |a4|a3|a2|a1|
+ ;// tmp4 = |c4|c3|c2|c1|
+ ;// tmp5 = |g4|g3|g2|g1|
+ ;// tmp6 = |m4|m3|m2|m1|
+ ;// tmp1 = |r4|r3|r2|r1|
+ ;// tmp2 = |t4|t3|t2|t1|
+
+ ;// third four pixels
+ UXTB16 tmpa, tmp5 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp4 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T
+
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp3, [mb]
+ LDR tmpa, = 0xFF00FF00
+ MVN tmp3, tmp3
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp3 ;// bilinear interpolation
+ LDR tmp3, [ref] ;// load next row
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp4 = |a4|a3|a2|a1|
+ ;// tmp5 = |c4|c3|c2|c1|
+ ;// tmp6 = |g4|g3|g2|g1|
+ ;// tmp1 = |m4|m3|m2|m1|
+ ;// tmp2 = |r4|r3|r2|r1|
+ ;// tmp3 = |t4|t3|t2|t1|
+
+ ;// fourth four pixels
+ UXTB16 tmpa, tmp6 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp5 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp5, [mb]
+ LDR tmp4, = 0xFF00FF00
+ MVN tmp5, tmp5
+
+ AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp5 ;// bilinear interpolation
+
+ ;// decrement loop_x counter
+ SUBS count, count, #4<<28 ;// decrement x loop counter
+
+ ;// calculate "ref" address for next round
+ SUB ref, ref, width, LSL #3 ;// ref -= 8*width;
+ ADD ref, ref, #4 ;// next column (4 pixels)
+
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #-44
+
+ BCS loop_x
+
+ ADDS mb, mb, #64 ;// set Carry=0
+ ADD ref, ref, width, LSL #2 ;// ref += 4*width
+ AND tmp6, count, #0x00F00000 ;// partWidth-1
+ SBC ref, ref, tmp6, LSR #20 ;// -(partWidth-1)-1
+ SBC mb, mb, tmp6, LSR #20 ;// -(partWidth-1)-1
+
+ ADDS count, count, #0xC << 24 ;// decrement y loop counter
+ BGE loop_y
+
+ ADD sp, sp, #0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s
new file mode 100755
index 0000000..a81aed7
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s
@@ -0,0 +1,163 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version horizontal part of
+;-- h264bsdInterpolateMid functions
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+
+;// Register allocation
+
+ref RN 0 ;// pointer to current position in reference image
+mb RN 1 ;// pointer to current position in interpolated mb
+count RN 2 ;// bit-packed width and count values
+
+x_2_0 RN 4
+x_3_1 RN 5
+x_6_4 RN 6
+x_7_5 RN 7
+
+tmp1 RN 8
+tmp2 RN 9
+tmp3 RN 10
+tmp4 RN 11
+
+mult_20_01 RN 12 ;// [20, 1]
+mult_20_m5 RN 14 ;// [20, -5]
+
+
+ EXPORT h264bsdInterpolateMidHorPart
+
+;// Horizontal filter approach
+;//
+;// Basic idea in horizontal filtering is to adjust coefficients
+;// like below. Calculation is done with 16-bit maths.
+;//
+;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
+;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
+;// y_0 = 20 1 20 -5 -5 1
+;// y_1 = -5 20 1 1 20 -5
+;// y_2 = 1 -5 -5 20 1 20
+;// y_3 = 1 20 -5 -5 20 1
+
+
+h264bsdInterpolateMidHorPart
+ STMFD sp!, {r4-r11, lr}
+
+ ;// pack values to count register
+ ;// [31:28] loop_x (partWidth-1)
+ ;// [27:24] loop_y (partHeight-1)
+ ;// [23:20] partWidth-1
+ ;// [19:16] partHeight-1
+ ;// [15:00] width
+
+
+ LDR mult_20_01, = 0x00140001
+ LDR mult_20_m5, = 0x0014FFFB
+ AND tmp3, count, #0x000F0000 ;// partWidth-1
+loop_y
+ LDR x_3_1, [ref, #-8]
+ ADD count, count, tmp3, LSL #12
+ LDR x_7_5, [ref, #-4]
+ UXTB16 x_2_0, x_3_1
+ UXTB16 x_3_1, x_3_1, ROR #8
+ UXTB16 x_6_4, x_7_5
+
+loop_x
+ UXTB16 x_7_5, x_7_5, ROR #8
+
+ SMUAD tmp1, x_2_0, mult_20_01
+ SMULTB tmp2, x_2_0, mult_20_m5
+ SMULTB tmp3, x_2_0, mult_20_01
+ SMULTB tmp4, x_3_1, mult_20_01
+
+ SMLAD tmp1, x_3_1, mult_20_m5, tmp1
+ SMLAD tmp2, x_3_1, mult_20_01, tmp2
+ SMLATB tmp3, x_3_1, mult_20_m5, tmp3
+ LDR x_3_1, [ref], #4
+ SMLAD tmp4, x_6_4, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_6_4, mult_20_m5, tmp1
+ SMLADX tmp2, x_6_4, mult_20_01, tmp2
+ SMLADX tmp3, x_6_4, mult_20_m5, tmp3
+ SMLADX tmp4, x_7_5, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_7_5, mult_20_01, tmp1
+ SMLABB tmp2, x_7_5, mult_20_m5, tmp2
+ UXTB16 x_2_0, x_3_1
+ SMLADX tmp3, x_7_5, mult_20_01, tmp3
+ SMLABB tmp4, x_2_0, mult_20_01, tmp4
+
+ SUBS count, count, #4<<28
+ STR tmp1, [mb], #4
+ STR tmp2, [mb], #4
+ STR tmp3, [mb], #4
+ STR tmp4, [mb], #4
+ BCC next_y
+
+ UXTB16 x_3_1, x_3_1, ROR #8
+
+ SMUAD tmp1, x_6_4, mult_20_01
+ SMULTB tmp2, x_6_4, mult_20_m5
+ SMULTB tmp3, x_6_4, mult_20_01
+ SMULTB tmp4, x_7_5, mult_20_01
+
+ SMLAD tmp1, x_7_5, mult_20_m5, tmp1
+ SMLAD tmp2, x_7_5, mult_20_01, tmp2
+ SMLATB tmp3, x_7_5, mult_20_m5, tmp3
+ LDR x_7_5, [ref], #4
+ SMLAD tmp4, x_2_0, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_2_0, mult_20_m5, tmp1
+ SMLADX tmp2, x_2_0, mult_20_01, tmp2
+ SMLADX tmp3, x_2_0, mult_20_m5, tmp3
+ SMLADX tmp4, x_3_1, mult_20_m5, tmp4
+
+ SMLABB tmp1, x_3_1, mult_20_01, tmp1
+ SMLABB tmp2, x_3_1, mult_20_m5, tmp2
+ UXTB16 x_6_4, x_7_5
+ SMLADX tmp3, x_3_1, mult_20_01, tmp3
+ SMLABB tmp4, x_6_4, mult_20_01, tmp4
+
+ SUBS count, count, #4<<28
+ STR tmp1, [mb], #4
+ STR tmp2, [mb], #4
+ STR tmp3, [mb], #4
+ STR tmp4, [mb], #4
+ BCS loop_x
+
+next_y
+ AND tmp3, count, #0x000F0000 ;// partWidth-1
+ SMLABB ref, count, mult_20_01, ref ;// +width
+ SBC ref, ref, tmp3, LSR #16 ;// -(partWidth-1)-1
+ ADDS count, count, #(1<<28)-(1<<20)
+ BGE loop_y
+
+ LDMFD sp!, {r4-r11, pc}
+
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s
new file mode 100755
index 0000000..244fc6f
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_half.s
@@ -0,0 +1,347 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerHalf function
+;--
+;-------------------------------------------------------------------------------
+
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateVerHalf register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+res RN 3
+y0 RN 3
+
+tmp1 RN 4
+
+tmp2 RN 5
+height RN 5
+
+tmp3 RN 6
+partW RN 6
+
+tmp4 RN 7
+partH RN 7
+
+tmp5 RN 8
+tmp6 RN 9
+
+tmpa RN 10
+tmpb RN 11
+width RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateVerHalf
+
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;// ..
+;// ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+h264bsdInterpolateVerHalf
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp5, x0, partW ;// (x0+partWidth)
+ LDR width, [sp,#0x218] ;// width
+ CMP tmp5, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp6, y0, partH ;// (y0+partHeight)
+ ADD tmp6, tmp6, #5 ;// (y0+partH+5)
+ LDR height, [sp,#0x21c] ;// height
+ CMP tmp6, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp5, partH, #5 ;// r2 = partH + 5;
+ LDR height, [sp,#0x21c] ;// height
+ LDR partW, [sp,#0x220] ;// partWidth
+ STMIB sp, {height, partW} ;// sp+4 = height, sp+8 = partWidth
+ STR tmp5, [sp,#0xc] ;// sp+c partHeight+5
+ STR partW, [sp,#0x10] ;// sp+10 = partWidth
+ LDR width, [sp,#0x218] ;// width
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR partW, [sp,#0x218]
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ MLA tmp6, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp6 ;// ref += y0*width+x0
+ LDR mb, [sp, #0x1e8] ;// mb
+
+ ADD count, partW, partH, LSL #16 ;// |partH|partW|
+ LDR tmp5, = 0x00010001
+ SSUB16 count, count, tmp5; ;// |partH-1|partW-1|
+ LDR plus16, = 0x00100010
+
+ AND tmp1, count, #0x000000FF ;// partWidth
+
+
+loop_y
+ ADD count, count, tmp1, LSL #24 ;// partWidth-1 to top byte
+
+loop_x
+ LDR tmp1, [ref], width ;// |a4|a3|a2|a1|
+ LDR tmp2, [ref], width ;// |c4|c3|c2|c1|
+ LDR tmp3, [ref], width ;// |g4|g3|g2|g1|
+ LDR tmp4, [ref], width ;// |m4|m3|m2|m1|
+ LDR tmp5, [ref], width ;// |r4|r3|r2|r1|
+ LDR tmp6, [ref], width ;// |t4|t3|t2|t1|
+
+ ;// first four pixels
+ UXTB16 tmpa, tmp3 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp2 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+
+ UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp1, [ref], width
+ LDR tmpa, = 0xFF00FF00
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32
+ ORR res, res, tmpa
+ STR res, [mb], #16 ;// next row (mb)
+
+ ;// tmp2 = |a4|a3|a2|a1|
+ ;// tmp3 = |c4|c3|c2|c1|
+ ;// tmp4 = |g4|g3|g2|g1|
+ ;// tmp5 = |m4|m3|m2|m1|
+ ;// tmp6 = |r4|r3|r2|r1|
+ ;// tmp1 = |t4|t3|t2|t1|
+
+ ;// second four pixels
+ UXTB16 tmpa, tmp4 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp3 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp2, [ref], width
+ LDR tmpa, = 0xFF00FF00
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp3 = |a4|a3|a2|a1|
+ ;// tmp4 = |c4|c3|c2|c1|
+ ;// tmp5 = |g4|g3|g2|g1|
+ ;// tmp6 = |m4|m3|m2|m1|
+ ;// tmp1 = |r4|r3|r2|r1|
+ ;// tmp2 = |t4|t3|t2|t1|
+
+ ;// third four pixels
+ UXTB16 tmpa, tmp5 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp4 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T
+
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp3, [ref]
+ LDR tmpa, = 0xFF00FF00
+
+ ;// decrement loop_x counter
+ SUBS count, count, #4<<24 ;// (partWidth-1) -= 4;
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp4 = |a4|a3|a2|a1|
+ ;// tmp5 = |c4|c3|c2|c1|
+ ;// tmp6 = |g4|g3|g2|g1|
+ ;// tmp1 = |m4|m3|m2|m1|
+ ;// tmp2 = |r4|r3|r2|r1|
+ ;// tmp3 = |t4|t3|t2|t1|
+
+ ;// fourth four pixels
+ UXTB16 tmpa, tmp6 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp5 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp4, = 0xFF00FF00
+
+ ;// calculate "ref" address for next round
+ SUB ref, ref, width, LSL #3 ;// ref -= 8*width;
+ ADD ref, ref, #4; ;// next column (4 pixels)
+ AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ STR res, [mb], #-44
+
+ BCS loop_x
+
+ ADDS count, count, #252<<16 ;// (partHeight-1) -= 4;
+ ADD ref, ref, width, LSL #2 ;// ref += 4*width
+ AND tmp1, count, #0x000000FF ;// partWidth-1
+ ADD tmp2, tmp1, #1 ;// partWidth
+ SUB ref, ref, tmp2 ;// ref -= partWidth
+ ADD mb, mb, #64;
+ SUB mb, mb, tmp2; ;// mb -= partWidth
+ BGE loop_y
+
+ ADD sp,sp,#0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s
new file mode 100755
index 0000000..5266c85
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_ver_quarter.s
@@ -0,0 +1,374 @@
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+
+;-------------------------------------------------------------------------------
+;--
+;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function
+;--
+;-------------------------------------------------------------------------------
+
+ IF :DEF: H264DEC_WINASM
+ ;// We dont use REQUIRE8 and PRESERVE8 for winasm
+ ELSE
+ REQUIRE8
+ PRESERVE8
+ ENDIF
+
+ AREA |.text|, CODE
+
+;// h264bsdInterpolateVerQuarter register allocation
+
+ref RN 0
+
+mb RN 1
+buff RN 1
+
+count RN 2
+x0 RN 2
+
+res RN 3
+y0 RN 3
+
+tmp1 RN 4
+
+tmp2 RN 5
+height RN 5
+
+tmp3 RN 6
+partW RN 6
+
+tmp4 RN 7
+partH RN 7
+
+tmp5 RN 8
+tmp6 RN 9
+
+tmpa RN 10
+tmpb RN 11
+width RN 12
+
+plus16 RN 14
+
+
+;// function exports and imports
+
+ IMPORT h264bsdFillBlock
+
+ EXPORT h264bsdInterpolateVerQuarter
+
+;// Approach to vertical interpolation
+;//
+;// Interpolation is done by using 32-bit loads and stores
+;// and by using 16 bit arithmetic. 4x4 block is processed
+;// in each round.
+;//
+;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
+;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
+;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
+;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
+;// ..
+;// ..
+;// |a_m1|a_m1|a_m1|a_m1|...
+;// |b_m1|b_m1|b_m1|b_m1|...
+;// |c_m1|c_m1|c_m1|c_m1|...
+;// |d_m1|d_m1|d_m1|d_m1|...
+
+h264bsdInterpolateVerQuarter
+ STMFD sp!, {r0-r11, lr}
+ SUB sp, sp, #0x1e4
+
+ CMP x0, #0
+ BLT do_fill ;// (x0 < 0)
+ LDR partW, [sp,#0x220] ;// partWidth
+ ADD tmp5, x0, partW ;// (x0+partWidth)
+ LDR width, [sp,#0x218] ;// width
+ CMP tmp5, width
+ BHI do_fill ;// (x0+partW)>width
+
+ CMP y0, #0
+ BLT do_fill ;// (y0 < 0)
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp6, y0, partH ;// (y0+partHeight)
+ ADD tmp6, tmp6, #5 ;// (y0+partH+5)
+ LDR height, [sp,#0x21c] ;// height
+ CMP tmp6, height
+ BLS skip_fill ;// no overfill needed
+
+
+do_fill
+ LDR partH, [sp,#0x224] ;// partHeight
+ ADD tmp5, partH, #5 ;// r2 = partH + 5;
+ LDR height, [sp,#0x21c] ;// height
+ LDR partW, [sp,#0x220] ;// partWidth
+ STMIB sp, {height, partW} ;// sp+4 = height, sp+8 = partWidth
+ STR tmp5, [sp,#0xc] ;// sp+c partHeight+5
+ STR partW, [sp,#0x10] ;// sp+10 = partWidth
+ LDR width, [sp,#0x218] ;// width
+ STR width, [sp,#0] ;// sp+0 = width
+ ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
+ BL h264bsdFillBlock
+
+ MOV x0, #0
+ STR x0,[sp,#0x1ec] ;// x0 = 0
+ STR x0,[sp,#0x1f0] ;// y0 = 0
+ ADD ref,sp,#0x28 ;// ref = p1
+ STR partW, [sp,#0x218]
+
+
+skip_fill
+ LDR x0 ,[sp,#0x1ec] ;// x0
+ LDR y0 ,[sp,#0x1f0] ;// y0
+ LDR width, [sp,#0x218] ;// width
+ MLA tmp6, width, y0, x0 ;// y0*width+x0
+ ADD ref, ref, tmp6 ;// ref += y0*width+x0
+ LDR mb, [sp, #0x1e8] ;// mb
+
+ ADD count, partW, partH, LSL #8 ;// |xx|xx|partH|partW|
+ LDR tmp5, = 0x00010100
+ RSB count, tmp5, count, LSL #8 ;// |xx|partH-1|partW-1|xx|
+ LDR tmp2, [sp, #0x228] ;// verOffset
+ ADD count, count, tmp2 ;// |xx|partH-1|partW-1|verOffset|
+ LDR plus16, = 0x00100010
+
+ AND tmp1, count, #0x0000FF00 ;// partWidth
+
+
+loop_y
+ ADD count, count, tmp1, LSL #16 ;// partWidth-1 to top byte
+
+loop_x
+ LDR tmp1, [ref], width ;// |a4|a3|a2|a1|
+ LDR tmp2, [ref], width ;// |c4|c3|c2|c1|
+ LDR tmp3, [ref], width ;// |g4|g3|g2|g1|
+ LDR tmp4, [ref], width ;// |m4|m3|m2|m1|
+ LDR tmp5, [ref], width ;// |r4|r3|r2|r1|
+ LDR tmp6, [ref], width ;// |t4|t3|t2|t1|
+
+ ;// first four pixels
+ UXTB16 tmpa, tmp3 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp2 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+
+ UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ MOVS tmp1, count, LSL #31 ;// update flags (verOffset)
+ LDR tmpa, = 0xFF00FF00
+ MVNEQ tmp1, tmp3 ;// select verOffset=0
+ MVNNE tmp1, tmp4 ;// select verOffset=1
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32
+ ORR res, res, tmpa
+
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp1 ;// bilinear interpolation
+ LDR tmp1, [ref], width ;// load next row
+ EOR res, res, tmpa ;// correct sign
+
+ STR res, [mb], #16 ;// next row (mb)
+
+
+ ;// tmp2 = |a4|a3|a2|a1|
+ ;// tmp3 = |c4|c3|c2|c1|
+ ;// tmp4 = |g4|g3|g2|g1|
+ ;// tmp5 = |m4|m3|m2|m1|
+ ;// tmp6 = |r4|r3|r2|r1|
+ ;// tmp1 = |t4|t3|t2|t1|
+
+ ;// second four pixels
+ UXTB16 tmpa, tmp4 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp3 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmpa, = 0xFF00FF00
+ MVNEQ tmp2, tmp4 ;// select verOffset=0
+ MVNNE tmp2, tmp5 ;// select verOffset=1
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp2 ;// bilinear interpolation
+ LDR tmp2, [ref], width ;// load next row
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp3 = |a4|a3|a2|a1|
+ ;// tmp4 = |c4|c3|c2|c1|
+ ;// tmp5 = |g4|g3|g2|g1|
+ ;// tmp6 = |m4|m3|m2|m1|
+ ;// tmp1 = |r4|r3|r2|r1|
+ ;// tmp2 = |t4|t3|t2|t1|
+
+ ;// third four pixels
+ UXTB16 tmpa, tmp5 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp4 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T
+
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmpa, = 0xFF00FF00
+ MVNEQ tmp3, tmp5 ;// select verOffset=0
+ MVNNE tmp3, tmp6 ;// select verOffset=1
+
+ AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp3 ;// bilinear interpolation
+ LDR tmp3, [ref] ;// load next row
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #16 ;// next row
+
+ ;// tmp4 = |a4|a3|a2|a1|
+ ;// tmp5 = |c4|c3|c2|c1|
+ ;// tmp6 = |g4|g3|g2|g1|
+ ;// tmp1 = |m4|m3|m2|m1|
+ ;// tmp2 = |r4|r3|r2|r1|
+ ;// tmp3 = |t4|t3|t2|t1|
+
+ ;// fourth four pixels
+ UXTB16 tmpa, tmp6 ;// |g3|g1|
+ UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1|
+ UXTB16 tmpb, tmp5 ;// |c3|c1|
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR res, = 0x00FF00FF
+ UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2|
+ UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2|
+ AND res, res, tmpb, LSR #5 ;// mask and divide by 32
+
+ ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M)
+ UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2|
+ ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M)
+ UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2|
+ UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A
+ UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T
+
+ ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R)
+ SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R)
+
+ USAT16 tmpb, #13, tmpa ;// saturate
+ LDR tmp4, = 0xFF00FF00
+ MVNEQ tmp5, tmp6 ;// select verOffset=0
+ MVNNE tmp5, tmp1 ;// select verOffset=1
+
+ AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32
+ ORR res, res, tmpa
+ LDR tmpa, = 0x80808080
+ UHSUB8 res, res, tmp5 ;// bilinear interpolation
+
+ ;// decrement loop_x counter
+ SUBS count, count, #4<<24 ;// (partWidth-1) -= 4;
+
+ ;// calculate "ref" address for next round
+ SUB ref, ref, width, LSL #3 ;// ref -= 8*width;
+ ADD ref, ref, #4; ;// next column (4 pixels)
+
+ EOR res, res, tmpa ;// correct sign
+ STR res, [mb], #-44
+
+ BCS loop_x
+
+ ADDS count, count, #252<<16 ;// (partHeight-1) -= 4;
+ ADD ref, ref, width, LSL #2 ;// ref += 4*width
+ AND tmp1, count, #0x0000FF00 ;// partWidth-1
+ MOV tmp2, #1
+ ADD tmp2, tmp2, tmp1, LSR #8 ;// partWidth
+ SUB ref, ref, tmp2 ;// ref -= partWidth
+ ADD mb, mb, #64;
+ SUB mb, mb, tmp2; ;// mb -= partWidth
+ BGE loop_y
+
+ ADD sp,sp,#0x1f4
+ LDMFD sp!, {r4-r11, pc}
+
+ END
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat
new file mode 100644
index 0000000..1b8d88c
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/win_asm.bat
@@ -0,0 +1,15 @@
+echo off
+set ASMFLAGS= -checkreglist -CPU ARM1136 -PreDefine "H264DEC_WINASM SETL {TRUE}"
+set ASM="D:\Program Files\Microsoft Visual Studio 8\VC\ce\bin\x86_arm\armasm"
+echo on
+
+%ASM% %ASMFLAGS% h264bsd_interpolate_chroma_ver.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_chroma_hor.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_half.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_quarter.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_hor_ver_quarter.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_ver_half.s
+%ASM% %ASMFLAGS% h264bsd_interpolate_ver_quarter.s
+
+rem %ASM% %ASMFLAGS% h264bsd_interpolate_chroma_hor_ver.s
+rem %ASM% %ASMFLAGS% h264bsd_interpolate_mid_hor.s