summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
diff options
context:
space:
mode:
authorJames Dong <jdong@google.com>2011-05-31 18:53:46 -0700
committerJames Dong <jdong@google.com>2011-06-02 12:32:46 -0700
commit0c1bc742181ded4930842b46e9507372f0b1b963 (patch)
treec952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
parent92a746c3b18d035189f596ce32847bf26247aaca (diff)
downloadframeworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder over PV software decoder is more than 30%. o In addition, it fixes some known PV software decoder issues like missing output frames o allow both pv and on2 software avc to be available for easy comparision o change output frames from 8 to 16 Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S')
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S239
1 files changed, 239 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
new file mode 100644
index 0000000..53268f6
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src_gcc/omxVCM4P10_PredictIntra_16x16_s.S
@@ -0,0 +1,239 @@
+/*
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ */
+
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+
+ .arm
+ .fpu neon
+
+ .section .rodata
+ .align 4
+;//-------------------------------------------------------
+;// This table for implementing switch case of C in asm by
+;// the mehtod of two levels of indexing.
+;//-------------------------------------------------------
+
+armVCM4P10_pIndexTable16x16:
+ .word OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
+ .word OMX_VC_16X16_DC, OMX_VC_16X16_PLANE
+
+
+
+armVCM4P10_MultiplierTable16x16:
+ .hword 7, 6, 5, 4, 3, 2, 1, 8
+ .hword 0, 1, 2, 3, 4, 5, 6, 7
+ .hword 8, 9, 10, 11, 12, 13, 14, 15
+
+ .text
+
+ .global omxVCM4P10_PredictIntra_16x16
+ .func omxVCM4P10_PredictIntra_16x16
+omxVCM4P10_PredictIntra_16x16:
+ PUSH {r4-r12,lr}
+ VPUSH {d8-d15}
+ LDR r9, =armVCM4P10_pIndexTable16x16
+ LDR r6,[sp,#0x70]
+ LDR r4,[sp,#0x68]
+ LDR r5,[sp,#0x6c]
+ LDR r7,[sp,#0x74]
+ MOV r12,#0x10
+ LDR pc,[r9,r6,LSL #2]
+OMX_VC_16X16_VERT:
+ VLD1.8 {d0,d1},[r1]
+ ADD r8,r3,r5
+ ADD r10,r5,r5
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3],r10
+ VST1.8 {d0,d1},[r8],r10
+ VST1.8 {d0,d1},[r3]
+ VST1.8 {d0,d1},[r8]
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_HOR:
+ ADD r8,r0,r4
+ ADD r4,r4,r4
+ ADD r11,r3,r5
+ ADD r5,r5,r5
+L0x8c:
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ SUBS r12,r12,#8
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ VLD1.8 {d2[],d3[]},[r0],r4
+ VLD1.8 {d0[],d1[]},[r8],r4
+ VST1.8 {d2,d3},[r3],r5
+ VST1.8 {d0,d1},[r11],r5
+ BNE L0x8c
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_DC:
+ MOV r11,#0
+ TST r7,#2
+ BEQ L0x14c
+ ADD r8,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d2[0]},[r0],r10
+ VLD1.8 {d2[1]},[r8],r10
+ VLD1.8 {d2[2]},[r0],r10
+ VLD1.8 {d2[3]},[r8],r10
+ VLD1.8 {d2[4]},[r0],r10
+ VLD1.8 {d2[5]},[r8],r10
+ VLD1.8 {d2[6]},[r0],r10
+ VLD1.8 {d2[7]},[r8],r10
+ VLD1.8 {d3[0]},[r0],r10
+ VLD1.8 {d3[1]},[r8],r10
+ VLD1.8 {d3[2]},[r0],r10
+ VLD1.8 {d3[3]},[r8],r10
+ VLD1.8 {d3[4]},[r0],r10
+ VLD1.8 {d3[5]},[r8],r10
+ VLD1.8 {d3[6]},[r0],r10
+ VLD1.8 {d3[7]},[r8]
+ VPADDL.U8 q0,q1
+ ADD r11,r11,#1
+ VPADD.I16 d0,d0,d1
+ VPADDL.U16 d0,d0
+ VPADDL.U32 d6,d0
+ VRSHR.U64 d8,d6,#4
+L0x14c:
+ TST r7,#1
+ BEQ L0x170
+ VLD1.8 {d0,d1},[r1]
+ ADD r11,r11,#1
+ VPADDL.U8 q0,q0
+ VPADD.I16 d0,d0,d1
+ VPADDL.U16 d0,d0
+ VPADDL.U32 d7,d0
+ VRSHR.U64 d8,d7,#4
+L0x170:
+ CMP r11,#2
+ BNE L0x180
+ VADD.I64 d8,d7,d6
+ VRSHR.U64 d8,d8,#5
+L0x180:
+ VDUP.8 q3,d8[0]
+ CMP r11,#0
+ ADD r8,r3,r5
+ ADD r10,r5,r5
+ BNE L0x198
+ VMOV.I8 q3,#0x80
+L0x198:
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ VST1.8 {d6,d7},[r3],r10
+ VST1.8 {d6,d7},[r8],r10
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+OMX_VC_16X16_PLANE:
+ LDR r9, =armVCM4P10_MultiplierTable16x16
+ VLD1.8 {d0,d1},[r1]
+ VLD1.8 {d4[0]},[r2]
+ ADD r8,r0,r4
+ ADD r10,r4,r4
+ VLD1.8 {d2[0]},[r0],r10
+ VLD1.8 {d2[1]},[r8],r10
+ VLD1.8 {d2[2]},[r0],r10
+ VLD1.8 {d2[3]},[r8],r10
+ VLD1.8 {d2[4]},[r0],r10
+ VLD1.8 {d2[5]},[r8],r10
+ VLD1.8 {d2[6]},[r0],r10
+ VLD1.8 {d2[7]},[r8],r10
+ VLD1.8 {d3[0]},[r0],r10
+ VLD1.8 {d3[1]},[r8],r10
+ VLD1.8 {d3[2]},[r0],r10
+ VLD1.8 {d3[3]},[r8],r10
+ VLD1.8 {d3[4]},[r0],r10
+ VLD1.8 {d3[5]},[r8],r10
+ VLD1.8 {d3[6]},[r0],r10
+ VLD1.8 {d3[7]},[r8]
+ VREV64.8 d5,d1
+ VSUBL.U8 q3,d5,d4
+ VSHR.U64 d5,d5,#8
+ VSUBL.U8 q4,d5,d0
+ VSHL.I64 d9,d9,#16
+ VEXT.8 d9,d9,d6,#2
+ VREV64.8 d12,d3
+ VSUBL.U8 q7,d12,d4
+ VSHR.U64 d12,d12,#8
+ VSUBL.U8 q8,d12,d2
+ VLD1.16 {d20,d21},[r9]!
+ VSHL.I64 d17,d17,#16
+ VEXT.8 d17,d17,d14,#2
+ VMULL.S16 q11,d8,d20
+ VMULL.S16 q12,d16,d20
+ VMLAL.S16 q11,d9,d21
+ VMLAL.S16 q12,d17,d21
+ VPADD.I32 d22,d23,d22
+ VPADD.I32 d23,d25,d24
+ VPADDL.S32 q11,q11
+ VSHL.I64 q12,q11,#2
+ VADD.I64 q11,q11,q12
+ VRSHR.S64 q11,q11,#6
+ VSHL.I64 q12,q11,#3
+ VSUB.I64 q12,q12,q11
+ VLD1.16 {d20,d21},[r9]!
+ VDUP.16 q6,d22[0]
+ VDUP.16 q7,d23[0]
+ VADDL.U8 q11,d1,d3
+ VSHL.I16 q11,q11,#4
+ VDUP.16 q11,d23[3]
+ VADD.I64 d1,d24,d25
+ VLD1.16 {d24,d25},[r9]
+ VDUP.16 q13,d1[0]
+ VSUB.I16 q13,q11,q13
+ VMUL.I16 q5,q6,q10
+ VMUL.I16 q6,q6,q12
+ VADD.I16 q0,q5,q13
+ VADD.I16 q1,q6,q13
+L0x2d4:
+ VQRSHRUN.S16 d6,q0,#5
+ VQRSHRUN.S16 d7,q1,#5
+ SUBS r12,r12,#1
+ VST1.8 {d6,d7},[r3],r5
+ VADD.I16 q0,q0,q7
+ VADD.I16 q1,q1,q7
+ BNE L0x2d4
+ MOV r0,#0
+ VPOP {d8-d15}
+ POP {r4-r12,pc}
+ .endfunc
+
+ .end
+