summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm
diff options
context:
space:
mode:
authorJames Dong <jdong@google.com>2011-05-31 18:53:46 -0700
committerJames Dong <jdong@google.com>2011-06-02 12:32:46 -0700
commit0c1bc742181ded4930842b46e9507372f0b1b963 (patch)
treec952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm
parent92a746c3b18d035189f596ce32847bf26247aaca (diff)
downloadframeworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz
frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder over PV software decoder is more than 30%. o In addition, it fixes some known PV software decoder issues like missing output frames o allow both pv and on2 software avc to be available for easy comparision o change output frames from 8 to 16 Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm')
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s66
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s49
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s180
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s82
-rw-r--r--media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s152
5 files changed, 529 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s
new file mode 100644
index 0000000..db11654
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s
@@ -0,0 +1,66 @@
+;
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE
+
+ EXPORT h264bsdClearMbLayer
+
+; Input / output registers
+pMbLayer RN 0
+size RN 1
+pTmp RN 2
+step RN 3
+
+; -- NEON registers --
+
+qZero QN Q0.U8
+
+;/*------------------------------------------------------------------------------
+;
+; Function: h264bsdClearMbLayer
+;
+; Functional description:
+;
+; Inputs:
+;
+; Outputs:
+;
+; Returns:
+;
+;------------------------------------------------------------------------------*/
+
+h264bsdClearMbLayer
+
+ VMOV qZero, #0
+ ADD pTmp, pMbLayer, #16
+ MOV step, #32
+ SUBS size, size, #64
+
+loop
+ VST1 qZero, [pMbLayer], step
+ SUBS size, size, #64
+ VST1 qZero, [pTmp], step
+ VST1 qZero, [pMbLayer], step
+ VST1 qZero, [pTmp], step
+ BCS loop
+
+ BX lr
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s
new file mode 100644
index 0000000..c7bd73e
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s
@@ -0,0 +1,49 @@
+;
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE
+
+ EXPORT h264bsdCountLeadingZeros
+
+; Input / output registers
+value RN 0
+
+; -- NEON registers --
+
+;/*------------------------------------------------------------------------------
+;
+; Function: h264bsdCountLeadingZeros
+;
+; Functional description:
+;
+; Inputs:
+;
+; Outputs:
+;
+; Returns:
+;
+;------------------------------------------------------------------------------*/
+
+h264bsdCountLeadingZeros
+
+ CLZ value, value
+ BX lr
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s
new file mode 100644
index 0000000..5bfac92
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s
@@ -0,0 +1,180 @@
+;
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE
+
+ EXPORT h264bsdFillRow7
+
+; Input / output registers
+
+ref RN 0
+fill RN 1
+left RN 2
+tmp2 RN 2
+center RN 3
+right RN 4
+tmp1 RN 5
+
+; -- NEON registers --
+
+qTmp0 QN Q0.U8
+qTmp1 QN Q1.U8
+dTmp0 DN D0.U8
+dTmp1 DN D1.U8
+dTmp2 DN D2.U8
+dTmp3 DN D3.U8
+
+
+;/*------------------------------------------------------------------------------
+;
+; Function: h264bsdFillRow7
+;
+; Functional description:
+;
+; Inputs:
+;
+; Outputs:
+;
+; Returns:
+;
+;------------------------------------------------------------------------------*/
+
+h264bsdFillRow7
+ PUSH {r4-r6,lr}
+ CMP left, #0
+ LDR right, [sp,#0x10]
+ BEQ switch_center
+ LDRB tmp1, [ref,#0]
+
+loop_left
+ SUBS left, left, #1
+ STRB tmp1, [fill], #1
+ BNE loop_left
+
+switch_center
+ ASR tmp2,center,#2
+ CMP tmp2,#9
+ ADDCC pc,pc,tmp2,LSL #2
+ B loop_center
+ B loop_center
+ B case_1
+ B case_2
+ B case_3
+ B case_4
+ B case_5
+ B case_6
+ B case_7
+ B case_8
+;case_8
+; LDR tmp2, [ref], #4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+;case_7
+; LDR tmp2, [ref], #4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+;case_6
+; LDR tmp2, [ref], #4
+; SUB center, center, #4
+; STR tmp2, [fill],#4
+;case_5
+; LDR tmp2, [ref], #4
+; SUB center, center, #4
+; STR tmp2, [fill],#4
+;case_4
+; LDR tmp2, [ref],#4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+;case_3
+; LDR tmp2, [ref],#4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+;case_2
+; LDR tmp2, [ref],#4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+;case_1
+; LDR tmp2, [ref],#4
+; SUB center, center, #4
+; STR tmp2, [fill], #4
+
+case_8
+ VLD1 {qTmp0, qTmp1}, [ref]!
+ SUB center, center, #32
+ VST1 qTmp0, [fill]!
+ VST1 qTmp1, [fill]!
+ B loop_center
+case_7
+ VLD1 {dTmp0,dTmp1,dTmp2}, [ref]!
+ SUB center, center, #28
+ LDR tmp2, [ref], #4
+ VST1 {dTmp0,dTmp1,dTmp2}, [fill]!
+ STR tmp2, [fill],#4
+ B loop_center
+case_6
+ VLD1 {dTmp0,dTmp1,dTmp2}, [ref]!
+ SUB center, center, #24
+ VST1 {dTmp0,dTmp1,dTmp2}, [fill]!
+ B loop_center
+case_5
+ VLD1 qTmp0, [ref]!
+ SUB center, center, #20
+ LDR tmp2, [ref], #4
+ VST1 qTmp0, [fill]!
+ STR tmp2, [fill],#4
+ B loop_center
+case_4
+ VLD1 qTmp0, [ref]!
+ SUB center, center, #16
+ VST1 qTmp0, [fill]!
+ B loop_center
+case_3
+ VLD1 dTmp0, [ref]!
+ SUB center, center, #12
+ LDR tmp2, [ref], #4
+ VST1 dTmp0, [fill]!
+ STR tmp2, [fill],#4
+ B loop_center
+case_2
+ LDR tmp2, [ref],#4
+ SUB center, center, #4
+ STR tmp2, [fill], #4
+case_1
+ LDR tmp2, [ref],#4
+ SUB center, center, #4
+ STR tmp2, [fill], #4
+
+loop_center
+ CMP center, #0
+ LDRBNE tmp2, [ref], #1
+ SUBNE center, center, #1
+ STRBNE tmp2, [fill], #1
+ BNE loop_center
+ CMP right,#0
+ POPEQ {r4-r6,pc}
+ LDRB tmp2, [ref,#-1]
+
+loop_right
+ STRB tmp2, [fill], #1
+ SUBS right, right, #1
+ BNE loop_right
+
+ POP {r4-r6,pc}
+ END
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s
new file mode 100644
index 0000000..21335b8
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s
@@ -0,0 +1,82 @@
+;
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE
+
+ EXPORT h264bsdFlushBits
+
+; Input / output registers
+pStrmData RN 0
+numBits RN 1
+readBits RN 2
+strmBuffSize RN 3
+pStrmBuffStart RN 1
+pStrmCurrPos RN 2
+bitPosInWord RN 1
+
+; -- NEON registers --
+
+
+
+;/*------------------------------------------------------------------------------
+;
+; Function: h264bsdFlushBits
+;
+; Functional description:
+;
+; Inputs:
+;
+; Outputs:
+;
+; Returns:
+;
+;------------------------------------------------------------------------------*/
+
+h264bsdFlushBits
+;// PUSH {r4-r6,lr}
+
+ LDR readBits, [pStrmData, #0x10]
+ LDR strmBuffSize, [pStrmData, #0xC]
+
+ ADD readBits, readBits, numBits
+ AND bitPosInWord, readBits, #7
+
+ STR readBits, [pStrmData, #0x10]
+ STR bitPosInWord, [pStrmData, #0x8]
+
+ LDR pStrmBuffStart, [pStrmData, #0x0]
+
+ CMP readBits, strmBuffSize, LSL #3
+
+ BHI end_of_stream
+
+ ADD pStrmCurrPos, pStrmBuffStart, readBits, LSR #3
+ STR pStrmCurrPos, [pStrmData, #0x4]
+ MOV r0, #0
+ BX lr
+;// POP {r4-r6,pc}
+
+end_of_stream
+ MVN r0, #0
+ BX lr
+;// POP {r4-r6,pc}
+
+ END
+
+
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s
new file mode 100644
index 0000000..38a0781
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s
@@ -0,0 +1,152 @@
+;
+; Copyright (C) 2009 The Android Open Source Project
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+
+ REQUIRE8
+ PRESERVE8
+
+ AREA |.text|, CODE
+
+ EXPORT h264bsdWriteMacroblock
+
+; Input / output registers
+image RN 0
+data RN 1
+width RN 2
+luma RN 3
+cb RN 4
+cr RN 5
+cwidth RN 6
+
+; -- NEON registers --
+
+qRow0 QN Q0.U8
+qRow1 QN Q1.U8
+qRow2 QN Q2.U8
+qRow3 QN Q3.U8
+qRow4 QN Q4.U8
+qRow5 QN Q5.U8
+qRow6 QN Q6.U8
+qRow7 QN Q7.U8
+qRow8 QN Q8.U8
+qRow9 QN Q9.U8
+qRow10 QN Q10.U8
+qRow11 QN Q11.U8
+qRow12 QN Q12.U8
+qRow13 QN Q13.U8
+qRow14 QN Q14.U8
+qRow15 QN Q15.U8
+
+dRow0 DN D0.U8
+dRow1 DN D1.U8
+dRow2 DN D2.U8
+dRow3 DN D3.U8
+dRow4 DN D4.U8
+dRow5 DN D5.U8
+dRow6 DN D6.U8
+dRow7 DN D7.U8
+dRow8 DN D8.U8
+dRow9 DN D9.U8
+dRow10 DN D10.U8
+dRow11 DN D11.U8
+dRow12 DN D12.U8
+dRow13 DN D13.U8
+dRow14 DN D14.U8
+dRow15 DN D15.U8
+
+;/*------------------------------------------------------------------------------
+;
+; Function: h264bsdWriteMacroblock
+;
+; Functional description:
+; Write one macroblock into the image. Both luma and chroma
+; components will be written at the same time.
+;
+; Inputs:
+; data pointer to macroblock data to be written, 256 values for
+; luma followed by 64 values for both chroma components
+;
+; Outputs:
+; image pointer to the image where the macroblock will be written
+;
+; Returns:
+; none
+;
+;------------------------------------------------------------------------------*/
+
+h264bsdWriteMacroblock
+ PUSH {r4-r6,lr}
+ VPUSH {q4-q7}
+
+ LDR width, [image, #4]
+ LDR luma, [image, #0xC]
+ LDR cb, [image, #0x10]
+ LDR cr, [image, #0x14]
+
+
+; Write luma
+ VLD1 {qRow0, qRow1}, [data]!
+ LSL width, width, #4
+ VLD1 {qRow2, qRow3}, [data]!
+ LSR cwidth, width, #1
+ VST1 {qRow0}, [luma@128], width
+ VLD1 {qRow4, qRow5}, [data]!
+ VST1 {qRow1}, [luma@128], width
+ VLD1 {qRow6, qRow7}, [data]!
+ VST1 {qRow2}, [luma@128], width
+ VLD1 {qRow8, qRow9}, [data]!
+ VST1 {qRow3}, [luma@128], width
+ VLD1 {qRow10, qRow11}, [data]!
+ VST1 {qRow4}, [luma@128], width
+ VLD1 {qRow12, qRow13}, [data]!
+ VST1 {qRow5}, [luma@128], width
+ VLD1 {qRow14, qRow15}, [data]!
+ VST1 {qRow6}, [luma@128], width
+
+ VLD1 {qRow0, qRow1}, [data]! ;cb rows 0,1,2,3
+ VST1 {qRow7}, [luma@128], width
+ VLD1 {qRow2, qRow3}, [data]! ;cb rows 4,5,6,7
+ VST1 {qRow8}, [luma@128], width
+ VLD1 {qRow4, qRow5}, [data]! ;cr rows 0,1,2,3
+ VST1 {qRow9}, [luma@128], width
+ VLD1 {qRow6, qRow7}, [data]! ;cr rows 4,5,6,7
+ VST1 {qRow10}, [luma@128], width
+ VST1 {dRow0}, [cb@64], cwidth
+ VST1 {dRow8}, [cr@64], cwidth
+ VST1 {qRow11}, [luma@128], width
+ VST1 {dRow1}, [cb@64], cwidth
+ VST1 {dRow9}, [cr@64], cwidth
+ VST1 {qRow12}, [luma@128], width
+ VST1 {dRow2}, [cb@64], cwidth
+ VST1 {dRow10}, [cr@64], cwidth
+ VST1 {qRow13}, [luma@128], width
+ VST1 {dRow3}, [cb@64], cwidth
+ VST1 {dRow11}, [cr@64], cwidth
+ VST1 {qRow14}, [luma@128], width
+ VST1 {dRow4}, [cb@64], cwidth
+ VST1 {dRow12}, [cr@64], cwidth
+ VST1 {qRow15}, [luma]
+ VST1 {dRow5}, [cb@64], cwidth
+ VST1 {dRow13}, [cr@64], cwidth
+ VST1 {dRow6}, [cb@64], cwidth
+ VST1 {dRow14}, [cr@64], cwidth
+ VST1 {dRow7}, [cb@64]
+ VST1 {dRow15}, [cr@64]
+
+ VPOP {q4-q7}
+ POP {r4-r6,pc}
+ END
+
+