diff options
author | James Dong <jdong@google.com> | 2011-05-31 18:53:46 -0700 |
---|---|---|
committer | James Dong <jdong@google.com> | 2011-06-02 12:32:46 -0700 |
commit | 0c1bc742181ded4930842b46e9507372f0b1b963 (patch) | |
tree | c952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm | |
parent | 92a746c3b18d035189f596ce32847bf26247aaca (diff) | |
download | frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2 |
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder
over PV software decoder is more than 30%.
o In addition, it fixes some known PV software decoder issues like missing
output frames
o allow both pv and on2 software avc to be available for easy comparision
o change output frames from 8 to 16
Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm')
5 files changed, 529 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s new file mode 100644 index 0000000..db11654 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdClearMbLayer.s @@ -0,0 +1,66 @@ +; +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; + + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE + + EXPORT h264bsdClearMbLayer + +; Input / output registers +pMbLayer RN 0 +size RN 1 +pTmp RN 2 +step RN 3 + +; -- NEON registers -- + +qZero QN Q0.U8 + +;/*------------------------------------------------------------------------------ +; +; Function: h264bsdClearMbLayer +; +; Functional description: +; +; Inputs: +; +; Outputs: +; +; Returns: +; +;------------------------------------------------------------------------------*/ + +h264bsdClearMbLayer + + VMOV qZero, #0 + ADD pTmp, pMbLayer, #16 + MOV step, #32 + SUBS size, size, #64 + +loop + VST1 qZero, [pMbLayer], step + SUBS size, size, #64 + VST1 qZero, [pTmp], step + VST1 qZero, [pMbLayer], step + VST1 qZero, [pTmp], step + BCS loop + + BX lr + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s new file mode 100644 index 0000000..c7bd73e --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdCountLeadingZeros.s @@ -0,0 +1,49 @@ +; +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; + + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE + + EXPORT h264bsdCountLeadingZeros + +; Input / output registers +value RN 0 + +; -- NEON registers -- + +;/*------------------------------------------------------------------------------ +; +; Function: h264bsdCountLeadingZeros +; +; Functional description: +; +; Inputs: +; +; Outputs: +; +; Returns: +; +;------------------------------------------------------------------------------*/ + +h264bsdCountLeadingZeros + + CLZ value, value + BX lr + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s new file mode 100644 index 0000000..5bfac92 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFillRow7.s @@ -0,0 +1,180 @@ +; +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; + + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE + + EXPORT h264bsdFillRow7 + +; Input / output registers + +ref RN 0 +fill RN 1 +left RN 2 +tmp2 RN 2 +center RN 3 +right RN 4 +tmp1 RN 5 + +; -- NEON registers -- + +qTmp0 QN Q0.U8 +qTmp1 QN Q1.U8 +dTmp0 DN D0.U8 +dTmp1 DN D1.U8 +dTmp2 DN D2.U8 +dTmp3 DN D3.U8 + + +;/*------------------------------------------------------------------------------ +; +; Function: h264bsdFillRow7 +; +; Functional description: +; +; Inputs: +; +; Outputs: +; +; Returns: +; +;------------------------------------------------------------------------------*/ + +h264bsdFillRow7 + PUSH {r4-r6,lr} + CMP left, #0 + LDR right, [sp,#0x10] + BEQ switch_center + LDRB tmp1, [ref,#0] + +loop_left + SUBS left, left, #1 + STRB tmp1, [fill], #1 + BNE loop_left + +switch_center + ASR tmp2,center,#2 + CMP tmp2,#9 + ADDCC pc,pc,tmp2,LSL #2 + B loop_center + B loop_center + B case_1 + B case_2 + B case_3 + B case_4 + B case_5 + B case_6 + B case_7 + B case_8 +;case_8 +; LDR tmp2, [ref], #4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 +;case_7 +; LDR tmp2, [ref], #4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 +;case_6 +; LDR tmp2, [ref], #4 +; SUB center, center, #4 +; STR tmp2, [fill],#4 +;case_5 +; LDR tmp2, [ref], #4 +; SUB center, center, #4 +; STR tmp2, [fill],#4 +;case_4 +; LDR tmp2, [ref],#4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 +;case_3 +; LDR tmp2, [ref],#4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 +;case_2 +; LDR tmp2, [ref],#4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 +;case_1 +; LDR tmp2, [ref],#4 +; SUB center, center, #4 +; STR tmp2, [fill], #4 + +case_8 + VLD1 {qTmp0, qTmp1}, [ref]! + SUB center, center, #32 + VST1 qTmp0, [fill]! + VST1 qTmp1, [fill]! + B loop_center +case_7 + VLD1 {dTmp0,dTmp1,dTmp2}, [ref]! + SUB center, center, #28 + LDR tmp2, [ref], #4 + VST1 {dTmp0,dTmp1,dTmp2}, [fill]! + STR tmp2, [fill],#4 + B loop_center +case_6 + VLD1 {dTmp0,dTmp1,dTmp2}, [ref]! + SUB center, center, #24 + VST1 {dTmp0,dTmp1,dTmp2}, [fill]! + B loop_center +case_5 + VLD1 qTmp0, [ref]! + SUB center, center, #20 + LDR tmp2, [ref], #4 + VST1 qTmp0, [fill]! + STR tmp2, [fill],#4 + B loop_center +case_4 + VLD1 qTmp0, [ref]! + SUB center, center, #16 + VST1 qTmp0, [fill]! + B loop_center +case_3 + VLD1 dTmp0, [ref]! + SUB center, center, #12 + LDR tmp2, [ref], #4 + VST1 dTmp0, [fill]! + STR tmp2, [fill],#4 + B loop_center +case_2 + LDR tmp2, [ref],#4 + SUB center, center, #4 + STR tmp2, [fill], #4 +case_1 + LDR tmp2, [ref],#4 + SUB center, center, #4 + STR tmp2, [fill], #4 + +loop_center + CMP center, #0 + LDRBNE tmp2, [ref], #1 + SUBNE center, center, #1 + STRBNE tmp2, [fill], #1 + BNE loop_center + CMP right,#0 + POPEQ {r4-r6,pc} + LDRB tmp2, [ref,#-1] + +loop_right + STRB tmp2, [fill], #1 + SUBS right, right, #1 + BNE loop_right + + POP {r4-r6,pc} + END + diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s new file mode 100644 index 0000000..21335b8 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdFlushBits.s @@ -0,0 +1,82 @@ +; +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; + + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE + + EXPORT h264bsdFlushBits + +; Input / output registers +pStrmData RN 0 +numBits RN 1 +readBits RN 2 +strmBuffSize RN 3 +pStrmBuffStart RN 1 +pStrmCurrPos RN 2 +bitPosInWord RN 1 + +; -- NEON registers -- + + + +;/*------------------------------------------------------------------------------ +; +; Function: h264bsdFlushBits +; +; Functional description: +; +; Inputs: +; +; Outputs: +; +; Returns: +; +;------------------------------------------------------------------------------*/ + +h264bsdFlushBits +;// PUSH {r4-r6,lr} + + LDR readBits, [pStrmData, #0x10] + LDR strmBuffSize, [pStrmData, #0xC] + + ADD readBits, readBits, numBits + AND bitPosInWord, readBits, #7 + + STR readBits, [pStrmData, #0x10] + STR bitPosInWord, [pStrmData, #0x8] + + LDR pStrmBuffStart, [pStrmData, #0x0] + + CMP readBits, strmBuffSize, LSL #3 + + BHI end_of_stream + + ADD pStrmCurrPos, pStrmBuffStart, readBits, LSR #3 + STR pStrmCurrPos, [pStrmData, #0x4] + MOV r0, #0 + BX lr +;// POP {r4-r6,pc} + +end_of_stream + MVN r0, #0 + BX lr +;// POP {r4-r6,pc} + + END + + diff --git a/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s new file mode 100644 index 0000000..38a0781 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm/h264bsdWriteMacroblock.s @@ -0,0 +1,152 @@ +; +; Copyright (C) 2009 The Android Open Source Project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +; + + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE + + EXPORT h264bsdWriteMacroblock + +; Input / output registers +image RN 0 +data RN 1 +width RN 2 +luma RN 3 +cb RN 4 +cr RN 5 +cwidth RN 6 + +; -- NEON registers -- + +qRow0 QN Q0.U8 +qRow1 QN Q1.U8 +qRow2 QN Q2.U8 +qRow3 QN Q3.U8 +qRow4 QN Q4.U8 +qRow5 QN Q5.U8 +qRow6 QN Q6.U8 +qRow7 QN Q7.U8 +qRow8 QN Q8.U8 +qRow9 QN Q9.U8 +qRow10 QN Q10.U8 +qRow11 QN Q11.U8 +qRow12 QN Q12.U8 +qRow13 QN Q13.U8 +qRow14 QN Q14.U8 +qRow15 QN Q15.U8 + +dRow0 DN D0.U8 +dRow1 DN D1.U8 +dRow2 DN D2.U8 +dRow3 DN D3.U8 +dRow4 DN D4.U8 +dRow5 DN D5.U8 +dRow6 DN D6.U8 +dRow7 DN D7.U8 +dRow8 DN D8.U8 +dRow9 DN D9.U8 +dRow10 DN D10.U8 +dRow11 DN D11.U8 +dRow12 DN D12.U8 +dRow13 DN D13.U8 +dRow14 DN D14.U8 +dRow15 DN D15.U8 + +;/*------------------------------------------------------------------------------ +; +; Function: h264bsdWriteMacroblock +; +; Functional description: +; Write one macroblock into the image. Both luma and chroma +; components will be written at the same time. +; +; Inputs: +; data pointer to macroblock data to be written, 256 values for +; luma followed by 64 values for both chroma components +; +; Outputs: +; image pointer to the image where the macroblock will be written +; +; Returns: +; none +; +;------------------------------------------------------------------------------*/ + +h264bsdWriteMacroblock + PUSH {r4-r6,lr} + VPUSH {q4-q7} + + LDR width, [image, #4] + LDR luma, [image, #0xC] + LDR cb, [image, #0x10] + LDR cr, [image, #0x14] + + +; Write luma + VLD1 {qRow0, qRow1}, [data]! + LSL width, width, #4 + VLD1 {qRow2, qRow3}, [data]! + LSR cwidth, width, #1 + VST1 {qRow0}, [luma@128], width + VLD1 {qRow4, qRow5}, [data]! + VST1 {qRow1}, [luma@128], width + VLD1 {qRow6, qRow7}, [data]! + VST1 {qRow2}, [luma@128], width + VLD1 {qRow8, qRow9}, [data]! + VST1 {qRow3}, [luma@128], width + VLD1 {qRow10, qRow11}, [data]! + VST1 {qRow4}, [luma@128], width + VLD1 {qRow12, qRow13}, [data]! + VST1 {qRow5}, [luma@128], width + VLD1 {qRow14, qRow15}, [data]! + VST1 {qRow6}, [luma@128], width + + VLD1 {qRow0, qRow1}, [data]! ;cb rows 0,1,2,3 + VST1 {qRow7}, [luma@128], width + VLD1 {qRow2, qRow3}, [data]! ;cb rows 4,5,6,7 + VST1 {qRow8}, [luma@128], width + VLD1 {qRow4, qRow5}, [data]! ;cr rows 0,1,2,3 + VST1 {qRow9}, [luma@128], width + VLD1 {qRow6, qRow7}, [data]! ;cr rows 4,5,6,7 + VST1 {qRow10}, [luma@128], width + VST1 {dRow0}, [cb@64], cwidth + VST1 {dRow8}, [cr@64], cwidth + VST1 {qRow11}, [luma@128], width + VST1 {dRow1}, [cb@64], cwidth + VST1 {dRow9}, [cr@64], cwidth + VST1 {qRow12}, [luma@128], width + VST1 {dRow2}, [cb@64], cwidth + VST1 {dRow10}, [cr@64], cwidth + VST1 {qRow13}, [luma@128], width + VST1 {dRow3}, [cb@64], cwidth + VST1 {dRow11}, [cr@64], cwidth + VST1 {qRow14}, [luma@128], width + VST1 {dRow4}, [cb@64], cwidth + VST1 {dRow12}, [cr@64], cwidth + VST1 {qRow15}, [luma] + VST1 {dRow5}, [cb@64], cwidth + VST1 {dRow13}, [cr@64], cwidth + VST1 {dRow6}, [cb@64], cwidth + VST1 {dRow14}, [cr@64], cwidth + VST1 {dRow7}, [cb@64] + VST1 {dRow15}, [cr@64] + + VPOP {q4-q7} + POP {r4-r6,pc} + END + + |