diff options
author | James Dong <jdong@google.com> | 2011-05-31 18:53:46 -0700 |
---|---|---|
committer | James Dong <jdong@google.com> | 2011-06-02 12:32:46 -0700 |
commit | 0c1bc742181ded4930842b46e9507372f0b1b963 (patch) | |
tree | c952bfcb03ff7cce5e0f91ad7d25c67a2fdd39cb /media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s | |
parent | 92a746c3b18d035189f596ce32847bf26247aaca (diff) | |
download | frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.zip frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.gz frameworks_av-0c1bc742181ded4930842b46e9507372f0b1b963.tar.bz2 |
Initial-checkin for ON2 Software AVC/H264 decoder
o when neon is present, the performance gain of On2 AVC software decoder
over PV software decoder is more than 30%.
o In addition, it fixes some known PV software decoder issues like missing
output frames
o allow both pv and on2 software avc to be available for easy comparision
o change output frames from 8 to 16
Change-Id: I567ad1842025ead7092f0c47e3513d6d9ca232dd
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s')
-rw-r--r-- | media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s new file mode 100644 index 0000000..73caec2 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s @@ -0,0 +1,469 @@ +;// +;// +;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 9641 +;// Date: Thursday, February 7, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// H.264 inverse quantize and transform module +;// +;// + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import/Export symbols required from/to other files +;// (For example tables) + + IMPORT armVCM4P10_UnpackBlock4x4 + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixQPModTable + + M_VARIANTS ARM1136JS + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + + +;// Guarding implementation by the processor name + + IF ARM1136JS + + +;//Input Registers +pData RN 0 +QP RN 1 + +;//Output Registers + + +;//Local Scratch Registers + +;// Packed Input pixels +in00 RN 2 ;// Src[0] & Src[1] +in02 RN 3 ;// Src[2] & Src[3] +in10 RN 4 ;// Src[4] & Src[5] +in12 RN 5 ;// Src[6] & Src[7] +in20 RN 6 ;// Src[8] & Src[9] +in22 RN 7 ;// Src[10] & Src[11] +in30 RN 8 ;// Src[12] & Src[13] +in32 RN 9 ;// Src[14] & Src[15] + +;// Transpose for Row operations (Rows to cols) +trRow00 RN 2 +trRow10 RN 10 +trRow02 RN 3 +trRow12 RN 5 +trRow20 RN 11 +trRow30 RN 12 +trRow32 RN 14 +trRow22 RN 7 + +;// Intermediate calculations +rowSum1 RN 4 +rowSum2 RN 6 +rowDiff1 RN 8 +rowDiff2 RN 9 + + +;// Row operated pixels +rowOp00 RN 2 +rowOp10 RN 10 +rowOp20 RN 11 +rowOp30 RN 12 +rowOp02 RN 3 +rowOp12 RN 5 +rowOp22 RN 7 +rowOp32 RN 14 + +;// Transpose for colulmn operations +trCol00 RN 2 +trCol02 RN 3 +trCol10 RN 4 +trCol12 RN 5 +trCol20 RN 6 +trCol22 RN 7 +trCol30 RN 8 +trCol32 RN 9 + +;// Intermediate calculations +colSum1 RN 10 +colSum2 RN 11 +colDiff1 RN 12 +colDiff2 RN 14 + + +;// Coloumn operated pixels +colOp00 RN 2 +colOp02 RN 3 +colOp10 RN 4 +colOp12 RN 5 +colOp20 RN 6 +colOp22 RN 7 +colOp30 RN 8 +colOp32 RN 9 + +;// Temporary scratch varaibles +pQPDivTable RN 0 +pQPModTable RN 11 +Shift RN 10 +Scale RN 14 +Round RN 0 + +temp1 RN 10 +temp2 RN 11 +temp3 RN 12 +temp4 RN 1 + + + +;// InvTransformed and Dequantized pixels +out00 RN 2 +out02 RN 3 +out10 RN 4 +out12 RN 5 +out20 RN 6 +out22 RN 7 +out30 RN 8 +out32 RN 9 + + + + + ;// Allocate stack memory required by the function + M_ALLOC4 pDataOnStack, 4 + + ;// Write function header + M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11 + + ;****************************************************************** + ;// The strategy used in implementing the transform is as follows:* + ;// Load the 4x4 block into 8 registers * + ;// Transpose the 4x4 matrix * + ;// Perform the row operations (on columns) using SIMD * + ;// Transpose the 4x4 result matrix * + ;// Perform the coloumn operations * + ;// Store the 4x4 block at one go * + ;****************************************************************** + + ;// Load all the 4x4 pixels + + LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32} + + ;//***************************************************************** + ;// + ;// Transpose the matrix inorder to perform row ops as coloumn ops + ;// Input: in[][] = original matrix + ;// Output: trRow[][]= transposed matrix + ;// Step1: Obtain the LL part of the transposed matrix + ;// Step2: Obtain the HL part + ;// step3: Obtain the LH part + ;// Step4: Obtain the HH part + ;// + ;//***************************************************************** + + ;// LL 2x2 transposed matrix + ;// d0 d1 - - + ;// d4 d5 - - + ;// - - - - + ;// - - - - + + PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] + PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] + + ;// HL 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// d8 d9 - - + ;// d12 d13 - - + + + PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] + PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] + + ;// LH 2x2 transposed matrix + ;// - - d2 d3 + ;// - - d6 d7 + ;// - - - - + ;// - - - - + + PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] + PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] + + + + + ;// HH 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// - - d10 d11 + ;// - - d14 d15 + + PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] + PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] + + + ;**************************************** + ;// Row Operations (Performed on columns) + ;**************************************** + + + ;// SIMD operations on first two columns(two rows of the original matrix) + + SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1) + SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3) + SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1) + SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3) + SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3) + SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3) + SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) + SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) + + + ;// SIMD operations on next two columns(next two rows of the original matrix) + + SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1) + SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3) + SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1) + SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3) + SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3) + SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3) + SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) + SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) + + + + ;***************************************************************** + ;// Transpose the resultant matrix + ;// Input: rowOp[][] + ;// Output: trCol[][] + ;***************************************************************** + + ;// LL 2x2 transposed matrix + ;// d0 d1 - - + ;// d4 d5 - - + ;// - - - - + ;// - - - - + + PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] + PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] + + ;// HL 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// d8 d9 - - + ;// d12 d13 - - + + + PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] + PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] + + ;// LH 2x2 transposed matrix + ;// - - d2 d3 + ;// - - d6 d7 + ;// - - - - + ;// - - - - + + PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] + PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] + + + + + ;// HH 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// - - d10 d11 + ;// - - d14 d15 + + PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] + PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] + + + ;******************************* + ;// Coloumn Operations + ;******************************* + + ;//-------------------------------------------------------------------------------------- + ;// Store pData(RN0) on stack and restore it only at the final store back + ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls + ;//-------------------------------------------------------------------------------------- + M_STR pData,pDataOnStack + + + ;// SIMD operations on first two columns(two rows of the original matrix) + + SADD16 colSum1,trCol00,trCol10 ;// (c0+c1) + SADD16 colSum2,trCol20,trCol30 ;// (c2+c3) + SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1) + SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3) + SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3) + SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3) + SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3) + SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3) + + + ;// SIMD operations on next two columns(next two rows of the original matrix) + + LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer + SADD16 colSum1,trCol02,trCol12 ;// (c0+c1) + SADD16 colSum2,trCol22,trCol32 ;// (c2+c3) + SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1) + SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3) + SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3) + SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3) + LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer + LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] + SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3) + SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3) + + + LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] + + ;//---------------------------------------------------------------------- + ;// + ;// <Dequantize> improves on the c-reference code + ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together + ;// We do not subtract 2 from Shift as in C reference, instead perform a + ;// Scale << Shift once in the beginning and do a right shift by a + ;// constant 2 after the Multiplication. The value of Round would be 2 + ;// + ;// By doing this we aviod the Branches required and also + ;// reduce the code size substantially + ;// + ;//---------------------------------------------------------------------- + + MOV Round, #2 ;// Round = 2 + LSL Scale, Scale, Shift ;// Scale = Scale << Shift + + + ;// Row 1 + SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round + SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round + SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round + SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round + + ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 + ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 + PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | + PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | + + + ;// Row 2 + SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round + SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round + SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round + SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round + + ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 + ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 + PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | + PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | + + ;// Row 3 + SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round + SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round + SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round + SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round + + ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 + ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 + PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | + PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | + + ;// Row 4 + SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round + SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round + SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round + SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round + + M_LDR pData,pDataOnStack ;// Restore pData pointer from stack + ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 + ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 + PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | + PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | + + + + ;*************************** + ;// Store all the 4x4 pixels + ;*************************** + +store_coeff + + STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32} + + + + ;// Set return value + + + ;// Write function tail + M_END + + ENDIF ;//ARM1136JS + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + +;// Guarding implementation by the processor name + + + + +;// Function: omxVCM4P10_TransformDequantLumaDCFromPair + +;//Input Registers +ppSrc RN 0 +pDst RN 1 +QPR2 RN 2 + +;//Output Registers +result RN 0 + +;//Local Scratch Registers +pDstR4 RN 4 +pDstR0 RN 0 +QPR1 RN 1 +QPR5 RN 5 + +;// Guarding implementation by the processor name + + IF ARM1136JS + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 + + MOV pDstR4,pDst ;// Saving register r1 + MOV QPR5,QPR2 ;// Saving register r2 + BL armVCM4P10_UnpackBlock4x4 + + MOV pDstR0,pDstR4 ;// Setting up register r0 + MOV QPR1,QPR5 ;// Setting up register r1 + BL armVCM4P10_InvTransformDequantLumaDC4x4 + + + ;// Set return value + MOV result,#OMX_Sts_NoErr + + ;// Write function tail + M_END + + + ENDIF ;//ARM1136JS + + + END
\ No newline at end of file |