diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s')
-rw-r--r-- | media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s | 480 |
1 files changed, 480 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s new file mode 100644 index 0000000..2b71486 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s @@ -0,0 +1,480 @@ +;// +;// (c) Copyright 2007 ARM Limited. All Rights Reserved. +;// +;// Description: +;// H.264 inverse quantize and transform module +;// +;// + + + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import symbols required from other files +;// (For example tables) + + IMPORT armVCM4P10_UnpackBlock4x4 + IMPORT armVCM4P10_TransformResidual4x4 + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixU16 + IMPORT armVCM4P10_QPModuloTable + + M_VARIANTS ARM1136JS, ARM1136JS_U + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + +;// Static Function: armVCM4P10_DequantLumaAC4x4 + +;// Guarding implementation by the processor name + + IF ARM1136JS + +;//Input Registers +pSrcDst RN 0 +QP RN 1 + + +;//Output Registers + + +;//Local Scratch Registers +pQPdiv RN 4 +pQPmod RN 5 +pVRow RN 2 +QPmod RN 6 +shift RN 3 +rowLuma01 RN 1 +rowLuma23 RN 4 + +SrcDst00 RN 5 +SrcDst02 RN 6 +SrcDst10 RN 7 +SrcDst12 RN 8 +SrcDst20 RN 9 +SrcDst22 RN 10 +SrcDst30 RN 11 +SrcDst32 RN 12 + +temp1 RN 2 +temp2 RN 3 +temp3 RN 14 + + + ;// Allocate stack memory required by the function + + ;// Write function header + M_START armVCM4P10_DequantLumaAC4x4,r11 + + LDR pQPmod,=armVCM4P10_QPModuloTable + LDR pQPdiv,=armVCM4P10_QPDivTable + LDR pVRow,=armVCM4P10_VMatrixU16 + + LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 + LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 + + LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a] + LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b] + LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c] + ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a] + + ;// Load all the 16 'src' values + LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} + + + ;//********************************************************************************************* + ;// + ;// 'Shift' ranges between [0,8] + ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation + ;// + ;//********************************************************************************************* + + LSL rowLuma01,rowLuma01,shift + LSL rowLuma23,rowLuma23,shift + + + ;//********************************************************************************************** + ;// + ;// The idea is to unroll the Loop completely + ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) + ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' + ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 + ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) + ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) + ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated + ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls + ;// + ;// We then pack the two 16 bit multiplication result into a word and store at one go + ;// + ;//********************************************************************************************** + + + ;// Row 1 + + + SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) + SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) + + SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) + SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) + + PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values + + + ;// Row 2 + SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) + SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) + + PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values + SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) + SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) + + PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values + + + ;// Row 3 + + SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) + SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) + + PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values + SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) + SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) + + PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values + + + + ;// Row 4 + + SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) + SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) + + SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) + SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) + + PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values + PKHBT SrcDst30,SrcDst30,temp1,LSL #16 + PKHBT SrcDst32,SrcDst32,temp3,LSL #16 + + + STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} + + + ;// Set return value + + + + ;// Write function tail + M_END + + ENDIF ;//ARM1136JS + + +;// Guarding implementation by the processor name + + IF ARM1136JS_U + +;//Input Registers +pSrcDst RN 0 +QP RN 1 + + +;//Output Registers + + +;//Local Scratch Registers +pQPdiv RN 4 +pQPmod RN 5 +pVRow RN 2 +QPmod RN 6 +shift RN 3 +rowLuma01 RN 1 +rowLuma23 RN 4 + +SrcDst00 RN 5 +SrcDst02 RN 6 +SrcDst10 RN 7 +SrcDst12 RN 8 +SrcDst20 RN 9 +SrcDst22 RN 10 +SrcDst30 RN 11 +SrcDst32 RN 12 + +temp1 RN 2 +temp2 RN 3 +temp3 RN 14 + + + ;// Allocate stack memory required by the function + + ;// Write function header + M_START armVCM4P10_DequantLumaAC4x4,r11 + + LDR pQPmod,=armVCM4P10_QPModuloTable + LDR pQPdiv,=armVCM4P10_QPDivTable + LDR pVRow,=armVCM4P10_VMatrixU16 + + LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 + LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 + + LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a] + LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c] + + ;// Load all the 16 'src' values + LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} + + + ;//********************************************************************************************* + ;// + ;// 'Shift' ranges between [0,8] + ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation + ;// + ;//********************************************************************************************* + + LSL rowLuma01,rowLuma01,shift + LSL rowLuma23,rowLuma23,shift + + + ;//********************************************************************************************** + ;// + ;// The idea is to unroll the Loop completely + ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) + ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' + ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 + ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) + ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) + ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated + ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls + ;// + ;// We then pack the two 16 bit multiplication result into a word and store at one go + ;// + ;//********************************************************************************************** + + + ;// Row 1 + + + SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) + SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) + + SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) + SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) + + PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values + + + ;// Row 2 + SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) + SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) + + PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values + SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) + SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) + + PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values + + + ;// Row 3 + + SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) + SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) + + PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values + SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) + SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) + + PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values + + + + ;// Row 4 + + SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) + SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) + + SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) + SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) + + PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values + PKHBT SrcDst30,SrcDst30,temp1,LSL #16 + PKHBT SrcDst32,SrcDst32,temp3,LSL #16 + + + STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} + + + ;// Set return value + + + + ;// Write function tail + M_END + + ENDIF ;//ARM1136JS_U + + + + + +;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd + +;// Guarding implementation by the processor name + + IF ARM1136JS + +;//Input Registers +ppSrc RN 0 +pPred RN 1 +pDC RN 2 +pDst RN 3 + + +;//Output Registers +result RN 0 + +;//Local Scratch Registers +pDelta RN 4 +pDeltaTmp RN 6 +AC RN 5 ;//Load from stack +pPredTemp RN 7 +pDCTemp RN 8 +pDstTemp RN 9 +pDeltaArg1 RN 1 +pDeltaArg0 RN 0 +QP RN 1 ;//Load from stack +DCval RN 10 +DCvalCopy RN 11 +predstep RN 1 +dstStep RN 10 +ycounter RN 0 +PredVal1 RN 3 +PredVal2 RN 5 +DeltaVal1 RN 2 +DeltaVal2 RN 11 +PredVal RN 8 +tmpDeltaVal RN 6 +sum1 RN 12 +sum2 RN 14 + + + + ;// Allocate stack memory required by the function + M_ALLOC8 pBuffer, 32 + + + ;// Write function header + M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11 + + ;// Define stack arguments + M_ARG predStepOnStack, 4 + M_ARG dstStepOnStack,4 + M_ARG QPOnStack, 4 + M_ARG ACOnStack,4 + + + M_ADR pDelta,pBuffer + M_LDR AC,ACOnStack + + + ;// Save registers r1,r2,r3 before function call + MOV pPredTemp,pPred + MOV pDCTemp,pDC + MOV pDstTemp,pDst + + CMP AC,#0 + BEQ DCcase + MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 + + BL armVCM4P10_UnpackBlock4x4 + + M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4 + MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4 + + BL armVCM4P10_DequantLumaAC4x4 + + + CMP pDCTemp,#0 + LDRSHNE DCval,[pDCTemp] + MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4 + MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4 + STRHNE DCval,[pDelta] + + BL armVCM4P10_TransformResidual4x4 + B OutDCcase + + +DCcase + LDRSH DCval,[pDCTemp] + ADD DCval,DCval,#32 + ASR DCval,DCval,#6 + PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword + MOV DCvalCopy, DCval ;// Needed for STRD + STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval + STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval + STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval + STRD DCval, [pDelta, #24] + + +OutDCcase + M_LDR predstep,predStepOnStack + M_LDR dstStep,dstStepOnStack + + LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load + MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop + LDR PredVal,[pPredTemp] ;// Pre load + +PredPlusDeltaLoop + + + SUBS ycounter,ycounter,#1 + ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr + + PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A] + PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B] + + UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a] + UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b] + + LDRGT PredVal,[pPredTemp] ;// Pre load + + QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits + QADD16 sum1,DeltaVal1,PredVal1 + + USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2) + USAT16 sum1,#8,sum1 + + LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load + + ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba] + STR sum1,[pDstTemp] + + ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr + BGT PredPlusDeltaLoop + + + ;// Set return value + MOV result,#OMX_Sts_NoErr + +End + + + ;// Write function tail + + M_END + + ENDIF ;//ARM1136JS + + +;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd + +;// Guarding implementation by the processor name + + + + + END |