summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s')
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s480
1 files changed, 480 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
new file mode 100644
index 0000000..2b71486
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
@@ -0,0 +1,480 @@
+;//
+;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;//
+;//
+
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Import symbols required from other files
+;// (For example tables)
+
+ IMPORT armVCM4P10_UnpackBlock4x4
+ IMPORT armVCM4P10_TransformResidual4x4
+ IMPORT armVCM4P10_QPDivTable
+ IMPORT armVCM4P10_VMatrixU16
+ IMPORT armVCM4P10_QPModuloTable
+
+ M_VARIANTS ARM1136JS, ARM1136JS_U
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_DequantLumaAC4x4
+
+;// Guarding implementation by the processor name
+
+ IF ARM1136JS
+
+;//Input Registers
+pSrcDst RN 0
+QP RN 1
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+pQPdiv RN 4
+pQPmod RN 5
+pVRow RN 2
+QPmod RN 6
+shift RN 3
+rowLuma01 RN 1
+rowLuma23 RN 4
+
+SrcDst00 RN 5
+SrcDst02 RN 6
+SrcDst10 RN 7
+SrcDst12 RN 8
+SrcDst20 RN 9
+SrcDst22 RN 10
+SrcDst30 RN 11
+SrcDst32 RN 12
+
+temp1 RN 2
+temp2 RN 3
+temp3 RN 14
+
+
+ ;// Allocate stack memory required by the function
+
+ ;// Write function header
+ M_START armVCM4P10_DequantLumaAC4x4,r11
+
+ LDR pQPmod,=armVCM4P10_QPModuloTable
+ LDR pQPdiv,=armVCM4P10_QPDivTable
+ LDR pVRow,=armVCM4P10_VMatrixU16
+
+ LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
+ LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
+
+ LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a]
+ LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b]
+ LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c]
+ ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a]
+
+ ;// Load all the 16 'src' values
+ LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+
+
+ ;//*********************************************************************************************
+ ;//
+ ;// 'Shift' ranges between [0,8]
+ ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
+ ;//
+ ;//*********************************************************************************************
+
+ LSL rowLuma01,rowLuma01,shift
+ LSL rowLuma23,rowLuma23,shift
+
+
+ ;//**********************************************************************************************
+ ;//
+ ;// The idea is to unroll the Loop completely
+ ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
+ ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
+ ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
+ ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
+ ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
+ ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
+ ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
+ ;//
+ ;// We then pack the two 16 bit multiplication result into a word and store at one go
+ ;//
+ ;//**********************************************************************************************
+
+
+ ;// Row 1
+
+
+ SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
+ SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
+
+ SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
+ SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
+
+
+ ;// Row 2
+ SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
+ SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
+ SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
+ SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
+
+
+ ;// Row 3
+
+ SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
+ SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
+ SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
+ SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
+
+
+
+ ;// Row 4
+
+ SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
+ SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
+
+ SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
+ SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
+ PKHBT SrcDst30,SrcDst30,temp1,LSL #16
+ PKHBT SrcDst32,SrcDst32,temp3,LSL #16
+
+
+ STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+
+
+ ;// Set return value
+
+
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//ARM1136JS
+
+
+;// Guarding implementation by the processor name
+
+ IF ARM1136JS_U
+
+;//Input Registers
+pSrcDst RN 0
+QP RN 1
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+pQPdiv RN 4
+pQPmod RN 5
+pVRow RN 2
+QPmod RN 6
+shift RN 3
+rowLuma01 RN 1
+rowLuma23 RN 4
+
+SrcDst00 RN 5
+SrcDst02 RN 6
+SrcDst10 RN 7
+SrcDst12 RN 8
+SrcDst20 RN 9
+SrcDst22 RN 10
+SrcDst30 RN 11
+SrcDst32 RN 12
+
+temp1 RN 2
+temp2 RN 3
+temp3 RN 14
+
+
+ ;// Allocate stack memory required by the function
+
+ ;// Write function header
+ M_START armVCM4P10_DequantLumaAC4x4,r11
+
+ LDR pQPmod,=armVCM4P10_QPModuloTable
+ LDR pQPdiv,=armVCM4P10_QPDivTable
+ LDR pVRow,=armVCM4P10_VMatrixU16
+
+ LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
+ LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
+
+ LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a]
+ LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c]
+
+ ;// Load all the 16 'src' values
+ LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+
+
+ ;//*********************************************************************************************
+ ;//
+ ;// 'Shift' ranges between [0,8]
+ ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
+ ;//
+ ;//*********************************************************************************************
+
+ LSL rowLuma01,rowLuma01,shift
+ LSL rowLuma23,rowLuma23,shift
+
+
+ ;//**********************************************************************************************
+ ;//
+ ;// The idea is to unroll the Loop completely
+ ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
+ ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
+ ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
+ ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
+ ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
+ ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
+ ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
+ ;//
+ ;// We then pack the two 16 bit multiplication result into a word and store at one go
+ ;//
+ ;//**********************************************************************************************
+
+
+ ;// Row 1
+
+
+ SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
+ SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
+
+ SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
+ SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
+
+
+ ;// Row 2
+ SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
+ SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
+ SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
+ SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
+
+
+ ;// Row 3
+
+ SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
+ SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
+ SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
+ SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
+
+ PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
+
+
+
+ ;// Row 4
+
+ SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
+ SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
+
+ SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
+ SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
+
+ PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
+ PKHBT SrcDst30,SrcDst30,temp1,LSL #16
+ PKHBT SrcDst32,SrcDst32,temp3,LSL #16
+
+
+ STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+
+
+ ;// Set return value
+
+
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//ARM1136JS_U
+
+
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
+
+;// Guarding implementation by the processor name
+
+ IF ARM1136JS
+
+;//Input Registers
+ppSrc RN 0
+pPred RN 1
+pDC RN 2
+pDst RN 3
+
+
+;//Output Registers
+result RN 0
+
+;//Local Scratch Registers
+pDelta RN 4
+pDeltaTmp RN 6
+AC RN 5 ;//Load from stack
+pPredTemp RN 7
+pDCTemp RN 8
+pDstTemp RN 9
+pDeltaArg1 RN 1
+pDeltaArg0 RN 0
+QP RN 1 ;//Load from stack
+DCval RN 10
+DCvalCopy RN 11
+predstep RN 1
+dstStep RN 10
+ycounter RN 0
+PredVal1 RN 3
+PredVal2 RN 5
+DeltaVal1 RN 2
+DeltaVal2 RN 11
+PredVal RN 8
+tmpDeltaVal RN 6
+sum1 RN 12
+sum2 RN 14
+
+
+
+ ;// Allocate stack memory required by the function
+ M_ALLOC8 pBuffer, 32
+
+
+ ;// Write function header
+ M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
+
+ ;// Define stack arguments
+ M_ARG predStepOnStack, 4
+ M_ARG dstStepOnStack,4
+ M_ARG QPOnStack, 4
+ M_ARG ACOnStack,4
+
+
+ M_ADR pDelta,pBuffer
+ M_LDR AC,ACOnStack
+
+
+ ;// Save registers r1,r2,r3 before function call
+ MOV pPredTemp,pPred
+ MOV pDCTemp,pDC
+ MOV pDstTemp,pDst
+
+ CMP AC,#0
+ BEQ DCcase
+ MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
+
+ BL armVCM4P10_UnpackBlock4x4
+
+ M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4
+ MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4
+
+ BL armVCM4P10_DequantLumaAC4x4
+
+
+ CMP pDCTemp,#0
+ LDRSHNE DCval,[pDCTemp]
+ MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4
+ MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4
+ STRHNE DCval,[pDelta]
+
+ BL armVCM4P10_TransformResidual4x4
+ B OutDCcase
+
+
+DCcase
+ LDRSH DCval,[pDCTemp]
+ ADD DCval,DCval,#32
+ ASR DCval,DCval,#6
+ PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword
+ MOV DCvalCopy, DCval ;// Needed for STRD
+ STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
+ STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
+ STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
+ STRD DCval, [pDelta, #24]
+
+
+OutDCcase
+ M_LDR predstep,predStepOnStack
+ M_LDR dstStep,dstStepOnStack
+
+ LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
+ MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop
+ LDR PredVal,[pPredTemp] ;// Pre load
+
+PredPlusDeltaLoop
+
+
+ SUBS ycounter,ycounter,#1
+ ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr
+
+ PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A]
+ PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B]
+
+ UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a]
+ UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b]
+
+ LDRGT PredVal,[pPredTemp] ;// Pre load
+
+ QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits
+ QADD16 sum1,DeltaVal1,PredVal1
+
+ USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2)
+ USAT16 sum1,#8,sum1
+
+ LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
+
+ ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba]
+ STR sum1,[pDstTemp]
+
+ ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr
+ BGT PredPlusDeltaLoop
+
+
+ ;// Set return value
+ MOV result,#OMX_Sts_NoErr
+
+End
+
+
+ ;// Write function tail
+
+ M_END
+
+ ENDIF ;//ARM1136JS
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
+
+;// Guarding implementation by the processor name
+
+
+
+
+ END